├── .gitignore
├── LICENSE
├── README.md
├── config.yaml
├── configs
├── ViTPose_base_coco_256x192.py
├── ViTPose_base_simple_coco_256x192.py
├── ViTPose_huge_coco_256x192.py
├── ViTPose_huge_simple_coco_256x192.py
├── ViTPose_large_coco_256x192.py
└── ViTPose_large_simple_coco_256x192.py
├── datasets
├── COCO.py
└── HumanPoseEstimation.py
├── examples
├── .DS_Store
├── img1.jpg
└── img1_result.jpg
├── inference.py
├── models
├── __init__.py
├── backbone
│ └── vit.py
├── head
│ ├── topdown_heatmap_base_head.py
│ └── topdown_heatmap_simple_head.py
├── losses
│ ├── __init__.py
│ ├── classfication_loss.py
│ ├── heatmap_loss.py
│ ├── mesh_loss.py
│ ├── mse_loss.py
│ ├── multi_loss_factory.py
│ └── regression_loss.py
├── model.py
└── optimizer.py
├── requirements.txt
├── to_onnx.ipynb
├── train.py
└── utils
├── __init__.py
├── dist_util.py
├── logging.py
├── nms
├── __init__.py
├── cpu_nms.c
├── cpu_nms.cpython-37m-x86_64-linux-gnu.so
├── cpu_nms.cpython-39-x86_64-linux-gnu.so
├── cpu_nms.pyx
├── gpu_nms.cpp
├── gpu_nms.cpython-37m-x86_64-linux-gnu.so
├── gpu_nms.cpython-39-x86_64-linux-gnu.so
├── gpu_nms.cu
├── gpu_nms.hpp
├── gpu_nms.pyx
├── nms.py
├── nms_kernel.cu
├── nms_ori.py
└── setup_linux.py
├── post_processing
├── __init__.py
├── group.py
├── nms.py
├── one_euro_filter.py
└── post_transforms.py
├── top_down_eval.py
├── train_valid_fn.py
├── transform.py
├── util.py
└── visualization.py
/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.pth
2 | **/*.pt
3 | **/__pycache__
4 | **/coco/
5 | *.onnx
6 | .DS_Store
7 | runs
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ViTPose (simple version w/o mmcv)
2 | An unofficial implementation of `ViTPose` [Y. Xu et al., 2022]
3 | 
4 |
5 | ## Usage
6 | ### | **Inference**
7 | ```
8 | python inference.py --image-path './examples/img1.jpg'
9 | ```
10 |
11 | ### | **Training**
12 | ```
13 | python train.py --config-path config.yaml --model-name 'b'
14 | ```
15 | - `model_name` must be in (`b`, `l`, `h`)
16 |
17 |
18 | ## Note
19 | 1. Download the trained model (.pth)
20 | - [ViTPose-B-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgSrlMB093JzJtqq-?e=Jr5S3R)
21 | - [ViTPose-L-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgTBm3dCVmBUbHYT6?e=fHUrTq)
22 | - [ViTPose-H-Multi-COCO.pth](https://1drv.ms/u/s!AimBgYV7JjTlgS5rLeRAJiWobCdh?e=41GsDd)
23 | 2. Set the config. according to the trained model
24 | - [ViTPose-B-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_base_coco_256x192.py)
25 | - [ViTPose-L-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_large_coco_256x192.py)
26 | - [ViTPose-H-COCO-256x192](/Users/jaehyun/workspace/ViTPose_pytorch/configs/ViTPose_huge_coco_256x192.py)
27 |
28 | ---
29 | ## Reference
30 | All codes were written with reference to [the official ViTPose repo.](https://github.com/ViTAE-Transformer/ViTPose)
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | # Train config ---------------------------------------
2 | log_level: logging.INFO
3 | seed: 0
4 | deterministic: True # whether not to evaluate the checkpoint during training
5 | cudnn_benchmark: True # Use cudnn
6 | resume_from: "/home/jaehyun/workspace/PoseEstimation/ViTPose_pytorch/vitpose-b-multi-coco.pth" # CKPT path
7 | gpu_ids: [0]
8 | launcher: 'none' # When distributed training ['none', 'pytorch', 'slurm', 'mpi']
9 | use_amp: True
10 | validate: True
11 |
12 | autoscale_lr: True # automatically scale lr with the number of gpus
13 |
14 | dist_params:
15 | ...
16 |
--------------------------------------------------------------------------------
/configs/ViTPose_base_coco_256x192.py:
--------------------------------------------------------------------------------
1 | # _base_ = [
2 | # '../../../../_base_/default_runtime.py',
3 | # '../../../../_base_/datasets/coco.py'
4 | # ]
5 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
6 |
7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
8 | constructor='LayerDecayOptimizerConstructor',
9 | paramwise_cfg=dict(
10 | num_layers=12,
11 | layer_decay_rate=0.75,
12 | custom_keys={
13 | 'bias': dict(decay_multi=0.),
14 | 'pos_embed': dict(decay_mult=0.),
15 | 'relative_position_bias_table': dict(decay_mult=0.),
16 | 'norm': dict(decay_mult=0.)
17 | }
18 | )
19 | )
20 |
21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
22 |
23 | # learning policy
24 | lr_config = dict(
25 | policy='step',
26 | warmup='linear',
27 | warmup_iters=500,
28 | warmup_ratio=0.001,
29 | step=[170, 200])
30 |
31 | total_epochs = 210
32 | target_type = 'GaussianHeatmap'
33 | channel_cfg = dict(
34 | num_output_channels=17,
35 | dataset_joints=17,
36 | dataset_channel=[
37 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
38 | ],
39 | inference_channel=[
40 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
41 | ])
42 |
43 | # model settings
44 | model = dict(
45 | type='TopDown',
46 | pretrained=None,
47 | backbone=dict(
48 | type='ViT',
49 | img_size=(256, 192),
50 | patch_size=16,
51 | embed_dim=768,
52 | depth=12,
53 | num_heads=12,
54 | ratio=1,
55 | use_checkpoint=False,
56 | mlp_ratio=4,
57 | qkv_bias=True,
58 | drop_path_rate=0.3,
59 | ),
60 | keypoint_head=dict(
61 | type='TopdownHeatmapSimpleHead',
62 | in_channels=768,
63 | num_deconv_layers=2,
64 | num_deconv_filters=(256, 256),
65 | num_deconv_kernels=(4, 4),
66 | extra=dict(final_conv_kernel=1, ),
67 | out_channels=channel_cfg['num_output_channels'],
68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
69 | train_cfg=dict(),
70 | test_cfg=dict(
71 | flip_test=True,
72 | post_process='default',
73 | shift_heatmap=False,
74 | target_type=target_type,
75 | modulate_kernel=11,
76 | use_udp=True))
77 |
78 | data_cfg = dict(
79 | image_size=[192, 256],
80 | heatmap_size=[48, 64],
81 | num_output_channels=channel_cfg['num_output_channels'],
82 | num_joints=channel_cfg['dataset_joints'],
83 | dataset_channel=channel_cfg['dataset_channel'],
84 | inference_channel=channel_cfg['inference_channel'],
85 | soft_nms=False,
86 | nms_thr=1.0,
87 | oks_thr=0.9,
88 | vis_thr=0.2,
89 | use_gt_bbox=False,
90 | det_bbox_thr=0.0,
91 | bbox_file='data/coco/person_detection_results/'
92 | 'COCO_val2017_detections_AP_H_56_person.json',
93 | )
94 |
95 | train_pipeline = [
96 | dict(type='LoadImageFromFile'),
97 | dict(type='TopDownRandomFlip', flip_prob=0.5),
98 | dict(
99 | type='TopDownHalfBodyTransform',
100 | num_joints_half_body=8,
101 | prob_half_body=0.3),
102 | dict(
103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
104 | dict(type='TopDownAffine', use_udp=True),
105 | dict(type='ToTensor'),
106 | dict(
107 | type='NormalizeTensor',
108 | mean=[0.485, 0.456, 0.406],
109 | std=[0.229, 0.224, 0.225]),
110 | dict(
111 | type='TopDownGenerateTarget',
112 | sigma=2,
113 | encoding='UDP',
114 | target_type=target_type),
115 | dict(
116 | type='Collect',
117 | keys=['img', 'target', 'target_weight'],
118 | meta_keys=[
119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
120 | 'rotation', 'bbox_score', 'flip_pairs'
121 | ]),
122 | ]
123 |
124 | val_pipeline = [
125 | dict(type='LoadImageFromFile'),
126 | dict(type='TopDownAffine', use_udp=True),
127 | dict(type='ToTensor'),
128 | dict(
129 | type='NormalizeTensor',
130 | mean=[0.485, 0.456, 0.406],
131 | std=[0.229, 0.224, 0.225]),
132 | dict(
133 | type='Collect',
134 | keys=['img'],
135 | meta_keys=[
136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
137 | 'flip_pairs'
138 | ]),
139 | ]
140 |
141 | test_pipeline = val_pipeline
142 |
143 | data_root = 'datasets/coco'
144 | data = dict(
145 | samples_per_gpu=32,
146 | workers_per_gpu=4,
147 | val_dataloader=dict(samples_per_gpu=32),
148 | test_dataloader=dict(samples_per_gpu=32),
149 | train=dict(
150 | type='TopDownCocoDataset',
151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
152 | img_prefix=f'{data_root}/train2017/',
153 | data_cfg=data_cfg,
154 | pipeline=train_pipeline),
155 | val=dict(
156 | type='TopDownCocoDataset',
157 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
158 | img_prefix=f'{data_root}/val2017/',
159 | data_cfg=data_cfg,
160 | pipeline=val_pipeline),
161 | test=dict(
162 | type='TopDownCocoDataset',
163 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
164 | img_prefix=f'{data_root}/val2017/',
165 | data_cfg=data_cfg,
166 | pipeline=test_pipeline)
167 | )
168 |
169 |
--------------------------------------------------------------------------------
/configs/ViTPose_base_simple_coco_256x192.py:
--------------------------------------------------------------------------------
1 | _base_ = [
2 | '../../../../_base_/default_runtime.py',
3 | '../../../../_base_/datasets/coco.py'
4 | ]
5 |
6 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
7 |
8 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
9 | constructor='LayerDecayOptimizerConstructor',
10 | paramwise_cfg=dict(
11 | num_layers=12,
12 | layer_decay_rate=0.75,
13 | custom_keys={
14 | 'bias': dict(decay_multi=0.),
15 | 'pos_embed': dict(decay_mult=0.),
16 | 'relative_position_bias_table': dict(decay_mult=0.),
17 | 'norm': dict(decay_mult=0.)
18 | }
19 | )
20 | )
21 |
22 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
23 |
24 | # learning policy
25 | lr_config = dict(
26 | policy='step',
27 | warmup='linear',
28 | warmup_iters=500,
29 | warmup_ratio=0.001,
30 | step=[170, 200])
31 | total_epochs = 210
32 | target_type = 'GaussianHeatmap'
33 | channel_cfg = dict(
34 | num_output_channels=17,
35 | dataset_joints=17,
36 | dataset_channel=[
37 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
38 | ],
39 | inference_channel=[
40 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
41 | ])
42 |
43 | # model settings
44 | model = dict(
45 | type='TopDown',
46 | pretrained=None,
47 | backbone=dict(
48 | type='ViT',
49 | img_size=(256, 192),
50 | patch_size=16,
51 | embed_dim=768,
52 | depth=12,
53 | num_heads=12,
54 | ratio=1,
55 | use_checkpoint=False,
56 | mlp_ratio=4,
57 | qkv_bias=True,
58 | drop_path_rate=0.3,
59 | ),
60 | keypoint_head=dict(
61 | type='TopdownHeatmapSimpleHead',
62 | in_channels=768,
63 | num_deconv_layers=0,
64 | num_deconv_filters=[],
65 | num_deconv_kernels=[],
66 | upsample=4,
67 | extra=dict(final_conv_kernel=3, ),
68 | out_channels=channel_cfg['num_output_channels'],
69 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
70 | train_cfg=dict(),
71 | test_cfg=dict(
72 | flip_test=True,
73 | post_process='default',
74 | shift_heatmap=False,
75 | target_type=target_type,
76 | modulate_kernel=11,
77 | use_udp=True))
78 |
79 | data_cfg = dict(
80 | image_size=[192, 256],
81 | heatmap_size=[48, 64],
82 | num_output_channels=channel_cfg['num_output_channels'],
83 | num_joints=channel_cfg['dataset_joints'],
84 | dataset_channel=channel_cfg['dataset_channel'],
85 | inference_channel=channel_cfg['inference_channel'],
86 | soft_nms=False,
87 | nms_thr=1.0,
88 | oks_thr=0.9,
89 | vis_thr=0.2,
90 | use_gt_bbox=False,
91 | det_bbox_thr=0.0,
92 | bbox_file='data/coco/person_detection_results/'
93 | 'COCO_val2017_detections_AP_H_56_person.json',
94 | )
95 |
96 | train_pipeline = [
97 | dict(type='LoadImageFromFile'),
98 | dict(type='TopDownRandomFlip', flip_prob=0.5),
99 | dict(
100 | type='TopDownHalfBodyTransform',
101 | num_joints_half_body=8,
102 | prob_half_body=0.3),
103 | dict(
104 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
105 | dict(type='TopDownAffine', use_udp=True),
106 | dict(type='ToTensor'),
107 | dict(
108 | type='NormalizeTensor',
109 | mean=[0.485, 0.456, 0.406],
110 | std=[0.229, 0.224, 0.225]),
111 | dict(
112 | type='TopDownGenerateTarget',
113 | sigma=2,
114 | encoding='UDP',
115 | target_type=target_type),
116 | dict(
117 | type='Collect',
118 | keys=['img', 'target', 'target_weight'],
119 | meta_keys=[
120 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121 | 'rotation', 'bbox_score', 'flip_pairs'
122 | ]),
123 | ]
124 |
125 | val_pipeline = [
126 | dict(type='LoadImageFromFile'),
127 | dict(type='TopDownAffine', use_udp=True),
128 | dict(type='ToTensor'),
129 | dict(
130 | type='NormalizeTensor',
131 | mean=[0.485, 0.456, 0.406],
132 | std=[0.229, 0.224, 0.225]),
133 | dict(
134 | type='Collect',
135 | keys=['img'],
136 | meta_keys=[
137 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
138 | 'flip_pairs'
139 | ]),
140 | ]
141 |
142 | test_pipeline = val_pipeline
143 |
144 | data_root = 'datasets/coco'
145 | data = dict(
146 | samples_per_gpu=64,
147 | workers_per_gpu=4,
148 | val_dataloader=dict(samples_per_gpu=32),
149 | test_dataloader=dict(samples_per_gpu=32),
150 | train=dict(
151 | type='TopDownCocoDataset',
152 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
153 | img_prefix=f'{data_root}/train2017/',
154 | data_cfg=data_cfg,
155 | pipeline=train_pipeline,
156 | dataset_info={{_base_.dataset_info}}),
157 | val=dict(
158 | type='TopDownCocoDataset',
159 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
160 | img_prefix=f'{data_root}/val2017/',
161 | data_cfg=data_cfg,
162 | pipeline=val_pipeline,
163 | dataset_info={{_base_.dataset_info}}),
164 | test=dict(
165 | type='TopDownCocoDataset',
166 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
167 | img_prefix=f'{data_root}/val2017/',
168 | data_cfg=data_cfg,
169 | pipeline=test_pipeline,
170 | dataset_info={{_base_.dataset_info}}),
171 | )
172 |
173 |
--------------------------------------------------------------------------------
/configs/ViTPose_huge_coco_256x192.py:
--------------------------------------------------------------------------------
1 | # _base_ = [
2 | # '../../../../_base_/default_runtime.py',
3 | # '../../../../_base_/datasets/coco.py'
4 | # ]
5 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
6 |
7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
8 | constructor='LayerDecayOptimizerConstructor',
9 | paramwise_cfg=dict(
10 | num_layers=32,
11 | layer_decay_rate=0.85,
12 | custom_keys={
13 | 'bias': dict(decay_multi=0.),
14 | 'pos_embed': dict(decay_mult=0.),
15 | 'relative_position_bias_table': dict(decay_mult=0.),
16 | 'norm': dict(decay_mult=0.)
17 | }
18 | )
19 | )
20 |
21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
22 |
23 | # learning policy
24 | lr_config = dict(
25 | policy='step',
26 | warmup='linear',
27 | warmup_iters=500,
28 | warmup_ratio=0.001,
29 | step=[170, 200])
30 | total_epochs = 210
31 | target_type = 'GaussianHeatmap'
32 | channel_cfg = dict(
33 | num_output_channels=17,
34 | dataset_joints=17,
35 | dataset_channel=[
36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
37 | ],
38 | inference_channel=[
39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
40 | ])
41 |
42 | # model settings
43 | model = dict(
44 | type='TopDown',
45 | pretrained=None,
46 | backbone=dict(
47 | type='ViT',
48 | img_size=(256, 192),
49 | patch_size=16,
50 | embed_dim=1280,
51 | depth=32,
52 | num_heads=16,
53 | ratio=1,
54 | use_checkpoint=False,
55 | mlp_ratio=4,
56 | qkv_bias=True,
57 | drop_path_rate=0.55,
58 | ),
59 | keypoint_head=dict(
60 | type='TopdownHeatmapSimpleHead',
61 | in_channels=1280,
62 | num_deconv_layers=2,
63 | num_deconv_filters=(256, 256),
64 | num_deconv_kernels=(4, 4),
65 | extra=dict(final_conv_kernel=1, ),
66 | out_channels=channel_cfg['num_output_channels'],
67 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
68 | train_cfg=dict(),
69 | test_cfg=dict(
70 | flip_test=True,
71 | post_process='default',
72 | shift_heatmap=False,
73 | target_type=target_type,
74 | modulate_kernel=11,
75 | use_udp=True))
76 |
77 | data_cfg = dict(
78 | image_size=[192, 256],
79 | heatmap_size=[48, 64],
80 | num_output_channels=channel_cfg['num_output_channels'],
81 | num_joints=channel_cfg['dataset_joints'],
82 | dataset_channel=channel_cfg['dataset_channel'],
83 | inference_channel=channel_cfg['inference_channel'],
84 | soft_nms=False,
85 | nms_thr=1.0,
86 | oks_thr=0.9,
87 | vis_thr=0.2,
88 | use_gt_bbox=False,
89 | det_bbox_thr=0.0,
90 | bbox_file='data/coco/person_detection_results/'
91 | 'COCO_val2017_detections_AP_H_56_person.json',
92 | )
93 |
94 | train_pipeline = [
95 | dict(type='LoadImageFromFile'),
96 | dict(type='TopDownRandomFlip', flip_prob=0.5),
97 | dict(
98 | type='TopDownHalfBodyTransform',
99 | num_joints_half_body=8,
100 | prob_half_body=0.3),
101 | dict(
102 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
103 | dict(type='TopDownAffine', use_udp=True),
104 | dict(type='ToTensor'),
105 | dict(
106 | type='NormalizeTensor',
107 | mean=[0.485, 0.456, 0.406],
108 | std=[0.229, 0.224, 0.225]),
109 | dict(
110 | type='TopDownGenerateTarget',
111 | sigma=2,
112 | encoding='UDP',
113 | target_type=target_type),
114 | dict(
115 | type='Collect',
116 | keys=['img', 'target', 'target_weight'],
117 | meta_keys=[
118 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
119 | 'rotation', 'bbox_score', 'flip_pairs'
120 | ]),
121 | ]
122 |
123 | val_pipeline = [
124 | dict(type='LoadImageFromFile'),
125 | dict(type='TopDownAffine', use_udp=True),
126 | dict(type='ToTensor'),
127 | dict(
128 | type='NormalizeTensor',
129 | mean=[0.485, 0.456, 0.406],
130 | std=[0.229, 0.224, 0.225]),
131 | dict(
132 | type='Collect',
133 | keys=['img'],
134 | meta_keys=[
135 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
136 | 'flip_pairs'
137 | ]),
138 | ]
139 |
140 | test_pipeline = val_pipeline
141 |
142 | data_root = 'datasets/coco'
143 | data = dict(
144 | samples_per_gpu=64,
145 | workers_per_gpu=4,
146 | val_dataloader=dict(samples_per_gpu=32),
147 | test_dataloader=dict(samples_per_gpu=32),
148 | train=dict(
149 | type='TopDownCocoDataset',
150 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
151 | img_prefix=f'{data_root}/train2017/',
152 | data_cfg=data_cfg,
153 | pipeline=train_pipeline),
154 | val=dict(
155 | type='TopDownCocoDataset',
156 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
157 | img_prefix=f'{data_root}/val2017/',
158 | data_cfg=data_cfg,
159 | pipeline=val_pipeline),
160 | test=dict(
161 | type='TopDownCocoDataset',
162 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
163 | img_prefix=f'{data_root}/val2017/',
164 | data_cfg=data_cfg,
165 | pipeline=test_pipeline)
166 | )
167 |
168 |
--------------------------------------------------------------------------------
/configs/ViTPose_huge_simple_coco_256x192.py:
--------------------------------------------------------------------------------
1 | _base_ = [
2 | '../../../../_base_/default_runtime.py',
3 | '../../../../_base_/datasets/coco.py'
4 | ]
5 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
6 |
7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
8 | constructor='LayerDecayOptimizerConstructor',
9 | paramwise_cfg=dict(
10 | num_layers=32,
11 | layer_decay_rate=0.85,
12 | custom_keys={
13 | 'bias': dict(decay_multi=0.),
14 | 'pos_embed': dict(decay_mult=0.),
15 | 'relative_position_bias_table': dict(decay_mult=0.),
16 | 'norm': dict(decay_mult=0.)
17 | }
18 | )
19 | )
20 |
21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
22 |
23 | # learning policy
24 | lr_config = dict(
25 | policy='step',
26 | warmup='linear',
27 | warmup_iters=500,
28 | warmup_ratio=0.001,
29 | step=[170, 200])
30 | total_epochs = 210
31 | target_type = 'GaussianHeatmap'
32 | channel_cfg = dict(
33 | num_output_channels=17,
34 | dataset_joints=17,
35 | dataset_channel=[
36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
37 | ],
38 | inference_channel=[
39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
40 | ])
41 |
42 | # model settings
43 | model = dict(
44 | type='TopDown',
45 | pretrained=None,
46 | backbone=dict(
47 | type='ViT',
48 | img_size=(256, 192),
49 | patch_size=16,
50 | embed_dim=1280,
51 | depth=32,
52 | num_heads=16,
53 | ratio=1,
54 | use_checkpoint=False,
55 | mlp_ratio=4,
56 | qkv_bias=True,
57 | drop_path_rate=0.55,
58 | ),
59 | keypoint_head=dict(
60 | type='TopdownHeatmapSimpleHead',
61 | in_channels=1280,
62 | num_deconv_layers=0,
63 | num_deconv_filters=[],
64 | num_deconv_kernels=[],
65 | upsample=4,
66 | extra=dict(final_conv_kernel=3, ),
67 | out_channels=channel_cfg['num_output_channels'],
68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
69 | train_cfg=dict(),
70 | test_cfg=dict(
71 | flip_test=True,
72 | post_process='default',
73 | shift_heatmap=False,
74 | target_type=target_type,
75 | modulate_kernel=11,
76 | use_udp=True))
77 |
78 | data_cfg = dict(
79 | image_size=[192, 256],
80 | heatmap_size=[48, 64],
81 | num_output_channels=channel_cfg['num_output_channels'],
82 | num_joints=channel_cfg['dataset_joints'],
83 | dataset_channel=channel_cfg['dataset_channel'],
84 | inference_channel=channel_cfg['inference_channel'],
85 | soft_nms=False,
86 | nms_thr=1.0,
87 | oks_thr=0.9,
88 | vis_thr=0.2,
89 | use_gt_bbox=False,
90 | det_bbox_thr=0.0,
91 | bbox_file='data/coco/person_detection_results/'
92 | 'COCO_val2017_detections_AP_H_56_person.json',
93 | )
94 |
95 | train_pipeline = [
96 | dict(type='LoadImageFromFile'),
97 | dict(type='TopDownRandomFlip', flip_prob=0.5),
98 | dict(
99 | type='TopDownHalfBodyTransform',
100 | num_joints_half_body=8,
101 | prob_half_body=0.3),
102 | dict(
103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
104 | dict(type='TopDownAffine', use_udp=True),
105 | dict(type='ToTensor'),
106 | dict(
107 | type='NormalizeTensor',
108 | mean=[0.485, 0.456, 0.406],
109 | std=[0.229, 0.224, 0.225]),
110 | dict(
111 | type='TopDownGenerateTarget',
112 | sigma=2,
113 | encoding='UDP',
114 | target_type=target_type),
115 | dict(
116 | type='Collect',
117 | keys=['img', 'target', 'target_weight'],
118 | meta_keys=[
119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
120 | 'rotation', 'bbox_score', 'flip_pairs'
121 | ]),
122 | ]
123 |
124 | val_pipeline = [
125 | dict(type='LoadImageFromFile'),
126 | dict(type='TopDownAffine', use_udp=True),
127 | dict(type='ToTensor'),
128 | dict(
129 | type='NormalizeTensor',
130 | mean=[0.485, 0.456, 0.406],
131 | std=[0.229, 0.224, 0.225]),
132 | dict(
133 | type='Collect',
134 | keys=['img'],
135 | meta_keys=[
136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
137 | 'flip_pairs'
138 | ]),
139 | ]
140 |
141 | test_pipeline = val_pipeline
142 |
143 | data_root = 'datasets/coco'
144 | data = dict(
145 | samples_per_gpu=64,
146 | workers_per_gpu=4,
147 | val_dataloader=dict(samples_per_gpu=32),
148 | test_dataloader=dict(samples_per_gpu=32),
149 | train=dict(
150 | type='TopDownCocoDataset',
151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
152 | img_prefix=f'{data_root}/train2017/',
153 | data_cfg=data_cfg,
154 | pipeline=train_pipeline,
155 | dataset_info={{_base_.dataset_info}}),
156 | val=dict(
157 | type='TopDownCocoDataset',
158 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
159 | img_prefix=f'{data_root}/val2017/',
160 | data_cfg=data_cfg,
161 | pipeline=val_pipeline,
162 | dataset_info={{_base_.dataset_info}}),
163 | test=dict(
164 | type='TopDownCocoDataset',
165 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
166 | img_prefix=f'{data_root}/val2017/',
167 | data_cfg=data_cfg,
168 | pipeline=test_pipeline,
169 | dataset_info={{_base_.dataset_info}}),
170 | )
171 |
172 |
--------------------------------------------------------------------------------
/configs/ViTPose_large_coco_256x192.py:
--------------------------------------------------------------------------------
1 | # _base_ = [
2 | # '../../../../_base_/default_runtime.py',
3 | # '../../../../_base_/datasets/coco.py'
4 | # ]
5 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
6 |
7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
8 | constructor='LayerDecayOptimizerConstructor',
9 | paramwise_cfg=dict(
10 | num_layers=16,
11 | layer_decay_rate=0.8,
12 | custom_keys={
13 | 'bias': dict(decay_multi=0.),
14 | 'pos_embed': dict(decay_mult=0.),
15 | 'relative_position_bias_table': dict(decay_mult=0.),
16 | 'norm': dict(decay_mult=0.)
17 | }
18 | )
19 | )
20 |
21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
22 |
23 | # learning policy
24 | lr_config = dict(
25 | policy='step',
26 | warmup='linear',
27 | warmup_iters=500,
28 | warmup_ratio=0.001,
29 | step=[170, 200])
30 | total_epochs = 210
31 | target_type = 'GaussianHeatmap'
32 | channel_cfg = dict(
33 | num_output_channels=17,
34 | dataset_joints=17,
35 | dataset_channel=[
36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
37 | ],
38 | inference_channel=[
39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
40 | ])
41 |
42 | # model settings
43 | model = dict(
44 | type='TopDown',
45 | pretrained=None,
46 | backbone=dict(
47 | type='ViT',
48 | img_size=(256, 192),
49 | patch_size=16,
50 | embed_dim=1024,
51 | depth=24,
52 | num_heads=16,
53 | ratio=1,
54 | use_checkpoint=False,
55 | mlp_ratio=4,
56 | qkv_bias=True,
57 | drop_path_rate=0.5,
58 | ),
59 | keypoint_head=dict(
60 | type='TopdownHeatmapSimpleHead',
61 | in_channels=1024,
62 | num_deconv_layers=2,
63 | num_deconv_filters=(256, 256),
64 | num_deconv_kernels=(4, 4),
65 | extra=dict(final_conv_kernel=1, ),
66 | out_channels=channel_cfg['num_output_channels'],
67 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
68 | train_cfg=dict(),
69 | test_cfg=dict(
70 | flip_test=True,
71 | post_process='default',
72 | shift_heatmap=False,
73 | target_type=target_type,
74 | modulate_kernel=11,
75 | use_udp=True))
76 |
77 | data_cfg = dict(
78 | image_size=[192, 256],
79 | heatmap_size=[48, 64],
80 | num_output_channels=channel_cfg['num_output_channels'],
81 | num_joints=channel_cfg['dataset_joints'],
82 | dataset_channel=channel_cfg['dataset_channel'],
83 | inference_channel=channel_cfg['inference_channel'],
84 | soft_nms=False,
85 | nms_thr=1.0,
86 | oks_thr=0.9,
87 | vis_thr=0.2,
88 | use_gt_bbox=False,
89 | det_bbox_thr=0.0,
90 | bbox_file='data/coco/person_detection_results/'
91 | 'COCO_val2017_detections_AP_H_56_person.json',
92 | )
93 |
94 | train_pipeline = [
95 | dict(type='LoadImageFromFile'),
96 | dict(type='TopDownRandomFlip', flip_prob=0.5),
97 | dict(
98 | type='TopDownHalfBodyTransform',
99 | num_joints_half_body=8,
100 | prob_half_body=0.3),
101 | dict(
102 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
103 | dict(type='TopDownAffine', use_udp=True),
104 | dict(type='ToTensor'),
105 | dict(
106 | type='NormalizeTensor',
107 | mean=[0.485, 0.456, 0.406],
108 | std=[0.229, 0.224, 0.225]),
109 | dict(
110 | type='TopDownGenerateTarget',
111 | sigma=2,
112 | encoding='UDP',
113 | target_type=target_type),
114 | dict(
115 | type='Collect',
116 | keys=['img', 'target', 'target_weight'],
117 | meta_keys=[
118 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
119 | 'rotation', 'bbox_score', 'flip_pairs'
120 | ]),
121 | ]
122 |
123 | val_pipeline = [
124 | dict(type='LoadImageFromFile'),
125 | dict(type='TopDownAffine', use_udp=True),
126 | dict(type='ToTensor'),
127 | dict(
128 | type='NormalizeTensor',
129 | mean=[0.485, 0.456, 0.406],
130 | std=[0.229, 0.224, 0.225]),
131 | dict(
132 | type='Collect',
133 | keys=['img'],
134 | meta_keys=[
135 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
136 | 'flip_pairs'
137 | ]),
138 | ]
139 |
140 | test_pipeline = val_pipeline
141 |
142 | data_root = 'datasets/coco'
143 | data = dict(
144 | samples_per_gpu=64,
145 | workers_per_gpu=4,
146 | val_dataloader=dict(samples_per_gpu=32),
147 | test_dataloader=dict(samples_per_gpu=32),
148 | train=dict(
149 | type='TopDownCocoDataset',
150 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
151 | img_prefix=f'{data_root}/train2017/',
152 | data_cfg=data_cfg,
153 | pipeline=train_pipeline),
154 | val=dict(
155 | type='TopDownCocoDataset',
156 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
157 | img_prefix=f'{data_root}/val2017/',
158 | data_cfg=data_cfg,
159 | pipeline=val_pipeline),
160 | test=dict(
161 | type='TopDownCocoDataset',
162 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
163 | img_prefix=f'{data_root}/val2017/',
164 | data_cfg=data_cfg,
165 | pipeline=test_pipeline)
166 | )
167 |
168 |
--------------------------------------------------------------------------------
/configs/ViTPose_large_simple_coco_256x192.py:
--------------------------------------------------------------------------------
1 | _base_ = [
2 | '../../../../_base_/default_runtime.py',
3 | '../../../../_base_/datasets/coco.py'
4 | ]
5 | evaluation = dict(interval=10, metric='mAP', save_best='AP')
6 |
7 | optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
8 | constructor='LayerDecayOptimizerConstructor',
9 | paramwise_cfg=dict(
10 | num_layers=24,
11 | layer_decay_rate=0.8,
12 | custom_keys={
13 | 'bias': dict(decay_multi=0.),
14 | 'pos_embed': dict(decay_mult=0.),
15 | 'relative_position_bias_table': dict(decay_mult=0.),
16 | 'norm': dict(decay_mult=0.)
17 | }
18 | )
19 | )
20 |
21 | optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
22 |
23 | # learning policy
24 | lr_config = dict(
25 | policy='step',
26 | warmup='linear',
27 | warmup_iters=500,
28 | warmup_ratio=0.001,
29 | step=[170, 200])
30 | total_epochs = 210
31 | target_type = 'GaussianHeatmap'
32 | channel_cfg = dict(
33 | num_output_channels=17,
34 | dataset_joints=17,
35 | dataset_channel=[
36 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
37 | ],
38 | inference_channel=[
39 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
40 | ])
41 |
42 | # model settings
43 | model = dict(
44 | type='TopDown',
45 | pretrained=None,
46 | backbone=dict(
47 | type='ViT',
48 | img_size=(256, 192),
49 | patch_size=16,
50 | embed_dim=1024,
51 | depth=24,
52 | num_heads=16,
53 | ratio=1,
54 | use_checkpoint=False,
55 | mlp_ratio=4,
56 | qkv_bias=True,
57 | drop_path_rate=0.5,
58 | ),
59 | keypoint_head=dict(
60 | type='TopdownHeatmapSimpleHead',
61 | in_channels=1024,
62 | num_deconv_layers=0,
63 | num_deconv_filters=[],
64 | num_deconv_kernels=[],
65 | upsample=4,
66 | extra=dict(final_conv_kernel=3, ),
67 | out_channels=channel_cfg['num_output_channels'],
68 | loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
69 | train_cfg=dict(),
70 | test_cfg=dict(
71 | flip_test=True,
72 | post_process='default',
73 | shift_heatmap=False,
74 | target_type=target_type,
75 | modulate_kernel=11,
76 | use_udp=True))
77 |
78 | data_cfg = dict(
79 | image_size=[192, 256],
80 | heatmap_size=[48, 64],
81 | num_output_channels=channel_cfg['num_output_channels'],
82 | num_joints=channel_cfg['dataset_joints'],
83 | dataset_channel=channel_cfg['dataset_channel'],
84 | inference_channel=channel_cfg['inference_channel'],
85 | soft_nms=False,
86 | nms_thr=1.0,
87 | oks_thr=0.9,
88 | vis_thr=0.2,
89 | use_gt_bbox=False,
90 | det_bbox_thr=0.0,
91 | bbox_file='data/coco/person_detection_results/'
92 | 'COCO_val2017_detections_AP_H_56_person.json',
93 | )
94 |
95 | train_pipeline = [
96 | dict(type='LoadImageFromFile'),
97 | dict(type='TopDownRandomFlip', flip_prob=0.5),
98 | dict(
99 | type='TopDownHalfBodyTransform',
100 | num_joints_half_body=8,
101 | prob_half_body=0.3),
102 | dict(
103 | type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
104 | dict(type='TopDownAffine', use_udp=True),
105 | dict(type='ToTensor'),
106 | dict(
107 | type='NormalizeTensor',
108 | mean=[0.485, 0.456, 0.406],
109 | std=[0.229, 0.224, 0.225]),
110 | dict(
111 | type='TopDownGenerateTarget',
112 | sigma=2,
113 | encoding='UDP',
114 | target_type=target_type),
115 | dict(
116 | type='Collect',
117 | keys=['img', 'target', 'target_weight'],
118 | meta_keys=[
119 | 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
120 | 'rotation', 'bbox_score', 'flip_pairs'
121 | ]),
122 | ]
123 |
124 | val_pipeline = [
125 | dict(type='LoadImageFromFile'),
126 | dict(type='TopDownAffine', use_udp=True),
127 | dict(type='ToTensor'),
128 | dict(
129 | type='NormalizeTensor',
130 | mean=[0.485, 0.456, 0.406],
131 | std=[0.229, 0.224, 0.225]),
132 | dict(
133 | type='Collect',
134 | keys=['img'],
135 | meta_keys=[
136 | 'image_file', 'center', 'scale', 'rotation', 'bbox_score',
137 | 'flip_pairs'
138 | ]),
139 | ]
140 |
141 | test_pipeline = val_pipeline
142 |
143 | data_root = 'datasets/coco'
144 | data = dict(
145 | samples_per_gpu=64,
146 | workers_per_gpu=4,
147 | val_dataloader=dict(samples_per_gpu=32),
148 | test_dataloader=dict(samples_per_gpu=32),
149 | train=dict(
150 | type='TopDownCocoDataset',
151 | ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
152 | img_prefix=f'{data_root}/train2017/',
153 | data_cfg=data_cfg,
154 | pipeline=train_pipeline,
155 | dataset_info={{_base_.dataset_info}}),
156 | val=dict(
157 | type='TopDownCocoDataset',
158 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
159 | img_prefix=f'{data_root}/val2017/',
160 | data_cfg=data_cfg,
161 | pipeline=val_pipeline,
162 | dataset_info={{_base_.dataset_info}}),
163 | test=dict(
164 | type='TopDownCocoDataset',
165 | ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
166 | img_prefix=f'{data_root}/val2017/',
167 | data_cfg=data_cfg,
168 | pipeline=test_pipeline,
169 | dataset_info={{_base_.dataset_info}}),
170 | )
171 |
172 |
--------------------------------------------------------------------------------
/datasets/HumanPoseEstimation.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset
2 |
3 |
4 | class HumanPoseEstimationDataset(Dataset):
5 | """
6 | HumanPoseEstimationDataset class.
7 |
8 | Generic class for HPE datasets.
9 | """
10 | def __init__(self):
11 | pass
12 |
13 | def __len__(self):
14 | pass
15 |
16 | def __getitem__(self, item):
17 | pass
--------------------------------------------------------------------------------
/examples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/.DS_Store
--------------------------------------------------------------------------------
/examples/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/img1.jpg
--------------------------------------------------------------------------------
/examples/img1_result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/examples/img1_result.jpg
--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os.path as osp
3 |
4 | import torch
5 | from torch import Tensor
6 |
7 | from pathlib import Path
8 | import cv2
9 | import numpy as np
10 |
11 |
12 | from time import time
13 | from PIL import Image
14 | from torchvision.transforms import transforms
15 |
16 | from models.model import ViTPose
17 | from utils.visualization import draw_points_and_skeleton, joints_dict
18 | from utils.dist_util import get_dist_info, init_dist
19 | from utils.top_down_eval import keypoints_from_heatmaps
20 |
21 | __all__ = ['inference']
22 |
23 |
24 | @torch.no_grad()
25 | def inference(img_path: Path, img_size: tuple[int, int],
26 | model_cfg: dict, ckpt_path: Path, device: torch.device, save_result: bool=True) -> np.ndarray:
27 |
28 | # Prepare model
29 | vit_pose = ViTPose(model_cfg)
30 |
31 |
32 | ckpt = torch.load(ckpt_path)
33 | if 'state_dict' in ckpt:
34 | vit_pose.load_state_dict(ckpt['state_dict'])
35 | else:
36 | vit_pose.load_state_dict(ckpt)
37 | vit_pose.to(device)
38 | print(f">>> Model loaded: {ckpt_path}")
39 |
40 | # Prepare input data
41 | img = Image.open(img_path)
42 | org_w, org_h = img.size
43 | print(f">>> Original image size: {org_h} X {org_w} (height X width)")
44 | print(f">>> Resized image size: {img_size[1]} X {img_size[0]} (height X width)")
45 | print(f">>> Scale change: {org_h/img_size[1]}, {org_w/img_size[0]}")
46 | img_tensor = transforms.Compose (
47 | [transforms.Resize((img_size[1], img_size[0])),
48 | transforms.ToTensor()]
49 | )(img).unsqueeze(0).to(device)
50 |
51 |
52 | # Feed to model
53 | tic = time()
54 | heatmaps = vit_pose(img_tensor).detach().cpu().numpy() # N, 17, h/4, w/4
55 | elapsed_time = time()-tic
56 | print(f">>> Output size: {heatmaps.shape} ---> {elapsed_time:.4f} sec. elapsed [{elapsed_time**-1: .1f} fps]\n")
57 |
58 | # points = heatmap2coords(heatmaps=heatmaps, original_resolution=(org_h, org_w))
59 | points, prob = keypoints_from_heatmaps(heatmaps=heatmaps, center=np.array([[org_w//2, org_h//2]]), scale=np.array([[org_w, org_h]]),
60 | unbiased=True, use_udp=True)
61 | points = np.concatenate([points[:, :, ::-1], prob], axis=2)
62 |
63 | # Visualization
64 | if save_result:
65 | for pid, point in enumerate(points):
66 | img = np.array(img)[:, :, ::-1] # RGB to BGR for cv2 modules
67 | img = draw_points_and_skeleton(img.copy(), point, joints_dict()['coco']['skeleton'], person_index=pid,
68 | points_color_palette='gist_rainbow', skeleton_color_palette='jet',
69 | points_palette_samples=10, confidence_threshold=0.4)
70 | save_name = img_path.replace(".jpg", "_result.jpg")
71 | cv2.imwrite(save_name, img)
72 |
73 | return points
74 |
75 |
76 | if __name__ == "__main__":
77 | from configs.ViTPose_base_coco_256x192 import model as model_cfg
78 | from configs.ViTPose_base_coco_256x192 import data_cfg
79 |
80 | parser = argparse.ArgumentParser()
81 | parser.add_argument('--image-path', nargs='+', type=str, default='examples/sample.jpg', help='image path(s)')
82 | args = parser.parse_args()
83 |
84 | CUR_DIR = osp.dirname(__file__)
85 | # CKPT_PATH = f"{CUR_DIR}/vitpose-b-multi-coco.pth"
86 | CKPT_PATH = "/home/jaehyun/workspace/PoseEstimation/ViTPose_pytorch/runs/train/002/epoch010.pth"
87 |
88 | img_size = data_cfg['image_size']
89 | if type(args.image_path) != list:
90 | args.image_path = [args.image_path]
91 | for img_path in args.image_path:
92 | print(img_path)
93 | keypoints = inference(img_path=img_path, img_size=img_size, model_cfg=model_cfg, ckpt_path=CKPT_PATH,
94 | device=torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu'),
95 | save_result=True)
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os.path as osp
3 |
4 | sys.path.append(osp.dirname(osp.dirname(__file__)))
5 |
6 | from utils.util import load_checkpoint, resize, constant_init, normal_init
7 | from utils.top_down_eval import keypoints_from_heatmaps, pose_pck_accuracy
8 | from utils.post_processing import *
--------------------------------------------------------------------------------
/models/head/topdown_heatmap_base_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABCMeta, abstractmethod
3 |
4 | import numpy as np
5 | import torch.nn as nn
6 |
7 | from .. import keypoints_from_heatmaps
8 |
9 |
10 | class TopdownHeatmapBaseHead(nn.Module):
11 | """Base class for top-down heatmap heads.
12 |
13 | All top-down heatmap heads should subclass it.
14 | All subclass should overwrite:
15 |
16 | Methods:`get_loss`, supporting to calculate loss.
17 | Methods:`get_accuracy`, supporting to calculate accuracy.
18 | Methods:`forward`, supporting to forward model.
19 | Methods:`inference_model`, supporting to inference model.
20 | """
21 |
22 | __metaclass__ = ABCMeta
23 |
24 | @abstractmethod
25 | def get_loss(self, **kwargs):
26 | """Gets the loss."""
27 |
28 | @abstractmethod
29 | def get_accuracy(self, **kwargs):
30 | """Gets the accuracy."""
31 |
32 | @abstractmethod
33 | def forward(self, **kwargs):
34 | """Forward function."""
35 |
36 | @abstractmethod
37 | def inference_model(self, **kwargs):
38 | """Inference function."""
39 |
40 | def decode(self, img_metas, output, **kwargs):
41 | """Decode keypoints from heatmaps.
42 |
43 | Args:
44 | img_metas (list(dict)): Information about data augmentation
45 | By default this includes:
46 |
47 | - "image_file: path to the image file
48 | - "center": center of the bbox
49 | - "scale": scale of the bbox
50 | - "rotation": rotation of the bbox
51 | - "bbox_score": score of bbox
52 | output (np.ndarray[N, K, H, W]): model predicted heatmaps.
53 | """
54 | batch_size = len(img_metas)
55 |
56 | if 'bbox_id' in img_metas[0]:
57 | bbox_ids = []
58 | else:
59 | bbox_ids = None
60 |
61 | c = np.zeros((batch_size, 2), dtype=np.float32)
62 | s = np.zeros((batch_size, 2), dtype=np.float32)
63 | image_paths = []
64 | score = np.ones(batch_size)
65 | for i in range(batch_size):
66 | c[i, :] = img_metas[i]['center']
67 | s[i, :] = img_metas[i]['scale']
68 | image_paths.append(img_metas[i]['image_file'])
69 |
70 | if 'bbox_score' in img_metas[i]:
71 | score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
72 | if bbox_ids is not None:
73 | bbox_ids.append(img_metas[i]['bbox_id'])
74 |
75 | preds, maxvals = keypoints_from_heatmaps(
76 | output,
77 | c,
78 | s,
79 | unbiased=self.test_cfg.get('unbiased_decoding', False),
80 | post_process=self.test_cfg.get('post_process', 'default'),
81 | kernel=self.test_cfg.get('modulate_kernel', 11),
82 | valid_radius_factor=self.test_cfg.get('valid_radius_factor',
83 | 0.0546875),
84 | use_udp=self.test_cfg.get('use_udp', False),
85 | target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
86 |
87 | all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
88 | all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
89 | all_preds[:, :, 0:2] = preds[:, :, 0:2]
90 | all_preds[:, :, 2:3] = maxvals
91 | all_boxes[:, 0:2] = c[:, 0:2]
92 | all_boxes[:, 2:4] = s[:, 0:2]
93 | all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
94 | all_boxes[:, 5] = score
95 |
96 | result = {}
97 |
98 | result['preds'] = all_preds
99 | result['boxes'] = all_boxes
100 | result['image_paths'] = image_paths
101 | result['bbox_ids'] = bbox_ids
102 |
103 | return result
104 |
105 | @staticmethod
106 | def _get_deconv_cfg(deconv_kernel):
107 | """Get configurations for deconv layers."""
108 | if deconv_kernel == 4:
109 | padding = 1
110 | output_padding = 0
111 | elif deconv_kernel == 3:
112 | padding = 1
113 | output_padding = 1
114 | elif deconv_kernel == 2:
115 | padding = 0
116 | output_padding = 0
117 | else:
118 | raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
119 |
120 | return deconv_kernel, padding, output_padding
121 |
--------------------------------------------------------------------------------
/models/head/topdown_heatmap_simple_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn as nn
4 | from .. import constant_init, normal_init
5 |
6 | from .. import pose_pck_accuracy, flip_back, resize
7 | import torch.nn.functional as F
8 | from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
9 |
10 |
11 | class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
12 | """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
13 | Baselines for Human Pose Estimation and Tracking``.
14 |
15 | TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
16 | and a simple conv2d layer.
17 |
18 | Args:
19 | in_channels (int): Number of input channels
20 | out_channels (int): Number of output channels
21 | num_deconv_layers (int): Number of deconv layers.
22 | num_deconv_layers should >= 0. Note that 0 means
23 | no deconv layers.
24 | num_deconv_filters (list|tuple): Number of filters.
25 | If num_deconv_layers > 0, the length of
26 | num_deconv_kernels (list|tuple): Kernel sizes.
27 | in_index (int|Sequence[int]): Input feature index. Default: 0
28 | input_transform (str|None): Transformation type of input features.
29 | Options: 'resize_concat', 'multiple_select', None.
30 | Default: None.
31 |
32 | - 'resize_concat': Multiple feature maps will be resized to the
33 | same size as the first one and then concat together.
34 | Usually used in FCN head of HRNet.
35 | - 'multiple_select': Multiple feature maps will be bundle into
36 | a list and passed into decode head.
37 | - None: Only one select feature map is allowed.
38 | align_corners (bool): align_corners argument of F.interpolate.
39 | Default: False.
40 | loss_keypoint (dict): Config for keypoint loss. Default: None.
41 | """
42 |
43 | def __init__(self,
44 | in_channels,
45 | out_channels,
46 | num_deconv_layers=3,
47 | num_deconv_filters=(256, 256, 256),
48 | num_deconv_kernels=(4, 4, 4),
49 | extra=None,
50 | in_index=0,
51 | input_transform=None,
52 | align_corners=False,
53 | loss_keypoint=None,
54 | train_cfg=None,
55 | test_cfg=None,
56 | upsample=0,):
57 | super().__init__()
58 |
59 | self.in_channels = in_channels
60 | self.loss = loss_keypoint
61 | self.upsample = upsample
62 |
63 | self.train_cfg = {} if train_cfg is None else train_cfg
64 | self.test_cfg = {} if test_cfg is None else test_cfg
65 | self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
66 |
67 | self._init_inputs(in_channels, in_index, input_transform)
68 | self.in_index = in_index
69 | self.align_corners = align_corners
70 |
71 | if extra is not None and not isinstance(extra, dict):
72 | raise TypeError('extra should be dict or None.')
73 |
74 | if num_deconv_layers > 0:
75 | self.deconv_layers = self._make_deconv_layer(
76 | num_deconv_layers,
77 | num_deconv_filters,
78 | num_deconv_kernels,
79 | )
80 | elif num_deconv_layers == 0:
81 | self.deconv_layers = nn.Identity()
82 | else:
83 | raise ValueError(
84 | f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
85 |
86 | identity_final_layer = False
87 | if extra is not None and 'final_conv_kernel' in extra:
88 | assert extra['final_conv_kernel'] in [0, 1, 3]
89 | if extra['final_conv_kernel'] == 3:
90 | padding = 1
91 | elif extra['final_conv_kernel'] == 1:
92 | padding = 0
93 | else:
94 | # 0 for Identity mapping.
95 | identity_final_layer = True
96 | kernel_size = extra['final_conv_kernel']
97 | else:
98 | kernel_size = 1
99 | padding = 0
100 |
101 | if identity_final_layer:
102 | self.final_layer = nn.Identity()
103 | else:
104 | conv_channels = num_deconv_filters[
105 | -1] if num_deconv_layers > 0 else self.in_channels
106 |
107 | layers = []
108 | if extra is not None:
109 | num_conv_layers = extra.get('num_conv_layers', 0)
110 | num_conv_kernels = extra.get('num_conv_kernels',
111 | [1] * num_conv_layers)
112 |
113 | for i in range(num_conv_layers):
114 | layers.append(
115 | nn.Conv2d(in_channels=conv_channels,
116 | out_channels=conv_channels,
117 | kernel_size=num_conv_kernels[i],
118 | stride=1,
119 | padding=(num_conv_kernels[i] - 1) // 2)
120 | )
121 | layers.append(nn.BatchNorm2d(conv_channels))
122 | layers.append(nn.ReLU(inplace=True))
123 |
124 | layers.append(
125 | nn.Conv2d(in_channels=conv_channels,
126 | out_channels=out_channels,
127 | kernel_size=kernel_size,
128 | stride=1,
129 | padding=padding)
130 | )
131 |
132 | if len(layers) > 1:
133 | self.final_layer = nn.Sequential(*layers)
134 | else:
135 | self.final_layer = layers[0]
136 |
137 | def get_loss(self, output, target, target_weight):
138 | """Calculate top-down keypoint loss.
139 |
140 | Note:
141 | - batch_size: N
142 | - num_keypoints: K
143 | - heatmaps height: H
144 | - heatmaps weight: W
145 |
146 | Args:
147 | output (torch.Tensor[N,K,H,W]): Output heatmaps.
148 | target (torch.Tensor[N,K,H,W]): Target heatmaps.
149 | target_weight (torch.Tensor[N,K,1]):
150 | Weights across different joint types.
151 | """
152 |
153 | losses = dict()
154 |
155 | assert not isinstance(self.loss, nn.Sequential)
156 | assert target.dim() == 4 and target_weight.dim() == 3
157 | losses['heatmap_loss'] = self.loss(output, target, target_weight)
158 |
159 | return losses
160 |
161 | def get_accuracy(self, output, target, target_weight):
162 | """Calculate accuracy for top-down keypoint loss.
163 |
164 | Note:
165 | - batch_size: N
166 | - num_keypoints: K
167 | - heatmaps height: H
168 | - heatmaps weight: W
169 |
170 | Args:
171 | output (torch.Tensor[N,K,H,W]): Output heatmaps.
172 | target (torch.Tensor[N,K,H,W]): Target heatmaps.
173 | target_weight (torch.Tensor[N,K,1]):
174 | Weights across different joint types.
175 | """
176 |
177 | accuracy = dict()
178 |
179 | if self.target_type == 'GaussianHeatmap':
180 | _, avg_acc, _ = pose_pck_accuracy(
181 | output.detach().cpu().numpy(),
182 | target.detach().cpu().numpy(),
183 | target_weight.detach().cpu().numpy().squeeze(-1) > 0)
184 | accuracy['acc_pose'] = float(avg_acc)
185 |
186 | return accuracy
187 |
188 | def forward(self, x):
189 | """Forward function."""
190 | x = self._transform_inputs(x)
191 | x = self.deconv_layers(x)
192 | x = self.final_layer(x)
193 | return x
194 |
195 | def inference_model(self, x, flip_pairs=None):
196 | """Inference function.
197 |
198 | Returns:
199 | output_heatmap (np.ndarray): Output heatmaps.
200 |
201 | Args:
202 | x (torch.Tensor[N,K,H,W]): Input features.
203 | flip_pairs (None | list[tuple]):
204 | Pairs of keypoints which are mirrored.
205 | """
206 | output = self.forward(x)
207 |
208 | if flip_pairs is not None:
209 | output_heatmap = flip_back(
210 | output.detach().cpu().numpy(),
211 | flip_pairs,
212 | target_type=self.target_type)
213 | # feature is not aligned, shift flipped heatmap for higher accuracy
214 | if self.test_cfg.get('shift_heatmap', False):
215 | output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
216 | else:
217 | output_heatmap = output.detach().cpu().numpy()
218 | return output_heatmap
219 |
220 | def _init_inputs(self, in_channels, in_index, input_transform):
221 | """Check and initialize input transforms.
222 |
223 | The in_channels, in_index and input_transform must match.
224 | Specifically, when input_transform is None, only single feature map
225 | will be selected. So in_channels and in_index must be of type int.
226 | When input_transform is not None, in_channels and in_index must be
227 | list or tuple, with the same length.
228 |
229 | Args:
230 | in_channels (int|Sequence[int]): Input channels.
231 | in_index (int|Sequence[int]): Input feature index.
232 | input_transform (str|None): Transformation type of input features.
233 | Options: 'resize_concat', 'multiple_select', None.
234 |
235 | - 'resize_concat': Multiple feature maps will be resize to the
236 | same size as first one and than concat together.
237 | Usually used in FCN head of HRNet.
238 | - 'multiple_select': Multiple feature maps will be bundle into
239 | a list and passed into decode head.
240 | - None: Only one select feature map is allowed.
241 | """
242 |
243 | if input_transform is not None:
244 | assert input_transform in ['resize_concat', 'multiple_select']
245 | self.input_transform = input_transform
246 | self.in_index = in_index
247 | if input_transform is not None:
248 | assert isinstance(in_channels, (list, tuple))
249 | assert isinstance(in_index, (list, tuple))
250 | assert len(in_channels) == len(in_index)
251 | if input_transform == 'resize_concat':
252 | self.in_channels = sum(in_channels)
253 | else:
254 | self.in_channels = in_channels
255 | else:
256 | assert isinstance(in_channels, int)
257 | assert isinstance(in_index, int)
258 | self.in_channels = in_channels
259 |
260 | def _transform_inputs(self, inputs):
261 | """Transform inputs for decoder.
262 |
263 | Args:
264 | inputs (list[Tensor] | Tensor): multi-level img features.
265 |
266 | Returns:
267 | Tensor: The transformed inputs
268 | """
269 | if not isinstance(inputs, list):
270 | if not isinstance(inputs, list):
271 | if self.upsample > 0:
272 | inputs = resize(
273 | input=F.relu(inputs),
274 | scale_factor=self.upsample,
275 | mode='bilinear',
276 | align_corners=self.align_corners
277 | )
278 | return inputs
279 |
280 | if self.input_transform == 'resize_concat':
281 | inputs = [inputs[i] for i in self.in_index]
282 | upsampled_inputs = [
283 | resize(
284 | input=x,
285 | size=inputs[0].shape[2:],
286 | mode='bilinear',
287 | align_corners=self.align_corners) for x in inputs
288 | ]
289 | inputs = torch.cat(upsampled_inputs, dim=1)
290 | elif self.input_transform == 'multiple_select':
291 | inputs = [inputs[i] for i in self.in_index]
292 | else:
293 | inputs = inputs[self.in_index]
294 |
295 | return inputs
296 |
297 | def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
298 | """Make deconv layers."""
299 | if num_layers != len(num_filters):
300 | error_msg = f'num_layers({num_layers}) ' \
301 | f'!= length of num_filters({len(num_filters)})'
302 | raise ValueError(error_msg)
303 | if num_layers != len(num_kernels):
304 | error_msg = f'num_layers({num_layers}) ' \
305 | f'!= length of num_kernels({len(num_kernels)})'
306 | raise ValueError(error_msg)
307 |
308 | layers = []
309 | for i in range(num_layers):
310 | kernel, padding, output_padding = \
311 | self._get_deconv_cfg(num_kernels[i])
312 |
313 | planes = num_filters[i]
314 | layers.append(
315 | nn.ConvTranspose2d(in_channels=self.in_channels,
316 | out_channels=planes,
317 | kernel_size=kernel,
318 | stride=2,
319 | padding=padding,
320 | output_padding=output_padding,
321 | bias=False)
322 | )
323 | layers.append(nn.BatchNorm2d(planes))
324 | layers.append(nn.ReLU(inplace=True))
325 | self.in_channels = planes
326 |
327 | return nn.Sequential(*layers)
328 |
329 | def init_weights(self):
330 | """Initialize model weights."""
331 | for _, m in self.deconv_layers.named_modules():
332 | if isinstance(m, nn.ConvTranspose2d):
333 | normal_init(m, std=0.001)
334 | elif isinstance(m, nn.BatchNorm2d):
335 | constant_init(m, 1)
336 | for m in self.final_layer.modules():
337 | if isinstance(m, nn.Conv2d):
338 | normal_init(m, std=0.001, bias=0)
339 | elif isinstance(m, nn.BatchNorm2d):
340 | constant_init(m, 1)
341 |
--------------------------------------------------------------------------------
/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .classfication_loss import BCELoss
3 | from .heatmap_loss import AdaptiveWingLoss
4 | from .mesh_loss import GANLoss, MeshLoss
5 | from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
6 | from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
7 | from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss,
8 | SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
9 | WingLoss)
10 |
11 | __all__ = [
12 | 'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
13 | 'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
14 | 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
15 | 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss'
16 | ]
17 |
--------------------------------------------------------------------------------
/models/losses/classfication_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | __all__ = ['BCELoss']
7 |
8 |
9 | class BCELoss(nn.Module):
10 | """Binary Cross Entropy loss."""
11 |
12 | def __init__(self, use_target_weight=False, loss_weight=1.):
13 | super().__init__()
14 | self.criterion = F.binary_cross_entropy
15 | self.use_target_weight = use_target_weight
16 | self.loss_weight = loss_weight
17 |
18 | def forward(self, output, target, target_weight=None):
19 | """Forward function.
20 |
21 | Note:
22 | - batch_size: N
23 | - num_labels: K
24 |
25 | Args:
26 | output (torch.Tensor[N, K]): Output classification.
27 | target (torch.Tensor[N, K]): Target classification.
28 | target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
29 | Weights across different labels.
30 | """
31 |
32 | if self.use_target_weight:
33 | assert target_weight is not None
34 | loss = self.criterion(output, target, reduction='none')
35 | if target_weight.dim() == 1:
36 | target_weight = target_weight[:, None]
37 | loss = (loss * target_weight).mean()
38 | else:
39 | loss = self.criterion(output, target)
40 |
41 | return loss * self.loss_weight
42 |
--------------------------------------------------------------------------------
/models/losses/heatmap_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class AdaptiveWingLoss(nn.Module):
7 | """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
8 | Alignment via Heatmap Regression' Wang et al. ICCV'2019.
9 |
10 | Args:
11 | alpha (float), omega (float), epsilon (float), theta (float)
12 | are hyper-parameters.
13 | use_target_weight (bool): Option to use weighted MSE loss.
14 | Different joint types may have different target weights.
15 | loss_weight (float): Weight of the loss. Default: 1.0.
16 | """
17 |
18 | def __init__(self,
19 | alpha=2.1,
20 | omega=14,
21 | epsilon=1,
22 | theta=0.5,
23 | use_target_weight=False,
24 | loss_weight=1.):
25 | super().__init__()
26 | self.alpha = float(alpha)
27 | self.omega = float(omega)
28 | self.epsilon = float(epsilon)
29 | self.theta = float(theta)
30 | self.use_target_weight = use_target_weight
31 | self.loss_weight = loss_weight
32 |
33 | def criterion(self, pred, target):
34 | """Criterion of wingloss.
35 |
36 | Note:
37 | batch_size: N
38 | num_keypoints: K
39 |
40 | Args:
41 | pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
42 | target (torch.Tensor[NxKxHxW]): Target heatmaps.
43 | """
44 | H, W = pred.shape[2:4]
45 | delta = (target - pred).abs()
46 |
47 | A = self.omega * (
48 | 1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
49 | ) * (self.alpha - target) * (torch.pow(
50 | self.theta / self.epsilon,
51 | self.alpha - target - 1)) * (1 / self.epsilon)
52 | C = self.theta * A - self.omega * torch.log(
53 | 1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
54 |
55 | losses = torch.where(
56 | delta < self.theta,
57 | self.omega *
58 | torch.log(1 +
59 | torch.pow(delta / self.epsilon, self.alpha - target)),
60 | A * delta - C)
61 |
62 | return torch.mean(losses)
63 |
64 | def forward(self, output, target, target_weight):
65 | """Forward function.
66 |
67 | Note:
68 | batch_size: N
69 | num_keypoints: K
70 |
71 | Args:
72 | output (torch.Tensor[NxKxHxW]): Output heatmaps.
73 | target (torch.Tensor[NxKxHxW]): Target heatmaps.
74 | target_weight (torch.Tensor[NxKx1]):
75 | Weights across different joint types.
76 | """
77 | if self.use_target_weight:
78 | loss = self.criterion(output * target_weight.unsqueeze(-1),
79 | target * target_weight.unsqueeze(-1))
80 | else:
81 | loss = self.criterion(output, target)
82 |
83 | return loss * self.loss_weight
84 |
--------------------------------------------------------------------------------
/models/losses/mse_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | __all__ = ['JointsMSELoss', 'JointsOHKMMSELoss',]
7 |
8 |
9 | class JointsMSELoss(nn.Module):
10 | """MSE loss for heatmaps.
11 |
12 | Args:
13 | use_target_weight (bool): Option to use weighted MSE loss.
14 | Different joint types may have different target weights.
15 | loss_weight (float): Weight of the loss. Default: 1.0.
16 | """
17 |
18 | def __init__(self, use_target_weight=False, loss_weight=1.):
19 | super().__init__()
20 | self.criterion = nn.MSELoss()
21 | self.use_target_weight = use_target_weight
22 | self.loss_weight = loss_weight
23 |
24 | def forward(self, output, target, target_weight):
25 | """Forward function."""
26 | batch_size = output.size(0)
27 | num_joints = output.size(1)
28 |
29 | heatmaps_pred = output.reshape(
30 | (batch_size, num_joints, -1)).split(1, 1)
31 | heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
32 |
33 | loss = 0.
34 |
35 | for idx in range(num_joints):
36 | heatmap_pred = heatmaps_pred[idx].squeeze(1)
37 | heatmap_gt = heatmaps_gt[idx].squeeze(1)
38 | if self.use_target_weight:
39 | loss += self.criterion(heatmap_pred * target_weight[:, idx],
40 | heatmap_gt * target_weight[:, idx])
41 | else:
42 | loss += self.criterion(heatmap_pred, heatmap_gt)
43 |
44 | return loss / num_joints * self.loss_weight
45 |
46 |
47 | class CombinedTargetMSELoss(nn.Module):
48 | """MSE loss for combined target.
49 | CombinedTarget: The combination of classification target
50 | (response map) and regression target (offset map).
51 | Paper ref: Huang et al. The Devil is in the Details: Delving into
52 | Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
53 |
54 | Args:
55 | use_target_weight (bool): Option to use weighted MSE loss.
56 | Different joint types may have different target weights.
57 | loss_weight (float): Weight of the loss. Default: 1.0.
58 | """
59 |
60 | def __init__(self, use_target_weight, loss_weight=1.):
61 | super().__init__()
62 | self.criterion = nn.MSELoss(reduction='mean')
63 | self.use_target_weight = use_target_weight
64 | self.loss_weight = loss_weight
65 |
66 | def forward(self, output, target, target_weight):
67 | batch_size = output.size(0)
68 | num_channels = output.size(1)
69 | heatmaps_pred = output.reshape(
70 | (batch_size, num_channels, -1)).split(1, 1)
71 | heatmaps_gt = target.reshape(
72 | (batch_size, num_channels, -1)).split(1, 1)
73 | loss = 0.
74 | num_joints = num_channels // 3
75 | for idx in range(num_joints):
76 | heatmap_pred = heatmaps_pred[idx * 3].squeeze()
77 | heatmap_gt = heatmaps_gt[idx * 3].squeeze()
78 | offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
79 | offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
80 | offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
81 | offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
82 | if self.use_target_weight:
83 | heatmap_pred = heatmap_pred * target_weight[:, idx]
84 | heatmap_gt = heatmap_gt * target_weight[:, idx]
85 | # classification loss
86 | loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
87 | # regression loss
88 | loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
89 | heatmap_gt * offset_x_gt)
90 | loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
91 | heatmap_gt * offset_y_gt)
92 | return loss / num_joints * self.loss_weight
93 |
94 |
95 | class JointsOHKMMSELoss(nn.Module):
96 | """MSE loss with online hard keypoint mining.
97 |
98 | Args:
99 | use_target_weight (bool): Option to use weighted MSE loss.
100 | Different joint types may have different target weights.
101 | topk (int): Only top k joint losses are kept.
102 | loss_weight (float): Weight of the loss. Default: 1.0.
103 | """
104 |
105 | def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
106 | super().__init__()
107 | assert topk > 0
108 | self.criterion = nn.MSELoss(reduction='none')
109 | self.use_target_weight = use_target_weight
110 | self.topk = topk
111 | self.loss_weight = loss_weight
112 |
113 | def _ohkm(self, loss):
114 | """Online hard keypoint mining."""
115 | ohkm_loss = 0.
116 | N = len(loss)
117 | for i in range(N):
118 | sub_loss = loss[i]
119 | _, topk_idx = torch.topk(
120 | sub_loss, k=self.topk, dim=0, sorted=False)
121 | tmp_loss = torch.gather(sub_loss, 0, topk_idx)
122 | ohkm_loss += torch.sum(tmp_loss) / self.topk
123 | ohkm_loss /= N
124 | return ohkm_loss
125 |
126 | def forward(self, output, target, target_weight):
127 | """Forward function."""
128 | batch_size = output.size(0)
129 | num_joints = output.size(1)
130 | if num_joints < self.topk:
131 | raise ValueError(f'topk ({self.topk}) should not '
132 | f'larger than num_joints ({num_joints}).')
133 | heatmaps_pred = output.reshape(
134 | (batch_size, num_joints, -1)).split(1, 1)
135 | heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
136 |
137 | losses = []
138 | for idx in range(num_joints):
139 | heatmap_pred = heatmaps_pred[idx].squeeze(1)
140 | heatmap_gt = heatmaps_gt[idx].squeeze(1)
141 | if self.use_target_weight:
142 | losses.append(
143 | self.criterion(heatmap_pred * target_weight[:, idx],
144 | heatmap_gt * target_weight[:, idx]))
145 | else:
146 | losses.append(self.criterion(heatmap_pred, heatmap_gt))
147 |
148 | losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
149 | losses = torch.cat(losses, dim=1)
150 |
151 | return self._ohkm(losses) * self.loss_weight
152 |
--------------------------------------------------------------------------------
/models/losses/multi_loss_factory.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
3 | # Original licence: Copyright (c) Microsoft, under the MIT License.
4 | # ------------------------------------------------------------------------------
5 |
6 | import torch
7 | import torch.nn as nn
8 |
9 |
10 | __all__ = ['HeatmapLoss', 'AELoss', 'MultiLossFactory']
11 |
12 |
13 | def _make_input(t, requires_grad=False, device=torch.device('cpu')):
14 | """Make zero inputs for AE loss.
15 |
16 | Args:
17 | t (torch.Tensor): input
18 | requires_grad (bool): Option to use requires_grad.
19 | device: torch device
20 |
21 | Returns:
22 | torch.Tensor: zero input.
23 | """
24 | inp = torch.autograd.Variable(t, requires_grad=requires_grad)
25 | inp = inp.sum()
26 | inp = inp.to(device)
27 | return inp
28 |
29 |
30 | class HeatmapLoss(nn.Module):
31 | """Accumulate the heatmap loss for each image in the batch.
32 |
33 | Args:
34 | supervise_empty (bool): Whether to supervise empty channels.
35 | """
36 |
37 | def __init__(self, supervise_empty=True):
38 | super().__init__()
39 | self.supervise_empty = supervise_empty
40 |
41 | def forward(self, pred, gt, mask):
42 | """Forward function.
43 |
44 | Note:
45 | - batch_size: N
46 | - heatmaps weight: W
47 | - heatmaps height: H
48 | - max_num_people: M
49 | - num_keypoints: K
50 |
51 | Args:
52 | pred (torch.Tensor[N,K,H,W]):heatmap of output.
53 | gt (torch.Tensor[N,K,H,W]): target heatmap.
54 | mask (torch.Tensor[N,H,W]): mask of target.
55 | """
56 | assert pred.size() == gt.size(
57 | ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
58 |
59 | if not self.supervise_empty:
60 | empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
61 | loss = ((pred - gt)**2) * empty_mask.expand_as(
62 | pred) * mask[:, None, :, :].expand_as(pred)
63 | else:
64 | loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
65 | loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
66 | return loss
67 |
68 |
69 | class AELoss(nn.Module):
70 | """Associative Embedding loss.
71 |
72 | `Associative Embedding: End-to-End Learning for Joint Detection and
73 | Grouping `_.
74 | """
75 |
76 | def __init__(self, loss_type):
77 | super().__init__()
78 | self.loss_type = loss_type
79 |
80 | def singleTagLoss(self, pred_tag, joints):
81 | """Associative embedding loss for one image.
82 |
83 | Note:
84 | - heatmaps weight: W
85 | - heatmaps height: H
86 | - max_num_people: M
87 | - num_keypoints: K
88 |
89 | Args:
90 | pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
91 | joints (torch.Tensor[M,K,2]): joints information for one image.
92 | """
93 | tags = []
94 | pull = 0
95 | for joints_per_person in joints:
96 | tmp = []
97 | for joint in joints_per_person:
98 | if joint[1] > 0:
99 | tmp.append(pred_tag[joint[0]])
100 | if len(tmp) == 0:
101 | continue
102 | tmp = torch.stack(tmp)
103 | tags.append(torch.mean(tmp, dim=0))
104 | pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
105 |
106 | num_tags = len(tags)
107 | if num_tags == 0:
108 | return (
109 | _make_input(torch.zeros(1).float(), device=pred_tag.device),
110 | _make_input(torch.zeros(1).float(), device=pred_tag.device))
111 | elif num_tags == 1:
112 | return (_make_input(
113 | torch.zeros(1).float(), device=pred_tag.device), pull)
114 |
115 | tags = torch.stack(tags)
116 |
117 | size = (num_tags, num_tags)
118 | A = tags.expand(*size)
119 | B = A.permute(1, 0)
120 |
121 | diff = A - B
122 |
123 | if self.loss_type == 'exp':
124 | diff = torch.pow(diff, 2)
125 | push = torch.exp(-diff)
126 | push = torch.sum(push) - num_tags
127 | elif self.loss_type == 'max':
128 | diff = 1 - torch.abs(diff)
129 | push = torch.clamp(diff, min=0).sum() - num_tags
130 | else:
131 | raise ValueError('Unknown ae loss type')
132 |
133 | push_loss = push / ((num_tags - 1) * num_tags) * 0.5
134 | pull_loss = pull / (num_tags)
135 |
136 | return push_loss, pull_loss
137 |
138 | def forward(self, tags, joints):
139 | """Accumulate the tag loss for each image in the batch.
140 |
141 | Note:
142 | - batch_size: N
143 | - heatmaps weight: W
144 | - heatmaps height: H
145 | - max_num_people: M
146 | - num_keypoints: K
147 |
148 | Args:
149 | tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
150 | joints (torch.Tensor[N,M,K,2]): joints information.
151 | """
152 | pushes, pulls = [], []
153 | joints = joints.cpu().data.numpy()
154 | batch_size = tags.size(0)
155 | for i in range(batch_size):
156 | push, pull = self.singleTagLoss(tags[i], joints[i])
157 | pushes.append(push)
158 | pulls.append(pull)
159 | return torch.stack(pushes), torch.stack(pulls)
160 |
161 |
162 | class MultiLossFactory(nn.Module):
163 | """Loss for bottom-up models.
164 |
165 | Args:
166 | num_joints (int): Number of keypoints.
167 | num_stages (int): Number of stages.
168 | ae_loss_type (str): Type of ae loss.
169 | with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
170 | push_loss_factor (list[float]):
171 | Parameter of push loss in multi-heatmap.
172 | pull_loss_factor (list[float]):
173 | Parameter of pull loss in multi-heatmap.
174 | with_heatmap_loss (list[bool]):
175 | Use heatmap loss or not in multi-heatmap.
176 | heatmaps_loss_factor (list[float]):
177 | Parameter of heatmap loss in multi-heatmap.
178 | supervise_empty (bool): Whether to supervise empty channels.
179 | """
180 |
181 | def __init__(self,
182 | num_joints,
183 | num_stages,
184 | ae_loss_type,
185 | with_ae_loss,
186 | push_loss_factor,
187 | pull_loss_factor,
188 | with_heatmaps_loss,
189 | heatmaps_loss_factor,
190 | supervise_empty=True):
191 | super().__init__()
192 |
193 | assert isinstance(with_heatmaps_loss, (list, tuple)), \
194 | 'with_heatmaps_loss should be a list or tuple'
195 | assert isinstance(heatmaps_loss_factor, (list, tuple)), \
196 | 'heatmaps_loss_factor should be a list or tuple'
197 | assert isinstance(with_ae_loss, (list, tuple)), \
198 | 'with_ae_loss should be a list or tuple'
199 | assert isinstance(push_loss_factor, (list, tuple)), \
200 | 'push_loss_factor should be a list or tuple'
201 | assert isinstance(pull_loss_factor, (list, tuple)), \
202 | 'pull_loss_factor should be a list or tuple'
203 |
204 | self.num_joints = num_joints
205 | self.num_stages = num_stages
206 | self.ae_loss_type = ae_loss_type
207 | self.with_ae_loss = with_ae_loss
208 | self.push_loss_factor = push_loss_factor
209 | self.pull_loss_factor = pull_loss_factor
210 | self.with_heatmaps_loss = with_heatmaps_loss
211 | self.heatmaps_loss_factor = heatmaps_loss_factor
212 |
213 | self.heatmaps_loss = \
214 | nn.ModuleList(
215 | [
216 | HeatmapLoss(supervise_empty)
217 | if with_heatmaps_loss else None
218 | for with_heatmaps_loss in self.with_heatmaps_loss
219 | ]
220 | )
221 |
222 | self.ae_loss = \
223 | nn.ModuleList(
224 | [
225 | AELoss(self.ae_loss_type) if with_ae_loss else None
226 | for with_ae_loss in self.with_ae_loss
227 | ]
228 | )
229 |
230 | def forward(self, outputs, heatmaps, masks, joints):
231 | """Forward function to calculate losses.
232 |
233 | Note:
234 | - batch_size: N
235 | - heatmaps weight: W
236 | - heatmaps height: H
237 | - max_num_people: M
238 | - num_keypoints: K
239 | - output_channel: C C=2K if use ae loss else K
240 |
241 | Args:
242 | outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
243 | heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
244 | masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
245 | joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
246 | """
247 | heatmaps_losses = []
248 | push_losses = []
249 | pull_losses = []
250 | for idx in range(len(outputs)):
251 | offset_feat = 0
252 | if self.heatmaps_loss[idx]:
253 | heatmaps_pred = outputs[idx][:, :self.num_joints]
254 | offset_feat = self.num_joints
255 | heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
256 | heatmaps[idx],
257 | masks[idx])
258 | heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
259 | heatmaps_losses.append(heatmaps_loss)
260 | else:
261 | heatmaps_losses.append(None)
262 |
263 | if self.ae_loss[idx]:
264 | tags_pred = outputs[idx][:, offset_feat:]
265 | batch_size = tags_pred.size()[0]
266 | tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
267 |
268 | push_loss, pull_loss = self.ae_loss[idx](tags_pred,
269 | joints[idx])
270 | push_loss = push_loss * self.push_loss_factor[idx]
271 | pull_loss = pull_loss * self.pull_loss_factor[idx]
272 |
273 | push_losses.append(push_loss)
274 | pull_losses.append(pull_loss)
275 | else:
276 | push_losses.append(None)
277 | pull_losses.append(None)
278 |
279 | return heatmaps_losses, push_losses, pull_losses
280 |
--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 | from .backbone.vit import ViT
4 | from .head.topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
5 |
6 |
7 | __all__ = ['ViTPose']
8 |
9 |
10 | class ViTPose(nn.Module):
11 | def __init__(self, cfg: dict) -> None:
12 | super(ViTPose, self).__init__()
13 |
14 | backbone_cfg = {k: v for k, v in cfg['backbone'].items() if k != 'type'}
15 | head_cfg = {k: v for k, v in cfg['keypoint_head'].items() if k != 'type'}
16 |
17 | self.backbone = ViT(**backbone_cfg)
18 | self.keypoint_head = TopdownHeatmapSimpleHead(**head_cfg)
19 |
20 | def forward_features(self, x):
21 | return self.backbone(x)
22 |
23 | def forward(self, x):
24 | return self.keypoint_head(self.backbone(x))
--------------------------------------------------------------------------------
/models/optimizer.py:
--------------------------------------------------------------------------------
1 | import torch.optim as optim
2 |
3 | class LayerDecayOptimizer:
4 | def __init__(self, optimizer, layerwise_decay_rate):
5 | self.optimizer = optimizer
6 | self.layerwise_decay_rate = layerwise_decay_rate
7 | self.param_groups = optimizer.param_groups
8 |
9 | def step(self, *args, **kwargs):
10 | for i, group in enumerate(self.optimizer.param_groups):
11 | group['lr'] *= self.layerwise_decay_rate[i]
12 | self.optimizer.step(*args, **kwargs)
13 |
14 | def zero_grad(self, *args, **kwargs):
15 | self.optimizer.zero_grad(*args, **kwargs)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ffmpeg==1.4
2 | matplotlib==3.6.2
3 | munkres==1.1.4
4 | numpy==1.23.5
5 | opencv_python==4.6.0.66
6 | Pillow==9.3.0
7 | torch==1.9.0+cu111
8 | torchvision==0.10.0+cu111
9 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import argparse
3 | import copy
4 | import os
5 | import os.path as osp
6 | import time
7 | import warnings
8 | import click
9 | import yaml
10 |
11 | from glob import glob
12 |
13 | import torch
14 | import torch.distributed as dist
15 |
16 | from utils.util import init_random_seed, set_random_seed
17 | from utils.dist_util import get_dist_info, init_dist
18 | from utils.logging import get_root_logger
19 |
20 | import configs.ViTPose_base_coco_256x192 as b_cfg
21 | import configs.ViTPose_large_coco_256x192 as l_cfg
22 | import configs.ViTPose_huge_coco_256x192 as h_cfg
23 |
24 | from models.model import ViTPose
25 | from datasets.COCO import COCODataset
26 | from utils.train_valid_fn import train_model
27 |
28 | CUR_PATH = osp.dirname(__file__)
29 |
30 | @click.command()
31 | @click.option('--config-path', type=click.Path(exists=True), default='config.yaml', required=True, help='train config file path')
32 | @click.option('--model-name', type=str, default='b', required=True, help='[b: ViT-B, l: ViT-L, h: ViT-H]')
33 | def main(config_path, model_name):
34 |
35 | cfg = {'b':b_cfg,
36 | 'l':l_cfg,
37 | 'h':h_cfg}.get(model_name.lower())
38 | # Load config.yaml
39 | with open(config_path, 'r') as f:
40 | cfg_yaml = yaml.load(f, Loader=yaml.SafeLoader)
41 |
42 | for k, v in cfg_yaml.items():
43 | if hasattr(cfg, k):
44 | raise ValueError(f"Already exsist {k} in config")
45 | else:
46 | cfg.__setattr__(k, v)
47 |
48 | # set cudnn_benchmark
49 | if cfg.cudnn_benchmark:
50 | torch.backends.cudnn.benchmark = True
51 |
52 | # Set work directory (session-level)
53 | if not hasattr(cfg, 'work_dir'):
54 | cfg.__setattr__('work_dir', f"{CUR_PATH}/runs/train")
55 |
56 | if not osp.exists(cfg.work_dir):
57 | os.makedirs(cfg.work_dir)
58 | session_list = sorted(glob(f"{cfg.work_dir}/*"))
59 | if len(session_list) == 0:
60 | session = 1
61 | else:
62 | session = int(os.path.basename(session_list[-1])) + 1
63 | session_dir = osp.join(cfg.work_dir, str(session).zfill(3))
64 | os.makedirs(session_dir)
65 | cfg.__setattr__('work_dir', session_dir)
66 |
67 |
68 | if cfg.autoscale_lr:
69 | # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
70 | cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
71 |
72 | # init distributed env first, since logger depends on the dist info.
73 | if cfg.launcher == 'none':
74 | distributed = False
75 | if len(cfg.gpu_ids) > 1:
76 | warnings.warn(
77 | f"We treat {cfg['gpu_ids']} as gpu-ids, and reset to "
78 | f"{cfg['gpu_ids'][0:1]} as gpu-ids to avoid potential error in "
79 | "non-distribute training time.")
80 | cfg.gpu_ids = cfg.gpu_ids[0:1]
81 | else:
82 | distributed = True
83 | init_dist(cfg.launcher, **cfg.dist_params)
84 | # re-set gpu_ids with distributed training mode
85 | _, world_size = get_dist_info()
86 | cfg.gpu_ids = range(world_size)
87 |
88 | # init the logger before other steps
89 | timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
90 | log_file = osp.join(session_dir, f'{timestamp}.log')
91 | logger = get_root_logger(log_file=log_file)
92 |
93 | # init the meta dict to record some important information such as
94 | # environment info and seed, which will be logged
95 | meta = dict()
96 |
97 | # log some basic info
98 | logger.info(f'Distributed training: {distributed}')
99 |
100 | # set random seeds
101 | seed = init_random_seed(cfg.seed)
102 | logger.info(f"Set random seed to {seed}, "
103 | f"deterministic: {cfg.deterministic}")
104 | set_random_seed(seed, deterministic=cfg.deterministic)
105 | meta['seed'] = seed
106 |
107 | # Set model
108 | model = ViTPose(cfg.model)
109 | if cfg.resume_from:
110 | model.load_state_dict(torch.load(cfg.resume_from)['state_dict'])
111 |
112 | # Set dataset
113 | datasets_train = COCODataset(
114 | root_path=cfg.data_root,
115 | data_version="train_custom",
116 | is_train=True,
117 | use_gt_bboxes=True,
118 | image_width=192,
119 | image_height=256,
120 | scale=True,
121 | scale_factor=0.35,
122 | flip_prob=0.5,
123 | rotate_prob=0.5,
124 | rotation_factor=45.,
125 | half_body_prob=0.3,
126 | use_different_joints_weight=True,
127 | heatmap_sigma=3,
128 | soft_nms=False
129 | )
130 |
131 | datasets_valid = COCODataset(
132 | root_path=cfg.data_root,
133 | data_version="valid_custom",
134 | is_train=False,
135 | use_gt_bboxes=True,
136 | image_width=192,
137 | image_height=256,
138 | scale=False,
139 | scale_factor=0.35,
140 | flip_prob=0.5,
141 | rotate_prob=0.5,
142 | rotation_factor=45.,
143 | half_body_prob=0.3,
144 | use_different_joints_weight=True,
145 | heatmap_sigma=3,
146 | soft_nms=False
147 | )
148 |
149 | train_model(
150 | model=model,
151 | datasets_train=datasets_train,
152 | datasets_valid=datasets_valid,
153 | cfg=cfg,
154 | distributed=distributed,
155 | validate=cfg.validate,
156 | timestamp=timestamp,
157 | meta=meta
158 | )
159 |
160 |
161 | if __name__ == '__main__':
162 | main()
163 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .top_down_eval import *
3 | from .post_processing import *
4 | from .visualization import *
5 | from .dist_util import *
6 | from .logging import *
7 |
--------------------------------------------------------------------------------
/utils/dist_util.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3 | import functools
4 | import os
5 | import socket
6 | import subprocess
7 | from collections import OrderedDict
8 | from typing import Callable, List, Optional, Tuple
9 |
10 | import torch
11 | import torch.multiprocessing as mp
12 | from torch import distributed as dist
13 | from torch._utils import (_flatten_dense_tensors, _take_tensors,
14 | _unflatten_dense_tensors)
15 |
16 |
17 | def is_mps_available() -> bool:
18 | """Return True if mps devices exist.
19 |
20 | It's specialized for mac m1 chips and require torch version 1.12 or higher.
21 | """
22 | try:
23 | import torch
24 | return hasattr(torch.backends,
25 | 'mps') and torch.backends.mps.is_available()
26 | except Exception:
27 | return False
28 |
29 | def _find_free_port() -> str:
30 | # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
31 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
32 | # Binding to port 0 will cause the OS to find an available port for us
33 | sock.bind(('', 0))
34 | port = sock.getsockname()[1]
35 | sock.close()
36 | # NOTE: there is still a chance the port could be taken by other processes.
37 | return port
38 |
39 |
40 | def _is_free_port(port: int) -> bool:
41 | ips = socket.gethostbyname_ex(socket.gethostname())[-1]
42 | ips.append('localhost')
43 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
44 | return all(s.connect_ex((ip, port)) != 0 for ip in ips)
45 |
46 |
47 | def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
48 | if mp.get_start_method(allow_none=True) is None:
49 | mp.set_start_method('spawn')
50 | if launcher == 'pytorch':
51 | _init_dist_pytorch(backend, **kwargs)
52 | elif launcher == 'mpi':
53 | _init_dist_mpi(backend, **kwargs)
54 | elif launcher == 'slurm':
55 | _init_dist_slurm(backend, **kwargs)
56 | else:
57 | raise ValueError(f'Invalid launcher type: {launcher}')
58 |
59 |
60 | def _init_dist_pytorch(backend: str, **kwargs) -> None:
61 | # TODO: use local_rank instead of rank % num_gpus
62 | rank = int(os.environ['RANK'])
63 | num_gpus = torch.cuda.device_count()
64 | torch.cuda.set_device(rank % num_gpus)
65 | dist.init_process_group(backend=backend, **kwargs)
66 |
67 |
68 | def _init_dist_mpi(backend: str, **kwargs) -> None:
69 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
70 | torch.cuda.set_device(local_rank)
71 | if 'MASTER_PORT' not in os.environ:
72 | # 29500 is torch.distributed default port
73 | os.environ['MASTER_PORT'] = '29500'
74 | if 'MASTER_ADDR' not in os.environ:
75 | raise KeyError('The environment variable MASTER_ADDR is not set')
76 | os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
77 | os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
78 | dist.init_process_group(backend=backend, **kwargs)
79 |
80 |
81 | def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
82 | """Initialize slurm distributed training environment.
83 |
84 | If argument ``port`` is not specified, then the master port will be system
85 | environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
86 | environment variable, then a default port ``29500`` will be used.
87 |
88 | Args:
89 | backend (str): Backend of torch.distributed.
90 | port (int, optional): Master port. Defaults to None.
91 | """
92 | proc_id = int(os.environ['SLURM_PROCID'])
93 | ntasks = int(os.environ['SLURM_NTASKS'])
94 | node_list = os.environ['SLURM_NODELIST']
95 | num_gpus = torch.cuda.device_count()
96 | torch.cuda.set_device(proc_id % num_gpus)
97 | addr = subprocess.getoutput(
98 | f'scontrol show hostname {node_list} | head -n1')
99 | # specify master port
100 | if port is not None:
101 | os.environ['MASTER_PORT'] = str(port)
102 | elif 'MASTER_PORT' in os.environ:
103 | pass # use MASTER_PORT in the environment variable
104 | else:
105 | # if torch.distributed default port(29500) is available
106 | # then use it, else find a free port
107 | if _is_free_port(29500):
108 | os.environ['MASTER_PORT'] = '29500'
109 | else:
110 | os.environ['MASTER_PORT'] = str(_find_free_port())
111 | # use MASTER_ADDR in the environment variable if it already exists
112 | if 'MASTER_ADDR' not in os.environ:
113 | os.environ['MASTER_ADDR'] = addr
114 | os.environ['WORLD_SIZE'] = str(ntasks)
115 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
116 | os.environ['RANK'] = str(proc_id)
117 | dist.init_process_group(backend=backend)
118 |
119 |
120 | def get_dist_info() -> Tuple[int, int]:
121 | if dist.is_available() and dist.is_initialized():
122 | rank = dist.get_rank()
123 | world_size = dist.get_world_size()
124 | else:
125 | rank = 0
126 | world_size = 1
127 | return rank, world_size
128 |
129 |
130 | def master_only(func: Callable) -> Callable:
131 |
132 | @functools.wraps(func)
133 | def wrapper(*args, **kwargs):
134 | rank, _ = get_dist_info()
135 | if rank == 0:
136 | return func(*args, **kwargs)
137 |
138 | return wrapper
139 |
140 |
141 | def allreduce_params(params: List[torch.nn.Parameter],
142 | coalesce: bool = True,
143 | bucket_size_mb: int = -1) -> None:
144 | """Allreduce parameters.
145 |
146 | Args:
147 | params (list[torch.nn.Parameter]): List of parameters or buffers
148 | of a model.
149 | coalesce (bool, optional): Whether allreduce parameters as a whole.
150 | Defaults to True.
151 | bucket_size_mb (int, optional): Size of bucket, the unit is MB.
152 | Defaults to -1.
153 | """
154 | _, world_size = get_dist_info()
155 | if world_size == 1:
156 | return
157 | params = [param.data for param in params]
158 | if coalesce:
159 | _allreduce_coalesced(params, world_size, bucket_size_mb)
160 | else:
161 | for tensor in params:
162 | dist.all_reduce(tensor.div_(world_size))
163 |
164 |
165 | def allreduce_grads(params: List[torch.nn.Parameter],
166 | coalesce: bool = True,
167 | bucket_size_mb: int = -1) -> None:
168 | """Allreduce gradients.
169 |
170 | Args:
171 | params (list[torch.nn.Parameter]): List of parameters of a model.
172 | coalesce (bool, optional): Whether allreduce parameters as a whole.
173 | Defaults to True.
174 | bucket_size_mb (int, optional): Size of bucket, the unit is MB.
175 | Defaults to -1.
176 | """
177 | grads = [
178 | param.grad.data for param in params
179 | if param.requires_grad and param.grad is not None
180 | ]
181 | _, world_size = get_dist_info()
182 | if world_size == 1:
183 | return
184 | if coalesce:
185 | _allreduce_coalesced(grads, world_size, bucket_size_mb)
186 | else:
187 | for tensor in grads:
188 | dist.all_reduce(tensor.div_(world_size))
189 |
190 |
191 | def _allreduce_coalesced(tensors: torch.Tensor,
192 | world_size: int,
193 | bucket_size_mb: int = -1) -> None:
194 | if bucket_size_mb > 0:
195 | bucket_size_bytes = bucket_size_mb * 1024 * 1024
196 | buckets = _take_tensors(tensors, bucket_size_bytes)
197 | else:
198 | buckets = OrderedDict()
199 | for tensor in tensors:
200 | tp = tensor.type()
201 | if tp not in buckets:
202 | buckets[tp] = []
203 | buckets[tp].append(tensor)
204 | buckets = buckets.values()
205 |
206 | for bucket in buckets:
207 | flat_tensors = _flatten_dense_tensors(bucket)
208 | dist.all_reduce(flat_tensors)
209 | flat_tensors.div_(world_size)
210 | for tensor, synced in zip(
211 | bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
212 | tensor.copy_(synced)
213 |
--------------------------------------------------------------------------------
/utils/logging.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import logging
3 |
4 | import torch.distributed as dist
5 |
6 | logger_initialized: dict = {}
7 |
8 |
9 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
10 | """Initialize and get a logger by name.
11 |
12 | If the logger has not been initialized, this method will initialize the
13 | logger by adding one or two handlers, otherwise the initialized logger will
14 | be directly returned. During initialization, a StreamHandler will always be
15 | added. If `log_file` is specified and the process rank is 0, a FileHandler
16 | will also be added.
17 |
18 | Args:
19 | name (str): Logger name.
20 | log_file (str | None): The log filename. If specified, a FileHandler
21 | will be added to the logger.
22 | log_level (int): The logger level. Note that only the process of
23 | rank 0 is affected, and other processes will set the level to
24 | "Error" thus be silent most of the time.
25 | file_mode (str): The file mode used in opening log file.
26 | Defaults to 'w'.
27 |
28 | Returns:
29 | logging.Logger: The expected logger.
30 | """
31 | logger = logging.getLogger(name)
32 | if name in logger_initialized:
33 | return logger
34 | # handle hierarchical names
35 | # e.g., logger "a" is initialized, then logger "a.b" will skip the
36 | # initialization since it is a child of "a".
37 | for logger_name in logger_initialized:
38 | if name.startswith(logger_name):
39 | return logger
40 |
41 | # handle duplicate logs to the console
42 | # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET)
43 | # to the root logger. As logger.propagate is True by default, this root
44 | # level handler causes logging messages from rank>0 processes to
45 | # unexpectedly show up on the console, creating much unwanted clutter.
46 | # To fix this issue, we set the root logger's StreamHandler, if any, to log
47 | # at the ERROR level.
48 | for handler in logger.root.handlers:
49 | if type(handler) is logging.StreamHandler:
50 | handler.setLevel(logging.ERROR)
51 |
52 | stream_handler = logging.StreamHandler()
53 | handlers = [stream_handler]
54 |
55 | if dist.is_available() and dist.is_initialized():
56 | rank = dist.get_rank()
57 | else:
58 | rank = 0
59 |
60 | # only rank 0 will add a FileHandler
61 | if rank == 0 and log_file is not None:
62 | # Here, the default behaviour of the official logger is 'a'. Thus, we
63 | # provide an interface to change the file mode to the default
64 | # behaviour.
65 | file_handler = logging.FileHandler(log_file, file_mode)
66 | handlers.append(file_handler)
67 |
68 | formatter = logging.Formatter(
69 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
70 | for handler in handlers:
71 | handler.setFormatter(formatter)
72 | handler.setLevel(log_level)
73 | logger.addHandler(handler)
74 |
75 | if rank == 0:
76 | logger.setLevel(log_level)
77 | else:
78 | logger.setLevel(logging.ERROR)
79 |
80 | logger_initialized[name] = True
81 |
82 | return logger
83 |
84 |
85 | def print_log(msg, logger=None, level=logging.INFO):
86 | """Print a log message.
87 |
88 | Args:
89 | msg (str): The message to be logged.
90 | logger (logging.Logger | str | None): The logger to be used.
91 | Some special loggers are:
92 |
93 | - "silent": no message will be printed.
94 | - other str: the logger obtained with `get_root_logger(logger)`.
95 | - None: The `print()` method will be used to print log messages.
96 | level (int): Logging level. Only available when `logger` is a Logger
97 | object or "root".
98 | """
99 | if logger is None:
100 | print(msg)
101 | elif isinstance(logger, logging.Logger):
102 | logger.log(level, msg)
103 | elif logger == 'silent':
104 | pass
105 | elif isinstance(logger, str):
106 | _logger = get_logger(logger)
107 | _logger.log(level, msg)
108 | else:
109 | raise TypeError(
110 | 'logger should be either a logging.Logger object, str, '
111 | f'"silent" or None, but got {type(logger)}')
112 |
113 |
114 | def get_root_logger(log_file=None, log_level=logging.INFO):
115 | """Use `get_logger` method in mmcv to get the root logger.
116 |
117 | The logger will be initialized if it has not been initialized. By default a
118 | StreamHandler will be added. If `log_file` is specified, a FileHandler will
119 | also be added. The name of the root logger is the top-level package name,
120 | e.g., "mmpose".
121 |
122 | Args:
123 | log_file (str | None): The log filename. If specified, a FileHandler
124 | will be added to the root logger.
125 | log_level (int): The root logger level. Note that only the process of
126 | rank 0 is affected, while other processes will set the level to
127 | "Error" and be silent most of the time.
128 |
129 | Returns:
130 | logging.Logger: The root logger.
131 | """
132 | return get_logger(__name__.split('.')[0], log_file, log_level)
133 |
134 |
--------------------------------------------------------------------------------
/utils/nms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/__init__.py
--------------------------------------------------------------------------------
/utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/cpu_nms.cpython-37m-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/utils/nms/cpu_nms.cpython-39-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/cpu_nms.cpython-39-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/utils/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Copyright (c) Microsoft
3 | # Licensed under the MIT License.
4 | # Written by Bin Xiao (Bin.Xiao@microsoft.com)
5 | # ------------------------------------------------------------------------------
6 |
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 | cimport numpy as np
13 |
14 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
15 | return a if a >= b else b
16 |
17 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
18 | return a if a <= b else b
19 |
20 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
21 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
22 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
23 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
24 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
25 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
26 |
27 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
28 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1].astype('i')
29 |
30 | cdef int ndets = dets.shape[0]
31 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \
32 | np.zeros((ndets), dtype=np.int)
33 |
34 | # nominal indices
35 | cdef int _i, _j
36 | # sorted indices
37 | cdef int i, j
38 | # temp variables for box i's (the box currently under consideration)
39 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea
40 | # variables for computing overlap with box j (lower scoring box)
41 | cdef np.float32_t xx1, yy1, xx2, yy2
42 | cdef np.float32_t w, h
43 | cdef np.float32_t inter, ovr
44 |
45 | keep = []
46 | for _i in range(ndets):
47 | i = order[_i]
48 | if suppressed[i] == 1:
49 | continue
50 | keep.append(i)
51 | ix1 = x1[i]
52 | iy1 = y1[i]
53 | ix2 = x2[i]
54 | iy2 = y2[i]
55 | iarea = areas[i]
56 | for _j in range(_i + 1, ndets):
57 | j = order[_j]
58 | if suppressed[j] == 1:
59 | continue
60 | xx1 = max(ix1, x1[j])
61 | yy1 = max(iy1, y1[j])
62 | xx2 = min(ix2, x2[j])
63 | yy2 = min(iy2, y2[j])
64 | w = max(0.0, xx2 - xx1 + 1)
65 | h = max(0.0, yy2 - yy1 + 1)
66 | inter = w * h
67 | ovr = inter / (iarea + areas[j] - inter)
68 | if ovr >= thresh:
69 | suppressed[j] = 1
70 |
71 | return keep
72 |
--------------------------------------------------------------------------------
/utils/nms/gpu_nms.cpython-37m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/gpu_nms.cpython-37m-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/utils/nms/gpu_nms.cpython-39-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyunnn/ViTPose_pytorch/1bd3cc3982d5b976622df54bcdc2106bb16e6d16/utils/nms/gpu_nms.cpython-39-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/utils/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 | int boxes_dim, float nms_overlap_thresh, int device_id);
3 |
--------------------------------------------------------------------------------
/utils/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Copyright (c) Microsoft
3 | # Licensed under the MIT License.
4 | # Written by Bin Xiao (Bin.Xiao@microsoft.com)
5 | # ------------------------------------------------------------------------------
6 |
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 | cimport numpy as np
13 |
14 | assert sizeof(int) == sizeof(np.int32_t)
15 |
16 | cdef extern from "gpu_nms.hpp":
17 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
18 |
19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
20 | np.int32_t device_id=0):
21 | cdef int boxes_num = dets.shape[0]
22 | cdef int boxes_dim = dets.shape[1]
23 | cdef int num_out
24 | cdef np.ndarray[np.int32_t, ndim=1] \
25 | keep = np.zeros(boxes_num, dtype=np.int32)
26 | cdef np.ndarray[np.float32_t, ndim=1] \
27 | scores = dets[:, 4]
28 | cdef np.ndarray[np.int32_t, ndim=1] \
29 | order = scores.argsort()[::-1].astype(np.int32)
30 | cdef np.ndarray[np.float32_t, ndim=2] \
31 | sorted_dets = dets[order, :]
32 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
33 | keep = keep[:num_out]
34 | return list(order[keep])
35 |
--------------------------------------------------------------------------------
/utils/nms/nms.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Copyright (c) Microsoft
3 | # Licensed under the MIT License.
4 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
5 | # ------------------------------------------------------------------------------
6 |
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 |
13 | from .cpu_nms import cpu_nms
14 | from .gpu_nms import gpu_nms
15 |
16 |
17 | def py_nms_wrapper(thresh):
18 | def _nms(dets):
19 | return nms(dets, thresh)
20 | return _nms
21 |
22 |
23 | def cpu_nms_wrapper(thresh):
24 | def _nms(dets):
25 | return cpu_nms(dets, thresh)
26 | return _nms
27 |
28 |
29 | def gpu_nms_wrapper(thresh, device_id):
30 | def _nms(dets):
31 | return gpu_nms(dets, thresh, device_id)
32 | return _nms
33 |
34 |
35 | def nms(dets, thresh):
36 | """
37 | greedily select boxes with high confidence and overlap with current maximum <= thresh
38 | rule out overlap >= thresh
39 | :param dets: [[x1, y1, x2, y2 score]]
40 | :param thresh: retain overlap < thresh
41 | :return: indexes to keep
42 | """
43 | if dets.shape[0] == 0:
44 | return []
45 |
46 | x1 = dets[:, 0]
47 | y1 = dets[:, 1]
48 | x2 = dets[:, 2]
49 | y2 = dets[:, 3]
50 | scores = dets[:, 4]
51 |
52 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
53 | order = scores.argsort()[::-1]
54 |
55 | keep = []
56 | while order.size > 0:
57 | i = order[0]
58 | keep.append(i)
59 | xx1 = np.maximum(x1[i], x1[order[1:]])
60 | yy1 = np.maximum(y1[i], y1[order[1:]])
61 | xx2 = np.minimum(x2[i], x2[order[1:]])
62 | yy2 = np.minimum(y2[i], y2[order[1:]])
63 |
64 | w = np.maximum(0.0, xx2 - xx1 + 1)
65 | h = np.maximum(0.0, yy2 - yy1 + 1)
66 | inter = w * h
67 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
68 |
69 | inds = np.where(ovr <= thresh)[0]
70 | order = order[inds + 1]
71 |
72 | return keep
73 |
74 |
75 | def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
76 | if not isinstance(sigmas, np.ndarray):
77 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0
78 | vars = (sigmas * 2) ** 2
79 | xg = g[0::3]
80 | yg = g[1::3]
81 | vg = g[2::3]
82 | ious = np.zeros((d.shape[0]))
83 | for n_d in range(0, d.shape[0]):
84 | xd = d[n_d, 0::3]
85 | yd = d[n_d, 1::3]
86 | vd = d[n_d, 2::3]
87 | dx = xd - xg
88 | dy = yd - yg
89 | e = (dx ** 2 + dy ** 2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
90 | if in_vis_thre is not None:
91 | ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
92 | e = e[ind]
93 | ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
94 | return ious
95 |
96 |
97 | def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
98 | """
99 | greedily select boxes with high confidence and overlap with current maximum <= thresh
100 | rule out overlap >= thresh, overlap = oks
101 | :param kpts_db
102 | :param thresh: retain overlap < thresh
103 | :return: indexes to keep
104 | """
105 | if len(kpts_db) == 0:
106 | return []
107 |
108 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
109 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
110 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
111 |
112 | order = scores.argsort()[::-1]
113 |
114 | keep = []
115 | while order.size > 0:
116 | i = order[0]
117 | keep.append(i)
118 |
119 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre)
120 |
121 | inds = np.where(oks_ovr <= thresh)[0]
122 | order = order[inds + 1]
123 |
124 | return keep
125 |
126 |
127 | def rescore(overlap, scores, thresh, type='gaussian'):
128 | assert overlap.shape[0] == scores.shape[0]
129 | if type == 'linear':
130 | inds = np.where(overlap >= thresh)[0]
131 | scores[inds] = scores[inds] * (1 - overlap[inds])
132 | else:
133 | scores = scores * np.exp(- overlap**2 / thresh)
134 |
135 | return scores
136 |
137 |
138 | def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
139 | """
140 | greedily select boxes with high confidence and overlap with current maximum <= thresh
141 | rule out overlap >= thresh, overlap = oks
142 | :param kpts_db
143 | :param thresh: retain overlap < thresh
144 | :return: indexes to keep
145 | """
146 | if len(kpts_db) == 0:
147 | return []
148 |
149 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
150 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
151 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
152 |
153 | order = scores.argsort()[::-1]
154 | scores = scores[order]
155 |
156 | # max_dets = order.size
157 | max_dets = 20
158 | keep = np.zeros(max_dets, dtype=np.intp)
159 | keep_cnt = 0
160 | while order.size > 0 and keep_cnt < max_dets:
161 | i = order[0]
162 |
163 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre)
164 |
165 | order = order[1:]
166 | scores = rescore(oks_ovr, scores[1:], thresh)
167 |
168 | tmp = scores.argsort()[::-1]
169 | order = order[tmp]
170 | scores = scores[tmp]
171 |
172 | keep[keep_cnt] = i
173 | keep_cnt += 1
174 |
175 | keep = keep[:keep_cnt]
176 |
177 | return keep
178 | # kpts_db = kpts_db[:keep_cnt]
179 |
180 | # return kpts_db
181 |
--------------------------------------------------------------------------------
/utils/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
1 | // ------------------------------------------------------------------
2 | // Copyright (c) Microsoft
3 | // Licensed under The MIT License
4 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
5 | // ------------------------------------------------------------------
6 |
7 | #include "gpu_nms.hpp"
8 | #include
9 | #include
10 |
11 | #define CUDA_CHECK(condition) \
12 | /* Code block avoids redefinition of cudaError_t error */ \
13 | do { \
14 | cudaError_t error = condition; \
15 | if (error != cudaSuccess) { \
16 | std::cout << cudaGetErrorString(error) << std::endl; \
17 | } \
18 | } while (0)
19 |
20 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
21 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
22 |
23 | __device__ inline float devIoU(float const * const a, float const * const b) {
24 | float left = max(a[0], b[0]), right = min(a[2], b[2]);
25 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
26 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
27 | float interS = width * height;
28 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
29 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
30 | return interS / (Sa + Sb - interS);
31 | }
32 |
33 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
34 | const float *dev_boxes, unsigned long long *dev_mask) {
35 | const int row_start = blockIdx.y;
36 | const int col_start = blockIdx.x;
37 |
38 | // if (row_start > col_start) return;
39 |
40 | const int row_size =
41 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
42 | const int col_size =
43 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
44 |
45 | __shared__ float block_boxes[threadsPerBlock * 5];
46 | if (threadIdx.x < col_size) {
47 | block_boxes[threadIdx.x * 5 + 0] =
48 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
49 | block_boxes[threadIdx.x * 5 + 1] =
50 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
51 | block_boxes[threadIdx.x * 5 + 2] =
52 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
53 | block_boxes[threadIdx.x * 5 + 3] =
54 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
55 | block_boxes[threadIdx.x * 5 + 4] =
56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
57 | }
58 | __syncthreads();
59 |
60 | if (threadIdx.x < row_size) {
61 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
62 | const float *cur_box = dev_boxes + cur_box_idx * 5;
63 | int i = 0;
64 | unsigned long long t = 0;
65 | int start = 0;
66 | if (row_start == col_start) {
67 | start = threadIdx.x + 1;
68 | }
69 | for (i = start; i < col_size; i++) {
70 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
71 | t |= 1ULL << i;
72 | }
73 | }
74 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
75 | dev_mask[cur_box_idx * col_blocks + col_start] = t;
76 | }
77 | }
78 |
79 | void _set_device(int device_id) {
80 | int current_device;
81 | CUDA_CHECK(cudaGetDevice(¤t_device));
82 | if (current_device == device_id) {
83 | return;
84 | }
85 | // The call to cudaSetDevice must come before any calls to Get, which
86 | // may perform initialization using the GPU.
87 | CUDA_CHECK(cudaSetDevice(device_id));
88 | }
89 |
90 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
91 | int boxes_dim, float nms_overlap_thresh, int device_id) {
92 | _set_device(device_id);
93 |
94 | float* boxes_dev = NULL;
95 | unsigned long long* mask_dev = NULL;
96 |
97 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
98 |
99 | CUDA_CHECK(cudaMalloc(&boxes_dev,
100 | boxes_num * boxes_dim * sizeof(float)));
101 | CUDA_CHECK(cudaMemcpy(boxes_dev,
102 | boxes_host,
103 | boxes_num * boxes_dim * sizeof(float),
104 | cudaMemcpyHostToDevice));
105 |
106 | CUDA_CHECK(cudaMalloc(&mask_dev,
107 | boxes_num * col_blocks * sizeof(unsigned long long)));
108 |
109 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
110 | DIVUP(boxes_num, threadsPerBlock));
111 | dim3 threads(threadsPerBlock);
112 | nms_kernel<<>>(boxes_num,
113 | nms_overlap_thresh,
114 | boxes_dev,
115 | mask_dev);
116 |
117 | std::vector mask_host(boxes_num * col_blocks);
118 | CUDA_CHECK(cudaMemcpy(&mask_host[0],
119 | mask_dev,
120 | sizeof(unsigned long long) * boxes_num * col_blocks,
121 | cudaMemcpyDeviceToHost));
122 |
123 | std::vector remv(col_blocks);
124 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
125 |
126 | int num_to_keep = 0;
127 | for (int i = 0; i < boxes_num; i++) {
128 | int nblock = i / threadsPerBlock;
129 | int inblock = i % threadsPerBlock;
130 |
131 | if (!(remv[nblock] & (1ULL << inblock))) {
132 | keep_out[num_to_keep++] = i;
133 | unsigned long long *p = &mask_host[0] + i * col_blocks;
134 | for (int j = nblock; j < col_blocks; j++) {
135 | remv[j] |= p[j];
136 | }
137 | }
138 | }
139 | *num_out = num_to_keep;
140 |
141 | CUDA_CHECK(cudaFree(boxes_dev));
142 | CUDA_CHECK(cudaFree(mask_dev));
143 | }
144 |
--------------------------------------------------------------------------------
/utils/nms/nms_ori.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Copyright (c) Microsoft
3 | # Licensed under the MIT License.
4 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
5 | # ------------------------------------------------------------------------------
6 |
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 |
13 | from cpu_nms import cpu_nms
14 | from gpu_nms import gpu_nms
15 |
16 |
17 | def py_nms_wrapper(thresh):
18 | def _nms(dets):
19 | return nms(dets, thresh)
20 | return _nms
21 |
22 |
23 | def cpu_nms_wrapper(thresh):
24 | def _nms(dets):
25 | return cpu_nms(dets, thresh)
26 | return _nms
27 |
28 |
29 | def gpu_nms_wrapper(thresh, device_id):
30 | def _nms(dets):
31 | return gpu_nms(dets, thresh, device_id)
32 | return _nms
33 |
34 |
35 | def nms(dets, thresh):
36 | """
37 | greedily select boxes with high confidence and overlap with current maximum <= thresh
38 | rule out overlap >= thresh
39 | :param dets: [[x1, y1, x2, y2 score]]
40 | :param thresh: retain overlap < thresh
41 | :return: indexes to keep
42 | """
43 | if dets.shape[0] == 0:
44 | return []
45 |
46 | x1 = dets[:, 0]
47 | y1 = dets[:, 1]
48 | x2 = dets[:, 2]
49 | y2 = dets[:, 3]
50 | scores = dets[:, 4]
51 |
52 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
53 | order = scores.argsort()[::-1]
54 |
55 | keep = []
56 | while order.size > 0:
57 | i = order[0]
58 | keep.append(i)
59 | xx1 = np.maximum(x1[i], x1[order[1:]])
60 | yy1 = np.maximum(y1[i], y1[order[1:]])
61 | xx2 = np.minimum(x2[i], x2[order[1:]])
62 | yy2 = np.minimum(y2[i], y2[order[1:]])
63 |
64 | w = np.maximum(0.0, xx2 - xx1 + 1)
65 | h = np.maximum(0.0, yy2 - yy1 + 1)
66 | inter = w * h
67 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
68 |
69 | inds = np.where(ovr <= thresh)[0]
70 | order = order[inds + 1]
71 |
72 | return keep
73 |
74 |
75 | def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
76 | if not isinstance(sigmas, np.ndarray):
77 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0
78 | vars = (sigmas * 2) ** 2
79 | xg = g[0::3]
80 | yg = g[1::3]
81 | vg = g[2::3]
82 | ious = np.zeros((d.shape[0]))
83 | for n_d in range(0, d.shape[0]):
84 | xd = d[n_d, 0::3]
85 | yd = d[n_d, 1::3]
86 | vd = d[n_d, 2::3]
87 | dx = xd - xg
88 | dy = yd - yg
89 | e = (dx ** 2 + dy ** 2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
90 | if in_vis_thre is not None:
91 | ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
92 | e = e[ind]
93 | ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
94 | return ious
95 |
96 |
97 | def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
98 | """
99 | greedily select boxes with high confidence and overlap with current maximum <= thresh
100 | rule out overlap >= thresh, overlap = oks
101 | :param kpts_db
102 | :param thresh: retain overlap < thresh
103 | :return: indexes to keep
104 | """
105 | if len(kpts_db) == 0:
106 | return []
107 |
108 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
109 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
110 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
111 |
112 | order = scores.argsort()[::-1]
113 |
114 | keep = []
115 | while order.size > 0:
116 | i = order[0]
117 | keep.append(i)
118 |
119 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre)
120 |
121 | inds = np.where(oks_ovr <= thresh)[0]
122 | order = order[inds + 1]
123 |
124 | return keep
125 |
126 |
127 | def rescore(overlap, scores, thresh, type='gaussian'):
128 | assert overlap.shape[0] == scores.shape[0]
129 | if type == 'linear':
130 | inds = np.where(overlap >= thresh)[0]
131 | scores[inds] = scores[inds] * (1 - overlap[inds])
132 | else:
133 | scores = scores * np.exp(- overlap**2 / thresh)
134 |
135 | return scores
136 |
137 |
138 | def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
139 | """
140 | greedily select boxes with high confidence and overlap with current maximum <= thresh
141 | rule out overlap >= thresh, overlap = oks
142 | :param kpts_db
143 | :param thresh: retain overlap < thresh
144 | :return: indexes to keep
145 | """
146 | if len(kpts_db) == 0:
147 | return []
148 |
149 | scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
150 | kpts = np.array([kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
151 | areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
152 |
153 | order = scores.argsort()[::-1]
154 | scores = scores[order]
155 |
156 | # max_dets = order.size
157 | max_dets = 20
158 | keep = np.zeros(max_dets, dtype=np.intp)
159 | keep_cnt = 0
160 | while order.size > 0 and keep_cnt < max_dets:
161 | i = order[0]
162 |
163 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], sigmas, in_vis_thre)
164 |
165 | order = order[1:]
166 | scores = rescore(oks_ovr, scores[1:], thresh)
167 |
168 | tmp = scores.argsort()[::-1]
169 | order = order[tmp]
170 | scores = scores[tmp]
171 |
172 | keep[keep_cnt] = i
173 | keep_cnt += 1
174 |
175 | keep = keep[:keep_cnt]
176 |
177 | return keep
178 | # kpts_db = kpts_db[:keep_cnt]
179 |
180 | # return kpts_db
181 |
--------------------------------------------------------------------------------
/utils/nms/setup_linux.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Pose.gluon
3 | # Copyright (c) 2018-present Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
6 | # --------------------------------------------------------
7 |
8 | import os
9 | from os.path import join as pjoin
10 | from setuptools import setup
11 | from distutils.extension import Extension
12 | from Cython.Distutils import build_ext
13 | import numpy as np
14 |
15 |
16 | def find_in_path(name, path):
17 | "Find a file in a search path"
18 | # Adapted fom
19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
20 | for dir in path.split(os.pathsep):
21 | binpath = pjoin(dir, name)
22 | if os.path.exists(binpath):
23 | return os.path.abspath(binpath)
24 | return None
25 |
26 |
27 | def locate_cuda():
28 | """Locate the CUDA environment on the system
29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
30 | and values giving the absolute path to each directory.
31 | Starts by looking for the CUDAHOME env variable. If not found, everything
32 | is based on finding 'nvcc' in the PATH.
33 | """
34 |
35 | # first check if the CUDAHOME env variable is in use
36 | if 'CUDAHOME' in os.environ:
37 | home = os.environ['CUDAHOME']
38 | nvcc = pjoin(home, 'bin', 'nvcc')
39 | else:
40 | # otherwise, search the PATH for NVCC
41 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
42 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
43 | if nvcc is None:
44 | raise EnvironmentError('The nvcc binary could not be '
45 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME')
46 | home = os.path.dirname(os.path.dirname(nvcc))
47 |
48 | cudaconfig = {'home':home, 'nvcc':nvcc,
49 | 'include': pjoin(home, 'include'),
50 | 'lib64': pjoin(home, 'lib64')}
51 | for k, v in cudaconfig.items():
52 | if not os.path.exists(v):
53 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
54 |
55 | return cudaconfig
56 | CUDA = locate_cuda()
57 |
58 |
59 | # Obtain the numpy include directory. This logic works across numpy versions.
60 | try:
61 | numpy_include = np.get_include()
62 | except AttributeError:
63 | numpy_include = np.get_numpy_include()
64 |
65 |
66 | def customize_compiler_for_nvcc(self):
67 | """inject deep into distutils to customize how the dispatch
68 | to gcc/nvcc works.
69 | If you subclass UnixCCompiler, it's not trivial to get your subclass
70 | injected in, and still have the right customizations (i.e.
71 | distutils.sysconfig.customize_compiler) run on it. So instead of going
72 | the OO route, I have this. Note, it's kindof like a wierd functional
73 | subclassing going on."""
74 |
75 | # tell the compiler it can processes .cu
76 | self.src_extensions.append('.cu')
77 |
78 | # save references to the default compiler_so and _comple methods
79 | default_compiler_so = self.compiler_so
80 | super = self._compile
81 |
82 | # now redefine the _compile method. This gets executed for each
83 | # object but distutils doesn't have the ability to change compilers
84 | # based on source extension: we add it.
85 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
86 | if os.path.splitext(src)[1] == '.cu':
87 | # use the cuda for .cu files
88 | self.set_executable('compiler_so', CUDA['nvcc'])
89 | # use only a subset of the extra_postargs, which are 1-1 translated
90 | # from the extra_compile_args in the Extension class
91 | postargs = extra_postargs['nvcc']
92 | else:
93 | postargs = extra_postargs['gcc']
94 |
95 | super(obj, src, ext, cc_args, postargs, pp_opts)
96 | # reset the default compiler_so, which we might have changed for cuda
97 | self.compiler_so = default_compiler_so
98 |
99 | # inject our redefined _compile method into the class
100 | self._compile = _compile
101 |
102 |
103 | # run the customize_compiler
104 | class custom_build_ext(build_ext):
105 | def build_extensions(self):
106 | customize_compiler_for_nvcc(self.compiler)
107 | build_ext.build_extensions(self)
108 |
109 |
110 | ext_modules = [
111 | Extension(
112 | "cpu_nms",
113 | ["cpu_nms.pyx"],
114 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
115 | include_dirs = [numpy_include]
116 | ),
117 | Extension('gpu_nms',
118 | ['nms_kernel.cu', 'gpu_nms.pyx'],
119 | library_dirs=[CUDA['lib64']],
120 | libraries=['cudart'],
121 | language='c++',
122 | runtime_library_dirs=[CUDA['lib64']],
123 | # this syntax is specific to this build system
124 | # we're only going to use certain compiler args with nvcc and not with
125 | # gcc the implementation of this trick is in customize_compiler() below
126 | extra_compile_args={'gcc': ["-Wno-unused-function"],
127 | 'nvcc': ['-arch=sm_35',
128 | '--ptxas-options=-v',
129 | '-c',
130 | '--compiler-options',
131 | "'-fPIC'"]},
132 | include_dirs = [numpy_include, CUDA['include']]
133 | ),
134 | ]
135 |
136 | setup(
137 | name='nms',
138 | ext_modules=ext_modules,
139 | # inject our custom trigger
140 | cmdclass={'build_ext': custom_build_ext},
141 | )
142 |
--------------------------------------------------------------------------------
/utils/post_processing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .nms import oks_iou, oks_nms, soft_oks_nms
3 | from .one_euro_filter import OneEuroFilter
4 | from .post_transforms import (affine_transform, flip_back, fliplr_joints,
5 | fliplr_regression, get_affine_transform,
6 | get_warp_matrix, rotate_point, transform_preds,
7 | warp_affine_joints)
8 |
9 | __all__ = [
10 | 'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back',
11 | 'fliplr_joints', 'fliplr_regression', 'transform_preds',
12 | 'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints',
13 | 'OneEuroFilter', 'oks_iou'
14 | ]
15 |
--------------------------------------------------------------------------------
/utils/post_processing/group.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Adapted from https://github.com/princeton-vl/pose-ae-train/
3 | # Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
4 | # ------------------------------------------------------------------------------
5 |
6 | import numpy as np
7 | import torch
8 | from munkres import Munkres
9 |
10 | from ..top_down_eval import post_dark_udp
11 |
12 |
13 | def _py_max_match(scores):
14 | """Apply munkres algorithm to get the best match.
15 |
16 | Args:
17 | scores(np.ndarray): cost matrix.
18 |
19 | Returns:
20 | np.ndarray: best match.
21 | """
22 | m = Munkres()
23 | tmp = m.compute(scores)
24 | tmp = np.array(tmp).astype(int)
25 | return tmp
26 |
27 |
28 | def _match_by_tag(inp, params):
29 | """Match joints by tags. Use Munkres algorithm to calculate the best match
30 | for keypoints grouping.
31 |
32 | Note:
33 | number of keypoints: K
34 | max number of people in an image: M (M=30 by default)
35 | dim of tags: L
36 | If use flip testing, L=2; else L=1.
37 |
38 | Args:
39 | inp(tuple):
40 | tag_k (np.ndarray[KxMxL]): tag corresponding to the
41 | top k values of feature map per keypoint.
42 | loc_k (np.ndarray[KxMx2]): top k locations of the
43 | feature maps for keypoint.
44 | val_k (np.ndarray[KxM]): top k value of the
45 | feature maps per keypoint.
46 | params(Params): class Params().
47 |
48 | Returns:
49 | np.ndarray: result of pose groups.
50 | """
51 | assert isinstance(params, _Params), 'params should be class _Params()'
52 |
53 | tag_k, loc_k, val_k = inp
54 |
55 | default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
56 | dtype=np.float32)
57 |
58 | joint_dict = {}
59 | tag_dict = {}
60 | for i in range(params.num_joints):
61 | idx = params.joint_order[i]
62 |
63 | tags = tag_k[idx]
64 | joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
65 | mask = joints[:, 2] > params.detection_threshold
66 | tags = tags[mask]
67 | joints = joints[mask]
68 |
69 | if joints.shape[0] == 0:
70 | continue
71 |
72 | if i == 0 or len(joint_dict) == 0:
73 | for tag, joint in zip(tags, joints):
74 | key = tag[0]
75 | joint_dict.setdefault(key, np.copy(default_))[idx] = joint
76 | tag_dict[key] = [tag]
77 | else:
78 | grouped_keys = list(joint_dict.keys())[:params.max_num_people]
79 | grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
80 |
81 | if (params.ignore_too_much
82 | and len(grouped_keys) == params.max_num_people):
83 | continue
84 |
85 | diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
86 | diff_normed = np.linalg.norm(diff, ord=2, axis=2)
87 | diff_saved = np.copy(diff_normed)
88 |
89 | if params.use_detection_val:
90 | diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
91 |
92 | num_added = diff.shape[0]
93 | num_grouped = diff.shape[1]
94 |
95 | if num_added > num_grouped:
96 | diff_normed = np.concatenate(
97 | (diff_normed,
98 | np.zeros((num_added, num_added - num_grouped),
99 | dtype=np.float32) + 1e10),
100 | axis=1)
101 |
102 | pairs = _py_max_match(diff_normed)
103 | for row, col in pairs:
104 | if (row < num_added and col < num_grouped
105 | and diff_saved[row][col] < params.tag_threshold):
106 | key = grouped_keys[col]
107 | joint_dict[key][idx] = joints[row]
108 | tag_dict[key].append(tags[row])
109 | else:
110 | key = tags[row][0]
111 | joint_dict.setdefault(key, np.copy(default_))[idx] = \
112 | joints[row]
113 | tag_dict[key] = [tags[row]]
114 |
115 | results = np.array([joint_dict[i] for i in joint_dict]).astype(np.float32)
116 | return results
117 |
118 |
119 | class _Params:
120 | """A class of parameter.
121 |
122 | Args:
123 | cfg(Config): config.
124 | """
125 |
126 | def __init__(self, cfg):
127 | self.num_joints = cfg['num_joints']
128 | self.max_num_people = cfg['max_num_people']
129 |
130 | self.detection_threshold = cfg['detection_threshold']
131 | self.tag_threshold = cfg['tag_threshold']
132 | self.use_detection_val = cfg['use_detection_val']
133 | self.ignore_too_much = cfg['ignore_too_much']
134 |
135 | if self.num_joints == 17:
136 | self.joint_order = [
137 | i - 1 for i in
138 | [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
139 | ]
140 | else:
141 | self.joint_order = list(np.arange(self.num_joints))
142 |
143 |
144 | class HeatmapParser:
145 | """The heatmap parser for post processing."""
146 |
147 | def __init__(self, cfg):
148 | self.params = _Params(cfg)
149 | self.tag_per_joint = cfg['tag_per_joint']
150 | self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
151 | cfg['nms_padding'])
152 | self.use_udp = cfg.get('use_udp', False)
153 | self.score_per_joint = cfg.get('score_per_joint', False)
154 |
155 | def nms(self, heatmaps):
156 | """Non-Maximum Suppression for heatmaps.
157 |
158 | Args:
159 | heatmap(torch.Tensor): Heatmaps before nms.
160 |
161 | Returns:
162 | torch.Tensor: Heatmaps after nms.
163 | """
164 |
165 | maxm = self.pool(heatmaps)
166 | maxm = torch.eq(maxm, heatmaps).float()
167 | heatmaps = heatmaps * maxm
168 |
169 | return heatmaps
170 |
171 | def match(self, tag_k, loc_k, val_k):
172 | """Group keypoints to human poses in a batch.
173 |
174 | Args:
175 | tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
176 | top k values of feature map per keypoint.
177 | loc_k (np.ndarray[NxKxMx2]): top k locations of the
178 | feature maps for keypoint.
179 | val_k (np.ndarray[NxKxM]): top k value of the
180 | feature maps per keypoint.
181 |
182 | Returns:
183 | list
184 | """
185 |
186 | def _match(x):
187 | return _match_by_tag(x, self.params)
188 |
189 | return list(map(_match, zip(tag_k, loc_k, val_k)))
190 |
191 | def top_k(self, heatmaps, tags):
192 | """Find top_k values in an image.
193 |
194 | Note:
195 | batch size: N
196 | number of keypoints: K
197 | heatmap height: H
198 | heatmap width: W
199 | max number of people: M
200 | dim of tags: L
201 | If use flip testing, L=2; else L=1.
202 |
203 | Args:
204 | heatmaps (torch.Tensor[NxKxHxW])
205 | tags (torch.Tensor[NxKxHxWxL])
206 |
207 | Returns:
208 | dict: A dict containing top_k values.
209 |
210 | - tag_k (np.ndarray[NxKxMxL]):
211 | tag corresponding to the top k values of
212 | feature map per keypoint.
213 | - loc_k (np.ndarray[NxKxMx2]):
214 | top k location of feature map per keypoint.
215 | - val_k (np.ndarray[NxKxM]):
216 | top k value of feature map per keypoint.
217 | """
218 | heatmaps = self.nms(heatmaps)
219 | N, K, H, W = heatmaps.size()
220 | heatmaps = heatmaps.view(N, K, -1)
221 | val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
222 |
223 | tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
224 | if not self.tag_per_joint:
225 | tags = tags.expand(-1, self.params.num_joints, -1, -1)
226 |
227 | tag_k = torch.stack(
228 | [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
229 | dim=3)
230 |
231 | x = ind % W
232 | y = ind // W
233 |
234 | ind_k = torch.stack((x, y), dim=3)
235 |
236 | results = {
237 | 'tag_k': tag_k.cpu().numpy(),
238 | 'loc_k': ind_k.cpu().numpy(),
239 | 'val_k': val_k.cpu().numpy()
240 | }
241 |
242 | return results
243 |
244 | @staticmethod
245 | def adjust(results, heatmaps):
246 | """Adjust the coordinates for better accuracy.
247 |
248 | Note:
249 | batch size: N
250 | number of keypoints: K
251 | heatmap height: H
252 | heatmap width: W
253 |
254 | Args:
255 | results (list(np.ndarray)): Keypoint predictions.
256 | heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
257 | """
258 | _, _, H, W = heatmaps.shape
259 | for batch_id, people in enumerate(results):
260 | for people_id, people_i in enumerate(people):
261 | for joint_id, joint in enumerate(people_i):
262 | if joint[2] > 0:
263 | x, y = joint[0:2]
264 | xx, yy = int(x), int(y)
265 | tmp = heatmaps[batch_id][joint_id]
266 | if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
267 | xx]:
268 | y += 0.25
269 | else:
270 | y -= 0.25
271 |
272 | if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
273 | max(0, xx - 1)]:
274 | x += 0.25
275 | else:
276 | x -= 0.25
277 | results[batch_id][people_id, joint_id,
278 | 0:2] = (x + 0.5, y + 0.5)
279 | return results
280 |
281 | @staticmethod
282 | def refine(heatmap, tag, keypoints, use_udp=False):
283 | """Given initial keypoint predictions, we identify missing joints.
284 |
285 | Note:
286 | number of keypoints: K
287 | heatmap height: H
288 | heatmap width: W
289 | dim of tags: L
290 | If use flip testing, L=2; else L=1.
291 |
292 | Args:
293 | heatmap: np.ndarray(K, H, W).
294 | tag: np.ndarray(K, H, W) | np.ndarray(K, H, W, L)
295 | keypoints: np.ndarray of size (K, 3 + L)
296 | last dim is (x, y, score, tag).
297 | use_udp: bool-unbiased data processing
298 |
299 | Returns:
300 | np.ndarray: The refined keypoints.
301 | """
302 |
303 | K, H, W = heatmap.shape
304 | if len(tag.shape) == 3:
305 | tag = tag[..., None]
306 |
307 | tags = []
308 | for i in range(K):
309 | if keypoints[i, 2] > 0:
310 | # save tag value of detected keypoint
311 | x, y = keypoints[i][:2].astype(int)
312 | x = np.clip(x, 0, W - 1)
313 | y = np.clip(y, 0, H - 1)
314 | tags.append(tag[i, y, x])
315 |
316 | # mean tag of current detected people
317 | prev_tag = np.mean(tags, axis=0)
318 | results = []
319 |
320 | for _heatmap, _tag in zip(heatmap, tag):
321 | # distance of all tag values with mean tag of
322 | # current detected people
323 | distance_tag = (((_tag -
324 | prev_tag[None, None, :])**2).sum(axis=2)**0.5)
325 | norm_heatmap = _heatmap - np.round(distance_tag)
326 |
327 | # find maximum position
328 | y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
329 | xx = x.copy()
330 | yy = y.copy()
331 | # detection score at maximum position
332 | val = _heatmap[y, x]
333 | if not use_udp:
334 | # offset by 0.5
335 | x += 0.5
336 | y += 0.5
337 |
338 | # add a quarter offset
339 | if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
340 | x += 0.25
341 | else:
342 | x -= 0.25
343 |
344 | if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
345 | y += 0.25
346 | else:
347 | y -= 0.25
348 |
349 | results.append((x, y, val))
350 | results = np.array(results)
351 |
352 | if results is not None:
353 | for i in range(K):
354 | # add keypoint if it is not detected
355 | if results[i, 2] > 0 and keypoints[i, 2] == 0:
356 | keypoints[i, :3] = results[i, :3]
357 |
358 | return keypoints
359 |
360 | def parse(self, heatmaps, tags, adjust=True, refine=True):
361 | """Group keypoints into poses given heatmap and tag.
362 |
363 | Note:
364 | batch size: N
365 | number of keypoints: K
366 | heatmap height: H
367 | heatmap width: W
368 | dim of tags: L
369 | If use flip testing, L=2; else L=1.
370 |
371 | Args:
372 | heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
373 | tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
374 |
375 | Returns:
376 | tuple: A tuple containing keypoint grouping results.
377 |
378 | - results (list(np.ndarray)): Pose results.
379 | - scores (list/list(np.ndarray)): Score of people.
380 | """
381 | results = self.match(**self.top_k(heatmaps, tags))
382 |
383 | if adjust:
384 | if self.use_udp:
385 | for i in range(len(results)):
386 | if results[i].shape[0] > 0:
387 | results[i][..., :2] = post_dark_udp(
388 | results[i][..., :2].copy(), heatmaps[i:i + 1, :])
389 | else:
390 | results = self.adjust(results, heatmaps)
391 |
392 | if self.score_per_joint:
393 | scores = [i[:, 2] for i in results[0]]
394 | else:
395 | scores = [i[:, 2].mean() for i in results[0]]
396 |
397 | if refine:
398 | results = results[0]
399 | # for every detected person
400 | for i in range(len(results)):
401 | heatmap_numpy = heatmaps[0].cpu().numpy()
402 | tag_numpy = tags[0].cpu().numpy()
403 | if not self.tag_per_joint:
404 | tag_numpy = np.tile(tag_numpy,
405 | (self.params.num_joints, 1, 1, 1))
406 | results[i] = self.refine(
407 | heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
408 | results = [results]
409 |
410 | return results, scores
411 |
--------------------------------------------------------------------------------
/utils/post_processing/nms.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
3 | # Original licence: Copyright (c) Microsoft, under the MIT License.
4 | # ------------------------------------------------------------------------------
5 |
6 | import numpy as np
7 |
8 |
9 | def nms(dets, thr):
10 | """Greedily select boxes with high confidence and overlap <= thr.
11 |
12 | Args:
13 | dets: [[x1, y1, x2, y2, score]].
14 | thr: Retain overlap < thr.
15 |
16 | Returns:
17 | list: Indexes to keep.
18 | """
19 | if len(dets) == 0:
20 | return []
21 |
22 | x1 = dets[:, 0]
23 | y1 = dets[:, 1]
24 | x2 = dets[:, 2]
25 | y2 = dets[:, 3]
26 | scores = dets[:, 4]
27 |
28 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
29 | order = scores.argsort()[::-1]
30 |
31 | keep = []
32 | while len(order) > 0:
33 | i = order[0]
34 | keep.append(i)
35 | xx1 = np.maximum(x1[i], x1[order[1:]])
36 | yy1 = np.maximum(y1[i], y1[order[1:]])
37 | xx2 = np.minimum(x2[i], x2[order[1:]])
38 | yy2 = np.minimum(y2[i], y2[order[1:]])
39 |
40 | w = np.maximum(0.0, xx2 - xx1 + 1)
41 | h = np.maximum(0.0, yy2 - yy1 + 1)
42 | inter = w * h
43 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
44 |
45 | inds = np.where(ovr <= thr)[0]
46 | order = order[inds + 1]
47 |
48 | return keep
49 |
50 |
51 | def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
52 | """Calculate oks ious.
53 |
54 | Args:
55 | g: Ground truth keypoints.
56 | d: Detected keypoints.
57 | a_g: Area of the ground truth object.
58 | a_d: Area of the detected object.
59 | sigmas: standard deviation of keypoint labelling.
60 | vis_thr: threshold of the keypoint visibility.
61 |
62 | Returns:
63 | list: The oks ious.
64 | """
65 | if sigmas is None:
66 | sigmas = np.array([
67 | .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
68 | .87, .87, .89, .89
69 | ]) / 10.0
70 | vars = (sigmas * 2)**2
71 | xg = g[0::3]
72 | yg = g[1::3]
73 | vg = g[2::3]
74 | ious = np.zeros(len(d), dtype=np.float32)
75 | for n_d in range(0, len(d)):
76 | xd = d[n_d, 0::3]
77 | yd = d[n_d, 1::3]
78 | vd = d[n_d, 2::3]
79 | dx = xd - xg
80 | dy = yd - yg
81 | e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
82 | if vis_thr is not None:
83 | ind = list(vg > vis_thr) and list(vd > vis_thr)
84 | e = e[ind]
85 | ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
86 | return ious
87 |
88 |
89 | def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
90 | """OKS NMS implementations.
91 |
92 | Args:
93 | kpts_db: keypoints.
94 | thr: Retain overlap < thr.
95 | sigmas: standard deviation of keypoint labelling.
96 | vis_thr: threshold of the keypoint visibility.
97 | score_per_joint: the input scores (in kpts_db) are per joint scores
98 |
99 | Returns:
100 | np.ndarray: indexes to keep.
101 | """
102 | if len(kpts_db) == 0:
103 | return []
104 |
105 | if score_per_joint:
106 | scores = np.array([k['score'].mean() for k in kpts_db])
107 | else:
108 | scores = np.array([k['score'] for k in kpts_db])
109 |
110 | kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
111 | areas = np.array([k['area'] for k in kpts_db])
112 |
113 | order = scores.argsort()[::-1]
114 |
115 | keep = []
116 | while len(order) > 0:
117 | i = order[0]
118 | keep.append(i)
119 |
120 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
121 | sigmas, vis_thr)
122 |
123 | inds = np.where(oks_ovr <= thr)[0]
124 | order = order[inds + 1]
125 |
126 | keep = np.array(keep)
127 |
128 | return keep
129 |
130 |
131 | def _rescore(overlap, scores, thr, type='gaussian'):
132 | """Rescoring mechanism gaussian or linear.
133 |
134 | Args:
135 | overlap: calculated ious
136 | scores: target scores.
137 | thr: retain oks overlap < thr.
138 | type: 'gaussian' or 'linear'
139 |
140 | Returns:
141 | np.ndarray: indexes to keep
142 | """
143 | assert len(overlap) == len(scores)
144 | assert type in ['gaussian', 'linear']
145 |
146 | if type == 'linear':
147 | inds = np.where(overlap >= thr)[0]
148 | scores[inds] = scores[inds] * (1 - overlap[inds])
149 | else:
150 | scores = scores * np.exp(-overlap**2 / thr)
151 |
152 | return scores
153 |
154 |
155 | def soft_oks_nms(kpts_db,
156 | thr,
157 | max_dets=20,
158 | sigmas=None,
159 | vis_thr=None,
160 | score_per_joint=False):
161 | """Soft OKS NMS implementations.
162 |
163 | Args:
164 | kpts_db
165 | thr: retain oks overlap < thr.
166 | max_dets: max number of detections to keep.
167 | sigmas: Keypoint labelling uncertainty.
168 | score_per_joint: the input scores (in kpts_db) are per joint scores
169 |
170 | Returns:
171 | np.ndarray: indexes to keep.
172 | """
173 | if len(kpts_db) == 0:
174 | return []
175 |
176 | if score_per_joint:
177 | scores = np.array([k['score'].mean() for k in kpts_db])
178 | else:
179 | scores = np.array([k['score'] for k in kpts_db])
180 |
181 | kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
182 | areas = np.array([k['area'] for k in kpts_db])
183 |
184 | order = scores.argsort()[::-1]
185 | scores = scores[order]
186 |
187 | keep = np.zeros(max_dets, dtype=np.intp)
188 | keep_cnt = 0
189 | while len(order) > 0 and keep_cnt < max_dets:
190 | i = order[0]
191 |
192 | oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
193 | sigmas, vis_thr)
194 |
195 | order = order[1:]
196 | scores = _rescore(oks_ovr, scores[1:], thr)
197 |
198 | tmp = scores.argsort()[::-1]
199 | order = order[tmp]
200 | scores = scores[tmp]
201 |
202 | keep[keep_cnt] = i
203 | keep_cnt += 1
204 |
205 | keep = keep[:keep_cnt]
206 |
207 | return keep
208 |
--------------------------------------------------------------------------------
/utils/post_processing/one_euro_filter.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
3 | # Original licence: Copyright (c) HoBeom Jeon, under the MIT License.
4 | # ------------------------------------------------------------------------------
5 | from time import time
6 |
7 | import numpy as np
8 |
9 |
10 | def smoothing_factor(t_e, cutoff):
11 | r = 2 * np.pi * cutoff * t_e
12 | return r / (r + 1)
13 |
14 |
15 | def exponential_smoothing(a, x, x_prev):
16 | return a * x + (1 - a) * x_prev
17 |
18 |
19 | class OneEuroFilter:
20 |
21 | def __init__(self,
22 | x0,
23 | dx0=0.0,
24 | min_cutoff=1.7,
25 | beta=0.3,
26 | d_cutoff=30.0,
27 | fps=None):
28 | """One Euro Filter for keypoints smoothing.
29 |
30 | Args:
31 | x0 (np.ndarray[K, 2]): Initialize keypoints value
32 | dx0 (float): 0.0
33 | min_cutoff (float): parameter for one euro filter
34 | beta (float): parameter for one euro filter
35 | d_cutoff (float): Input data FPS
36 | fps (float): Video FPS for video inference
37 | """
38 |
39 | # The parameters.
40 | self.data_shape = x0.shape
41 | self.min_cutoff = np.full(x0.shape, min_cutoff)
42 | self.beta = np.full(x0.shape, beta)
43 | self.d_cutoff = np.full(x0.shape, d_cutoff)
44 | # Previous values.
45 | self.x_prev = x0.astype(np.float32)
46 | self.dx_prev = np.full(x0.shape, dx0)
47 | self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
48 | self.realtime = True
49 | if fps is None:
50 | # Using in realtime inference
51 | self.t_e = None
52 | self.skip_frame_factor = d_cutoff
53 | else:
54 | # fps using video inference
55 | self.realtime = False
56 | self.d_cutoff = np.full(x0.shape, float(fps))
57 | self.t_prev = time()
58 |
59 | def __call__(self, x, t_e=1.0):
60 | """Compute the filtered signal.
61 |
62 | Hyper-parameters (cutoff, beta) are from `VNect
63 | `__ .
64 |
65 | Realtime Camera fps (d_cutoff) default 30.0
66 |
67 | Args:
68 | x (np.ndarray[K, 2]): keypoints results in frame
69 | t_e (Optional): video skip frame count for posetrack
70 | evaluation
71 | """
72 | assert x.shape == self.data_shape
73 |
74 | t = 0
75 | if self.realtime:
76 | t = time()
77 | t_e = (t - self.t_prev) * self.skip_frame_factor
78 | t_e = np.full(x.shape, t_e)
79 |
80 | # missing keypoints mask
81 | mask = np.ma.masked_where(x <= 0, x)
82 |
83 | # The filtered derivative of the signal.
84 | a_d = smoothing_factor(t_e, self.d_cutoff)
85 | dx = (x - self.x_prev) / t_e
86 | dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
87 |
88 | # The filtered signal.
89 | cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
90 | a = smoothing_factor(t_e, cutoff)
91 | x_hat = exponential_smoothing(a, x, self.x_prev)
92 |
93 | # missing keypoints remove
94 | np.copyto(x_hat, -10, where=mask.mask)
95 |
96 | # Memorize the previous values.
97 | self.x_prev = x_hat
98 | self.dx_prev = dx_hat
99 | self.t_prev = t
100 | self.mask_prev = mask
101 |
102 | return x_hat
103 |
--------------------------------------------------------------------------------
/utils/post_processing/post_transforms.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------------
2 | # Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
3 | # Original licence: Copyright (c) Microsoft, under the MIT License.
4 | # ------------------------------------------------------------------------------
5 |
6 | import math
7 |
8 | import cv2
9 | import numpy as np
10 | import torch
11 |
12 |
13 | def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
14 | """Flip human joints horizontally.
15 |
16 | Note:
17 | - num_keypoints: K
18 |
19 | Args:
20 | joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
21 | joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
22 | img_width (int): Image width.
23 | flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
24 | (for example, left ear and right ear).
25 |
26 | Returns:
27 | tuple: Flipped human joints.
28 |
29 | - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
30 | - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
31 | """
32 |
33 | assert len(joints_3d) == len(joints_3d_visible)
34 | assert img_width > 0
35 |
36 | joints_3d_flipped = joints_3d.copy()
37 | joints_3d_visible_flipped = joints_3d_visible.copy()
38 |
39 | # Swap left-right parts
40 | for left, right in flip_pairs:
41 | joints_3d_flipped[left, :] = joints_3d[right, :]
42 | joints_3d_flipped[right, :] = joints_3d[left, :]
43 |
44 | joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
45 | joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
46 |
47 | # Flip horizontally
48 | joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
49 | joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
50 |
51 | return joints_3d_flipped, joints_3d_visible_flipped
52 |
53 |
54 | def fliplr_regression(regression,
55 | flip_pairs,
56 | center_mode='static',
57 | center_x=0.5,
58 | center_index=0):
59 | """Flip human joints horizontally.
60 |
61 | Note:
62 | - batch_size: N
63 | - num_keypoint: K
64 |
65 | Args:
66 | regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
67 | is the joint number and C is the dimension. Example shapes are:
68 |
69 | - [N, K, C]: a batch of keypoints where N is the batch size.
70 | - [N, T, K, C]: a batch of pose sequences, where T is the frame
71 | number.
72 | flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
73 | (for example, left ear -- right ear).
74 | center_mode (str): The mode to set the center location on the x-axis
75 | to flip around. Options are:
76 |
77 | - static: use a static x value (see center_x also)
78 | - root: use a root joint (see center_index also)
79 | center_x (float): Set the x-axis location of the flip center. Only used
80 | when center_mode=static.
81 | center_index (int): Set the index of the root joint, whose x location
82 | will be used as the flip center. Only used when center_mode=root.
83 |
84 | Returns:
85 | np.ndarray([..., K, C]): Flipped joints.
86 | """
87 | assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
88 |
89 | allowed_center_mode = {'static', 'root'}
90 | assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
91 | f'{center_mode}, allowed choices are {allowed_center_mode}'
92 |
93 | if center_mode == 'static':
94 | x_c = center_x
95 | elif center_mode == 'root':
96 | assert regression.shape[-2] > center_index
97 | x_c = regression[..., center_index:center_index + 1, 0]
98 |
99 | regression_flipped = regression.copy()
100 | # Swap left-right parts
101 | for left, right in flip_pairs:
102 | regression_flipped[..., left, :] = regression[..., right, :]
103 | regression_flipped[..., right, :] = regression[..., left, :]
104 |
105 | # Flip horizontally
106 | regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
107 | return regression_flipped
108 |
109 |
110 | def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
111 | """Flip the flipped heatmaps back to the original form.
112 |
113 | Note:
114 | - batch_size: N
115 | - num_keypoints: K
116 | - heatmap height: H
117 | - heatmap width: W
118 |
119 | Args:
120 | output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
121 | from the flipped images.
122 | flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
123 | (for example, left ear -- right ear).
124 | target_type (str): GaussianHeatmap or CombinedTarget
125 |
126 | Returns:
127 | np.ndarray: heatmaps that flipped back to the original image
128 | """
129 | assert output_flipped.ndim == 4, \
130 | 'output_flipped should be [batch_size, num_keypoints, height, width]'
131 | shape_ori = output_flipped.shape
132 | channels = 1
133 | if target_type.lower() == 'CombinedTarget'.lower():
134 | channels = 3
135 | output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
136 | output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
137 | shape_ori[2], shape_ori[3])
138 | output_flipped_back = output_flipped.copy()
139 |
140 | # Swap left-right parts
141 | for left, right in flip_pairs:
142 | output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
143 | output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
144 | output_flipped_back = output_flipped_back.reshape(shape_ori)
145 | # Flip horizontally
146 | output_flipped_back = output_flipped_back[..., ::-1]
147 | return output_flipped_back
148 |
149 |
150 | def transform_preds(coords, center, scale, output_size, use_udp=False):
151 | """Get final keypoint predictions from heatmaps and apply scaling and
152 | translation to map them back to the image.
153 |
154 | Note:
155 | num_keypoints: K
156 |
157 | Args:
158 | coords (np.ndarray[K, ndims]):
159 |
160 | * If ndims=2, corrds are predicted keypoint location.
161 | * If ndims=4, corrds are composed of (x, y, scores, tags)
162 | * If ndims=5, corrds are composed of (x, y, scores, tags,
163 | flipped_tags)
164 |
165 | center (np.ndarray[2, ]): Center of the bounding box (x, y).
166 | scale (np.ndarray[2, ]): Scale of the bounding box
167 | wrt [width, height].
168 | output_size (np.ndarray[2, ] | list(2,)): Size of the
169 | destination heatmaps.
170 | use_udp (bool): Use unbiased data processing
171 |
172 | Returns:
173 | np.ndarray: Predicted coordinates in the images.
174 | """
175 | assert coords.shape[1] in (2, 4, 5)
176 | assert len(center) == 2
177 | assert len(scale) == 2
178 | assert len(output_size) == 2
179 |
180 | # Recover the scale which is normalized by a factor of 200.
181 | # scale = scale * 200.0
182 |
183 | if use_udp:
184 | scale_x = scale[0] / (output_size[0] - 1.0)
185 | scale_y = scale[1] / (output_size[1] - 1.0)
186 | else:
187 | scale_x = scale[0] / output_size[0]
188 | scale_y = scale[1] / output_size[1]
189 |
190 | target_coords = np.ones_like(coords)
191 | target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
192 | target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
193 |
194 | return target_coords
195 |
196 |
197 | def get_affine_transform(center,
198 | scale,
199 | rot,
200 | output_size,
201 | shift=(0., 0.),
202 | inv=False):
203 | """Get the affine transform matrix, given the center/scale/rot/output_size.
204 |
205 | Args:
206 | center (np.ndarray[2, ]): Center of the bounding box (x, y).
207 | scale (np.ndarray[2, ]): Scale of the bounding box
208 | wrt [width, height].
209 | rot (float): Rotation angle (degree).
210 | output_size (np.ndarray[2, ] | list(2,)): Size of the
211 | destination heatmaps.
212 | shift (0-100%): Shift translation ratio wrt the width/height.
213 | Default (0., 0.).
214 | inv (bool): Option to inverse the affine transform direction.
215 | (inv=False: src->dst or inv=True: dst->src)
216 |
217 | Returns:
218 | np.ndarray: The transform matrix.
219 | """
220 | assert len(center) == 2
221 | assert len(scale) == 2
222 | assert len(output_size) == 2
223 | assert len(shift) == 2
224 |
225 | # pixel_std is 200.
226 | scale_tmp = scale * 200.0
227 |
228 | shift = np.array(shift)
229 | src_w = scale_tmp[0]
230 | dst_w = output_size[0]
231 | dst_h = output_size[1]
232 |
233 | rot_rad = np.pi * rot / 180
234 | src_dir = rotate_point([0., src_w * -0.5], rot_rad)
235 | dst_dir = np.array([0., dst_w * -0.5])
236 |
237 | src = np.zeros((3, 2), dtype=np.float32)
238 | src[0, :] = center + scale_tmp * shift
239 | src[1, :] = center + src_dir + scale_tmp * shift
240 | src[2, :] = _get_3rd_point(src[0, :], src[1, :])
241 |
242 | dst = np.zeros((3, 2), dtype=np.float32)
243 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
244 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
245 | dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
246 |
247 | if inv:
248 | trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
249 | else:
250 | trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
251 |
252 | return trans
253 |
254 |
255 | def affine_transform(pt, trans_mat):
256 | """Apply an affine transformation to the points.
257 |
258 | Args:
259 | pt (np.ndarray): a 2 dimensional point to be transformed
260 | trans_mat (np.ndarray): 2x3 matrix of an affine transform
261 |
262 | Returns:
263 | np.ndarray: Transformed points.
264 | """
265 | assert len(pt) == 2
266 | new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
267 |
268 | return new_pt
269 |
270 |
271 | def _get_3rd_point(a, b):
272 | """To calculate the affine matrix, three pairs of points are required. This
273 | function is used to get the 3rd point, given 2D points a & b.
274 |
275 | The 3rd point is defined by rotating vector `a - b` by 90 degrees
276 | anticlockwise, using b as the rotation center.
277 |
278 | Args:
279 | a (np.ndarray): point(x,y)
280 | b (np.ndarray): point(x,y)
281 |
282 | Returns:
283 | np.ndarray: The 3rd point.
284 | """
285 | assert len(a) == 2
286 | assert len(b) == 2
287 | direction = a - b
288 | third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
289 |
290 | return third_pt
291 |
292 |
293 | def rotate_point(pt, angle_rad):
294 | """Rotate a point by an angle.
295 |
296 | Args:
297 | pt (list[float]): 2 dimensional point to be rotated
298 | angle_rad (float): rotation angle by radian
299 |
300 | Returns:
301 | list[float]: Rotated point.
302 | """
303 | assert len(pt) == 2
304 | sn, cs = np.sin(angle_rad), np.cos(angle_rad)
305 | new_x = pt[0] * cs - pt[1] * sn
306 | new_y = pt[0] * sn + pt[1] * cs
307 | rotated_pt = [new_x, new_y]
308 |
309 | return rotated_pt
310 |
311 |
312 | def get_warp_matrix(theta, size_input, size_dst, size_target):
313 | """Calculate the transformation matrix under the constraint of unbiased.
314 | Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
315 | Data Processing for Human Pose Estimation (CVPR 2020).
316 |
317 | Args:
318 | theta (float): Rotation angle in degrees.
319 | size_input (np.ndarray): Size of input image [w, h].
320 | size_dst (np.ndarray): Size of output image [w, h].
321 | size_target (np.ndarray): Size of ROI in input plane [w, h].
322 |
323 | Returns:
324 | np.ndarray: A matrix for transformation.
325 | """
326 | theta = np.deg2rad(theta)
327 | matrix = np.zeros((2, 3), dtype=np.float32)
328 | scale_x = size_dst[0] / size_target[0]
329 | scale_y = size_dst[1] / size_target[1]
330 | matrix[0, 0] = math.cos(theta) * scale_x
331 | matrix[0, 1] = -math.sin(theta) * scale_x
332 | matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
333 | 0.5 * size_input[1] * math.sin(theta) +
334 | 0.5 * size_target[0])
335 | matrix[1, 0] = math.sin(theta) * scale_y
336 | matrix[1, 1] = math.cos(theta) * scale_y
337 | matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
338 | 0.5 * size_input[1] * math.cos(theta) +
339 | 0.5 * size_target[1])
340 | return matrix
341 |
342 |
343 | def warp_affine_joints(joints, mat):
344 | """Apply affine transformation defined by the transform matrix on the
345 | joints.
346 |
347 | Args:
348 | joints (np.ndarray[..., 2]): Origin coordinate of joints.
349 | mat (np.ndarray[3, 2]): The affine matrix.
350 |
351 | Returns:
352 | np.ndarray[..., 2]: Result coordinate of joints.
353 | """
354 | joints = np.array(joints)
355 | shape = joints.shape
356 | joints = joints.reshape(-1, 2)
357 | return np.dot(
358 | np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
359 | mat.T).reshape(shape)
360 |
361 |
362 | def affine_transform_torch(pts, t):
363 | npts = pts.shape[0]
364 | pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
365 | out = torch.mm(t, torch.t(pts_homo))
366 | return torch.t(out[:2, :])
367 |
--------------------------------------------------------------------------------
/utils/train_valid_fn.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from models.losses import JointsMSELoss
7 | from models.optimizer import LayerDecayOptimizer
8 |
9 | from torch.nn.parallel import DataParallel, DistributedDataParallel
10 | from torch.nn.utils import clip_grad_norm_
11 | from torch.optim import AdamW
12 | from torch.optim.lr_scheduler import LambdaLR, MultiStepLR
13 | from torch.utils.data import DataLoader, Dataset
14 | from torch.utils.data.distributed import DistributedSampler
15 | from torch.cuda.amp import autocast, GradScaler
16 | from tqdm import tqdm
17 | from time import time
18 |
19 | from utils.dist_util import get_dist_info, init_dist
20 | from utils.logging import get_root_logger
21 |
22 | @torch.no_grad()
23 | def valid_model(model: nn.Module, dataloaders: DataLoader, criterion: nn.Module, cfg: dict) -> None:
24 | total_loss = 0
25 | total_metric = 0
26 | model.eval()
27 | for dataloader in dataloaders:
28 | for batch_idx, batch in enumerate(dataloader):
29 | images, targets, target_weights, __ = batch
30 | images = images.to('cuda')
31 | targets = targets.to('cuda')
32 | target_weights = target_weights.to('cuda')
33 |
34 | outputs = model(images)
35 | loss = criterion(outputs, targets, target_weights)
36 | total_loss += loss.item()
37 |
38 | avg_loss = total_loss/(len(dataloader)*len(dataloaders))
39 | return avg_loss
40 |
41 | def train_model(model: nn.Module, datasets_train: Dataset, datasets_valid: Dataset, cfg: dict, distributed: bool, validate: bool, timestamp: str, meta: dict) -> None:
42 | logger = get_root_logger()
43 |
44 | # Prepare data loaders
45 | datasets_train = datasets_train if isinstance(datasets_train, (list, tuple)) else [datasets_train]
46 | datasets_valid = datasets_valid if isinstance(datasets_valid, (list, tuple)) else [datasets_valid]
47 |
48 | if distributed:
49 | samplers_train = [DistributedSampler(ds, num_replicas=len(cfg.gpu_ids), rank=torch.cuda.current_device(), shuffle=True, drop_last=False) for ds in datasets_train]
50 | samplers_valid = [DistributedSampler(ds, num_replicas=len(cfg.gpu_ids), rank=torch.cuda.current_device(), shuffle=False, drop_last=False) for ds in datasets_valid]
51 | else:
52 | samplers_train = [None for ds in datasets_train]
53 | samplers_valid = [None for ds in datasets_valid]
54 |
55 | dataloaders_train = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=True, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_train, samplers_train)]
56 | dataloaders_valid = [DataLoader(ds, batch_size=cfg.data['samples_per_gpu'], shuffle=False, sampler=sampler, num_workers=cfg.data['workers_per_gpu'], pin_memory=False) for ds, sampler in zip(datasets_valid, samplers_valid)]
57 |
58 | # put model on gpus
59 | if distributed:
60 | find_unused_parameters = cfg.get('find_unused_parameters', False)
61 | # Sets the `find_unused_parameters` parameter in
62 | # torch.nn.parallel.DistributedDataParallel
63 |
64 | model = DistributedDataParallel(
65 | module=model,
66 | device_ids=[torch.cuda.current_device()],
67 | broadcast_buffers=False,
68 | find_unused_parameters=find_unused_parameters)
69 | else:
70 | model = DataParallel(model, device_ids=cfg.gpu_ids)
71 |
72 | # Loss function
73 | criterion = JointsMSELoss(use_target_weight=cfg.model['keypoint_head']['loss_keypoint']['use_target_weight'])
74 |
75 | # Optimizer
76 | optimizer = AdamW(model.parameters(), lr=cfg.optimizer['lr'], betas=cfg.optimizer['betas'], weight_decay=cfg.optimizer['weight_decay'])
77 |
78 | # Layer-wise learning rate decay
79 | lr_mult = [cfg.optimizer['paramwise_cfg']['layer_decay_rate']] * cfg.optimizer['paramwise_cfg']['num_layers']
80 | layerwise_optimizer = LayerDecayOptimizer(optimizer, lr_mult)
81 |
82 |
83 | # Learning rate scheduler (MultiStepLR)
84 | milestones = cfg.lr_config['step']
85 | gamma = 0.1
86 | scheduler = MultiStepLR(optimizer, milestones, gamma)
87 |
88 | # Warm-up scheduler
89 | num_warmup_steps = cfg.lr_config['warmup_iters'] # Number of warm-up steps
90 | warmup_factor = cfg.lr_config['warmup_ratio'] # Initial learning rate = warmup_factor * learning_rate
91 | warmup_scheduler = LambdaLR(
92 | optimizer,
93 | lr_lambda=lambda step: warmup_factor + (1.0 - warmup_factor) * step / num_warmup_steps
94 | )
95 |
96 | # AMP setting
97 | if cfg.use_amp:
98 | logger.info("Using Automatic Mixed Precision (AMP) training...")
99 | # Create a GradScaler object for FP16 training
100 | scaler = GradScaler()
101 |
102 | # Logging config
103 | total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
104 | logger.info(f'''\n
105 | #========= [Train Configs] =========#
106 | # - Num GPUs: {len(cfg.gpu_ids)}
107 | # - Batch size (per gpu): {cfg.data['samples_per_gpu']}
108 | # - LR: {cfg.optimizer['lr']: .6f}
109 | # - Num params: {total_params:,d}
110 | # - AMP: {cfg.use_amp}
111 | #===================================#
112 | ''')
113 |
114 | global_step = 0
115 | for dataloader in dataloaders_train:
116 | for epoch in range(cfg.total_epochs):
117 | model.train()
118 | train_pbar = tqdm(dataloader)
119 | total_loss = 0
120 | tic = time()
121 | for batch_idx, batch in enumerate(train_pbar):
122 | layerwise_optimizer.zero_grad()
123 |
124 | images, targets, target_weights, __ = batch
125 | images = images.to('cuda')
126 | targets = targets.to('cuda')
127 | target_weights = target_weights.to('cuda')
128 |
129 | if cfg.use_amp:
130 | with autocast():
131 | outputs = model(images)
132 | loss = criterion(outputs, targets, target_weights)
133 | scaler.scale(loss).backward()
134 | clip_grad_norm_(model.parameters(), **cfg.optimizer_config['grad_clip'])
135 | scaler.step(layerwise_optimizer)
136 | scaler.update()
137 | else:
138 | outputs = model(images)
139 | loss = criterion(outputs, targets, target_weights)
140 | loss.backward()
141 | clip_grad_norm_(model.parameters(), **cfg.optimizer_config['grad_clip'])
142 | layerwise_optimizer.step()
143 |
144 | if global_step < num_warmup_steps:
145 | warmup_scheduler.step()
146 | global_step += 1
147 |
148 | total_loss += loss.item()
149 | train_pbar.set_description(f"🏋️> Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Loss {loss.item():.4f} | LR {optimizer.param_groups[0]['lr']:.6f} | Step")
150 | scheduler.step()
151 |
152 | avg_loss_train = total_loss/len(dataloader)
153 | logger.info(f"[Summary-train] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (train) {avg_loss_train:.4f} --- {time()-tic:.5f} sec. elapsed")
154 | ckpt_name = f"epoch{str(epoch).zfill(3)}.pth"
155 | ckpt_path = osp.join(cfg.work_dir, ckpt_name)
156 | torch.save(model.module.state_dict(), ckpt_path)
157 |
158 | # validation
159 | if validate:
160 | tic2 = time()
161 | avg_loss_valid = valid_model(model, dataloaders_valid, criterion, cfg)
162 | logger.info(f"[Summary-valid] Epoch [{str(epoch).zfill(3)}/{str(cfg.total_epochs).zfill(3)}] | Average Loss (valid) {avg_loss_valid:.4f} --- {time()-tic2:.5f} sec. elapsed")
163 |
--------------------------------------------------------------------------------
/utils/transform.py:
--------------------------------------------------------------------------------
1 | import math
2 | import cv2
3 | import munkres
4 | import numpy as np
5 | import torch
6 |
7 |
8 | # solution proposed in https://github.com/pytorch/pytorch/issues/229#issuecomment-299424875
9 | def flip_tensor(tensor, dim=0):
10 | """
11 | flip the tensor on the dimension dim
12 | """
13 | inv_idx = torch.arange(tensor.shape[dim] - 1, -1, -1).to(tensor.device)
14 | return tensor.index_select(dim, inv_idx)
15 |
16 |
17 | #
18 | # derived from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
19 | def flip_back(output_flipped, matched_parts):
20 | assert len(output_flipped.shape) == 4, 'output_flipped has to be [batch_size, num_joints, height, width]'
21 |
22 | output_flipped = flip_tensor(output_flipped, dim=-1)
23 |
24 | for pair in matched_parts:
25 | tmp = output_flipped[:, pair[0]].clone()
26 | output_flipped[:, pair[0]] = output_flipped[:, pair[1]]
27 | output_flipped[:, pair[1]] = tmp
28 |
29 | return output_flipped
30 |
31 |
32 | def fliplr_joints(joints, joints_vis, width, matched_parts):
33 | # Flip horizontal
34 | joints[:, 0] = width - joints[:, 0] - 1
35 |
36 | # Change left-right parts
37 | for pair in matched_parts:
38 | joints[pair[0], :], joints[pair[1], :] = \
39 | joints[pair[1], :], joints[pair[0], :].copy()
40 | joints_vis[pair[0], :], joints_vis[pair[1], :] = \
41 | joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
42 |
43 | return joints * joints_vis, joints_vis
44 |
45 |
46 | def get_affine_transform(center, scale, pixel_std, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0):
47 | if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
48 | print(scale)
49 | scale = np.array([scale, scale])
50 |
51 | scale_tmp = scale * 1.0 * pixel_std # It was scale_tmp = scale * 200.0
52 | src_w = scale_tmp[0]
53 | dst_w = output_size[0]
54 | dst_h = output_size[1]
55 |
56 | rot_rad = np.pi * rot / 180
57 | src_dir = get_dir([0, src_w * -0.5], rot_rad)
58 | dst_dir = np.array([0, dst_w * -0.5], np.float32)
59 |
60 | src = np.zeros((3, 2), dtype=np.float32)
61 | dst = np.zeros((3, 2), dtype=np.float32)
62 | src[0, :] = center + scale_tmp * shift
63 | src[1, :] = center + src_dir + scale_tmp * shift
64 | dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
65 | dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
66 |
67 | src[2:, :] = get_3rd_point(src[0, :], src[1, :])
68 | dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
69 |
70 | if inv:
71 | trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
72 | else:
73 | trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
74 |
75 | return trans
76 |
77 |
78 | def affine_transform(pt, t):
79 | new_pt = np.array([pt[0], pt[1], 1.]).T
80 | new_pt = np.dot(t, new_pt)
81 | return new_pt[:2]
82 |
83 |
84 | def get_3rd_point(a, b):
85 | direct = a - b
86 | return b + np.array([-direct[1], direct[0]], dtype=np.float32)
87 |
88 |
89 | def get_dir(src_point, rot_rad):
90 | sn, cs = np.sin(rot_rad), np.cos(rot_rad)
91 |
92 | src_result = [0, 0]
93 | src_result[0] = src_point[0] * cs - src_point[1] * sn
94 | src_result[1] = src_point[0] * sn + src_point[1] * cs
95 |
96 | return src_result
--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import warnings
3 | import random
4 | import numpy as np
5 |
6 | from collections import OrderedDict
7 | import os.path as osp
8 |
9 | import torch
10 | import torch.nn as nn
11 |
12 | from torch import distributed as dist
13 | from torch.nn.parallel import DataParallel, DistributedDataParallel
14 |
15 | from .dist_util import get_dist_info
16 |
17 | MODULE_WRAPPERS = [DataParallel, DistributedDataParallel]
18 |
19 |
20 | def init_random_seed(seed=None, device='cuda'):
21 | """Initialize random seed.
22 |
23 | If the seed is not set, the seed will be automatically randomized,
24 | and then broadcast to all processes to prevent some potential bugs.
25 |
26 | Args:
27 | seed (int, Optional): The seed. Default to None.
28 | device (str): The device where the seed will be put on.
29 | Default to 'cuda'.
30 |
31 | Returns:
32 | int: Seed to be used.
33 | """
34 | if seed is not None:
35 | return seed
36 |
37 | # Make sure all ranks share the same random seed to prevent
38 | # some potential bugs. Please refer to
39 | # https://github.com/open-mmlab/mmdetection/issues/6339
40 | rank, world_size = get_dist_info()
41 | seed = np.random.randint(2**31)
42 | if world_size == 1:
43 | return seed
44 |
45 | if rank == 0:
46 | random_num = torch.tensor(seed, dtype=torch.int32, device=device)
47 | else:
48 | random_num = torch.tensor(0, dtype=torch.int32, device=device)
49 | dist.broadcast(random_num, src=0)
50 | return random_num.item()
51 |
52 |
53 | def set_random_seed(seed: int,
54 | deterministic: bool = False,
55 | use_rank_shift: bool = False) -> None:
56 | """Set random seed.
57 |
58 | Args:
59 | seed (int): Seed to be used.
60 | deterministic (bool): Whether to set the deterministic option for
61 | CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
62 | to True and `torch.backends.cudnn.benchmark` to False.
63 | Default: False.
64 | rank_shift (bool): Whether to add rank number to the random seed to
65 | have different random seed in different threads. Default: False.
66 | """
67 | if use_rank_shift:
68 | rank, _ = get_dist_info()
69 | seed += rank
70 | random.seed(seed)
71 | np.random.seed(seed)
72 | torch.manual_seed(seed)
73 | torch.cuda.manual_seed(seed)
74 | torch.cuda.manual_seed_all(seed)
75 | os.environ['PYTHONHASHSEED'] = str(seed)
76 | if deterministic:
77 | torch.backends.cudnn.deterministic = True
78 | torch.backends.cudnn.benchmark = False
79 |
80 | def is_module_wrapper(module: nn.Module) -> bool:
81 | """ Check if module wrrapper exists recursively """
82 | def is_module_in_wrapper(module, module_wrapper):
83 | module_wrappers = tuple(module_wrapper.module_dict.values())
84 | if isinstance(module, module_wrappers):
85 | return True
86 | for child in module_wrapper.children.values():
87 | if is_module_in_wrapper(module, child):
88 | return True
89 | return is_module_in_wrapper(module, MODULE_WRAPPERS)
90 |
91 |
92 | def load_state_dict(module, state_dict, strict=False, logger=None):
93 | """Load state_dict to a module.
94 |
95 | This method is modified from :meth:`torch.nn.Module.load_state_dict`.
96 | Default value for ``strict`` is set to ``False`` and the message for
97 | param mismatch will be shown even if strict is False.
98 |
99 | Args:
100 | module (Module): Module that receives the state_dict.
101 | state_dict (OrderedDict): Weights.
102 | strict (bool): whether to strictly enforce that the keys
103 | in :attr:`state_dict` match the keys returned by this module's
104 | :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
105 | logger (:obj:`logging.Logger`, optional): Logger to log the error
106 | message. If not specified, print function will be used.
107 | """
108 | unexpected_keys = []
109 | all_missing_keys = []
110 | err_msg = []
111 |
112 | metadata = getattr(state_dict, '_metadata', None)
113 | state_dict = state_dict.copy()
114 | if metadata is not None:
115 | state_dict._metadata = metadata
116 |
117 | # use _load_from_state_dict to enable checkpoint version control
118 | def load(module, prefix=''):
119 | # recursively check parallel module in case that the model has a
120 | # complicated structure, e.g., nn.Module(nn.Module(DDP))
121 | if is_module_wrapper(module):
122 | module = module.module
123 | local_metadata = {} if metadata is None else metadata.get(
124 | prefix[:-1], {})
125 | module._load_from_state_dict(state_dict, prefix, local_metadata, True,
126 | all_missing_keys, unexpected_keys,
127 | err_msg)
128 | for name, child in module._modules.items():
129 | if child is not None:
130 | load(child, prefix + name + '.')
131 |
132 | load(module)
133 | load = None # break load->load reference cycle
134 |
135 | # ignore "num_batches_tracked" of BN layers
136 | missing_keys = [
137 | key for key in all_missing_keys if 'num_batches_tracked' not in key
138 | ]
139 |
140 | if unexpected_keys:
141 | err_msg.append('unexpected key in source '
142 | f'state_dict: {", ".join(unexpected_keys)}\n')
143 | if missing_keys:
144 | err_msg.append(
145 | f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
146 |
147 | rank, _ = get_dist_info()
148 | if len(err_msg) > 0 and rank == 0:
149 | err_msg.insert(
150 | 0, 'The model and loaded state dict do not match exactly\n')
151 | err_msg = '\n'.join(err_msg)
152 | if strict:
153 | raise RuntimeError(err_msg)
154 | elif logger is not None:
155 | logger.warning(err_msg)
156 | else:
157 | print(err_msg)
158 |
159 |
160 | def load_checkpoint(model,
161 | filename,
162 | map_location='cpu',
163 | strict=False,
164 | logger=None):
165 | """Load checkpoint from a file or URI.
166 |
167 | Args:
168 | model (Module): Module to load checkpoint.
169 | filename (str): Accept local filepath, URL, ``torchvision://xxx``,
170 | ``open-mmlab://xxx``.
171 | map_location (str): Same as :func:`torch.load`.
172 | strict (bool): Whether to allow different params for the model and
173 | checkpoint.
174 | logger (:mod:`logging.Logger` or None): The logger for error message.
175 |
176 | Returns:
177 | dict or OrderedDict: The loaded checkpoint.
178 | """
179 | checkpoint = torch.load(filename, map_location=map_location)
180 | # OrderedDict is a subclass of dict
181 | if not isinstance(checkpoint, dict):
182 | raise RuntimeError(
183 | f'No state_dict found in checkpoint file {filename}')
184 | # get state_dict from checkpoint
185 | if 'state_dict' in checkpoint:
186 | state_dict_tmp = checkpoint['state_dict']
187 | else:
188 | state_dict_tmp = checkpoint
189 |
190 | state_dict = OrderedDict()
191 | # strip prefix of state_dict
192 | for k, v in state_dict_tmp.items():
193 | if k.startswith('module.backbone.'):
194 | state_dict[k[16:]] = v
195 | elif k.startswith('module.'):
196 | state_dict[k[7:]] = v
197 | elif k.startswith('backbone.'):
198 | state_dict[k[9:]] = v
199 | else:
200 | state_dict[k] = v
201 | # load state_dict
202 | load_state_dict(model, state_dict, strict, logger)
203 | return checkpoint
204 |
205 |
206 | def resize(input,
207 | size=None,
208 | scale_factor=None,
209 | mode='nearest',
210 | align_corners=None,
211 | warning=True):
212 | if warning:
213 | if size is not None and align_corners:
214 | input_h, input_w = tuple(int(x) for x in input.shape[2:])
215 | output_h, output_w = tuple(int(x) for x in size)
216 | if output_h > input_h or output_w > output_h:
217 | if ((output_h > 1 and output_w > 1 and input_h > 1
218 | and input_w > 1) and (output_h - 1) % (input_h - 1)
219 | and (output_w - 1) % (input_w - 1)):
220 | warnings.warn(
221 | f'When align_corners={align_corners}, '
222 | 'the output would more aligned if '
223 | f'input size {(input_h, input_w)} is `x+1` and '
224 | f'out size {(output_h, output_w)} is `nx+1`')
225 | if isinstance(size, torch.Size):
226 | size = tuple(int(x) for x in size)
227 |
228 | def constant_init(module: nn.Module, val: float, bias: float = 0) -> None:
229 | if hasattr(module, 'weight') and module.weight is not None:
230 | nn.init.constant_(module.weight, val)
231 | if hasattr(module, 'bias') and module.bias is not None:
232 | nn.init.constant_(module.bias, bias)
233 |
234 | def normal_init(module: nn.Module,
235 | mean: float = 0,
236 | std: float = 1,
237 | bias: float = 0) -> None:
238 | if hasattr(module, 'weight') and module.weight is not None:
239 | nn.init.normal_(module.weight, mean, std)
240 | if hasattr(module, 'bias') and module.bias is not None:
241 | nn.init.constant_(module.bias, bias)
--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import torch
5 | import torchvision
6 | import ffmpeg
7 |
8 |
9 | __all__ = ["joints_dict", "draw_points_and_skeleton"]
10 |
11 |
12 | def joints_dict():
13 | joints = {
14 | "coco": {
15 | "keypoints": {
16 | 0: "nose",
17 | 1: "left_eye",
18 | 2: "right_eye",
19 | 3: "left_ear",
20 | 4: "right_ear",
21 | 5: "left_shoulder",
22 | 6: "right_shoulder",
23 | 7: "left_elbow",
24 | 8: "right_elbow",
25 | 9: "left_wrist",
26 | 10: "right_wrist",
27 | 11: "left_hip",
28 | 12: "right_hip",
29 | 13: "left_knee",
30 | 14: "right_knee",
31 | 15: "left_ankle",
32 | 16: "right_ankle"
33 | },
34 | "skeleton": [
35 | # # [16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8],
36 | # # [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]
37 | # [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
38 | # [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]
39 | [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
40 | [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], # [3, 5], [4, 6]
41 | [0, 5], [0, 6]
42 | ]
43 | },
44 | "mpii": {
45 | "keypoints": {
46 | 0: "right_ankle",
47 | 1: "right_knee",
48 | 2: "right_hip",
49 | 3: "left_hip",
50 | 4: "left_knee",
51 | 5: "left_ankle",
52 | 6: "pelvis",
53 | 7: "thorax",
54 | 8: "upper_neck",
55 | 9: "head top",
56 | 10: "right_wrist",
57 | 11: "right_elbow",
58 | 12: "right_shoulder",
59 | 13: "left_shoulder",
60 | 14: "left_elbow",
61 | 15: "left_wrist"
62 | },
63 | "skeleton": [
64 | # [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [13, 3], [12, 2], [13, 12], [13, 14],
65 | # [12, 11], [14, 15], [11, 10], # [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]
66 | [5, 4], [4, 3], [0, 1], [1, 2], [3, 2], [3, 6], [2, 6], [6, 7], [7, 8], [8, 9],
67 | [13, 7], [12, 7], [13, 14], [12, 11], [14, 15], [11, 10],
68 | ]
69 | },
70 | }
71 | return joints
72 |
73 |
74 | def draw_points(image, points, color_palette='tab20', palette_samples=16, confidence_threshold=0.5):
75 | """
76 | Draws `points` on `image`.
77 |
78 | Args:
79 | image: image in opencv format
80 | points: list of points to be drawn.
81 | Shape: (nof_points, 3)
82 | Format: each point should contain (y, x, confidence)
83 | color_palette: name of a matplotlib color palette
84 | Default: 'tab20'
85 | palette_samples: number of different colors sampled from the `color_palette`
86 | Default: 16
87 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
88 | Default: 0.5
89 |
90 | Returns:
91 | A new image with overlaid points
92 |
93 | """
94 | try:
95 | colors = np.round(
96 | np.array(plt.get_cmap(color_palette).colors) * 255
97 | ).astype(np.uint8)[:, ::-1].tolist()
98 | except AttributeError: # if palette has not pre-defined colors
99 | colors = np.round(
100 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255
101 | ).astype(np.uint8)[:, -2::-1].tolist()
102 |
103 | circle_size = max(1, min(image.shape[:2]) // 150) # ToDo Shape it taking into account the size of the detection
104 | # circle_size = max(2, int(np.sqrt(np.max(np.max(points, axis=0) - np.min(points, axis=0)) // 16)))
105 |
106 | for i, pt in enumerate(points):
107 | if pt[2] > confidence_threshold:
108 | image = cv2.circle(image, (int(pt[1]), int(pt[0])), circle_size, tuple(colors[i % len(colors)]), -1)
109 |
110 | return image
111 |
112 |
113 | def draw_skeleton(image, points, skeleton, color_palette='Set2', palette_samples=8, person_index=0,
114 | confidence_threshold=0.5):
115 | """
116 | Draws a `skeleton` on `image`.
117 |
118 | Args:
119 | image: image in opencv format
120 | points: list of points to be drawn.
121 | Shape: (nof_points, 3)
122 | Format: each point should contain (y, x, confidence)
123 | skeleton: list of joints to be drawn
124 | Shape: (nof_joints, 2)
125 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points`
126 | color_palette: name of a matplotlib color palette
127 | Default: 'Set2'
128 | palette_samples: number of different colors sampled from the `color_palette`
129 | Default: 8
130 | person_index: index of the person in `image`
131 | Default: 0
132 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
133 | Default: 0.5
134 |
135 | Returns:
136 | A new image with overlaid joints
137 |
138 | """
139 | try:
140 | colors = np.round(
141 | np.array(plt.get_cmap(color_palette).colors) * 255
142 | ).astype(np.uint8)[:, ::-1].tolist()
143 | except AttributeError: # if palette has not pre-defined colors
144 | colors = np.round(
145 | np.array(plt.get_cmap(color_palette)(np.linspace(0, 1, palette_samples))) * 255
146 | ).astype(np.uint8)[:, -2::-1].tolist()
147 |
148 | for i, joint in enumerate(skeleton):
149 | pt1, pt2 = points[joint]
150 | if pt1[2] > confidence_threshold and pt2[2] > confidence_threshold:
151 | image = cv2.line(
152 | image, (int(pt1[1]), int(pt1[0])), (int(pt2[1]), int(pt2[0])),
153 | tuple(colors[person_index % len(colors)]), 2
154 | )
155 |
156 | return image
157 |
158 |
159 | def draw_points_and_skeleton(image, points, skeleton, points_color_palette='tab20', points_palette_samples=16,
160 | skeleton_color_palette='Set2', skeleton_palette_samples=8, person_index=0,
161 | confidence_threshold=0.5):
162 | """
163 | Draws `points` and `skeleton` on `image`.
164 |
165 | Args:
166 | image: image in opencv format
167 | points: list of points to be drawn.
168 | Shape: (nof_points, 3)
169 | Format: each point should contain (y, x, confidence)
170 | skeleton: list of joints to be drawn
171 | Shape: (nof_joints, 2)
172 | Format: each joint should contain (point_a, point_b) where `point_a` and `point_b` are an index in `points`
173 | points_color_palette: name of a matplotlib color palette
174 | Default: 'tab20'
175 | points_palette_samples: number of different colors sampled from the `color_palette`
176 | Default: 16
177 | skeleton_color_palette: name of a matplotlib color palette
178 | Default: 'Set2'
179 | skeleton_palette_samples: number of different colors sampled from the `color_palette`
180 | Default: 8
181 | person_index: index of the person in `image`
182 | Default: 0
183 | confidence_threshold: only points with a confidence higher than this threshold will be drawn. Range: [0, 1]
184 | Default: 0.5
185 |
186 | Returns:
187 | A new image with overlaid joints
188 |
189 | """
190 | image = draw_skeleton(image, points, skeleton, color_palette=skeleton_color_palette,
191 | palette_samples=skeleton_palette_samples, person_index=person_index,
192 | confidence_threshold=confidence_threshold)
193 | image = draw_points(image, points, color_palette=points_color_palette, palette_samples=points_palette_samples,
194 | confidence_threshold=confidence_threshold)
195 | return image
196 |
197 |
198 | def save_images(images, target, joint_target, output, joint_output, joint_visibility, summary_writer=None, step=0,
199 | prefix=''):
200 | """
201 | Creates a grid of images with gt joints and a grid with predicted joints.
202 | This is a basic function for debugging purposes only.
203 |
204 | If summary_writer is not None, the grid will be written in that SummaryWriter with name "{prefix}_images" and
205 | "{prefix}_predictions".
206 |
207 | Args:
208 | images (torch.Tensor): a tensor of images with shape (batch x channels x height x width).
209 | target (torch.Tensor): a tensor of gt heatmaps with shape (batch x channels x height x width).
210 | joint_target (torch.Tensor): a tensor of gt joints with shape (batch x joints x 2).
211 | output (torch.Tensor): a tensor of predicted heatmaps with shape (batch x channels x height x width).
212 | joint_output (torch.Tensor): a tensor of predicted joints with shape (batch x joints x 2).
213 | joint_visibility (torch.Tensor): a tensor of joint visibility with shape (batch x joints).
214 | summary_writer (tb.SummaryWriter): a SummaryWriter where write the grids.
215 | Default: None
216 | step (int): summary_writer step.
217 | Default: 0
218 | prefix (str): summary_writer name prefix.
219 | Default: ""
220 |
221 | Returns:
222 | A pair of images which are built from torchvision.utils.make_grid
223 | """
224 | # Input images with gt
225 | images_ok = images.detach().clone()
226 | images_ok[:, 0].mul_(0.229).add_(0.485)
227 | images_ok[:, 1].mul_(0.224).add_(0.456)
228 | images_ok[:, 2].mul_(0.225).add_(0.406)
229 | for i in range(images.shape[0]):
230 | joints = joint_target[i] * 4.
231 | joints_vis = joint_visibility[i]
232 |
233 | for joint, joint_vis in zip(joints, joints_vis):
234 | if joint_vis[0]:
235 | a = int(joint[1].item())
236 | b = int(joint[0].item())
237 | # images_ok[i][:, a-1:a+1, b-1:b+1] = torch.tensor([1, 0, 0])
238 | images_ok[i][0, a - 1:a + 1, b - 1:b + 1] = 1
239 | images_ok[i][1:, a - 1:a + 1, b - 1:b + 1] = 0
240 | grid_gt = torchvision.utils.make_grid(images_ok, nrow=int(images_ok.shape[0] ** 0.5), padding=2, normalize=False)
241 | if summary_writer is not None:
242 | summary_writer.add_image(prefix + 'images', grid_gt, global_step=step)
243 |
244 | # Input images with prediction
245 | images_ok = images.detach().clone()
246 | images_ok[:, 0].mul_(0.229).add_(0.485)
247 | images_ok[:, 1].mul_(0.224).add_(0.456)
248 | images_ok[:, 2].mul_(0.225).add_(0.406)
249 | for i in range(images.shape[0]):
250 | joints = joint_output[i] * 4.
251 | joints_vis = joint_visibility[i]
252 |
253 | for joint, joint_vis in zip(joints, joints_vis):
254 | if joint_vis[0]:
255 | a = int(joint[1].item())
256 | b = int(joint[0].item())
257 | # images_ok[i][:, a-1:a+1, b-1:b+1] = torch.tensor([1, 0, 0])
258 | images_ok[i][0, a - 1:a + 1, b - 1:b + 1] = 1
259 | images_ok[i][1:, a - 1:a + 1, b - 1:b + 1] = 0
260 | grid_pred = torchvision.utils.make_grid(images_ok, nrow=int(images_ok.shape[0] ** 0.5), padding=2, normalize=False)
261 | if summary_writer is not None:
262 | summary_writer.add_image(prefix + 'predictions', grid_pred, global_step=step)
263 |
264 | # Heatmaps
265 | # ToDo
266 | # for h in range(0,17):
267 | # heatmap = torchvision.utils.make_grid(output[h].detach(), nrow=int(np.sqrt(output.shape[0])),
268 | # padding=2, normalize=True, range=(0, 1))
269 | # summary_writer.add_image('train_heatmap_%d' % h, heatmap, global_step=step + epoch*len_dl_train)
270 |
271 | return grid_gt, grid_pred
272 |
273 |
274 | def check_video_rotation(filename):
275 | # thanks to
276 | # https://stackoverflow.com/questions/53097092/frame-from-video-is-upside-down-after-extracting/55747773#55747773
277 |
278 | # this returns meta-data of the video file in form of a dictionary
279 | meta_dict = ffmpeg.probe(filename)
280 |
281 | # from the dictionary, meta_dict['streams'][0]['tags']['rotate'] is the key
282 | # we are looking for
283 | rotation_code = None
284 | try:
285 | if int(meta_dict['streams'][0]['tags']['rotate']) == 90:
286 | rotation_code = cv2.ROTATE_90_CLOCKWISE
287 | elif int(meta_dict['streams'][0]['tags']['rotate']) == 180:
288 | rotation_code = cv2.ROTATE_180
289 | elif int(meta_dict['streams'][0]['tags']['rotate']) == 270:
290 | rotation_code = cv2.ROTATE_90_COUNTERCLOCKWISE
291 | else:
292 | raise ValueError
293 | except KeyError:
294 | pass
295 |
296 | return rotation_code
297 |
--------------------------------------------------------------------------------