├── .editorconfig ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── ho-cap-demo-all-cameras.gif ├── image_label_viewer.png ├── sequence_3d_viewer.gif ├── sequence_pose_viewer.png ├── sequence_renderer_color.png ├── sequence_renderer_mask.png └── vis_labels.png ├── config ├── .gitignore ├── benchmarks │ └── benchmark_downloader.py ├── hocap_benchmarks.yaml ├── hocap_hpe.json ├── hocap_info.yaml ├── hocap_odet.json ├── hocap_ope.json ├── hocap_recordings.yaml └── mano_info.yaml ├── datasets └── .gitignore ├── examples ├── evaluate_hand_pose.py ├── evaluate_object_detection.py ├── evaluate_object_pose.py ├── image_label_viewer.py ├── sequence_3d_viewer.py ├── sequence_pose_viewer.py └── sequence_renderer.py ├── hocap_toolkit ├── benchmarks │ ├── __init__.py │ └── groundtruth_generator.py ├── factory │ ├── __init__.py │ └── dataset_factory.py ├── layers │ ├── __init__.py │ ├── mano_group_layer.py │ ├── mano_layer.py │ ├── object_group_layer.py │ └── object_layer.py ├── loaders │ ├── __init__.py │ └── sequence_loader.py ├── renderers │ ├── __init__.py │ ├── renderer_pyrd.py │ └── sequence_renderer.py └── utils │ ├── __init__.py │ ├── color_info.py │ ├── common_imports.py │ ├── cv_utils.py │ ├── io.py │ ├── mano_info.py │ ├── misc.py │ └── transforms.py ├── pyproject.toml ├── results └── .gitignore └── tools ├── hocap_dataset_split.py └── hocap_downloader.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # .editorconfig 2 | # Check http://editorconfig.org for more information 3 | # This file is for unifying the coding style for different editors and IDEs 4 | 5 | # top-most EditorConfig file 6 | root = true 7 | 8 | # Unix-style newlines with a newline ending every file 9 | [*] 10 | end_of_line = lf 11 | insert_final_newline = true 12 | trim_trailing_whitespace = true 13 | indent_style = space 14 | charset = utf-8 15 | 16 | [*.py] 17 | indent_size = 4 18 | 19 | [Makefile] 20 | indent_style = tab 21 | 22 | [*.md] 23 | trim_trailing_whitespace = false 24 | tab_width = 2 25 | indent_size = 2 26 | 27 | [*.{json,yml,yaml,xml,sh,launch}] 28 | indent_size = 2 29 | tab_width = 2 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | *.egg-info/ 13 | 14 | # IDEs and editors 15 | .idea 16 | .eclipse 17 | .vscode 18 | 19 | # Mac 20 | .DS_Store 21 | 22 | # Environments 23 | .env 24 | .venv 25 | env/ 26 | venv/ 27 | ENV/ 28 | env.bak/ 29 | venv.bak/ 30 | 31 | # Jupyter Notebook 32 | .ipynb_checkpoints 33 | 34 | # Others 35 | .*_history 36 | build.log 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HOCap Toolkit 2 | 3 | [![Python 3.10](https://img.shields.io/badge/Python-3.10-3776AB.svg)](https://www.python.org/downloads/release/python-31015/) [![PyTorch 2.3.1](https://img.shields.io/badge/PyTorch-2.3.1-EE4C2C.svg)](https://pytorch.org/) [![CUDA 11.8](https://img.shields.io/badge/CUDA-11.8-76B900.svg)](https://developer.nvidia.com/cuda-toolkit/) [![ROS Melodic](https://img.shields.io/badge/ROS-Melodic-22314E.svg)](http://wiki.ros.org/melodic/) ![GPLv3.0 License](https://img.shields.io/badge/License-GPL--3.0-3DA639.svg) 4 | 5 | The HOCap Toolkit is a Python package that provides evaluation and visualization tools for the HO-Cap dataset. 6 | 7 | --- 8 | 9 | **HO-Cap: A Capture System and Dataset for 3D Reconstruction and Pose Tracking of Hand-Object Interaction** 10 | 11 | Jikai Wang, Qifan Zhang, Yu-Wei Chao, Bowen Wen, Xiaohu Guo, Yu Xiang 12 | 13 | [ [arXiv](https://arxiv.org/abs/2406.06843) ] [ [Project page](https://irvlutd.github.io/HOCap/) ] 14 | 15 | ![hocap-demo-video](./assets/ho-cap-demo-all-cameras.gif) 16 | 17 | --- 18 | 19 | ## Contents 20 | 21 | - [HOCap Toolkit](#hocap-toolkit) 22 | - [Contents](#contents) 23 | - [News](#news) 24 | - [BibTeX Citation](#bibtex-citation) 25 | - [License](#license) 26 | - [Installation](#installation) 27 | - [Download the HOCap Dataset](#download-the-hocap-dataset) 28 | - [Labels in the HOCap Dataset](#labels-in-the-hocap-dataset) 29 | - [Loading Dataset and Visualizing Samples](#loading-dataset-and-visualizing-samples) 30 | - [Evaluation](#evaluation) 31 | - [Hand Pose Estimation Evaluation](#hand-pose-estimation-evaluation) 32 | - [Object Pose Estimation Evaluation](#object-pose-estimation-evaluation) 33 | - [Object Detection Evaluation](#object-detection-evaluation) 34 | - [HOCap Dataset Split for Training and Testing](#hocap-dataset-split-for-training-and-testing) 35 | 36 | ## News 37 | - :warning::warning: **2025-01-13**: We fixed the bug in image labels for "hand_joints_3d" and "hand_joints_2d". Please **re-download** the [labels](https://utdallas.box.com/s/ayd4st2wo588z2yqbuxalptxnz2qxlj5) and **regenerate** the HPE split dataset. 38 | - **2025-01-13**: The code for image label visualization is added! Please check the [here](#loading-dataset-and-visualizing-samples) (item 4). 39 | - **2024-12-15**: The training codes and datasets for YOLO11 and RT-DETR are added! Please check the [here](#training-yolo11-and-rt-detr-for-object-detection). 40 | - **2024-12-15**: The Object Collection dataset is added! Please check the [project page](https://irvlutd.github.io/HOCap/) for more details. 41 | - **2024-12-14**: The Object Collection dataset is added! Please check the [project page](https://irvlutd.github.io/HOCap/) for more details. 42 | - **2024-12-14**: The HO-Cap dataset is updated! Please check the [project page](https://irvlutd.github.io/HOCap/) for more details. 43 | - **2024-06-24**: The HO-Cap dataset is released! Please check the [project page](https://irvlutd.github.io/HOCap/) for more details. 44 | 45 | ## BibTeX Citation 46 | 47 | If HO-Cap helps your research, please consider citing the following: 48 | 49 | ``` 50 | @misc{wang2024hocapcapturedataset3d, 51 | title={HO-Cap: A Capture System and Dataset for 3D Reconstruction and Pose Tracking of Hand-Object Interaction}, 52 | author={Jikai Wang and Qifan Zhang and Yu-Wei Chao and Bowen Wen and Xiaohu Guo and Yu Xiang}, 53 | year={2024}, 54 | eprint={2406.06843}, 55 | archivePrefix={arXiv}, 56 | primaryClass={cs.CV}, 57 | url={https://arxiv.org/abs/2406.06843}, 58 | } 59 | ``` 60 | 61 | ### License 62 | 63 | HOCap Toolkit is released under the [GNU General Public License v3.0](./LICENSE). 64 | 65 | ## Installation 66 | 67 | This code is tested with [Python 3.10](https://docs.python.org/3.10) and [CUDA 11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) on [Ubuntu 20.04](https://releases.ubuntu.com/focal/). **Make sure CUDA 11.8 is installed on your system before running the code.** 68 | 69 | 1. Clone the HO-Cap repository from GitHub. 70 | 71 | ```bash 72 | git clone https://github.com/IRVLUTD/HO-Cap.git 73 | ``` 74 | 75 | 2. Change the current directory to the cloned repository. 76 | 77 | ```bash 78 | cd HO-Cap 79 | ``` 80 | 81 | 3. Create conda environment 82 | 83 | ```bash 84 | conda create -n hocap-toolkit python=3.10 85 | ``` 86 | 87 | 4. Activate conda environment 88 | 89 | ```bash 90 | conda activate hocap-toolkit 91 | ``` 92 | 93 | 5. Install Pytorch and torchvision 94 | 95 | ```bash 96 | python -m pip install torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118 --no-cache-dir 97 | ``` 98 | 99 | 6. Install hocap-toolkit package. 100 | 101 | ```bash 102 | python -m pip install -e . 103 | ``` 104 | 105 | 7. Download MANO models and code (`mano_v1_2.zip`) from the [MANO website](https://mano.is.tue.mpg.de) and place the extracted `.pkl` files under `config/mano_models` directory. The directory should look like this: 106 | 107 | ``` 108 | ./config/mano_models 109 | ├── MANO_LEFT.pkl 110 | └── MANO_RIGHT.pkl 111 | ``` 112 | 113 | ## Download the HOCap Dataset 114 | 115 | 1. Run below code to download the whole dataset: 116 | 117 | ``` 118 | python tools/hocap_downloader.py --subject_id all 119 | ``` 120 | 121 | 2. Or you can download the dataset for a specific subject: 122 | 123 | ``` 124 | python tools/hocap_downloader.py --subject_id subject_1 125 | ``` 126 | 127 | 3. The downloaded `.zip` files will be extracted to the `./datasets` directory. And the directory should look like this: 128 | 129 | ```bash 130 | ./datasets 131 | ├── calibration 132 | ├── models 133 | ├── subject_1 134 | │   ├── 20231025_165502 135 | │   │   ├── 037522251142 136 | │   │   │   ├── color_000000.jpg 137 | │   │   │   ├── depth_000000.png 138 | │   │   │   ├── label_000000.npz 139 | │   │   │   └── ... 140 | │   │   ├── 043422252387 141 | │   │   ├── ... 142 | │   │   ├── hololens_kv5h72 143 | │   │   ├── meta.yaml 144 | │   │   ├── poses_m.npy 145 | │   │   ├── poses_o.npy 146 | │   │   └── poses_pv.npy 147 | │   ├── 20231025_165502 148 | │   └── ... 149 | ├── ... 150 | └── subject_9 151 | ``` 152 | 153 | ## Labels in the HOCap Dataset 154 | 155 | The HOCap dataset provides the following labels: 156 | 157 | - 3d hand keypoints 158 | - 2d hand keypoints 159 | - hand bounding boxes 160 | - hand sides 161 | - hand MANO poses 162 | - object 6OD poses 163 | - segmentation masks 164 | 165 | ![vis_labels](./assets/vis_labels.png) 166 | 167 | ## Loading Dataset and Visualizing Samples 168 | 169 | 1. Below example shows how to visualize the pose annotations of one frame: 170 | 171 | ```bash 172 | python examples/sequence_pose_viewer.py 173 | ``` 174 | 175 | ![sequence_pose_viewer](./assets/sequence_pose_viewer.png) 176 | 177 | 2. Below example shows how to visualize sequence by the interactive 3D viewer: 178 | 179 | ```bash 180 | python examples/sequence_3d_viewer.py 181 | ``` 182 | 183 | ![sequence_3d_viewer](./assets/sequence_3d_viewer.gif) 184 | 185 | The 3D viewer provides the following functionalities: 186 | 187 | - `Background`: change the background color. 188 | - `Point Size`: change the point size. 189 | - `Show Skybox`: display/hide the skybox. 190 | - `Show Axes`: display/hide the axes of world coordinate. 191 | - `Crop Points`: crop the points outside the table area. 192 | - `Point Clouds`: display/hide the point clouds. 193 | - `Hand Mesh`: display/hide the hand mesh. 194 | - `Object Mesh`: display/hide the object mesh. 195 | - `Frame Slider`: change the frame index. 196 | - `Reset`: reset the camera view and the frame index. 197 | - `Pause/Play`: pause/play the sequence. 198 | - `Exit`: close the viewer. 199 | - `Help Tab`: show the help information. 200 | 201 | 3. Below example shows how to offline render the sequence: 202 | 203 | ```bash 204 | python examples/sequence_renderer.py 205 | ``` 206 | 207 | This will render the color image and segmentation map for all the frames in the sequence. The rendered images will be saved in the `/renders/` directory. 208 | 209 | ![sequence_renderer_color](./assets/sequence_renderer_color.png) 210 | ![sequence_renderer_mask](./assets/sequence_renderer_mask.png) 211 | 212 | 4. Below example shows how to visualize the image labels: 213 | 214 | ```bash 215 | python examples/image_label_viewer.py 216 | ``` 217 | 218 | ![image_label_viewer](./assets/image_label_viewer.png) 219 | 220 | ## Evaluation 221 | 222 | HO-Cap provides the benchmark evaluation for three tasks: 223 | 224 | - **Hand Pose Estimation (HPE)** (A2J-Transformer[^1] and HaMeR[^2]) 225 | - **Object Pose Estimation (OPE)** (MegaPose[^3] and FoundationPose[^4]) 226 | - **Object Detection (ODET)** (CNOS[^5], GroundingDINO[^6], YOLO11[^7] and RT-DETR[^8]). 227 | 228 | Run below code to download the example evaluation results: 229 | 230 | ```bash 231 | python config/benchmarks/benchmark_downloader.py 232 | ``` 233 | 234 | If the evaluation results are saved in the same format, the evaluation codes below can be used to evaluate the results. 235 | 236 | ### Hand Pose Estimation Evaluation 237 | 238 | - Evaluate the hand pose estimation performance: 239 | 240 | ```bash 241 | python examples/evaluate_hand_pose.py 242 | ``` 243 | 244 |
245 | You should see the following output: 246 | 247 | ``` 248 | PCK (0.05) PCK (0.10) PCK (0.15) PCK (0.20) MPJPE (mm) 249 | 45.319048 81.247619 91.357143 95.080952 25.657379 250 | ``` 251 | 252 |
253 | 254 | ### Object Pose Estimation Evaluation 255 | 256 | - Evaluate the novel object pose estimation performance: 257 | 258 | ```bash 259 | python examples/evaluate_object_pose.py 260 | ``` 261 | 262 |
263 | You should see the following output: 264 | 265 | ``` 266 | Object_ID ADD-S_err (cm) ADD_err (cm) ADD-S_AUC (%) ADD_AUC (%) 267 | |-------------- |-------------- |-------------- |-------------- |-------------- | 268 | G01_1 0.42 0.72 95.79 92.82 269 | G01_2 0.37 0.69 96.39 93.38 270 | G01_3 0.45 0.82 95.72 92.08 271 | G01_4 0.61 2.73 94.14 74.19 272 | Average 0.46 1.24 95.43 88.04 273 | ``` 274 | 275 |
276 | 277 | ### Object Detection Evaluation 278 | 279 | - Evaluate the object detection performance: 280 | 281 | ```bash 282 | python examples/evaluate_object_detection.py 283 | ``` 284 | 285 |
286 | You should see the following output: (click to expand) 287 | 288 | ``` 289 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.016 290 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.023 291 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.018 292 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.002 293 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.018 294 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.014 295 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.036 296 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.036 297 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.036 298 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005 299 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.037 300 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.017 301 | AP: 0.016 | AP_50: 0.023 | AP_75: 0.018 | AP_s: 0.002 | AP_m: 0.018 | AP_l: 0.014 302 | ``` 303 | 304 |
305 | 306 | ## HOCap Dataset Split for Training and Testing 307 | 308 | The train/valid/test split is defined separately for each task (HPE, ODET, OPE) by files `config/hocap_hpt.json`, `config/hocap_odt.json`, and `config/hocap_ope.json`. Each configuration file has the following structure: 309 | 310 | ```json 311 | { 312 | "train": [[0, 0, 0, 0], ...], 313 | "valid": [...], 314 | "test": [...] 315 | } 316 | ``` 317 | 318 | Each item is in format `[subject_index, sequence_index, camera_index, frame_index]`. For example, `[0, 0, 0, 0]` refers to `subject_1/20231022_190534/105322251564` folder and frame `color_000000.jpg`/ `depth_000000.png`. 319 | 320 | To save time, we provide the pre-defined splits for each task, the split datasets could be downloaded [here](https://utdallas.box.com/s/dt19tcvhwitz223cjqa5riot6zcf6yba). 321 | 322 | Or run below code to split the HOCap dataset manually, the split dataset will be saved in the `./datasets` directory. 323 | 324 | - Hand Pose Estimation (HPE) task: 325 | 326 | ```bash 327 | python tools/hocap_dataset_split.py --task hpe 328 | ``` 329 | 330 | - Object Pose Estimation (OPE) task: 331 | 332 | ```bash 333 | python tools/hocap_dataset_split.py --task ope 334 | ``` 335 | 336 | - Object Detection (ODET) task: 337 | - COCO annotation type: 338 | ```bash 339 | python tools/hocap_dataset_split.py --task odet --anno_type coco 340 | ``` 341 | - YOLO annotation type: 342 | ```bash 343 | python tools/hocap_dataset_split.py --task odet --anno_type yolo 344 | ``` 345 | 346 | [^1]: [A2J-Transformer: Anchor-to-Joint Transformer Network for 3D Interacting Hand Pose Estimation from a Single RGB Image](https://arxiv.org/abs/2304.03635) 347 | [^2]: [Reconstructing Hands in 3D with Transformers](https://arxiv.org/abs/2312.05251) 348 | [^3]: [MegaPose: 6D Pose Estimation of Novel Objects via Render & Compare](https://arxiv.org/abs/2212.06870) 349 | [^4]: [FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects](https://arxiv.org/abs/2312.08344) 350 | [^5]: [CNOS: A Strong Baseline for CAD-based Novel Object Segmentation](http://arxiv.org/abs/2307.11067) 351 | [^6]: [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 352 | [^7]: [YOLOv11: An Overview of the Key Architectural Enhancements](https://arxiv.org/html/2410.17725v1) 353 | [^8]: [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) 354 | -------------------------------------------------------------------------------- /assets/ho-cap-demo-all-cameras.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/ho-cap-demo-all-cameras.gif -------------------------------------------------------------------------------- /assets/image_label_viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/image_label_viewer.png -------------------------------------------------------------------------------- /assets/sequence_3d_viewer.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/sequence_3d_viewer.gif -------------------------------------------------------------------------------- /assets/sequence_pose_viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/sequence_pose_viewer.png -------------------------------------------------------------------------------- /assets/sequence_renderer_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/sequence_renderer_color.png -------------------------------------------------------------------------------- /assets/sequence_renderer_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/sequence_renderer_mask.png -------------------------------------------------------------------------------- /assets/vis_labels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/assets/vis_labels.png -------------------------------------------------------------------------------- /config/.gitignore: -------------------------------------------------------------------------------- 1 | *.task 2 | *.ckpt 3 | *.pth 4 | *.pkl 5 | benchmarks/*[.json, .txt] -------------------------------------------------------------------------------- /config/benchmarks/benchmark_downloader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from hocap_toolkit.utils import * 3 | 4 | PROJ_ROOT = Path(__file__).parent.parent.parent 5 | 6 | 7 | def download_box_file(box_link, output_file): 8 | output_path = Path(output_file) 9 | file_name = output_file.name 10 | 11 | resume_header = {} 12 | downloaded_size = 0 13 | 14 | with requests.get(box_link, headers=resume_header, stream=True) as response: 15 | # Check if the request was successful 16 | if response.status_code == 200: 17 | total_size = int(response.headers.get("content-length", 0)) 18 | else: 19 | print(f"Failed to retrieve file info. Status code: {response.status_code}") 20 | return 21 | 22 | if output_path.exists(): 23 | downloaded_size = output_path.stat().st_size 24 | # Check if there's a partial download and get its size 25 | resume_header = {"Range": f"bytes={downloaded_size}-"} 26 | 27 | # Check if the file is already fully downloaded 28 | if downloaded_size == total_size: 29 | tqdm.write(f" ** {file_name} is already downloaded.") 30 | return 31 | 32 | # Send a GET request with the range header if needed 33 | with requests.get(box_link, headers=resume_header, stream=True) as response: 34 | # Check if the request was successful 35 | if response.status_code in [200, 206]: 36 | # Initialize tqdm progress bar 37 | with tqdm( 38 | total=total_size, 39 | initial=downloaded_size, 40 | unit="B", 41 | unit_scale=True, 42 | ncols=80, 43 | ) as pbar: 44 | # Download the file in chunks 45 | with output_path.open("ab") as file: 46 | for chunk in response.iter_content( 47 | chunk_size=1024 * 1024 48 | ): # 1 MB chunks 49 | if chunk: 50 | file.write(chunk) 51 | pbar.update(len(chunk)) 52 | else: 53 | print(f"Failed to download file. Status code: {response.status_code}") 54 | 55 | 56 | if __name__ == "__main__": 57 | behchmark_data = read_data_from_yaml("config/hocap_benchmarks.yaml") 58 | 59 | for file_name, file_link in behchmark_data.items(): 60 | tqdm.write(f"- Downloading {file_name}...") 61 | if "demo" in file_name: 62 | save_path = PROJ_ROOT / "results" / f"{file_name}.json" 63 | else: 64 | save_path = PROJ_ROOT / "config" / "benchmarks" / f"{file_name}.json" 65 | download_box_file(file_link, save_path) 66 | -------------------------------------------------------------------------------- /config/hocap_benchmarks.yaml: -------------------------------------------------------------------------------- 1 | hpe_gt: https://utdallas.box.com/shared/static/bjt8jty6ngwjj76nbuj8tncpnfnjrp2i.json 2 | odet_gt_images: https://utdallas.box.com/shared/static/6hd03ii4fzpqgjq1d8jekrp94ohvfo8z.json 3 | odet_gt: https://utdallas.box.com/shared/static/iq005s6yc8g3ktc08wfi28vxu7lerime.json 4 | ope_gt: https://utdallas.box.com/shared/static/w3akltk94wnz5nj25r371rpqbemmkd53.json 5 | hpe_demo: https://utdallas.box.com/shared/static/evtuw2iyk4okkpuv23z4ur0r4you2w74.json 6 | odet_demo: https://utdallas.box.com/shared/static/024x6jcjgfhu0dum89shy5urxgy5fmps.json 7 | ope_demo: https://utdallas.box.com/shared/static/mg67undy02c0roeasxv3hnjalg9059n5.json 8 | -------------------------------------------------------------------------------- /config/hocap_info.yaml: -------------------------------------------------------------------------------- 1 | subject_ids: 2 | - subject_1 3 | - subject_2 4 | - subject_3 5 | - subject_4 6 | - subject_5 7 | - subject_6 8 | - subject_7 9 | - subject_8 10 | - subject_9 11 | object_classes: 12 | - G01_1 13 | - G01_2 14 | - G01_3 15 | - G01_4 16 | - G02_1 17 | - G02_2 18 | - G02_3 19 | - G02_4 20 | - G04_1 21 | - G04_2 22 | - G04_3 23 | - G04_4 24 | - G05_1 25 | - G05_2 26 | - G05_3 27 | - G05_4 28 | - G06_1 29 | - G06_2 30 | - G06_3 31 | - G06_4 32 | - G07_1 33 | - G07_2 34 | - G07_3 35 | - G07_4 36 | - G09_1 37 | - G09_2 38 | - G09_3 39 | - G09_4 40 | - G10_1 41 | - G10_2 42 | - G10_3 43 | - G10_4 44 | - G11_1 45 | - G11_2 46 | - G11_3 47 | - G11_4 48 | - G15_1 49 | - G15_2 50 | - G15_3 51 | - G15_4 52 | - G16_1 53 | - G16_2 54 | - G16_3 55 | - G16_4 56 | - G18_1 57 | - G18_2 58 | - G18_3 59 | - G18_4 60 | - G19_1 61 | - G19_2 62 | - G19_3 63 | - G19_4 64 | - G20_1 65 | - G20_2 66 | - G20_3 67 | - G20_4 68 | - G21_1 69 | - G21_2 70 | - G21_3 71 | - G21_4 72 | - G22_1 73 | - G22_2 74 | - G22_3 75 | - G22_4 76 | - RIGHT_HAND 77 | - LEFT_HAND 78 | object_descriptors: 79 | G01_1: fruit_snacks 80 | G01_2: water_softener_bottle 81 | G01_3: coconut_milk_carton 82 | G01_4: hammer 83 | G02_1: whole_milk_carton 84 | G02_2: cooked_ham 85 | G02_3: chocolate_drink_powder 86 | G02_4: sauce_bottle 87 | G04_1: flapjack_mix 88 | G04_2: chocolate_fudge 89 | G04_3: herring_fillets_can 90 | G04_4: crackers_box 91 | G05_1: vegetable_oil_spread 92 | G05_2: body_lotion 93 | G05_3: peanut_chocolate_box 94 | G05_4: sponge 95 | G06_1: dvd 96 | G06_2: pumpkin_creamer 97 | G06_3: yellow_mustard 98 | G06_4: dish_brush 99 | G07_1: game_controller 100 | G07_2: hot_cocoa_mix 101 | G07_3: dandruff_shampoo 102 | G07_4: toy_axe 103 | G09_1: candy_box 104 | G09_2: toy_car 105 | G09_3: toothpaste_box 106 | G09_4: chocolate_syrup_bottle 107 | G10_1: soup_mix 108 | G10_2: gel_toothpaste 109 | G10_3: chocolate_biscuit_sticks 110 | G10_4: body_wash 111 | G11_1: coconut_water 112 | G11_2: baby_powder 113 | G11_3: baking_soda 114 | G11_4: chocolate_bar 115 | G15_1: mens_body_wash 116 | G15_2: dandelion_tea 117 | G15_3: cooking_spray 118 | G15_4: joy_controller 119 | G16_1: toilet_cleaner 120 | G16_2: laundry_detergent 121 | G16_3: small_coconut_water 122 | G16_4: fabric_softener_sheets 123 | G18_1: green_tea_latte_mix 124 | G18_2: projector_remote 125 | G18_3: right_shoe 126 | G18_4: left_shoe 127 | G19_1: electric_screwdriver 128 | G19_2: blue_spatula 129 | G19_3: deodorant 130 | G19_4: ping_pong_paddle 131 | G20_1: cappuccino_mix 132 | G20_2: mustard_bottle 133 | G20_3: toilet_cleaner 134 | G20_4: dog_toy_bone 135 | G21_1: moisturizing_lotion 136 | G21_2: playing_cards 137 | G21_3: pink_spatula 138 | G21_4: blue_brush 139 | G22_1: chocolate_bar 140 | G22_2: mayomust_sauce 141 | G22_3: soup_mix 142 | G22_4: gray_spatula 143 | RIGHT_HAND: right_hand 144 | LEFT_HAND: left_hand 145 | sequence_ids: 146 | - '20231022_190534' 147 | - '20231022_192832' 148 | - '20231022_193506' 149 | - '20231022_193630' 150 | - '20231022_193809' 151 | - '20231022_200657' 152 | - '20231022_201316' 153 | - '20231022_201449' 154 | - '20231022_201556' 155 | - '20231022_201942' 156 | - '20231022_202115' 157 | - '20231022_202617' 158 | - '20231022_203100' 159 | - '20231023_162803' 160 | - '20231023_163653' 161 | - '20231023_163929' 162 | - '20231023_164242' 163 | - '20231023_164741' 164 | - '20231023_170018' 165 | - '20231024_154531' 166 | - '20231024_154810' 167 | - '20231024_155008' 168 | - '20231024_161209' 169 | - '20231024_161306' 170 | - '20231024_161937' 171 | - '20231024_162028' 172 | - '20231024_162327' 173 | - '20231024_162409' 174 | - '20231024_162756' 175 | - '20231024_162842' 176 | - '20231024_180111' 177 | - '20231024_180651' 178 | - '20231024_180733' 179 | - '20231024_181413' 180 | - '20231025_110646' 181 | - '20231025_110808' 182 | - '20231025_111118' 183 | - '20231025_111357' 184 | - '20231025_112229' 185 | - '20231025_112332' 186 | - '20231025_112546' 187 | - '20231025_165502' 188 | - '20231025_165807' 189 | - '20231025_170105' 190 | - '20231025_170231' 191 | - '20231025_170650' 192 | - '20231025_170959' 193 | - '20231025_171117' 194 | - '20231026_162155' 195 | - '20231026_162248' 196 | - '20231026_163223' 197 | - '20231026_164131' 198 | - '20231026_164812' 199 | - '20231026_164958' 200 | - '20231027_112303' 201 | - '20231027_113202' 202 | - '20231027_113535' 203 | - '20231027_123403' 204 | - '20231027_123725' 205 | - '20231027_123814' 206 | - '20231027_124057' 207 | - '20231027_124926' 208 | - '20231027_125019' 209 | - '20231027_125315' 210 | device_serials: 211 | - '105322251564' 212 | - '043422252387' 213 | - '037522251142' 214 | - '105322251225' 215 | - '108222250342' 216 | - '117222250549' 217 | - '046122250168' 218 | - '115422250549' 219 | - hololens_kv5h72 220 | -------------------------------------------------------------------------------- /config/hocap_recordings.yaml: -------------------------------------------------------------------------------- 1 | models: https://utdallas.box.com/shared/static/con44iqej33weg9f3rpxof61eh3x2x21.zip 2 | calibration: https://utdallas.box.com/shared/static/nlp4c6vtd0n8o0entxlh1vxdpcdeh0h8.zip 3 | subject_1: https://utdallas.box.com/shared/static/w0voy9bixtxyclo52841xyamock2lxpt.zip 4 | subject_2: https://utdallas.box.com/shared/static/j498kxxrkvaf674tvmt4su4ad0bz9s9f.zip 5 | subject_3: https://utdallas.box.com/shared/static/shklq33yaoozh9gm681nxwnq0o3y0y1d.zip 6 | subject_4: https://utdallas.box.com/shared/static/dew68k7b3ya09t40818gpfxm95oa4yeq.zip 7 | subject_5: https://utdallas.box.com/shared/static/mutor2a09kudze1yw173gsfetsru7ces.zip 8 | subject_6: https://utdallas.box.com/shared/static/iyja7rdbjx2ksgjhmdu6mx3zqvaurdni.zip 9 | subject_7: https://utdallas.box.com/shared/static/4g5qyig6i4uz1rgrzkcu9n4mhdjs74m2.zip 10 | subject_8: https://utdallas.box.com/shared/static/khrb5guy8rdwnoqi4euk2w0mk5lslxkn.zip 11 | subject_9: https://utdallas.box.com/shared/static/3x5yitydmbmwolq9bty5dd2udu5v52fc.zip 12 | poses: https://utdallas.box.com/shared/static/2lofbp2yd005d8o213ns77mdrtxg8eep.zip 13 | labels: https://utdallas.box.com/shared/static/ayd4st2wo588z2yqbuxalptxnz2qxlj5.zip 14 | -------------------------------------------------------------------------------- /config/mano_info.yaml: -------------------------------------------------------------------------------- 1 | joint_names: 2 | - wrist 3 | - thumb_mcp 4 | - thumb_pip 5 | - thumb_dip 6 | - thumb_tip 7 | - index_mcp 8 | - index_pip 9 | - index_dip 10 | - index_tip 11 | - middle_mcp 12 | - middle_pip 13 | - middle_dip 14 | - middle_tip 15 | - ring_mcp 16 | - ring_pip 17 | - ring_dip 18 | - ring_tip 19 | - little_mcp 20 | - little_pip 21 | - little_dip 22 | - little_tip 23 | joint_connections: 24 | - - 0 25 | - 1 26 | - - 1 27 | - 2 28 | - - 2 29 | - 3 30 | - - 3 31 | - 4 32 | - - 0 33 | - 5 34 | - - 5 35 | - 6 36 | - - 6 37 | - 7 38 | - - 7 39 | - 8 40 | - - 0 41 | - 9 42 | - - 9 43 | - 10 44 | - - 10 45 | - 11 46 | - - 11 47 | - 12 48 | - - 0 49 | - 13 50 | - - 13 51 | - 14 52 | - - 14 53 | - 15 54 | - - 15 55 | - 16 56 | - - 0 57 | - 17 58 | - - 17 59 | - 18 60 | - - 18 61 | - 19 62 | - - 19 63 | - 20 64 | -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /examples/evaluate_hand_pose.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from hocap_toolkit.utils import * 3 | 4 | PCK_THRESH = [0.05, 0.1, 0.15, 0.2] # Distance threshold for PCK calculation 5 | 6 | r_or_l = ["left", "right"] 7 | 8 | 9 | def calculate_mpjpe_3d(predicted, ground_truth): 10 | """ 11 | Calculate the Mean Per Joint Position Error (MPJPE) between predicted and ground truth 3D joint positions. 12 | 13 | Parameters: 14 | predicted (numpy.ndarray): The predicted 3D joint positions with shape (N, J, 3). 15 | ground_truth (numpy.ndarray): The ground truth 3D joint positions with shape (N, J, 3). 16 | 17 | Returns: 18 | float: The MPJPE value. 19 | """ 20 | # Calculate the Euclidean distance between the predicted and ground truth positions for each joint 21 | errors = np.linalg.norm(predicted - ground_truth, axis=2) 22 | 23 | # Calculate the mean distance across all joints for each sample 24 | sample_mpjpes = np.mean(errors, axis=1) 25 | 26 | # Calculate the mean MPJPE across all samples 27 | mpjpe = np.mean(sample_mpjpes) 28 | 29 | return mpjpe 30 | 31 | 32 | # def calculate_pck(predicted, ground_truth, bboxes, threshold, normalize): 33 | # """ 34 | # Calculate the Percentage of Correct Keypoints (PCK) for 2D hand pose estimation. 35 | 36 | # Parameters: 37 | # predicted (numpy.ndarray): The predicted 2D joint positions with shape (N, J, 2). 38 | # ground_truth (numpy.ndarray): The ground truth 2D joint positions with shape (N, J, 2). 39 | # bboxes (numpy.ndarray): Bounding boxes of the hands with shape (N, 4). 40 | # threshold (float): The distance threshold within which a predicted keypoint is considered correct. 41 | # normalize (numpy.ndarray): Normalization factors for distances with shape (N, 2). 42 | 43 | # Returns: 44 | # float: The PCK value (percentage of correct keypoints). 45 | # """ 46 | # N, K, _ = predicted.shape 47 | 48 | # predicted = predicted.astype(np.float32) 49 | # ground_truth = ground_truth.astype(np.float32) 50 | # normalize = normalize.astype(np.float32) 51 | 52 | # box_s = np.zeros((N, 2), dtype=np.float32) 53 | # for i in range(N): 54 | # box_s[i, 0] = bboxes[i, 2] - bboxes[i, 0] 55 | # box_s[i, 1] = bboxes[i, 3] - bboxes[i, 1] 56 | 57 | # distances = np.linalg.norm((predicted - ground_truth) / box_s[:, None, :], axis=-1) 58 | 59 | # acc = np.array([acc_distance(d, threshold) for d in distances.T]) 60 | # valid_acc = acc[acc >= 0] 61 | # cnt = len(valid_acc) 62 | # avg_acc = valid_acc.mean() if cnt > 0 else 0 63 | 64 | # return avg_acc * 100 65 | 66 | 67 | def calculate_pck(predicted, ground_truth, bboxes, thresholds, normalize): 68 | """ 69 | Calculate the Percentage of Correct Keypoints (PCK) for 2D hand pose estimation. 70 | 71 | Parameters: 72 | predicted (numpy.ndarray): The predicted 2D joint positions with shape (N, J, 2). 73 | ground_truth (numpy.ndarray): The ground truth 2D joint positions with shape (N, J, 2). 74 | bboxes (numpy.ndarray): Bounding boxes of the hands with shape (N, 4). 75 | thresholds (list[float]): A list of distance thresholds within which a predicted keypoint is considered correct. 76 | normalize (numpy.ndarray): Normalization factors for distances with shape (N, 2). 77 | 78 | Returns: 79 | dict: A dictionary where the keys are thresholds and the values are the PCK values for each threshold. 80 | """ 81 | N, K, _ = predicted.shape 82 | 83 | predicted = predicted.astype(np.float32) 84 | ground_truth = ground_truth.astype(np.float32) 85 | normalize = normalize.astype(np.float32) 86 | 87 | box_s = np.zeros((N, 2), dtype=np.float32) 88 | for i in range(N): 89 | box_s[i, 0] = bboxes[i, 2] - bboxes[i, 0] 90 | box_s[i, 1] = bboxes[i, 3] - bboxes[i, 1] 91 | 92 | # Normalize the predicted and ground truth keypoints 93 | distances = np.linalg.norm((predicted - ground_truth) / box_s[:, None, :], axis=-1) 94 | 95 | pck_results = [] 96 | for threshold in thresholds: 97 | acc = np.array([acc_distance(d, threshold) for d in distances.T]) 98 | valid_acc = acc[acc >= 0] 99 | cnt = len(valid_acc) 100 | avg_acc = valid_acc.mean() if cnt > 0 else 0 101 | pck_results.append(avg_acc) 102 | 103 | return pck_results 104 | 105 | 106 | def acc_distance(distances, thr=0.5): 107 | """ 108 | Return the percentage below the distance threshold, while ignoring 109 | distances values with -1. 110 | 111 | Parameters: 112 | distances (np.ndarray[N, ]): The normalized distances. 113 | thr (float): Threshold of the distances. 114 | 115 | Returns: 116 | float: Percentage of distances below the threshold. 117 | If all target keypoints are missing, return -1. 118 | """ 119 | distance_valid = distances != -1 120 | num_distance_valid = distance_valid.sum() 121 | if num_distance_valid > 0: 122 | return (distances[distance_valid] < thr).sum() / num_distance_valid 123 | return -1 124 | 125 | 126 | def get_hand_pose_evaluation(gt_file, pred_file): 127 | gt_result_file = Path(gt_file) 128 | pred_result_file = Path(pred_file) 129 | 130 | hand_json = read_data_from_json(gt_result_file) 131 | hamer_out_json = read_data_from_json(pred_result_file) 132 | 133 | all_pred_keypoints_3d = [] 134 | all_gt_keypoints_3d_full = [] 135 | all_gt_keypoints_2d_full = [] 136 | all_pred_keypoints_2d_full = [] 137 | all_gt_bboxes = [] 138 | 139 | for out_id, out_data in hamer_out_json.items(): 140 | if not out_data: 141 | continue 142 | gt_data = hand_json[out_id] 143 | is_right = np.array(out_data["is_right"], dtype=bool) 144 | pred_keypoints_2d_full = out_data["landmarks_2d"] 145 | pred_keypoints_3d = out_data["landmarks_3d"] 146 | 147 | gt_keypoints_2d_full = [] 148 | gt_bboxes = [] 149 | gt_keypoints_3d_full = [] 150 | pred_keypoints_3d_r_and_l = [] 151 | pred_keypoints_2d_r_and_l = [] 152 | 153 | for n in range(is_right.shape[0]): 154 | rl = r_or_l[int(is_right[n])] 155 | 156 | gt_s_keypoints_2d_full = gt_data["landmarks_2d"][rl] 157 | gt_keypoints_2d_full.append(np.array(gt_s_keypoints_2d_full)) 158 | gt_s_bboxes = np.array(gt_data["bbox"][rl]) 159 | gt_bboxes.append(gt_s_bboxes) 160 | 161 | gt_s_keypoints_3d_full = gt_data["landmarks_3d"][rl] 162 | gt_keypoints_3d_full.append(np.array(gt_s_keypoints_3d_full)) 163 | 164 | pred_keypoints_2d_r_and_l.append(np.array(pred_keypoints_2d_full[rl])) 165 | 166 | gt_keypoints_2d_full = np.stack(gt_keypoints_2d_full) 167 | gt_bboxes = np.stack(gt_bboxes) 168 | 169 | gt_keypoints_3d_full = np.stack(gt_keypoints_3d_full) * 1000 170 | 171 | for n in range(is_right.shape[0]): 172 | rl = r_or_l[int(is_right[n])] 173 | pred_keypoints_3d[rl] = np.array(pred_keypoints_3d[rl]) 174 | if not is_right[n]: 175 | pred_keypoints_3d[rl][:, 0] = -pred_keypoints_3d[rl][:, 0] 176 | 177 | align = pred_keypoints_3d[rl][0] - gt_keypoints_3d_full[n][0] 178 | gt_keypoints_3d_full[n] += align 179 | 180 | pred_keypoints_3d_r_and_l.append(pred_keypoints_3d[rl]) 181 | 182 | all_pred_keypoints_3d.append(np.stack(pred_keypoints_3d_r_and_l)) 183 | all_gt_keypoints_3d_full.append(gt_keypoints_3d_full) 184 | 185 | all_pred_keypoints_2d_full.append(np.stack(pred_keypoints_2d_r_and_l)) 186 | all_gt_keypoints_2d_full.append(gt_keypoints_2d_full) 187 | 188 | all_gt_bboxes.append(gt_bboxes) 189 | 190 | all_pred_keypoints_3d = np.concatenate(all_pred_keypoints_3d, axis=0) 191 | all_gt_keypoints_3d_full = np.concatenate(all_gt_keypoints_3d_full, axis=0) 192 | 193 | all_pred_keypoints_2d_full = np.concatenate(all_pred_keypoints_2d_full, axis=0) 194 | all_gt_keypoints_2d_full = np.concatenate(all_gt_keypoints_2d_full, axis=0) 195 | 196 | all_gt_bboxes = np.concatenate(all_gt_bboxes, axis=0) 197 | 198 | # Calculate PCK for each threshold in PCK_THRESH 199 | pcks = calculate_pck( 200 | all_pred_keypoints_2d_full, 201 | all_gt_keypoints_2d_full, 202 | all_gt_bboxes, 203 | PCK_THRESH, 204 | normalize=np.ones((len(all_pred_keypoints_2d_full), 2)), 205 | ) 206 | 207 | # Calculate MPJPE 208 | mpjpe = calculate_mpjpe_3d(all_pred_keypoints_3d, all_gt_keypoints_3d_full) 209 | 210 | # Prepare data for the DataFrame 211 | pd_data = {} 212 | for i, thresh in enumerate(PCK_THRESH): 213 | pd_data[f"PCK ({thresh:.2f})"] = pcks[i] * 100 214 | 215 | pd_data["MPJPE (mm)"] = mpjpe 216 | 217 | # Convert the data to a DataFrame and print it 218 | df = pd.DataFrame([pd_data]) 219 | result_str = df.to_string(index=False) 220 | 221 | print(result_str) 222 | 223 | # save to txt 224 | save_txt_file = pred_result_file.parent / f"{pred_result_file.stem}_pck_mpjpe.txt" 225 | save_txt_file.write_text(result_str) 226 | tqdm.write(f" * Results saved to {save_txt_file}") 227 | 228 | 229 | if __name__ == "__main__": 230 | gt_file = "config/benchmarks/hpe_gt.json" 231 | pred_file = "results/hpe_demo.json" 232 | 233 | tqdm.write("- Evaluating Hand Pose Estimation results...") 234 | get_hand_pose_evaluation(gt_file, pred_file) 235 | tqdm.write("- Evaluation Done...") 236 | -------------------------------------------------------------------------------- /examples/evaluate_object_detection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pycocotools.coco import COCO 3 | from pycocotools.cocoeval import COCOeval 4 | from hocap_toolkit.utils import * 5 | 6 | 7 | def evaluate_object_detection_results(gt_file, pred_file): 8 | coco_gt = COCO(str(gt_file)) 9 | coco_dt = coco_gt.loadRes(str(pred_file)) 10 | 11 | coco_eval = COCOeval(coco_gt, coco_dt, "bbox") 12 | coco_eval.evaluate() 13 | coco_eval.accumulate() 14 | coco_eval.summarize() 15 | 16 | ap_metrics = { 17 | "AP": coco_eval.stats[0], 18 | "AP50": coco_eval.stats[1], 19 | "AP75": coco_eval.stats[2], 20 | "APs": coco_eval.stats[3], 21 | "APm": coco_eval.stats[4], 22 | "APl": coco_eval.stats[5], 23 | "AR1": coco_eval.stats[6], 24 | "AR10": coco_eval.stats[7], 25 | "AR100": coco_eval.stats[8], 26 | "ARs": coco_eval.stats[9], 27 | "ARm": coco_eval.stats[10], 28 | "ARl": coco_eval.stats[11], 29 | } 30 | print( 31 | f"AP: {ap_metrics['AP']:.3f} | AP_50: {ap_metrics['AP50']:.3f} | AP_75: {ap_metrics['AP75']:.3f} | AP_s: {ap_metrics['APs']:.3f} | AP_m: {ap_metrics['APm']:.3f} | AP_l: {ap_metrics['APl']:.3f}" 32 | ) 33 | 34 | # Save to csv 35 | df = pd.DataFrame([ap_metrics]) 36 | save_csv_file = Path(pred_file).parent / f"{Path(pred_file).stem}_ap.csv" 37 | df.to_csv(save_csv_file, index=False) 38 | 39 | # Save to txt 40 | str_metrics = [ 41 | f" Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = {coco_eval.stats[0]*100:.2f}", 42 | f" Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = {coco_eval.stats[1]*100:.2f}", 43 | f" Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = {coco_eval.stats[2]*100:.2f}", 44 | f" Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = {coco_eval.stats[3]*100:.2f}", 45 | f" Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = {coco_eval.stats[4]*100:.2f}", 46 | f" Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = {coco_eval.stats[5]*100:.2f}", 47 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = {coco_eval.stats[6]*100:.2f}", 48 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = {coco_eval.stats[7]*100:.2f}", 49 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = {coco_eval.stats[8]*100:.2f}", 50 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = {coco_eval.stats[9]*100:.2f}", 51 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = {coco_eval.stats[10]*100:.2f}", 52 | f" Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = {coco_eval.stats[11]*100:.2f}", 53 | ] 54 | str_metrics = "\n".join(str_metrics) 55 | save_txt_file = Path(pred_file).parent / f"{Path(pred_file).stem}_ap.txt" 56 | save_txt_file.write_text(str_metrics) 57 | print(f"AP metrics saved to '{save_csv_file}' and '{save_txt_file}'") 58 | 59 | 60 | if __name__ == "__main__": 61 | gt_file = "config/benchmarks/odet_gt.json" 62 | pred_file = "results/odet_demo.json" 63 | 64 | tqdm.write("- Evaluating Object Detection results...") 65 | evaluate_object_detection_results(gt_file, pred_file) 66 | tqdm.write("- Evaluation Done...") 67 | -------------------------------------------------------------------------------- /examples/evaluate_object_pose.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.spatial import cKDTree 3 | from hocap_toolkit.utils import * 4 | 5 | PROJ_ROOT = Path(__file__).parent.parent 6 | 7 | 8 | def to_homo(pts): 9 | """ 10 | @pts: (N,3 or 2) will homogeneliaze the last dimension 11 | """ 12 | assert len(pts.shape) == 2, f"pts.shape: {pts.shape}" 13 | homo = np.concatenate((pts, np.ones((pts.shape[0], 1))), axis=-1) 14 | return homo 15 | 16 | 17 | def add_err(pred, gt, model_pts): 18 | """ 19 | Average Distance of Model Points for objects with no indistinguishable views 20 | - by Hinterstoisser et al. (ACCV 2012). 21 | """ 22 | pred_pts = (pred @ to_homo(model_pts).T).T[:, :3] 23 | gt_pts = (gt @ to_homo(model_pts).T).T[:, :3] 24 | e = np.linalg.norm(pred_pts - gt_pts, axis=1).mean() 25 | return e 26 | 27 | 28 | def adi_err(pred, gt, model_pts): 29 | """ 30 | @pred: 4x4 mat 31 | @gt: 32 | @model: (N,3) 33 | """ 34 | pred_pts = (pred @ to_homo(model_pts).T).T[:, :3] 35 | gt_pts = (gt @ to_homo(model_pts).T).T[:, :3] 36 | nn_index = cKDTree(pred_pts) 37 | nn_dists, _ = nn_index.query(gt_pts, k=1, workers=-1) 38 | e = nn_dists.mean() 39 | return e 40 | 41 | 42 | def compute_auc(rec, max_val=0.1): 43 | """ 44 | Compute the Area Under Curve (AUC) for precision-recall curve up to a maximum recall value. 45 | 46 | This function calculates the AUC considering only the part of the precision-recall curve 47 | where the recall value is less than `max_val`. This is useful for scenarios where recall beyond 48 | a certain threshold is not relevant. 49 | 50 | Parameters: 51 | - rec (list or np.array): The recall values for different thresholds. 52 | - max_val (float): The maximum recall value to consider for AUC calculation. 53 | 54 | Returns: 55 | - float: The computed AUC value. 56 | 57 | Reference: 58 | - https://github.com/wenbowen123/iros20-6d-pose-tracking/blob/main/eval_ycb.py 59 | """ 60 | if len(rec) == 0: 61 | return 0 62 | 63 | rec = np.sort(np.array(rec)) 64 | n = len(rec) 65 | 66 | # Compute precision values based on the recall array 67 | prec = np.arange(1, n + 1) / n 68 | 69 | # Filter recall and precision arrays to include only recall values less than `max_val` 70 | valid_indices = np.where(rec < max_val)[0] 71 | rec = rec[valid_indices] 72 | prec = prec[valid_indices] 73 | 74 | # Prepare modified recall and precision arrays for AUC calculation 75 | mrec = np.concatenate(([0], rec, [max_val])) 76 | mpre = np.concatenate(([0], prec, [prec[-1] if len(prec) > 0 else 0])) 77 | 78 | # Ensure precision is non-decreasing 79 | for i in range(1, len(mpre)): 80 | mpre[i] = max(mpre[i], mpre[i - 1]) 81 | 82 | # Calculate the differences in recall 83 | i = np.where(mrec[1:] != mrec[:-1])[0] + 1 84 | ap = np.sum((mrec[i] - mrec[i - 1]) * mpre[i]) 85 | 86 | return ap / max_val 87 | 88 | 89 | def get_object_pose_evaluation(gt_file, pred_file): 90 | gt_results_file = Path(gt_file) 91 | pred_results_file = Path(pred_file) 92 | 93 | gt_poses = read_data_from_json(gt_results_file) 94 | pred_poses = read_data_from_json(pred_results_file) 95 | object_ids = sorted(pred_poses.keys()) 96 | 97 | pd_data = { 98 | "Object_ID": [], 99 | "ADD-S_err (cm)": [], 100 | "ADD_err (cm)": [], 101 | "ADD-S_AUC (%)": [], 102 | "ADD_AUC (%)": [], 103 | } 104 | adi_errs = [] 105 | add_errs = [] 106 | 107 | for object_id in tqdm(object_ids, total=len(object_ids), ncols=60): 108 | if object_id not in gt_poses: 109 | continue 110 | 111 | object_mesh = trimesh.load( 112 | PROJ_ROOT / "datasets" / f"models/{object_id}/cleaned_mesh_10000.obj", 113 | process=False, 114 | ) 115 | vertices = object_mesh.vertices.astype(np.float32) 116 | 117 | adi_errs_obj = [] 118 | add_errs_obj = [] 119 | for key in sorted(pred_poses[object_id].keys()): 120 | if key not in gt_poses[object_id]: 121 | continue 122 | 123 | gt_ob_in_cam = np.array(gt_poses[object_id][key], dtype=np.float32) 124 | pred_ob_in_cam = np.array(pred_poses[object_id][key], dtype=np.float32) 125 | 126 | adi = adi_err(pred_ob_in_cam, gt_ob_in_cam, vertices.copy()) 127 | add = add_err(pred_ob_in_cam, gt_ob_in_cam, vertices.copy()) 128 | 129 | adi_errs_obj.append(adi) 130 | add_errs_obj.append(add) 131 | 132 | adi_errs.append(adi) 133 | add_errs.append(add) 134 | 135 | ADDS_ERR = np.mean(adi_errs_obj) * 100 136 | ADD_ERR = np.mean(add_errs_obj) * 100 137 | ADDS_AUC = compute_auc(adi_errs_obj, max_val=0.1) * 100 138 | ADD_AUC = compute_auc(add_errs_obj, max_val=0.1) * 100 139 | 140 | pd_data["Object_ID"].append(object_id) 141 | pd_data["ADD-S_err (cm)"].append(ADDS_ERR) 142 | pd_data["ADD_err (cm)"].append(ADD_ERR) 143 | pd_data["ADD-S_AUC (%)"].append(ADDS_AUC) 144 | pd_data["ADD_AUC (%)"].append(ADD_AUC) 145 | 146 | # Average 147 | ADDS_ERR = np.mean(adi_errs) * 100 148 | ADD_ERR = np.mean(add_errs) * 100 149 | ADDS_AUC = compute_auc(adi_errs, max_val=0.1) * 100 150 | ADD_AUC = compute_auc(add_errs, max_val=0.1) * 100 151 | pd_data["Object_ID"].append("Average") 152 | pd_data["ADD-S_err (cm)"].append(ADDS_ERR) 153 | pd_data["ADD_err (cm)"].append(ADD_ERR) 154 | pd_data["ADD-S_AUC (%)"].append(ADDS_AUC) 155 | pd_data["ADD_AUC (%)"].append(ADD_AUC) 156 | 157 | df = pd.DataFrame(pd_data) 158 | 159 | # Save to csv 160 | save_csv_file = pred_results_file.parent / f"{pred_results_file.stem}_add_adds.csv" 161 | df.to_csv(save_csv_file, index=False) 162 | 163 | # Save to txt 164 | iStr = "{:>15} {:>15} {:>15} {:>15} {:>15}" 165 | result_str = [ 166 | iStr.format( 167 | "Object_ID", 168 | "ADD-S_err (cm)", 169 | "ADD_err (cm)", 170 | "ADD-S_AUC (%)", 171 | "ADD_AUC (%)", 172 | ), 173 | iStr.format( 174 | "|" + "-" * 14, 175 | "|" + "-" * 14, 176 | "|" + "-" * 14, 177 | "|" + "-" * 14, 178 | "|" + "-" * 14 + " |", 179 | ), 180 | ] 181 | for i in range(len(pd_data["Object_ID"])): 182 | result_str.append( 183 | iStr.format( 184 | pd_data["Object_ID"][i], 185 | f"{pd_data['ADD-S_err (cm)'][i]:.2f}", 186 | f"{pd_data['ADD_err (cm)'][i]:.2f}", 187 | f"{pd_data['ADD-S_AUC (%)'][i]:.2f}", 188 | f"{pd_data['ADD_AUC (%)'][i]:.2f}", 189 | ) 190 | ) 191 | result_str = "\n".join(result_str) 192 | save_txt_file = pred_results_file.parent / f"{pred_results_file.stem}_add_adds.txt" 193 | save_txt_file.write_text(result_str) 194 | tqdm.write(f" * Results saved to {save_csv_file}, {save_txt_file}") 195 | 196 | print(result_str) 197 | 198 | 199 | if __name__ == "__main__": 200 | gt_file = "config/benchmarks/ope_gt.json" 201 | pred_file = "results/ope_demo.json" 202 | 203 | tqdm.write(f"- Evaluating Object Pose Estimation results...") 204 | get_object_pose_evaluation(gt_file, pred_file) 205 | 206 | tqdm.write("- Evaluation Done...") 207 | -------------------------------------------------------------------------------- /examples/image_label_viewer.py: -------------------------------------------------------------------------------- 1 | """Example of visualizing hand and object poses of one frame in a sequence.""" 2 | 3 | import os 4 | 5 | os.environ["PYOPENGL_PLATFORM"] = "egl" # GPU-based offscreen rendering 6 | 7 | from hocap_toolkit.utils import * 8 | from hocap_toolkit.loaders import SequenceLoader 9 | from hocap_toolkit.renderers import OffscreenRenderer 10 | 11 | PROJ_ROOT = Path(__file__).parent.parent 12 | 13 | 14 | if __name__ == "__main__": 15 | sequence_folder = PROJ_ROOT / "datasets/HOCap/subject_2/20231022_201449" 16 | 17 | data_loader = SequenceLoader(str(sequence_folder), device="cuda") 18 | rs_serials = data_loader.rs_serials 19 | rs_height = data_loader.rs_height 20 | rs_width = data_loader.rs_width 21 | num_frames = data_loader.num_frames 22 | mano_sides = data_loader.mano_sides 23 | obj_meshes = [trimesh.load(p) for p in data_loader.object_textured_mesh_files] 24 | 25 | # Initialize renderer 26 | renderer = OffscreenRenderer(rs_width, rs_height) 27 | 28 | for frame_id in range(num_frames): 29 | for serial in rs_serials: 30 | image_color = data_loader.get_rgb_image(frame_id, serial) 31 | image_label = data_loader.get_image_label(frame_id, serial) 32 | 33 | if image_label: 34 | cam_K = image_label["cam_K"] 35 | obj_poses = image_label["obj_poses"] 36 | hand_joints_3d = image_label["hand_joints_3d"] 37 | hand_joints_2d = image_label["hand_joints_2d"] 38 | segmentation_mask = image_label["seg_mask"] 39 | obj_class_inds = image_label["obj_class_inds"].astype(int) 40 | obj_class_names = image_label["obj_class_names"].astype(str) 41 | 42 | # Render object poses 43 | render_color, render_depth = renderer.get_render_image( 44 | obj_meshes, obj_poses, cam_K 45 | ) 46 | image_pose = draw_image_overlay(image_color, render_color) 47 | 48 | # Draw hand joints 49 | image_handmarks = image_color.copy() 50 | for idx, marks in enumerate(hand_joints_2d): 51 | side = mano_sides[idx] 52 | image_handmarks = draw_hand_landmarks(image_handmarks, marks, side) 53 | 54 | # Draw segmentation visualization 55 | image_seg = np.zeros_like(image_color) 56 | for idx in np.unique(segmentation_mask): 57 | if idx == 0: # skip background 58 | continue 59 | image_seg[segmentation_mask == idx] = HO_CAP_SEG_COLOR[idx].rgb 60 | image_seg = draw_image_overlay(image_color, image_seg) 61 | 62 | labels_vis = draw_image_grid( 63 | [image_pose, image_handmarks, image_seg], 64 | ["ObjectPose", "Handmarks", "Segmentation"], 65 | ) 66 | 67 | # Display visualization 68 | plt.imshow(labels_vis) 69 | plt.title(f"{serial} - frame_{frame_id:06d}") 70 | plt.axis("off") 71 | plt.tight_layout() 72 | plt.show() 73 | plt.close() 74 | 75 | exit() 76 | -------------------------------------------------------------------------------- /examples/sequence_3d_viewer.py: -------------------------------------------------------------------------------- 1 | import open3d as o3d 2 | import open3d.core as o3c 3 | import open3d.visualization.gui as gui 4 | import open3d.visualization.rendering as rendering 5 | 6 | from time import sleep 7 | from torch.utils import dlpack 8 | from hocap_toolkit.utils import * 9 | from hocap_toolkit.loaders import SequenceLoader 10 | from hocap_toolkit.layers import MANOGroupLayer 11 | 12 | PROJ_ROOT = Path(__file__).parents[1] 13 | 14 | HELP_INFO = """ 15 | ============================= 16 | Keyboard commands: 17 | ============================= 18 | H: display control panel 19 | SPACE: pause 20 | Q: quit 21 | R: reset camera 22 | ============================= 23 | """ 24 | 25 | 26 | class SequenceViewer: 27 | def __init__(self, sequence_folder, device="cuda") -> None: 28 | self._data_folder = Path(sequence_folder) 29 | self._device = device 30 | self._logger = get_logger(self.__class__.__name__) 31 | 32 | self._loader = SequenceLoader(sequence_folder, device=device) 33 | self._num_frames = self._loader.num_frames 34 | self._rs_serials = self._loader.rs_serials 35 | self._rs_master = self._loader.rs_master 36 | self._master_id = self._rs_serials.index(self._rs_master) 37 | self._num_cameras = len(self._rs_serials) 38 | self._rs_height = self._loader.rs_height 39 | self._rs_width = self._loader.rs_width 40 | self._rs_Ks = self._loader.rs_Ks.cpu().numpy() 41 | self._rs_RTs = self._loader.rs_RTs.cpu().numpy() 42 | self._mano_sides = self._loader.mano_sides 43 | 44 | self._mano_group_layer = self._init_mano_group_layer() 45 | self._mano_verts = self._get_mano_verts() 46 | self._mano_faces = self._get_mano_faces() 47 | self._mano_colors = self._get_mano_colors() 48 | 49 | self._poses_o = self._load_poses_o() 50 | 51 | def run(self): 52 | self._is_done = False 53 | self._frame_id = -1 54 | 55 | # rendering settings 56 | self._bg_color = (0.0, 0.0, 0.0, 1.0) # black 57 | self._point_size = 1 58 | self._update_flag = ( 59 | rendering.Scene.UPDATE_POINTS_FLAG | rendering.Scene.UPDATE_COLORS_FLAG 60 | ) # update points and colors 61 | 62 | # control flags 63 | self._cropped = False # crop points 64 | self._is_paused = False # pause 65 | self._show_skybox = False # show skybox background 66 | self._show_axes = False # show axes frame 67 | self._show_pcds = True # show point clouds 68 | self._show_mano = False # show mano mesh 69 | self._show_object = False # show object mesh 70 | self._cam_id = self._rs_serials.index(self._rs_master) # camera view 71 | 72 | # materials 73 | self._mat_pcd = rendering.MaterialRecord() 74 | self._mat_pcd.shader = "defaultUnlit" 75 | self._mat_pcd.point_size = self._point_size 76 | self._mat_mesh = rendering.MaterialRecord() 77 | self._mat_mesh.shader = "defaultUnlit" 78 | self._mat_line = rendering.MaterialRecord() 79 | self._mat_line.shader = "unlitLine" 80 | 81 | # dummy geometry 82 | zeros = o3c.Tensor.zeros( 83 | (self._rs_width * self._rs_height * self._num_cameras, 3), dtype=o3c.float32 84 | ) 85 | self._pcd = o3d.t.geometry.PointCloud() 86 | self._pcd.point.positions = zeros 87 | self._pcd.point.colors = zeros 88 | self._pcd.point.normals = zeros 89 | 90 | mano_mesh = o3d.geometry.TriangleMesh() 91 | mano_mesh.vertices = o3d.utility.Vector3dVector(self._mano_verts[0].numpy()) 92 | mano_mesh.triangles = o3d.utility.Vector3iVector(self._mano_faces) 93 | mano_mesh.vertex_colors = o3d.utility.Vector3dVector(self._mano_colors) 94 | mano_mesh.compute_vertex_normals() 95 | mano_ls = o3d.geometry.LineSet.create_from_triangle_mesh(mano_mesh) 96 | mano_ls.paint_uniform_color((0.0, 0.0, 0.0)) # black 97 | self._mano_mesh = o3d.t.geometry.TriangleMesh.from_legacy(mano_mesh) 98 | self._mano_ls = o3d.t.geometry.LineSet.from_legacy(mano_ls) 99 | 100 | # init gui 101 | self._app = gui.Application.instance 102 | self._app.initialize() 103 | 104 | # create window 105 | self._window = self._create_window() 106 | 107 | # set callbacks 108 | self._window.set_on_layout(self._on_layout) 109 | self._window.set_on_key(self._on_key) 110 | self._window.set_on_close(self._on_close) 111 | 112 | # add initial dummy geometry 113 | self._widget3d.scene.add_geometry("pcd", self._pcd, self._mat_pcd) 114 | self._widget3d.scene.show_geometry("pcd", self._show_pcds) 115 | self._widget3d.scene.add_geometry("mano", self._mano_mesh, self._mat_mesh) 116 | self._widget3d.scene.add_geometry("mano_ls", self._mano_ls, self._mat_line) 117 | self._widget3d.scene.show_geometry("mano", self._show_mano) 118 | self._widget3d.scene.show_geometry("mano_ls", self._show_mano) 119 | for i, mesh_file in enumerate(self._loader.object_textured_mesh_files): 120 | self._widget3d.scene.add_model( 121 | f"object_{i}", o3d.io.read_triangle_model(mesh_file) 122 | ) 123 | self._widget3d.scene.show_geometry(f"object_{i}", self._show_object) 124 | 125 | # update camera 126 | self._reset_camera() 127 | 128 | # run 129 | self._app.run_in_thread(self.update) 130 | self._app.run() 131 | 132 | def _create_window(self, title="Sequence Viewer", width=1280, height=720): 133 | # create window 134 | window = self._app.create_window(title, width, height) 135 | 136 | ## add widget3d 137 | self._widget3d = gui.SceneWidget() 138 | self._widget3d.scene = rendering.Open3DScene(window.renderer) 139 | self._widget3d.scene.set_background(self._bg_color) 140 | self._widget3d.scene.scene.enable_sun_light(False) 141 | self._widget3d.scene.scene.enable_indirect_light(True) 142 | point_light_postions = [ 143 | np.array([0.5, 0.5, 1.0]).astype(np.float32), 144 | np.array([-0.5, 0.5, 1.0]).astype(np.float32), 145 | np.array([-0.5, -0.5, 1.0]).astype(np.float32), 146 | np.array([0.5, -0.5, 1.0]).astype(np.float32), 147 | np.array([0.5, -0.5, 0.0]).astype(np.float32), 148 | np.array([0.5, 0.5, 0.0]).astype(np.float32), 149 | np.array([-0.5, 0.5, 0.0]).astype(np.float32), 150 | np.array([-0.5, -0.5, 0.0]).astype(np.float32), 151 | ] 152 | for idx, pos in enumerate(point_light_postions): 153 | self._widget3d.scene.scene.add_point_light( 154 | name=f"light_{idx}", 155 | color=np.array([1.0, 1.0, 1.0]).astype(np.float32), 156 | position=pos, 157 | intensity=1e6, 158 | falloff=1e2, 159 | cast_shadows=False, 160 | ) 161 | 162 | view = self._widget3d.scene.view 163 | view.set_post_processing(False) 164 | window.add_child(self._widget3d) 165 | 166 | ## add settings panel 167 | em = window.theme.font_size 168 | margin = 0.25 * em 169 | self._panel = gui.Vert(margin, gui.Margins(margin, margin, margin, margin)) 170 | 171 | ### render settings 172 | settings = gui.CollapsableVert( 173 | "Render Settings", margin, gui.Margins(margin, margin, margin, margin) 174 | ) 175 | settings.set_is_open(True) 176 | render_blk = gui.VGrid(2, margin) 177 | self._bg_color_edit = gui.ColorEdit() 178 | self._bg_color_edit.color_value = gui.Color(*self._bg_color) 179 | self._bg_color_edit.set_on_value_changed(self._on_bg_color) 180 | render_blk.add_child(gui.Label("Background Color")) 181 | render_blk.add_child(self._bg_color_edit) 182 | point_size = gui.Slider(gui.Slider.INT) 183 | point_size.double_value = self._point_size 184 | point_size.set_limits(1, 10) 185 | point_size.set_on_value_changed(self._on_point_size) 186 | render_blk.add_child(gui.Label("Point Size")) 187 | render_blk.add_child(point_size) 188 | chk_box = gui.Checkbox("Show Skybox") 189 | chk_box.checked = self._show_skybox 190 | chk_box.set_on_checked(self._on_skybox) 191 | render_blk.add_child(chk_box) 192 | chk_box = gui.Checkbox("Show Axes") 193 | chk_box.checked = self._show_axes 194 | chk_box.set_on_checked(self._on_axes) 195 | render_blk.add_child(chk_box) 196 | crop_box = gui.Checkbox("Crop Points") 197 | crop_box.checked = self._cropped 198 | crop_box.set_on_checked(self._on_crop) 199 | render_blk.add_child(crop_box) 200 | settings.add_child(render_blk) 201 | self._panel.add_child(settings) 202 | ### geometry settings 203 | settings = gui.CollapsableVert( 204 | "Geometry Settings", margin, gui.Margins(margin, margin, margin, margin) 205 | ) 206 | settings.set_is_open(True) 207 | geo_blk = gui.Vert(margin, gui.Margins(margin, margin, margin, margin)) 208 | chk_box = gui.Checkbox("Point Clouds") 209 | chk_box.checked = self._show_pcds 210 | chk_box.set_on_checked(self._on_pcds) 211 | geo_blk.add_child(chk_box) 212 | chk_box = gui.Checkbox("Hand Mesh") 213 | chk_box.enabled = True 214 | chk_box.checked = self._show_mano 215 | chk_box.set_on_checked(self._on_mano) 216 | geo_blk.add_child(chk_box) 217 | chk_box = gui.Checkbox("Object Mesh") 218 | chk_box.enabled = True 219 | chk_box.checked = self._show_object 220 | chk_box.set_on_checked(self._on_object) 221 | geo_blk.add_child(chk_box) 222 | settings.add_child(geo_blk) 223 | self._panel.add_child(settings) 224 | ### progress bar 225 | bar = gui.VGrid(3, margin) 226 | self._slider = gui.Slider(gui.Slider.INT) 227 | self._slider.set_limits(0, self._num_frames - 1) 228 | self._slider.set_on_value_changed(self._on_progress_slider) 229 | self._num_edit = gui.NumberEdit(gui.NumberEdit.INT) 230 | self._num_edit.set_limits(0, self._num_frames - 1) 231 | self._num_edit.set_on_value_changed(self._on_progress_slider) 232 | bar.add_child(gui.Label("Frame Slider")) 233 | bar.add_child(self._slider) 234 | bar.add_child(self._num_edit) 235 | self._panel.add_child(bar) 236 | ### reset button 237 | btns = gui.Horiz(margin, gui.Margins(margin, margin, margin, margin)) 238 | botton1 = gui.Button("Reset") 239 | botton1.set_on_clicked(self._on_reset) 240 | botton2 = gui.Button("Pause/Play") 241 | botton2.set_on_clicked(self._on_pause) 242 | botton3 = gui.Button("Exit") 243 | botton3.set_on_clicked(self._on_exit) 244 | btns.add_stretch() 245 | btns.add_child(botton1) 246 | btns.add_child(botton2) 247 | btns.add_child(botton3) 248 | btns.add_stretch() 249 | self._panel.add_child(btns) 250 | 251 | self._panel.add_stretch() 252 | #################### 253 | # add tab control 254 | self._tabs = gui.TabControl() 255 | help_tab = gui.Vert(margin, gui.Margins(margin, margin, margin, margin)) 256 | help_info = gui.VGrid(2, margin) 257 | help_info.add_child(gui.Label(HELP_INFO)) 258 | help_tab.add_child(help_info) 259 | self._tabs.add_tab("Settings", self._panel) 260 | self._tabs.add_tab("Help", help_tab) 261 | 262 | # add tabs 263 | window.add_child(self._tabs) 264 | 265 | return window 266 | 267 | def _on_layout(self, ctx): 268 | r = self._window.content_rect 269 | panel_size = self._tabs.calc_preferred_size(ctx, gui.Widget.Constraints()) 270 | if (r.width < self._rs_width + panel_size.width) or r.height < self._rs_height: 271 | self._window.size = gui.Size( 272 | self._rs_width + panel_size.width, self._rs_height 273 | ) 274 | self._width = r.width - panel_size.width 275 | self._height = r.height 276 | self._widget3d.frame = gui.Rect(0, 0, self._width, self._height) 277 | self._tabs.frame = gui.Rect( 278 | self._widget3d.frame.get_right(), 0, panel_size.width, self._height 279 | ) 280 | self._update_camera_K() 281 | 282 | def _on_close(self): 283 | self._is_done = True 284 | sleep(0.10) 285 | return True 286 | 287 | def _on_key(self, event): 288 | if event.key == gui.KeyName.Q: # quit 289 | if event.type == gui.KeyEvent.DOWN: 290 | self._window.close() 291 | return True 292 | 293 | if event.key == gui.KeyName.SPACE: # pause 294 | if event.type == gui.KeyEvent.DOWN: 295 | self._on_pause() 296 | return True 297 | 298 | if event.key == gui.KeyName.R: # reset camera 299 | if event.type == gui.KeyEvent.DOWN: 300 | self._reset_camera() 301 | return True 302 | 303 | return False 304 | 305 | def _on_exit(self): 306 | self._window.close() 307 | self._app.quit() 308 | 309 | def _on_pause(self): 310 | self._is_paused = not self._is_paused 311 | 312 | def _on_reset(self): 313 | self._cam_id = self._rs_serials.index(self._rs_master) 314 | self._reset_camera() 315 | self._frame_id = -1 316 | self._slider.int_value = 0 317 | self._num_edit.int_value = 0 318 | 319 | def _on_progress_slider(self, value): 320 | value = int(value) % self._num_frames 321 | self._frame_id = value 322 | self._num_edit.int_value = value 323 | 324 | def _on_bg_color(self, color): 325 | self._bg_color_edit.color_value = color 326 | self._widget3d.scene.set_background( 327 | [color.red, color.green, color.blue, color.alpha] 328 | ) 329 | 330 | def _on_skybox(self, checked): 331 | self._widget3d.scene.show_skybox(checked) 332 | 333 | def _on_axes(self, checked): 334 | self._widget3d.scene.show_axes(checked) 335 | 336 | def _on_crop(self, checked): 337 | self._cropped = checked 338 | 339 | def _on_pcds(self, checked): 340 | self._show_pcds = checked 341 | self._widget3d.scene.show_geometry("pcd", checked) 342 | 343 | def _on_mano(self, checked): 344 | self._show_mano = checked 345 | self._widget3d.scene.show_geometry("mano", checked) 346 | self._widget3d.scene.show_geometry("mano_ls", checked) 347 | 348 | def _on_object(self, checked): 349 | self._show_object = checked 350 | for i in range(len(self._loader.object_textured_mesh_files)): 351 | self._widget3d.scene.show_geometry(f"object_{i}", checked) 352 | 353 | def _on_point_size(self, value): 354 | self._mat_pcd.point_size = int(value) 355 | self._widget3d.scene.modify_geometry_material("pcd", self._mat_pcd) 356 | 357 | def _reset_camera(self): 358 | self._widget3d.scene.camera.look_at([0, 0, 0], [0, 0, 0.8], [0, -1, 0]) 359 | 360 | def _update_camera_K(self): 361 | def create_K_matrix(image_width, image_height, fov_degrees): 362 | # The principal point is at the center of the image. 363 | cx = image_width / 2.0 364 | cy = image_height / 2.0 365 | # Compute the focal length from the field of view. 366 | fov_rad = np.deg2rad(fov_degrees) 367 | fx = fy = cx / np.tan(fov_rad / 2) 368 | # Create the intrinsic matrix. 369 | K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) 370 | return K 371 | 372 | K = create_K_matrix(self._width, self._height, 90) 373 | self._widget3d.scene.camera.set_projection( 374 | K, 0.001, 1000.0, self._width, self._height 375 | ) 376 | 377 | def _update_camera_pose(self): 378 | def extrinsics_to_look_at(pose): 379 | R = pose[:3, :3] 380 | T = pose[:3, 3] 381 | # The camera's position (eye) is the negative rotation by R of T. 382 | eye = -np.matmul(R.T, T) 383 | # The center point is one unit down the z-axis in the camera's space, 384 | # then transformed to world space by the pose matrix. 385 | center = np.matmul(R.T, np.array([0, 0, 1])) + eye 386 | # The up vector is the y-axis in the camera's space, then transformed to world space by the rotation matrix. 387 | # This assumes that the y-axis is the down-direction in the camera's local space. 388 | up = np.matmul(R.T, np.array([0, -1, 0])) 389 | return center, eye, up 390 | 391 | extrinsics = self._rs_RTs[self._cam_id] 392 | center, eye, up = extrinsics_to_look_at(extrinsics) 393 | # self._widget3d.scene.camera.look_at(center, eye, up) 394 | self._widget3d.look_at(center, eye, up) 395 | 396 | def _init_mano_group_layer(self): 397 | betas = [self._loader.mano_beta.cpu().numpy() for _ in self._mano_sides] 398 | return MANOGroupLayer(self._mano_sides, betas).to(self._device) 399 | 400 | def _load_poses_m(self): 401 | poses = np.load(self._data_folder / "poses_m.npy").astype(np.float32) 402 | poses = np.concatenate( 403 | [poses[0 if side == "right" else 1] for side in self._mano_sides], axis=1 404 | ) 405 | poses = torch.from_numpy(poses).to(self._device) 406 | return poses 407 | 408 | def _load_poses_o(self): 409 | poses = np.load(self._data_folder / "poses_o.npy").astype(np.float32) 410 | poses = np.stack( 411 | [quat_to_mat(p) for p in poses], axis=1 412 | ) # (num_frames, num_objects, 4, 4) 413 | print(f"poses_o: {poses.shape}") 414 | return poses 415 | 416 | def _get_mano_verts(self): 417 | pose_file = self._data_folder / "poses_m.npy" 418 | poses = np.load(pose_file).astype(np.float32) 419 | poses = np.concatenate( 420 | [poses[0 if side == "right" else 1] for side in self._mano_sides], axis=1 421 | ) 422 | poses = torch.from_numpy(poses).to(self._device) 423 | verts, _ = self._mano_group_layer(poses) 424 | return verts.cpu() 425 | 426 | def _get_mano_faces(self): 427 | faces = [self._mano_group_layer.f.cpu().numpy()] 428 | for i, side in enumerate(self._mano_sides): 429 | faces.append(np.array(NEW_MANO_FACES[side]) + i * NUM_MANO_VERTS) 430 | faces = np.concatenate(faces, axis=0).astype(np.int64) 431 | return faces 432 | 433 | def _get_mano_colors(self): 434 | colors = np.stack( 435 | [ 436 | [HAND_COLORS[1 if side == "right" else 2].rgb_norm] * NUM_MANO_VERTS 437 | for side in self._mano_sides 438 | ] 439 | ).reshape(-1, 3) 440 | return colors 441 | 442 | def step(self): 443 | if not self._is_paused: 444 | self._frame_id = (self._frame_id + 1) % self._num_frames 445 | self._slider.int_value = self._frame_id 446 | self._num_edit.int_value = self._frame_id 447 | self._loader.step_by_frame_id(self._frame_id) 448 | 449 | def update(self): 450 | def update(): 451 | if self._show_pcds: 452 | points = self._loader.points 453 | colors = self._loader.colors 454 | masks = self._loader.masks 455 | if self._cropped: 456 | points[~masks] = 0.0 457 | colors[~masks] = 0.0 458 | self._pcd.point.positions = o3c.Tensor.from_dlpack( 459 | dlpack.to_dlpack(points.cpu().view((-1, 3))) 460 | ) 461 | self._pcd.point.colors = o3c.Tensor.from_dlpack( 462 | dlpack.to_dlpack(colors.cpu().view((-1, 3))) 463 | ) 464 | self._widget3d.scene.scene.update_geometry( 465 | "pcd", self._pcd, self._update_flag 466 | ) 467 | 468 | if self._show_mano: 469 | self._mano_mesh.vertex.positions = self._mano_ls.point.positions = ( 470 | o3c.Tensor.from_dlpack( 471 | dlpack.to_dlpack(self._mano_verts[self._frame_id]) 472 | ) 473 | ) 474 | self._widget3d.scene.remove_geometry("mano") 475 | self._widget3d.scene.add_geometry( 476 | "mano", self._mano_mesh, self._mat_mesh 477 | ) 478 | self._widget3d.scene.remove_geometry("mano_ls") 479 | self._widget3d.scene.add_geometry( 480 | "mano_ls", self._mano_ls, self._mat_line 481 | ) 482 | 483 | if self._show_object: 484 | for i, pose in enumerate(self._poses_o[self._frame_id]): 485 | self._widget3d.scene.set_geometry_transform(f"object_{i}", pose) 486 | 487 | while not self._is_done: 488 | sleep(0.067) 489 | if not self._is_done: 490 | self.step() 491 | self._app.post_to_main_thread(self._window, update) 492 | 493 | 494 | if __name__ == "__main__": 495 | sequence_folder = "datasets/subject_1/20231025_165502" 496 | device = "cuda" if torch.cuda.is_available() else "cpu" 497 | 498 | viewer = SequenceViewer(sequence_folder, device=device) 499 | viewer.run() 500 | -------------------------------------------------------------------------------- /examples/sequence_pose_viewer.py: -------------------------------------------------------------------------------- 1 | """Example of visualizing hand and object poses of one frame in a sequence.""" 2 | 3 | import os 4 | 5 | os.environ["PYOPENGL_PLATFORM"] = "egl" # GPU-based offscreen rendering 6 | 7 | from hocap_toolkit.utils import * 8 | from hocap_toolkit.renderers import SequenceRenderer 9 | 10 | PROJ_ROOT = Path(__file__).parent.parent 11 | 12 | if __name__ == "__main__": 13 | sequence_folder = PROJ_ROOT / "datasets/subject_1/20231025_165502" 14 | renderer = SequenceRenderer(sequence_folder, device="cuda") 15 | 16 | frame_id = 70 17 | 18 | # Render the scene and get the rendered images 19 | renderer.create_scene(frame_id) 20 | render_colors = renderer.get_render_colors() 21 | 22 | # Display the rendered images 23 | overlays = [ 24 | cv2.addWeighted( 25 | renderer.get_rgb_image(frame_id, serial), 0.4, render_color, 0.6, 0 26 | ) 27 | for serial, render_color in render_colors.items() 28 | ] 29 | 30 | draw_all_camera_images(overlays, list(render_colors.keys()), show_only=True) 31 | -------------------------------------------------------------------------------- /examples/sequence_renderer.py: -------------------------------------------------------------------------------- 1 | """Example of rendering a sequence.""" 2 | 3 | import os 4 | 5 | os.environ["PYOPENGL_PLATFORM"] = "egl" # GPU-based offscreen rendering 6 | 7 | from hocap_toolkit.utils import * 8 | from hocap_toolkit.renderers import SequenceRenderer 9 | 10 | 11 | if __name__ == "__main__": 12 | sequence_folder = "datasets/subject_1/20231025_165502" 13 | 14 | renderer = SequenceRenderer(sequence_folder, device="cuda") 15 | 16 | for frame_id in tqdm(range(renderer.num_frames), desc="Rendering", ncols=80): 17 | # Render the scene and get the rendered images 18 | renderer.create_scene(frame_id) 19 | render_colors = renderer.get_render_colors() 20 | render_masks = renderer.get_render_masks() 21 | overlays = { 22 | serial: cv2.addWeighted( 23 | renderer.get_rgb_image(frame_id, serial), 0.4, render_color, 0.6, 0 24 | ) 25 | for serial, render_color in render_colors.items() 26 | } 27 | 28 | # Save the rendered images 29 | for serial in render_colors: 30 | save_folder = Path(sequence_folder) / "renders" / serial 31 | save_folder.mkdir(parents=True, exist_ok=True) 32 | write_rgb_image(save_folder / f"vis_{frame_id:06d}.png", overlays[serial]) 33 | write_rgb_image( 34 | save_folder / f"color_{frame_id:06d}.png", render_colors[serial] 35 | ) 36 | write_mask_image( 37 | save_folder / f"seg_{frame_id:06d}.png", render_masks[serial] 38 | ) 39 | -------------------------------------------------------------------------------- /hocap_toolkit/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IRVLUTD/HO-Cap/723ae6f8f5291f074ae309f179eb8556f67ffd19/hocap_toolkit/benchmarks/__init__.py -------------------------------------------------------------------------------- /hocap_toolkit/benchmarks/groundtruth_generator.py: -------------------------------------------------------------------------------- 1 | from ..utils import * 2 | from ..utils.common import * 3 | from ..loaders import SequenceLoader 4 | 5 | 6 | class BenchmarkGTGenerator: 7 | def __init__(self): 8 | self._data_root = PROJ_ROOT / "data" 9 | 10 | def generate_hand_pose_gt(self): 11 | keys_file = ( 12 | self._data_root / "data/benchmarks/hand_pose_benchmark_gt_demo_keys.json" 13 | ) 14 | keys = read_data_from_json(keys_file) 15 | -------------------------------------------------------------------------------- /hocap_toolkit/factory/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_factory import HOCapFactory 2 | -------------------------------------------------------------------------------- /hocap_toolkit/factory/dataset_factory.py: -------------------------------------------------------------------------------- 1 | import pycocotools.mask as mask_util 2 | from hocap_toolkit.utils import * 3 | 4 | PROJ_ROOT = Path(__file__).parents[2] 5 | HOCAP_DATASET_ROOT = PROJ_ROOT / "datasets" 6 | 7 | HOCAP_INFO = read_data_from_yaml(PROJ_ROOT / "config/hocap_info.yaml") 8 | 9 | # The train/valid/test split is defined separately for each task (HPE, ODET, OPE) 10 | # - The split is defined as a list of items 11 | # - Each item is a list in the format [subject_index, sequence_index, camera_index, frame_index] 12 | # - For example, [0, 0, 0, 0] refers "subject_1/20231022_190534/105322251564" folder and frame "color_000000.jpg" & "depth_000000.png" 13 | HPE_CONFIG = read_data_from_json(PROJ_ROOT / "config/hocap_hpe.json") 14 | ODET_CONFIG = read_data_from_json(PROJ_ROOT / "config/hocap_odet.json") 15 | OPE_CONFIG = read_data_from_json(PROJ_ROOT / "config/hocap_ope.json") 16 | 17 | COCO_CATEGORIES = [ 18 | { 19 | "id": i + 1, 20 | "name": obj_class, 21 | "supercategory": "object", 22 | } 23 | for i, obj_class in enumerate(HOCAP_INFO["object_classes"]) 24 | if "HAND" not in obj_class 25 | ] 26 | 27 | YOLO_CLASSES = [ 28 | obj_class for obj_class in HOCAP_INFO["object_classes"] if "HAND" not in obj_class 29 | ] 30 | 31 | 32 | class HOCapFactory: 33 | def __init__(self) -> None: 34 | self._logger = get_logger(__class__.__name__) 35 | 36 | self._calib_dir = HOCAP_DATASET_ROOT / "calibration" 37 | self._models_dir = HOCAP_DATASET_ROOT / "models" 38 | self._rs_width = 640 39 | self._rs_height = 480 40 | self._mano_betas = [ 41 | self._read_mano_beta(sub_id) for sub_id in HOCAP_INFO["subject_ids"] 42 | ] 43 | self._rs_RTs = self._load_rs_cam_RTs() 44 | self._rs_RTs_inv = [np.linalg.inv(RT) for RT in self._rs_RTs] 45 | 46 | def _world_mano_pose_to_camera(self, mano_pose, cam_RT_inv): 47 | if np.all(mano_pose == -1): 48 | return mano_pose 49 | 50 | pose_c = mano_pose.copy() 51 | rvt_w = np.concatenate([pose_c[:3], pose_c[-3:]], axis=0) 52 | mat_w = rvt_to_mat(rvt_w) 53 | mat_c = cam_RT_inv @ mat_w 54 | rvt_c = mat_to_rvt(mat_c) 55 | pose_c[:3] = rvt_c[:3] 56 | pose_c[-3:] = rvt_c[-3:] 57 | return pose_c 58 | 59 | def _read_mano_beta(self, sub_id): 60 | file_path = self._calib_dir / "mano" / f"{sub_id}.yaml" 61 | mano_data = read_data_from_yaml(file_path) 62 | mano_betas = np.array(mano_data["betas"]).astype(np.float32) 63 | return mano_betas 64 | 65 | def _load_pose_m(self, sub_id, seq_id): 66 | file_path = HOCAP_DATASET_ROOT / sub_id / seq_id / "poses_m.npy" 67 | poses_m = np.load(file_path).astype(np.float32) 68 | return poses_m 69 | 70 | def _load_rs_cam_RTs(self): 71 | def create_mat(values): 72 | return np.array( 73 | [values[0:4], values[4:8], values[8:12], [0, 0, 0, 1]], dtype=np.float32 74 | ) 75 | 76 | file_path = self._calib_dir / f"extrinsics/extrinsics_20231014.yaml" 77 | extrinsics = read_data_from_yaml(file_path)["extrinsics"] 78 | tag_1 = create_mat(extrinsics["tag_1"]) 79 | tag_1_inv = np.linalg.inv(tag_1) 80 | rs_RTs_master = [ 81 | create_mat(extrinsics[serial]) 82 | for serial in HOCAP_INFO["device_serials"][:-1] # Exclude the hololens 83 | ] 84 | rs_RTs_world = [tag_1_inv @ RT for RT in rs_RTs_master] 85 | return rs_RTs_world 86 | 87 | def _get_obj_model_path(self, obj_id): 88 | mesh_file = self._models_dir / obj_id / "textured_mesh.obj" 89 | texture_file = self._models_dir / obj_id / "textured_mesh_0.png" 90 | material_file = self._models_dir / obj_id / "textured_mesh.mtl" 91 | return (mesh_file, texture_file, material_file) 92 | 93 | def _load_object_vertices(self): 94 | object_vertices = {} 95 | for obj_id in HOCAP_INFO["object_classes"]: 96 | if "HAND" in obj_id: 97 | continue # Exclude hands 98 | mesh_file, _, _ = self._get_obj_model_path(obj_id) 99 | mesh = trimesh.load(mesh_file) 100 | object_vertices[obj_id] = mesh.vertices.astype(np.float32) 101 | return object_vertices 102 | 103 | def _calculate_model_info(self, mesh): 104 | # Diameter (approximate) as the max distance between any two vertices 105 | diameter = mesh.bounding_sphere.primitive.radius * 2 106 | # Bounding box dimensions 107 | min_bounds, max_bounds = mesh.bounds 108 | size = max_bounds - min_bounds 109 | return { 110 | "diameter": float(diameter), 111 | "min_x": float(min_bounds[0]), 112 | "min_y": float(min_bounds[1]), 113 | "min_z": float(min_bounds[2]), 114 | "size_x": float(size[0]), 115 | "size_y": float(size[1]), 116 | "size_z": float(size[2]), 117 | } 118 | 119 | def _calculate_projected_bbox(self, cam_K, obj_pose, object_vertices): 120 | """Calculate the 2D bounding box of the projected 3D object mesh.""" 121 | # Transform vertices to camera space 122 | object_vertices_homogeneous = np.hstack( 123 | (object_vertices, np.ones((object_vertices.shape[0], 1))) 124 | ) 125 | vertices_cam = (obj_pose @ object_vertices_homogeneous.T).T[:, :3] 126 | 127 | # Project vertices into 2D 128 | vertices_2d = (cam_K @ vertices_cam.T).T 129 | vertices_2d = ( 130 | vertices_2d[:, :2] / vertices_2d[:, 2:3] 131 | ) # Normalize by depth to get 2D coordinates 132 | 133 | # Get min/max x and y for the bounding box 134 | x_min, y_min = np.min(vertices_2d, axis=0) 135 | x_max, y_max = np.max(vertices_2d, axis=0) 136 | 137 | # Return bbox as [x_min, y_min, width, height] 138 | bbox = [int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)] 139 | return bbox 140 | 141 | def _binary_mask_to_rle(self, mask): 142 | """ 143 | Convert binary mask to COCO RLE format using pycocotools. 144 | """ 145 | # Ensure mask is uint8 146 | binary_mask = mask.astype(np.uint8) 147 | 148 | rle = mask_util.encode(np.asfortranarray(binary_mask)) 149 | rle["counts"] = rle["counts"].decode("utf-8") # Convert to string (COCO format) 150 | return rle 151 | 152 | def create_odet_dataset(self, dataset_type): 153 | if dataset_type not in ["coco", "yolo"]: 154 | msg = f"Invalid dataset type: {dataset_type}, choose from 'coco' or 'yolo'" 155 | self._logger.error(msg) 156 | raise ValueError(msg) 157 | 158 | self._logger.info( 159 | f">>>>>>>>>> Creating HOCap Object Detection Dataset ({dataset_type})..." 160 | ) 161 | output_dir = HOCAP_DATASET_ROOT / f"hocap_odet_{dataset_type}" 162 | make_clean_folder(output_dir) 163 | 164 | if dataset_type == "yolo": 165 | yolo_classes = [ 166 | obj_c for obj_c in HOCAP_INFO["object_classes"] if "HAND" not in obj_c 167 | ] 168 | dataset_info = { 169 | "train": "../train/images", 170 | "val": "../valid/images", 171 | "test": "../test/images", 172 | "nc": len(yolo_classes), 173 | "names": yolo_classes, 174 | } 175 | write_data_to_yaml(output_dir / "data.yaml", dataset_info) 176 | 177 | for split, split_data in ODET_CONFIG.items(): 178 | self._logger.info(f"Extracting {split} data...") 179 | 180 | save_image_dir = output_dir / split / "images" 181 | make_clean_folder(save_image_dir) 182 | 183 | save_label_dir = output_dir / split / "labels" 184 | make_clean_folder(save_label_dir) 185 | 186 | tqbar = tqdm(total=len(split_data), ncols=100) 187 | for image_idx, (sub_idx, seq_idx, cam_idx, frame_idx) in enumerate( 188 | split_data 189 | ): 190 | sub_id = HOCAP_INFO["subject_ids"][sub_idx] 191 | seq_id = HOCAP_INFO["sequence_ids"][seq_idx] 192 | cam_id = HOCAP_INFO["device_serials"][cam_idx] 193 | 194 | # Copy image 195 | src_img_path = ( 196 | HOCAP_DATASET_ROOT 197 | / f"{sub_id}/{seq_id}/{cam_id}/color_{frame_idx:06d}.jpg" 198 | ) 199 | save_img_name = f"sub{sub_idx:02d}_seq{seq_idx:02d}_cam{cam_idx:02d}_frame{frame_idx:06d}.jpg" 200 | shutil.copy(src_img_path, save_image_dir / save_img_name) 201 | 202 | # Generate yolo annotations 203 | yolo_annotations = [] 204 | label_data = np.load( 205 | src_img_path.parent / f"label_{frame_idx:06d}.npz" 206 | ) 207 | seg_mask = label_data["seg_mask"] 208 | obj_class_inds = label_data["obj_class_inds"] 209 | obj_class_names = label_data["obj_class_names"] 210 | for idx, mask_i in enumerate(np.unique(seg_mask)): 211 | if mask_i == 0: # Background 212 | continue 213 | mask = seg_mask == mask_i 214 | if mask.sum() < 10: 215 | continue # Ignore tiny/noisy masks 216 | 217 | x, y, w, h = cv2.boundingRect(mask.astype(np.uint8)) 218 | category_id = obj_class_inds[idx - 1].item() 219 | cx = (x + w / 2) / self._rs_width 220 | cy = (y + h / 2) / self._rs_height 221 | w /= self._rs_width 222 | h /= self._rs_height 223 | 224 | yolo_annotations.append( 225 | f"{category_id} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}" 226 | ) 227 | 228 | # Save yolo annotations 229 | save_label_name = f"sub{sub_idx:02d}_seq{seq_idx:02d}_cam{cam_idx:02d}_frame{frame_idx:06d}.txt" 230 | (save_label_dir / save_label_name).write_text( 231 | "\n".join(yolo_annotations) 232 | ) 233 | 234 | tqbar.update(1) 235 | tqbar.close() 236 | 237 | elif dataset_type == "coco": 238 | save_anno_dir = output_dir / "annotations" 239 | make_clean_folder(save_anno_dir) 240 | 241 | for split, split_data in ODET_CONFIG.items(): 242 | self._logger.info(f"Extracting {split} data...") 243 | save_image_dir = output_dir / split 244 | make_clean_folder(save_image_dir) 245 | 246 | save_anno_path = save_anno_dir / f"instances_{split}HOCap.json" 247 | 248 | annotations = [] 249 | images = [] 250 | tqbar = tqdm(total=len(split_data), ncols=100) 251 | for image_idx, (sub_idx, seq_idx, cam_idx, frame_idx) in enumerate( 252 | split_data 253 | ): 254 | sub_id = HOCAP_INFO["subject_ids"][sub_idx] 255 | seq_id = HOCAP_INFO["sequence_ids"][seq_idx] 256 | cam_id = HOCAP_INFO["device_serials"][cam_idx] 257 | 258 | src_img_path = ( 259 | HOCAP_DATASET_ROOT 260 | / f"{sub_id}/{seq_id}/{cam_id}/color_{frame_idx:06d}.jpg" 261 | ) 262 | 263 | # Copy image 264 | save_img_name = f"sub{sub_idx:02d}_seq{seq_idx:02d}_cam{cam_idx:02d}_frame{frame_idx:06d}.jpg" 265 | shutil.copy(src_img_path, save_image_dir / save_img_name) 266 | 267 | # Generate COCO annotations 268 | images.append( 269 | { 270 | "id": image_idx, 271 | "file_name": save_img_name, 272 | "height": self._rs_height, 273 | "width": self._rs_width, 274 | } 275 | ) 276 | 277 | label_data = np.load( 278 | src_img_path.parent / f"label_{frame_idx:06d}.npz" 279 | ) 280 | seg_mask = label_data["seg_mask"] 281 | obj_class_inds = label_data["obj_class_inds"] 282 | 283 | for idx, mask_i in enumerate(np.unique(seg_mask)): 284 | if mask_i == 0: 285 | continue # Background 286 | mask = (seg_mask == mask_i).astype(np.uint8) 287 | area = np.sum(mask).item() 288 | if area < 10: 289 | continue # Ignore tiny/noisy masks 290 | 291 | x, y, w, h = cv2.boundingRect(mask) 292 | category_id = ( 293 | obj_class_inds[idx - 1].item() + 1 294 | ) # COCO category id starts from 1 295 | 296 | annotations.append( 297 | { 298 | "id": len(annotations), 299 | "image_id": image_idx, 300 | "category_id": category_id, 301 | "bbox": [x, y, w, h], 302 | "area": area, 303 | "iscrowd": 0, 304 | "segmentation": self._binary_mask_to_rle(mask), 305 | } 306 | ) 307 | tqbar.update(1) 308 | tqbar.close() 309 | 310 | coco_data = { 311 | "images": images, 312 | "annotations": annotations, 313 | "categories": COCO_CATEGORIES, 314 | } 315 | with open(save_anno_path, "w") as f: 316 | json.dump(coco_data, f) 317 | 318 | def create_hpe_dataset(self): 319 | self._logger.info(">>>>>>>>>> Creating HOCap Hand Pose Estimation Dataset...") 320 | output_dir = HOCAP_DATASET_ROOT / "hocap_hpe" 321 | make_clean_folder(output_dir) 322 | 323 | for split, split_data in HPE_CONFIG.items(): 324 | self._logger.info(f"Extracting {split} data...") 325 | 326 | # Create directories 327 | save_image_dir = output_dir / split / "images" 328 | save_image_dir.mkdir(parents=True, exist_ok=True) 329 | save_label_dir = output_dir / split / "labels" 330 | save_label_dir.mkdir(parents=True, exist_ok=True) 331 | 332 | tqbar = tqdm(total=len(split_data), ncols=100) 333 | for image_idx, (sub_idx, seq_idx, cam_idx, frame_idx) in enumerate( 334 | split_data 335 | ): 336 | sub_id = HOCAP_INFO["subject_ids"][sub_idx] 337 | seq_id = HOCAP_INFO["sequence_ids"][seq_idx] 338 | cam_id = HOCAP_INFO["device_serials"][cam_idx] 339 | 340 | # Copy image 341 | src_img_path = ( 342 | HOCAP_DATASET_ROOT 343 | / f"{sub_id}/{seq_id}/{cam_id}/color_{frame_idx:06d}.jpg" 344 | ) 345 | save_img_name = f"sub{sub_idx:02d}_seq{seq_idx:02d}_cam{cam_idx:02d}_frame{frame_idx:06d}.jpg" 346 | shutil.copy(src_img_path, save_image_dir / save_img_name) 347 | 348 | # Generate hand pose annotations 349 | label_data = np.load(src_img_path.parent / f"label_{frame_idx:06d}.npz") 350 | cam_K = label_data["cam_K"] 351 | hand_joints_2d = label_data["hand_joints_2d"] 352 | hand_joints_3d = label_data["hand_joints_3d"] 353 | mano_shape = self._mano_betas[sub_idx] 354 | mano_poses_w = self._load_pose_m(sub_id, seq_id)[:, frame_idx] 355 | cam_RT_inv = self._rs_RTs_inv[cam_idx] 356 | mano_poses = np.stack( 357 | [ 358 | self._world_mano_pose_to_camera(p, cam_RT_inv) 359 | for p in mano_poses_w 360 | ], 361 | axis=0, 362 | ) 363 | 364 | save_label_name = f"sub{sub_idx:02d}_seq{seq_idx:02d}_cam{cam_idx:02d}_frame{frame_idx:06d}.npz" 365 | np.savez_compressed( 366 | save_label_dir / save_label_name, 367 | cam_K=cam_K, 368 | keypoints_2d=hand_joints_2d, 369 | keypoints_3d=hand_joints_3d, 370 | mano_betas=mano_shape, 371 | mano_poses=mano_poses, 372 | ) 373 | tqbar.update(1) 374 | tqbar.close() 375 | 376 | def create_ope_dataset(self): 377 | self._logger.info(">>>>>>>>>> Creating HOCap Object Pose Estimation Dataset...") 378 | output_dir = HOCAP_DATASET_ROOT / "hocap_ope" 379 | make_clean_folder(output_dir) 380 | 381 | object_vertices = self._load_object_vertices() 382 | 383 | for split, split_data in OPE_CONFIG.items(): 384 | self._logger.info(f"Extracting {split} data...") 385 | split_dir = output_dir / split 386 | split_dir.mkdir(parents=True, exist_ok=True) 387 | 388 | tqbar = tqdm(total=len(split_data), ncols=100) 389 | gt_data = {} 390 | for image_idx, (sub_idx, seq_idx, cam_idx, frame_idx) in enumerate( 391 | split_data 392 | ): 393 | sub_id = HOCAP_INFO["subject_ids"][sub_idx] 394 | seq_id = HOCAP_INFO["sequence_ids"][seq_idx] 395 | cam_id = HOCAP_INFO["device_serials"][cam_idx] 396 | 397 | save_rgb_dir = split_dir / f"{seq_idx:02d}_{cam_idx:02d}" / "rgb" 398 | save_mask_dir = split_dir / f"{seq_idx:02d}_{cam_idx:02d}" / "mask" 399 | save_depth_dir = split_dir / f"{seq_idx:02d}_{cam_idx:02d}" / "depth" 400 | save_rgb_dir.mkdir(parents=True, exist_ok=True) 401 | save_mask_dir.mkdir(parents=True, exist_ok=True) 402 | save_depth_dir.mkdir(parents=True, exist_ok=True) 403 | 404 | src_rgb_path = ( 405 | HOCAP_DATASET_ROOT 406 | / f"{sub_id}/{seq_id}/{cam_id}/color_{frame_idx:06d}.jpg" 407 | ) 408 | src_depth_path = ( 409 | HOCAP_DATASET_ROOT 410 | / f"{sub_id}/{seq_id}/{cam_id}/depth_{frame_idx:06d}.png" 411 | ) 412 | label_path = ( 413 | HOCAP_DATASET_ROOT 414 | / f"{sub_id}/{seq_id}/{cam_id}/label_{frame_idx:06d}.npz" 415 | ) 416 | 417 | # Generate gt data 418 | gt_info = [] 419 | label_data = np.load(label_path) 420 | cam_K = label_data["cam_K"] 421 | obj_poses = label_data["obj_poses"] 422 | seg_mask = label_data["seg_mask"] 423 | obj_class_inds = label_data["obj_class_inds"] 424 | obj_class_names = label_data["obj_class_names"] 425 | obj_seg_mask = np.zeros_like(seg_mask) 426 | 427 | for idx, mask_id in enumerate(np.unique(seg_mask)): 428 | if mask_id == 0: # Background 429 | continue 430 | obj_idx = ( 431 | obj_class_inds[idx - 1].item() + 1 432 | ) # BOP format starts from 1 433 | obj_name = obj_class_names[idx - 1] 434 | 435 | if "HAND" in obj_name: # Exclude hands 436 | continue 437 | 438 | obj_pose = obj_poses[idx - 1] 439 | # update obj_seg_mask 440 | obj_seg_mask[seg_mask == mask_id] = obj_idx 441 | 442 | gt_entry = { 443 | "cam_R_m2c": obj_pose[:3, :3].tolist(), 444 | "cam_t_m2c": obj_pose[:3, 3].tolist(), 445 | "obj_id": obj_idx, 446 | "cam_K": cam_K.tolist(), 447 | "obj_bb": self._calculate_projected_bbox( 448 | cam_K, obj_pose, object_vertices[obj_name] 449 | ), 450 | } 451 | gt_info.append(gt_entry) 452 | 453 | # Save gt data 454 | shutil.copy( 455 | src_rgb_path, save_rgb_dir / f"{sub_idx:02d}_{frame_idx:06d}.jpg" 456 | ) 457 | shutil.copy( 458 | src_depth_path, 459 | save_depth_dir / f"{sub_idx:02d}_{frame_idx:06d}.png", 460 | ) 461 | write_mask_image( 462 | save_mask_dir / f"{sub_idx:02d}_{frame_idx:06d}.png", obj_seg_mask 463 | ) 464 | 465 | # Add gt.yaml 466 | if f"{seq_idx:02d}_{cam_idx:02d}" not in gt_data: 467 | gt_data[f"{seq_idx:02d}_{cam_idx:02d}"] = {} 468 | gt_data[f"{seq_idx:02d}_{cam_idx:02d}"][ 469 | f"{sub_idx:02d}_{frame_idx:06d}" 470 | ] = gt_info 471 | 472 | tqbar.update(1) 473 | tqbar.close() 474 | 475 | # Save gt.yaml 476 | self._logger.info(f"Saving gt.yaml for {split} split...") 477 | for key in gt_data.keys(): 478 | write_data_to_yaml(split_dir / f"{key}/gt.yaml", gt_data[key]) 479 | 480 | # Copy object models 481 | self._logger.info("Generating object models...") 482 | save_model_dir = output_dir / "models" 483 | save_model_dir.mkdir(parents=True, exist_ok=True) 484 | 485 | model_info = {} 486 | for obj_idx, obj_id in enumerate(HOCAP_INFO["object_classes"]): 487 | if "HAND" in obj_id: 488 | continue # Exclude hands 489 | mesh_file, _, _ = self._get_obj_model_path(obj_id) 490 | # Convert obj to ply 491 | mesh = trimesh.load(mesh_file) 492 | mesh.export(save_model_dir / f"{obj_id}.ply") 493 | model_info[obj_idx] = self._calculate_model_info(mesh) 494 | write_data_to_json(save_model_dir / "model_info.json", model_info) 495 | -------------------------------------------------------------------------------- /hocap_toolkit/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .mano_layer import MANOLayer 2 | from .mano_group_layer import MANOGroupLayer 3 | from .object_layer import ObjectLayer 4 | from .object_group_layer import ObjectGroupLayer 5 | 6 | 7 | __all__ = ["MANOLayer", "MANOGroupLayer", "ObjectLayer", "ObjectGroupLayer"] 8 | -------------------------------------------------------------------------------- /hocap_toolkit/layers/mano_group_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.nn import Module, ModuleList 4 | from .mano_layer import MANOLayer 5 | 6 | 7 | class MANOGroupLayer(Module): 8 | """Wrapper layer to hold a group of MANOLayers.""" 9 | 10 | def __init__(self, sides: list[str], betas: list[np.ndarray]): 11 | """ 12 | Constructor. 13 | 14 | Args: 15 | sides (list[str]): A list of MANO sides. 'right' or 'left'. 16 | betas (list[np.ndarray]): A list of numpy arrays of shape [10] containing the betas. 17 | """ 18 | super(MANOGroupLayer, self).__init__() 19 | 20 | self._sides = sides 21 | self._betas = betas 22 | self._num_obj = len(self._sides) 23 | 24 | self._layers = ModuleList( 25 | [MANOLayer(s, b) for s, b in zip(self._sides, self._betas)] 26 | ) 27 | 28 | # Register buffer for faces 29 | f = torch.cat([self._layers[i].f + 778 * i for i in range(self._num_obj)]) 30 | self.register_buffer("f", f) 31 | 32 | # Register buffer for root translation 33 | r = torch.cat([l.root_trans for l in self._layers]) 34 | self.register_buffer("root_trans", r) 35 | 36 | def forward( 37 | self, p: torch.Tensor, inds: list[int] = None 38 | ) -> tuple[torch.Tensor, torch.Tensor]: 39 | """ 40 | Forward function. 41 | 42 | Args: 43 | p (torch.Tensor): A tensor of shape [B, D] containing the pose vectors. 44 | inds (list[int], optional): A list of sub-layer indices. Default is None. 45 | 46 | Returns: 47 | tuple[torch.Tensor, torch.Tensor]: 48 | v: A tensor of shape [B, N, 3] containing the vertices. 49 | j: A tensor of shape [B, J, 3] containing the joints. 50 | """ 51 | if inds is None: 52 | inds = range(self._num_obj) 53 | v = [torch.zeros((p.size(0), 0, 3), dtype=torch.float32, device=self.f.device)] 54 | j = [torch.zeros((p.size(0), 0, 3), dtype=torch.float32, device=self.f.device)] 55 | p, t = self.pose2pt(p) 56 | for i in inds: 57 | y = self._layers[i](p[:, i], t[:, i]) 58 | v.append(y[0]) 59 | j.append(y[1]) 60 | v = torch.cat(v, dim=1) 61 | j = torch.cat(j, dim=1) 62 | return v, j 63 | 64 | def pose2pt(self, pose: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: 65 | """ 66 | Extracts pose and trans from pose vectors. 67 | 68 | Args: 69 | pose (torch.Tensor): A tensor of shape [B, D] containing the pose vectors. 70 | 71 | Returns: 72 | tuple[torch.Tensor, torch.Tensor]: 73 | p: A tensor of shape [B, O, 48] containing the pose. 74 | t: A tensor of shape [B, O, 3] containing the trans. 75 | """ 76 | p = torch.stack( 77 | [pose[:, 51 * i : 51 * i + 48] for i in range(self._num_obj)], dim=1 78 | ) 79 | t = torch.stack( 80 | [pose[:, 51 * i + 48 : 51 * i + 51] for i in range(self._num_obj)], dim=1 81 | ) 82 | return p, t 83 | 84 | def get_f_from_inds(self, inds: list[int]) -> tuple[torch.Tensor, torch.Tensor]: 85 | """ 86 | Gets faces from sub-layer indices. 87 | 88 | Args: 89 | inds (list[int]): A list of sub-layer indices. 90 | 91 | Returns: 92 | tuple[torch.Tensor, torch.Tensor]: 93 | f: A tensor of shape [F, 3] containing the faces. 94 | m: A tensor of shape [F] containing the face to index mapping. 95 | """ 96 | f = [torch.zeros((0, 3), dtype=self.f.dtype, device=self.f.device)] 97 | m = [torch.zeros((0,), dtype=torch.int64, device=self.f.device)] 98 | for i, x in enumerate(inds): 99 | f.append(self._layers[x].f + 778 * i) 100 | m.append( 101 | x 102 | * torch.ones( 103 | self._layers[x].f.size(0), dtype=torch.int64, device=self.f.device 104 | ) 105 | ) 106 | f = torch.cat(f) 107 | m = torch.cat(m) 108 | return f, m 109 | 110 | @property 111 | def num_obj(self) -> int: 112 | """Return the number of objects.""" 113 | return self._num_obj 114 | -------------------------------------------------------------------------------- /hocap_toolkit/layers/mano_layer.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | import torch 4 | from torch.nn import Module 5 | from manopth.manolayer import ManoLayer 6 | 7 | PROJ_ROOT = Path(__file__).parent.parent.parent 8 | 9 | 10 | class MANOLayer(Module): 11 | """Wrapper layer for manopth ManoLayer.""" 12 | 13 | def __init__(self, side: str, betas: np.ndarray): 14 | """ 15 | Constructor for MANOLayer. 16 | 17 | Args: 18 | side (str): MANO hand type. 'right' or 'left'. 19 | betas (np.ndarray): A numpy array of shape [10] containing the betas. 20 | """ 21 | super(MANOLayer, self).__init__() 22 | 23 | self._side = side 24 | self._betas = betas 25 | 26 | self._mano_layer = ManoLayer( 27 | side=side, 28 | mano_root=PROJ_ROOT / "config/mano_models", 29 | flat_hand_mean=False, 30 | ncomps=45, 31 | use_pca=True, 32 | ) 33 | 34 | # Register buffer for betas 35 | b = torch.from_numpy(betas).unsqueeze(0).float() 36 | self.register_buffer("b", b) 37 | 38 | # Register buffer for faces 39 | self.register_buffer("f", self._mano_layer.th_faces) 40 | 41 | # Register buffer for root translation 42 | v = ( 43 | torch.matmul(self._mano_layer.th_shapedirs, self.b.transpose(0, 1)).permute( 44 | 2, 0, 1 45 | ) 46 | + self._mano_layer.th_v_template 47 | ) 48 | r = torch.matmul(self._mano_layer.th_J_regressor[0], v) 49 | self.register_buffer("root_trans", r) 50 | 51 | def forward( 52 | self, p: torch.Tensor, t: torch.Tensor 53 | ) -> tuple[torch.Tensor, torch.Tensor]: 54 | """ 55 | Forward function. 56 | 57 | Args: 58 | p (torch.Tensor): A tensor of shape [B, 48] containing the pose. 59 | t (torch.Tensor): A tensor of shape [B, 3] containing the translation. 60 | 61 | Returns: 62 | tuple[torch.Tensor, torch.Tensor]: 63 | v: A tensor of shape [B, 778, 3] containing the vertices. 64 | j: A tensor of shape [B, 21, 3] containing the joints. 65 | """ 66 | v, j = self._mano_layer(p, self.b.expand(p.size(0), -1), t) 67 | 68 | # Convert to meters. 69 | v /= 1000.0 70 | j /= 1000.0 71 | return v, j 72 | 73 | @property 74 | def th_hands_mean(self) -> torch.Tensor: 75 | """Return the hand mean tensor.""" 76 | return self._mano_layer.th_hands_mean 77 | 78 | @property 79 | def th_selected_comps(self) -> torch.Tensor: 80 | """Return the selected components tensor.""" 81 | return self._mano_layer.th_selected_comps 82 | 83 | @property 84 | def th_v_template(self) -> torch.Tensor: 85 | """Return the vertex template tensor.""" 86 | return self._mano_layer.th_v_template 87 | 88 | @property 89 | def side(self) -> str: 90 | """Return the side of the hand.""" 91 | return self._side 92 | 93 | @property 94 | def num_verts(self) -> int: 95 | """Return the number of vertices.""" 96 | return 778 97 | -------------------------------------------------------------------------------- /hocap_toolkit/layers/object_group_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Module, ModuleList 3 | from .object_layer import ObjectLayer 4 | import numpy as np 5 | 6 | 7 | class ObjectGroupLayer(Module): 8 | """Wrapper layer to hold a group of ObjectLayers.""" 9 | 10 | def __init__( 11 | self, 12 | verts: list[np.ndarray], 13 | faces: list[np.ndarray], 14 | normals: list[np.ndarray], 15 | ): 16 | """ 17 | Constructor. 18 | 19 | Args: 20 | verts (list[np.ndarray]): A list of numpy arrays of shape [N, 3] containing the vertices. 21 | faces (list[np.ndarray]): A list of numpy arrays of shape [N, 3] containing the faces. 22 | normals (list[np.ndarray]): A list of numpy arrays of shape [N, 3] containing the normals. 23 | """ 24 | super(ObjectGroupLayer, self).__init__() 25 | 26 | self._layers = ModuleList( 27 | [ObjectLayer(v, f, n) for v, f, n in zip(verts, faces, normals)] 28 | ) 29 | self._num_obj = len(verts) 30 | self._num_verts = [v.shape[0] for v in verts] 31 | 32 | # Initialize faces with offsets 33 | f = [] 34 | offset = 0 35 | for i in range(self._num_obj): 36 | if i > 0: 37 | offset += self._layers[i - 1].v.size(1) 38 | f.append(self._layers[i].f + offset) 39 | f = torch.cat(f) 40 | self.register_buffer("f", f) 41 | 42 | @property 43 | def num_obj(self) -> int: 44 | """Return the number of objects.""" 45 | return self._num_obj 46 | 47 | @property 48 | def num_verts(self) -> list[int]: 49 | """Return the number of vertices for each object.""" 50 | return self._num_verts 51 | 52 | @property 53 | def count(self) -> list[int]: 54 | """Return the number of faces for each object.""" 55 | return [l.f.numel() for l in self._layers] 56 | 57 | def forward( 58 | self, p: torch.Tensor, inds: list[int] = None 59 | ) -> tuple[torch.Tensor, torch.Tensor]: 60 | """ 61 | Forward function. 62 | 63 | Args: 64 | p (torch.Tensor): A tensor of shape [B, D] containing the pose vectors. 65 | inds (list[int], optional): A list of sub-layer indices. Default is None. 66 | 67 | Returns: 68 | tuple[torch.Tensor, torch.Tensor]: 69 | v: A tensor of shape [B, N, 3] containing the transformed vertices. 70 | n: A tensor of shape [B, N, 3] containing the transformed normals. 71 | """ 72 | if inds is None: 73 | inds = range(self._num_obj) 74 | v = [torch.zeros((p.size(0), 0, 3), dtype=torch.float32, device=self.f.device)] 75 | n = [torch.zeros((p.size(0), 0, 3), dtype=torch.float32, device=self.f.device)] 76 | r, t = self.pose2rt(p) 77 | for i in inds: 78 | y = self._layers[i](r[:, i], t[:, i]) 79 | v.append(y[0]) 80 | n.append(y[1]) 81 | v = torch.cat(v, dim=1) 82 | n = torch.cat(n, dim=1) 83 | return v, n 84 | 85 | def pose2rt(self, pose: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: 86 | """ 87 | Extracts rotations and translations from pose vectors. 88 | 89 | Args: 90 | pose (torch.Tensor): A tensor of shape [B, D] containing the pose vectors. 91 | 92 | Returns: 93 | tuple[torch.Tensor, torch.Tensor]: 94 | r: A tensor of shape [B, O, 3] containing the rotation vectors. 95 | t: A tensor of shape [B, O, 3] containing the translations. 96 | """ 97 | r = torch.stack( 98 | [pose[:, 6 * i : 6 * i + 3] for i in range(self._num_obj)], dim=1 99 | ) 100 | t = torch.stack( 101 | [pose[:, 6 * i + 3 : 6 * i + 6] for i in range(self._num_obj)], dim=1 102 | ) 103 | return r, t 104 | 105 | def get_f_from_inds(self, inds: list[int]) -> tuple[torch.Tensor, torch.Tensor]: 106 | """ 107 | Gets faces from sub-layer indices. 108 | 109 | Args: 110 | inds (list[int]): A list of sub-layer indices. 111 | 112 | Returns: 113 | tuple[torch.Tensor, torch.Tensor]: 114 | f: A tensor of shape [F, 3] containing the faces. 115 | m: A tensor of shape [F] containing the face to index mapping. 116 | """ 117 | f = [torch.zeros((0, 3), dtype=self.f.dtype, device=self.f.device)] 118 | m = [torch.zeros((0,), dtype=torch.int64, device=self.f.device)] 119 | offset = 0 120 | for i, x in enumerate(inds): 121 | if i > 0: 122 | offset += self._layers[inds[i - 1]].v.size(1) 123 | f.append(self._layers[x].f + offset) 124 | m.append( 125 | x 126 | * torch.ones( 127 | self._layers[x].f.size(0), dtype=torch.int64, device=self.f.device 128 | ) 129 | ) 130 | f = torch.cat(f) 131 | m = torch.cat(m) 132 | return f, m 133 | 134 | def get_num_verts_from_inds(self, inds: list[int]) -> int: 135 | """ 136 | Gets number of vertices from sub-layer indices. 137 | 138 | Args: 139 | inds (list[int]): A non-empty list of sub-layer indices. 140 | 141 | Returns: 142 | int: The number of vertices. 143 | """ 144 | return sum(self._layers[i].v.size(1) for i in inds) 145 | 146 | def get_vert_inds_from_inds( 147 | self, inds: list[int] 148 | ) -> tuple[torch.Tensor, torch.Tensor]: 149 | """ 150 | Gets vertices from sub-layer indices. 151 | 152 | Args: 153 | inds (list[int]): A list of sub-layer indices. 154 | 155 | Returns: 156 | tuple[torch.Tensor, torch.Tensor]: 157 | idx: A tensor of shape [N] containing the vertices. 158 | m: A tensor of shape [N] containing the vertex to index mapping. 159 | """ 160 | idx = [torch.zeros((0,), dtype=torch.int64, device=self.f.device)] 161 | m = [torch.zeros((0,), dtype=torch.int64, device=self.f.device)] 162 | offset = 0 163 | for i in range(self._num_obj): 164 | if i > 0: 165 | offset += self._layers[i - 1].v.size(1) 166 | idx.append( 167 | torch.arange(self._layers[i].v.size(1), device=self.f.device) + offset 168 | ) 169 | m.append( 170 | i 171 | * torch.ones( 172 | self._layers[i].v.size(1), dtype=torch.int64, device=self.f.device 173 | ) 174 | ) 175 | idx = torch.cat(idx) 176 | m = torch.cat(m) 177 | return idx, m 178 | -------------------------------------------------------------------------------- /hocap_toolkit/layers/object_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.nn import Module 4 | 5 | 6 | class ObjectLayer(Module): 7 | def __init__(self, verts: np.ndarray, faces: np.ndarray, normals: np.ndarray): 8 | """ 9 | Initializes the object layer. 10 | 11 | Args: 12 | verts (np.ndarray): A numpy array of shape [N, 3] containing the vertices. 13 | faces (np.ndarray): A numpy array of shape [N, 3] containing the faces. 14 | normals (np.ndarray): A numpy array of shape [N, 3] containing the normals. 15 | """ 16 | super().__init__() 17 | self._num_verts = verts.shape[0] 18 | 19 | # Convert numpy arrays to torch tensors 20 | v = torch.from_numpy(verts.astype(np.float32).T) 21 | n = torch.from_numpy(normals.astype(np.float32).T) 22 | f = torch.from_numpy(faces.astype(np.int64).reshape((-1, 3))) 23 | 24 | # Register buffers for vertices, normals, and faces 25 | self.register_buffer("v", v) 26 | self.register_buffer("n", n) 27 | self.register_buffer("f", f) 28 | 29 | def forward( 30 | self, r: torch.Tensor, t: torch.Tensor 31 | ) -> tuple[torch.Tensor, torch.Tensor]: 32 | """ 33 | Forward function. 34 | 35 | Args: 36 | r (torch.Tensor): A tensor of shape [B, 3] containing the rotation in axis-angle. 37 | t (torch.Tensor): A tensor of shape [B, 3] containing the translation. 38 | 39 | Returns: 40 | tuple[torch.Tensor, torch.Tensor]: 41 | v: A tensor of shape [B, N, 3] containing the transformed vertices. 42 | n: A tensor of shape [B, N, 3] containing the transformed normals. 43 | """ 44 | R = self.rv2dcm(r) 45 | v = torch.matmul(R, self.v).permute(0, 2, 1) + t.unsqueeze(1) 46 | n = torch.matmul(R, self.n).permute(0, 2, 1) 47 | return v, n 48 | 49 | def rv2dcm(self, rv: torch.Tensor) -> torch.Tensor: 50 | """ 51 | Converts rotation vectors to direction cosine matrices. 52 | 53 | Args: 54 | rv (torch.Tensor): A tensor of shape [B, 3] containing the rotation vectors. 55 | 56 | Returns: 57 | torch.Tensor: A tensor of shape [B, 3, 3] containing the direction cosine matrices. 58 | """ 59 | angle = torch.norm(rv + 1e-8, p=2, dim=1) 60 | axis = rv / angle.unsqueeze(1) 61 | s = torch.sin(angle).unsqueeze(1).unsqueeze(2) 62 | c = torch.cos(angle).unsqueeze(1).unsqueeze(2) 63 | I = torch.eye(3, device=rv.device).expand(rv.size(0), -1, -1) 64 | z = torch.zeros_like(angle) 65 | K = torch.stack( 66 | ( 67 | torch.stack((z, -axis[:, 2], axis[:, 1]), dim=1), 68 | torch.stack((axis[:, 2], z, -axis[:, 0]), dim=1), 69 | torch.stack((-axis[:, 1], axis[:, 0], z), dim=1), 70 | ), 71 | dim=1, 72 | ) 73 | dcm = I + s * K + (1 - c) * torch.bmm(K, K) 74 | return dcm 75 | 76 | @property 77 | def num_verts(self) -> int: 78 | """Return the number of vertices.""" 79 | return self._num_verts 80 | -------------------------------------------------------------------------------- /hocap_toolkit/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence_loader import SequenceLoader 2 | 3 | __all__ = ["SequenceLoader"] 4 | -------------------------------------------------------------------------------- /hocap_toolkit/loaders/sequence_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from pathlib import Path 4 | from hocap_toolkit.utils import read_data_from_yaml, read_rgb_image, read_depth_image 5 | 6 | 7 | class SequenceLoader: 8 | """ 9 | Class for loading and processing sequence data. 10 | 11 | Supports loading MANO and object layers, along with their poses, intrinsics, 12 | extrinsics, and metadata required for 3D reconstruction and analysis. 13 | """ 14 | 15 | def __init__(self, sequence_folder: str, device: str = "cuda"): 16 | """ 17 | Initializes the SequenceLoader object. 18 | 19 | Args: 20 | sequence_folder (str): The path to the sequence folder. 21 | device (str): The device to run computations on ('cpu' or 'cuda'). Defaults to 'cpu'. 22 | """ 23 | self._data_folder = Path(sequence_folder) 24 | self._calib_folder = self._data_folder.parent.parent / "calibration" 25 | self._models_folder = self._data_folder.parent.parent / "models" 26 | self._device = device 27 | 28 | # Crop limits in world frame, [x_min, x_max, y_min, y_max, z_min, z_max] 29 | self._crop_lim = [-0.60, +0.60, -0.35, +0.35, -0.01, +0.80] 30 | 31 | # Load metadata 32 | self._load_metadata() 33 | 34 | # Create mapping from 2D coordinates to 3D rays 35 | self._rays = self._create_3d_rays() 36 | 37 | # Create projection matrices from camera to master/world 38 | self._M2world = torch.bmm(self._rs_Ks, self._rs_RTs_inv[:, :3, :]) 39 | 40 | # Initialize points, colors, and masks 41 | self._frame_id = -1 42 | self._points = torch.zeros( 43 | (len(self._rs_serials), self._rs_height * self._rs_width, 3), 44 | dtype=torch.float32, 45 | device=self._device, 46 | ) 47 | self._colors = torch.zeros( 48 | (len(self._rs_serials), self._rs_height * self._rs_width, 3), 49 | dtype=torch.float32, 50 | device=self._device, 51 | ) 52 | self._masks = torch.zeros( 53 | (len(self._rs_serials), self._rs_height * self._rs_width), 54 | dtype=torch.bool, 55 | device=self._device, 56 | ) 57 | 58 | def _load_metadata(self): 59 | data = read_data_from_yaml(self._data_folder / "meta.yaml") 60 | 61 | self._num_frames = data["num_frames"] 62 | self._object_ids = data["object_ids"] 63 | self._mano_sides = data["mano_sides"] 64 | self._task_id = data["task_id"] 65 | self._subject_id = data["subject_id"] 66 | # RealSense camera metadata 67 | self._rs_serials = data["realsense"]["serials"] 68 | self._rs_width = data["realsense"]["width"] 69 | self._rs_height = data["realsense"]["height"] 70 | self._num_cams = len(self._rs_serials) 71 | # HoloLens metadata 72 | self._hl_serial = data["hololens"]["serial"] 73 | self._hl_pv_width = data["hololens"]["pv_width"] 74 | self._hl_pv_height = data["hololens"]["pv_height"] 75 | # Object models file paths 76 | self._object_textured_files = [ 77 | self._models_folder / obj_id / "textured_mesh.obj" 78 | for obj_id in self._object_ids 79 | ] 80 | self._object_cleaned_files = [ 81 | self._models_folder / obj_id / "cleaned_mesh_10000.obj" 82 | for obj_id in self._object_ids 83 | ] 84 | 85 | # Load camera intrinsics 86 | self._load_intrinsics() 87 | 88 | # Load rs camera extrinsics 89 | self._load_extrinsics(data["extrinsics"]) 90 | 91 | # Load MANO shape parameters 92 | self._mano_beta = self._load_mano_beta() 93 | 94 | def _load_intrinsics(self): 95 | def read_K_from_yaml(serial, cam_type="color"): 96 | yaml_file = self._calib_folder / "intrinsics" / f"{serial}.yaml" 97 | data = read_data_from_yaml(yaml_file)[cam_type] 98 | K = np.array( 99 | [ 100 | [data["fx"], 0.0, data["ppx"]], 101 | [0.0, data["fy"], data["ppy"]], 102 | [0.0, 0.0, 1.0], 103 | ], 104 | dtype=np.float32, 105 | ) 106 | return K 107 | 108 | rs_Ks = np.stack( 109 | [read_K_from_yaml(serial) for serial in self._rs_serials], axis=0 110 | ) 111 | rs_Ks_inv = np.stack([np.linalg.inv(K) for K in rs_Ks], axis=0) 112 | 113 | hl_K = read_K_from_yaml(self._hl_serial) 114 | hl_K_inv = np.linalg.inv(hl_K) 115 | 116 | # Convert intrinsics to torch tensors 117 | self._rs_Ks = torch.from_numpy(rs_Ks).to(self._device) 118 | self._rs_Ks_inv = torch.from_numpy(rs_Ks_inv).to(self._device) 119 | self._hl_K = torch.from_numpy(hl_K).to(self._device) 120 | self._hl_K_inv = torch.from_numpy(hl_K_inv).to(self._device) 121 | 122 | def _load_extrinsics(self, file_name): 123 | def create_mat(values): 124 | return np.array( 125 | [values[0:4], values[4:8], values[8:12], [0, 0, 0, 1]], dtype=np.float32 126 | ) 127 | 128 | data = read_data_from_yaml(self._calib_folder / "extrinsics" / f"{file_name}") 129 | 130 | # Read rs_master serial 131 | self._rs_master = data["rs_master"] 132 | 133 | # Create extrinsics matrices 134 | extrinsics = data["extrinsics"] 135 | tag_0 = create_mat(extrinsics["tag_0"]) 136 | tag_0_inv = np.linalg.inv(tag_0) 137 | tag_1 = create_mat(extrinsics["tag_1"]) 138 | tag_1_inv = np.linalg.inv(tag_1) 139 | extr2master = np.stack( 140 | [create_mat(extrinsics[s]) for s in self._rs_serials], axis=0 141 | ) 142 | extr2master_inv = np.stack([np.linalg.inv(t) for t in extr2master], axis=0) 143 | extr2world = np.stack([tag_1_inv @ t for t in extr2master], axis=0) 144 | extr2world_inv = np.stack([np.linalg.inv(t) for t in extr2world], axis=0) 145 | 146 | # Convert extrinsics to torch tensors 147 | self._tag_0 = torch.from_numpy(tag_0).to(self._device) 148 | self._tag_0_inv = torch.from_numpy(tag_0_inv).to(self._device) 149 | self._tag_1 = torch.from_numpy(tag_1).to(self._device) 150 | self._tag_1_inv = torch.from_numpy(tag_1_inv).to(self._device) 151 | self._extr2master = torch.from_numpy(extr2master).to(self._device) 152 | self._extr2master_inv = torch.from_numpy(extr2master_inv).to(self._device) 153 | self._rs_RTs = torch.from_numpy(extr2world).to(self._device) 154 | self._rs_RTs_inv = torch.from_numpy(extr2world_inv).to(self._device) 155 | 156 | def _load_mano_beta(self) -> torch.Tensor: 157 | file_path = self._calib_folder / "mano" / f"{self._subject_id}.yaml" 158 | data = read_data_from_yaml(file_path) 159 | return torch.tensor(data["betas"], dtype=torch.float32, device=self._device) 160 | 161 | def _create_3d_rays(self) -> torch.Tensor: 162 | """Creates 3D rays for deprojecting depth images to 3D space.""" 163 | 164 | def create_2d_coords() -> torch.Tensor: 165 | xv, yv = torch.meshgrid( 166 | torch.arange(self._rs_width), 167 | torch.arange(self._rs_height), 168 | indexing="xy", 169 | ) 170 | coord_2d = torch.stack( 171 | (xv, yv, torch.ones_like(xv)), dim=0 172 | ).float() # (3, H, W) 173 | coords_2d = ( 174 | coord_2d.unsqueeze(0) 175 | .repeat(self._num_cams, 1, 1, 1) 176 | .view(self._num_cams, 3, -1) 177 | ) # (N, 3, H*W) 178 | coords_2d = coords_2d.to(self._device) 179 | return coords_2d 180 | 181 | coords_2d = create_2d_coords() 182 | return torch.bmm(self._rs_Ks_inv, coords_2d) # (N, 3, H*W) 183 | 184 | def _deproject(self, colors, depths) -> tuple: 185 | """ 186 | Deprojects depth images to 3D points. 187 | 188 | Args: 189 | colors (np.ndarray): List of color images, [N, H, W, 3], dtype=float32. 190 | depths (np.ndarray): List of depth images, [N, H, W], dtype=np.float32. 191 | 192 | Returns: 193 | tuple: Colors, 3D points, and masks. 194 | """ 195 | # Process color images 196 | colors = torch.from_numpy(colors.reshape(self._num_cams, -1, 3)).to( 197 | self._device 198 | ) # [N, H*W, 3] 199 | 200 | # Process depth images 201 | depths = torch.from_numpy(depths.reshape(self._num_cams, 1, -1)).to( 202 | self._device 203 | ) # [N, 1, H*W] 204 | 205 | # Deproject depth images to 3D points in camera frame 206 | pts_c = self._rays * depths # [N, 3, H*W] 207 | # Transform 3D points from camera frame to world frame 208 | pts = torch.baddbmm( 209 | self._rs_RTs[:, :3, 3].unsqueeze(2), 210 | self._rs_RTs[:, :3, :3], 211 | pts_c, 212 | ).permute( 213 | 0, 2, 1 214 | ) # (N, H*W, 3) 215 | 216 | # Crop 3D points 217 | mx1 = pts[..., 0] > self._crop_lim[0] 218 | mx2 = pts[..., 0] < self._crop_lim[1] 219 | my1 = pts[..., 1] > self._crop_lim[2] 220 | my2 = pts[..., 1] < self._crop_lim[3] 221 | mz1 = pts[..., 2] > self._crop_lim[4] 222 | mz2 = pts[..., 2] < self._crop_lim[5] 223 | masks = mx1 & mx2 & my1 & my2 & mz1 & mz2 224 | 225 | return colors, pts, masks 226 | 227 | def _update_pcd(self, frame_id: int): 228 | """Update point cloud data.""" 229 | colors, points, masks = self._deproject( 230 | np.stack( 231 | [self.get_rgb_image(frame_id, serial) for serial in self._rs_serials], 232 | axis=0, 233 | dtype=np.float32, 234 | ) 235 | / 255.0, 236 | np.stack( 237 | [self.get_depth_image(frame_id, serial) for serial in self._rs_serials], 238 | axis=0, 239 | dtype=np.float32, 240 | ), 241 | ) 242 | self._points.copy_(points) 243 | self._colors.copy_(colors) 244 | self._masks.copy_(masks) 245 | 246 | def get_rgb_image(self, frame_id: int, serial: str) -> np.ndarray: 247 | """Get RGB image in numpy format, dtype=uint8, [H, W, 3].""" 248 | image_file = self._data_folder / f"{serial}/color_{frame_id:06d}.jpg" 249 | return read_rgb_image(image_file) 250 | 251 | def get_depth_image(self, frame_id: int, serial: str) -> np.ndarray: 252 | """Get depth image in numpy format, dtype=uint16, [H, W].""" 253 | image_file = self._data_folder / f"{serial}/depth_{frame_id:06d}.png" 254 | return read_depth_image(image_file, scale=1000.0) 255 | 256 | def get_image_label(self, frame_id: int, serial: str) -> dict: 257 | """Get image label data.""" 258 | label_file = self._data_folder / f"{serial}/label_{frame_id:06d}.npz" 259 | if not label_file.exists(): 260 | return {} 261 | return np.load(label_file) 262 | 263 | def step(self): 264 | """Step to the next frame.""" 265 | self._frame_id = (self._frame_id + 1) % self._num_frames 266 | self._update_pcd(self._frame_id) 267 | 268 | def step_by_frame_id(self, frame_id: int): 269 | """Step to a specific frame.""" 270 | self._frame_id = frame_id % self._num_frames 271 | self._update_pcd(self._frame_id) 272 | 273 | @property 274 | def object_ids(self) -> list: 275 | return self._object_ids 276 | 277 | @property 278 | def subject_id(self) -> str: 279 | return self._subject_id 280 | 281 | @property 282 | def num_frames(self) -> int: 283 | return self._num_frames 284 | 285 | @property 286 | def rs_width(self) -> int: 287 | return self._rs_width 288 | 289 | @property 290 | def rs_height(self) -> int: 291 | return self._rs_height 292 | 293 | @property 294 | def rs_serials(self) -> list: 295 | return self._rs_serials 296 | 297 | @property 298 | def rs_master(self) -> str: 299 | return self._rs_master 300 | 301 | @property 302 | def holo_pv_width(self) -> int: 303 | return self._hl_pv_width 304 | 305 | @property 306 | def holo_pv_height(self) -> int: 307 | return self._hl_pv_height 308 | 309 | @property 310 | def holo_serial(self) -> list: 311 | return self._hl_serial 312 | 313 | @property 314 | def mano_beta(self) -> torch.Tensor: 315 | return self._mano_beta 316 | 317 | @property 318 | def mano_sides(self) -> list: 319 | return self._mano_sides 320 | 321 | @property 322 | def rs_Ks(self) -> torch.Tensor: 323 | return self._rs_Ks 324 | 325 | @property 326 | def rs_Ks_inv(self) -> torch.Tensor: 327 | return self._rs_Ks_inv 328 | 329 | @property 330 | def rs_RTs(self) -> torch.Tensor: 331 | return self._rs_RTs 332 | 333 | @property 334 | def rs_RTs_inv(self) -> torch.Tensor: 335 | return self._rs_RTs_inv 336 | 337 | @property 338 | def tag_0(self) -> torch.Tensor: 339 | """tag_0 to rs_master transformation matrix""" 340 | return self._tag_0 341 | 342 | @property 343 | def tag_0_inv(self) -> torch.Tensor: 344 | """rs_master to tag_0 transformation matrix""" 345 | return self._tag_0_inv 346 | 347 | @property 348 | def tag_1(self) -> torch.Tensor: 349 | """tag_1 to rs_master transformation matrix""" 350 | return self._tag_1 351 | 352 | @property 353 | def tag_1_inv(self) -> torch.Tensor: 354 | """rs_master to tag_1 transformation matrix""" 355 | return self._tag_1_inv 356 | 357 | @property 358 | def M(self) -> torch.Tensor: 359 | """camera to world transformation matrix""" 360 | return self._M2world 361 | 362 | @property 363 | def frame_id(self) -> int: 364 | return self._frame_id 365 | 366 | @property 367 | def object_textured_mesh_files(self) -> list: 368 | return [ 369 | str(self._models_folder / f"{object_id}/textured_mesh.obj") 370 | for object_id in self._object_ids 371 | ] 372 | 373 | @property 374 | def object_cleaned_mesh_files(self) -> list: 375 | return [ 376 | str(self._models_folder / f"{object_id}/cleaned_mesh_10000.obj") 377 | for object_id in self._object_ids 378 | ] 379 | 380 | @property 381 | def points(self) -> torch.Tensor: 382 | return self._points 383 | 384 | @property 385 | def colors(self) -> torch.Tensor: 386 | return self._colors 387 | 388 | @property 389 | def masks(self) -> torch.Tensor: 390 | return self._masks 391 | -------------------------------------------------------------------------------- /hocap_toolkit/renderers/__init__.py: -------------------------------------------------------------------------------- 1 | from .sequence_renderer import SequenceRenderer 2 | from .renderer_pyrd import OffscreenRenderer 3 | -------------------------------------------------------------------------------- /hocap_toolkit/renderers/renderer_pyrd.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["PYOPENGL_PLATFORM"] = "egl" # GPU-based offscreen rendering 4 | 5 | import numpy as np 6 | import pyrender 7 | from pyrender import RenderFlags 8 | 9 | # OpenGL RH y UP (Pyrender) 10 | # y 11 | # | 12 | # +---x 13 | # / 14 | # z 15 | 16 | # CV Camera RH y DOWN, x RIGHT, z FRONT 17 | # z 18 | # / 19 | # +---x 20 | # | 21 | # y 22 | 23 | 24 | class OffscreenRenderer: 25 | def __init__(self, width, height, znear=0.001, zfar=1000.0, pose_type="cv") -> None: 26 | assert pose_type in ["cv", "gl"], "Invalid pose type. Must be 'cv' or 'gl'" 27 | self._pose_type = pose_type 28 | self._width = width 29 | self._height = height 30 | self._znear = znear 31 | self._zfar = zfar 32 | self._bg_color = np.array([0.0, 0.0, 0.0, 1.0]) 33 | self._ambient_light = np.array([1.0, 1.0, 1.0, 1.0]) 34 | 35 | self._cam = pyrender.PerspectiveCamera( 36 | yfov=np.pi / 3.0, znear=self._znear, zfar=self._zfar 37 | ) 38 | 39 | self._glcam2cvcam = np.array( 40 | [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]] 41 | ) 42 | self._cvcam2glcam = np.linalg.inv(self._glcam2cvcam) 43 | 44 | def get_render_image(self, meshes, mesh_poses=None, cam_K=None, cam_pose=None): 45 | poses_m = ( 46 | mesh_poses 47 | if mesh_poses is not None 48 | else [np.eye(4) for _ in range(len(meshes))] 49 | ) 50 | pose_c = cam_pose if cam_pose is not None else np.eye(4) 51 | if self._pose_type == "cv": 52 | pose_c = pose_c @ self._cvcam2glcam 53 | 54 | scene = pyrender.Scene( 55 | bg_color=self._bg_color, ambient_light=self._ambient_light 56 | ) 57 | 58 | # add dummy world node 59 | world_node = scene.add(pyrender.PerspectiveCamera(yfov=np.pi / 3.0)) 60 | 61 | # add camera 62 | scene.main_camera_node = scene.add( 63 | pyrender.IntrinsicsCamera( 64 | fx=cam_K[0, 0], 65 | fy=cam_K[1, 1], 66 | cx=cam_K[0, 2], 67 | cy=cam_K[1, 2], 68 | znear=self._znear, 69 | zfar=self._zfar, 70 | ), 71 | name="camera", 72 | pose=pose_c, 73 | parent_node=world_node, 74 | ) 75 | 76 | # add meshes 77 | for i, mesh in enumerate(meshes): 78 | scene.add( 79 | pyrender.Mesh.from_trimesh(mesh), 80 | name=f"mesh_{i}", 81 | pose=poses_m[i], 82 | parent_node=world_node, 83 | ) 84 | # render 85 | r = pyrender.OffscreenRenderer(self._width, self._height) 86 | color, depth = r.render(scene, flags=RenderFlags.ALL_SOLID) 87 | r.delete() 88 | 89 | return color, depth 90 | -------------------------------------------------------------------------------- /hocap_toolkit/renderers/sequence_renderer.py: -------------------------------------------------------------------------------- 1 | from hocap_toolkit.utils import * 2 | from hocap_toolkit.loaders import SequenceLoader 3 | from hocap_toolkit.layers import MANOGroupLayer 4 | 5 | cvcam_in_glcam = np.array([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) 6 | 7 | 8 | class SequenceRenderer: 9 | def __init__(self, sequence_folder, device="cpu") -> None: 10 | self._seq_folder = Path(sequence_folder).resolve() 11 | self._device = device 12 | self._loader = SequenceLoader(sequence_folder, device=device) 13 | self._num_frames = self._loader.num_frames 14 | self._object_ids = self._loader.object_ids 15 | self._mano_sides = self._loader.mano_sides 16 | self._mano_group_layer = self._init_mano_group_layer() 17 | # Realsense cameras 18 | self._rs_serials = self._loader.rs_serials 19 | self._rs_width = self._loader.rs_width 20 | self._rs_height = self._loader.rs_height 21 | self._rs_intrinsics = self._loader.rs_Ks.cpu().numpy() 22 | self._rs_extrinsics = self._loader.rs_RTs.cpu().numpy() 23 | # Hololens cameras 24 | self._hl_serial = self._loader.holo_serial 25 | self._hl_pv_width = self._loader.holo_pv_width 26 | self._hl_pv_height = self._loader.holo_pv_height 27 | self._hl_pv_intrinsics = self._loader._hl_K.cpu().numpy() 28 | 29 | # Load poses 30 | self._poses_o = self._load_object_poses() 31 | self._poses_m = self._load_mano_poses() 32 | self._poses_pv = self._load_holo_poses() 33 | 34 | # Load object meshes 35 | self._obj_meshes = [ 36 | pyrender.Mesh.from_trimesh(trimesh.load_mesh(f, process=False)) 37 | for f in self._loader.object_textured_mesh_files 38 | ] 39 | 40 | # Get verts, faces, colors for MANO 41 | self._mano_verts = self._get_mano_verts() 42 | self._mano_faces = self._get_mano_faces() 43 | self._mano_colors = self._get_mano_colors() 44 | 45 | # Rendering flags 46 | self._rgb_flags = ( 47 | pyrender.RenderFlags.OFFSCREEN | pyrender.RenderFlags.SHADOWS_ALL 48 | ) 49 | self._depth_flags = ( 50 | pyrender.RenderFlags.OFFSCREEN | pyrender.RenderFlags.DEPTH_ONLY 51 | ) 52 | self._mask_flags = pyrender.RenderFlags.OFFSCREEN | pyrender.RenderFlags.SEG 53 | 54 | def _load_holo_pv_intrinsics(self, serial): 55 | K = np.fromfile( 56 | self._loader._calib_folder 57 | / f"hololens/{serial}/personal_video" 58 | / f"1000_{self._hl_pv_width}_{self._hl_pv_height}/intrinsics.bin", 59 | dtype=np.float32, 60 | ).reshape(4, 4)[:3, :3] 61 | K[0, 0] = -K[0, 0] 62 | return K.T 63 | 64 | def _load_object_poses(self): 65 | pose_file = self._seq_folder / "poses_o.npy" 66 | poses = np.load(pose_file) 67 | poses = np.stack([quat_to_mat(p) for p in poses], axis=0) 68 | return poses 69 | 70 | def _load_mano_poses(self): 71 | pose_file = self._seq_folder / "poses_m.npy" 72 | poses = np.load(pose_file) 73 | poses = [ 74 | torch.from_numpy(poses[0 if side == "right" else 1]).to(self._device) 75 | for side in self._mano_sides 76 | ] 77 | return poses 78 | 79 | def _load_holo_poses(self): 80 | pose_file = self._seq_folder / "poses_pv.npy" 81 | poses = quat_to_mat(np.load(pose_file)) 82 | return poses 83 | 84 | def _init_mano_group_layer(self): 85 | beta = self._loader.mano_beta.cpu().numpy() 86 | return MANOGroupLayer(self._mano_sides, [beta for _ in self._mano_sides]).to( 87 | self._device 88 | ) 89 | 90 | def _get_mano_verts(self): 91 | p = torch.cat(self._poses_m, dim=1) 92 | v, _ = self._mano_group_layer(p) 93 | if p.size(0) == 1: 94 | v = v[0] 95 | return v.cpu().numpy() 96 | 97 | def _get_mano_faces(self): 98 | mano_faces = self._mano_group_layer.f.cpu().numpy() 99 | mano_faces = [ 100 | np.concatenate( 101 | [ 102 | mano_faces[idx * NUM_MANO_FACES : (idx + 1) * NUM_MANO_FACES] 103 | - idx * NUM_MANO_VERTS, 104 | NEW_MANO_FACES[side], 105 | ] 106 | ) 107 | for idx, side in enumerate(self._mano_sides) 108 | ] 109 | return mano_faces 110 | 111 | def _get_mano_colors(self): 112 | mano_colors = [ 113 | HAND_COLORS[1].rgb if side == "right" else HAND_COLORS[2].rgb 114 | for side in self._mano_sides 115 | ] 116 | return mano_colors 117 | 118 | def _get_mano_meshes(self, frame_id): 119 | meshes = [ 120 | trimesh.Trimesh( 121 | vertices=self._mano_verts[frame_id][ 122 | i * NUM_MANO_VERTS : (i + 1) * NUM_MANO_VERTS 123 | ], 124 | faces=self._mano_faces[i], 125 | vertex_colors=self._mano_colors[i], 126 | ) 127 | for i in range(len(self._mano_sides)) 128 | ] 129 | meshes = [pyrender.Mesh.from_trimesh(mesh) for mesh in meshes] 130 | return meshes 131 | 132 | def create_scene(self, frame_id): 133 | self._scene = pyrender.Scene( 134 | bg_color=[0.0, 0.0, 0.0], ambient_light=[1.0, 1.0, 1.0] 135 | ) 136 | 137 | # Add world node 138 | world_node = self._scene.add_node(pyrender.Node(name="world")) 139 | 140 | # Add realsense camera nodes 141 | self._camera_nodes = { 142 | serial: self._scene.add( 143 | pyrender.IntrinsicsCamera( 144 | fx=cam_K[0, 0], 145 | fy=cam_K[1, 1], 146 | cx=cam_K[0, 2], 147 | cy=cam_K[1, 2], 148 | znear=0.01, 149 | zfar=10.0, 150 | ), 151 | parent_node=world_node, 152 | name=f"cam_{serial}", 153 | pose=cam_RT @ cvcam_in_glcam, 154 | ) 155 | for serial, cam_K, cam_RT in zip( 156 | self._rs_serials, self._rs_intrinsics, self._rs_extrinsics 157 | ) 158 | } 159 | 160 | # Add hololens camera node 161 | self._camera_nodes[self._hl_serial] = self._scene.add( 162 | pyrender.IntrinsicsCamera( 163 | fx=self._hl_pv_intrinsics[0, 0], 164 | fy=self._hl_pv_intrinsics[1, 1], 165 | cx=self._hl_pv_intrinsics[0, 2], 166 | cy=self._hl_pv_intrinsics[1, 2], 167 | znear=0.01, 168 | zfar=10.0, 169 | ), 170 | parent_node=world_node, 171 | name=f"cam_{self._hl_serial}", 172 | pose=self._poses_pv[frame_id] @ cvcam_in_glcam, 173 | ) 174 | 175 | # Add object nodes 176 | self._object_nodes = [ 177 | self._scene.add( 178 | obj_mesh, 179 | parent_node=world_node, 180 | name=f"obj_{self._object_ids[i]}", 181 | pose=self._poses_o[i, frame_id], 182 | ) 183 | for i, obj_mesh in enumerate(self._obj_meshes) 184 | ] 185 | 186 | # Add MANO nodes 187 | self._mano_nodes = [ 188 | self._scene.add( 189 | mano_mesh, 190 | parent_node=world_node, 191 | name=f"mano_{self._mano_sides[i]}", 192 | pose=np.eye(4), 193 | ) 194 | for i, mano_mesh in enumerate(self._get_mano_meshes(frame_id)) 195 | ] 196 | 197 | self._seg_node_map = {} 198 | for i, obj_node in enumerate(self._object_nodes): 199 | self._seg_node_map[obj_node] = OBJ_CLASS_COLORS[i + 1].rgb 200 | 201 | for i, side in enumerate(self._mano_sides): 202 | hand_color_idx = 1 if side == "right" else 2 203 | self._seg_node_map[self._mano_nodes[i]] = HAND_COLORS[hand_color_idx].rgb 204 | 205 | def get_rgb_image(self, frame_id, serial): 206 | return self._loader.get_rgb_image(frame_id, serial) 207 | 208 | def get_render_colors(self): 209 | color_images = {} 210 | # Render color images for realsense cameras 211 | r = pyrender.OffscreenRenderer(self._rs_width, self._rs_height) 212 | for serial in self._rs_serials: 213 | self._scene.main_camera_node = self._camera_nodes[serial] 214 | color, _ = r.render(self._scene, flags=self._rgb_flags) 215 | color_images[serial] = color 216 | r.delete() 217 | # Render color image for hololens camera 218 | r = pyrender.OffscreenRenderer(self._hl_pv_width, self._hl_pv_height) 219 | self._scene.main_camera_node = self._camera_nodes[self._hl_serial] 220 | color, _ = r.render(self._scene, flags=self._rgb_flags) 221 | color_images[self._hl_serial] = color 222 | r.delete() 223 | return color_images 224 | 225 | def get_render_depths(self): 226 | depth_images = {} 227 | # Render depth images for realsense cameras 228 | r = pyrender.OffscreenRenderer(self._rs_width, self._rs_height) 229 | for serial in self._rs_serials: 230 | self._scene.main_camera_node = self._camera_nodes[serial] 231 | depth = r.render(self._scene, flags=self._depth_flags) 232 | depth_images[serial] = depth 233 | r.delete() 234 | # Render depth image for hololens camera 235 | r = pyrender.OffscreenRenderer(self._hl_pv_width, self._hl_pv_height) 236 | self._scene.main_camera_node = self._camera_nodes[self._hl_serial] 237 | depth = r.render(self._scene, flags=self._depth_flags) 238 | depth_images[self._hl_serial] = depth 239 | r.delete() 240 | return depth_images 241 | 242 | def get_render_masks(self): 243 | mask_images = {} 244 | # Render mask images for realsense cameras 245 | r = pyrender.OffscreenRenderer(self._rs_width, self._rs_height) 246 | for serial in self._rs_serials: 247 | self._scene.main_camera_node = self._camera_nodes[serial] 248 | mask, _ = r.render( 249 | self._scene, flags=self._mask_flags, seg_node_map=self._seg_node_map 250 | ) 251 | mask_images[serial] = mask 252 | r.delete() 253 | # Render mask image for hololens camera 254 | r = pyrender.OffscreenRenderer(self._hl_pv_width, self._hl_pv_height) 255 | self._scene.main_camera_node = self._camera_nodes[self._hl_serial] 256 | mask, _ = r.render( 257 | self._scene, flags=self._mask_flags, seg_node_map=self._seg_node_map 258 | ) 259 | mask_images[self._hl_serial] = mask 260 | r.delete() 261 | return mask_images 262 | 263 | @property 264 | def num_frames(self): 265 | return self._num_frames 266 | 267 | @property 268 | def rs_serials(self): 269 | return self._rs_serials 270 | 271 | @property 272 | def holo_serial(self): 273 | return self._hl_serial 274 | 275 | 276 | def plot_and_save_images(images): 277 | """ 278 | Plot the images in the specified layout and save as 1080P PNG. 279 | 280 | Parameters: 281 | images (list of numpy arrays): List of 10 images to be displayed. 282 | frame_id (int): The frame ID to be used in the filename. 283 | output_folder (str): The folder where the output images will be saved. 284 | """ 285 | if len(images) != 10: 286 | raise ValueError("The function expects exactly 10 images.") 287 | 288 | # Create a figure with 1920x1080 resolution 289 | fig = plt.figure( 290 | figsize=(19.2, 10.8), dpi=100 291 | ) # figsize in inches, dpi=100 for 1920x1080 pixels 292 | 293 | # Create a GridSpec with 3 rows and 4 columns 294 | gs = fig.add_gridspec(3, 4, height_ratios=[1, 1, 1.5]) 295 | 296 | # Plot the first 8 images in a 2x4 grid 297 | for i in range(8): 298 | ax = fig.add_subplot(gs[i // 4, i % 4]) 299 | ax.imshow(images[i]) 300 | ax.axis("off") # Hide the axes 301 | 302 | # Plot the 9th image on the bottom left 303 | ax = fig.add_subplot(gs[2, :2]) 304 | ax.imshow(images[8]) 305 | ax.axis("off") # Hide the axes 306 | 307 | # Plot the 10th image on the bottom right 308 | ax = fig.add_subplot(gs[2, 2:]) 309 | ax.imshow(images[9]) 310 | ax.axis("off") # Hide the axes 311 | 312 | # Display the plot 313 | plt.tight_layout() 314 | plt.show() 315 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common_imports import * 2 | from .io import * 3 | from .cv_utils import * 4 | from .transforms import * 5 | from .mano_info import NEW_MANO_FACES, NUM_MANO_VERTS, NUM_MANO_FACES 6 | 7 | 8 | def add_path(path): 9 | if str(path) not in sys.path: 10 | sys.path.insert(0, str(path)) 11 | 12 | 13 | def get_logger(log_name="HOCapToolkit", log_level="INFO", log_file=None): 14 | """Create and return a logger with console and optional file output.""" 15 | logger = logging.getLogger(log_name) 16 | logger.setLevel(logging.DEBUG) 17 | formatter = logging.Formatter( 18 | "[%(asctime)s] [%(name)s:%(funcName)s] [%(levelname).3s] %(message)s", 19 | datefmt="%Y%m%d;%H:%M:%S", 20 | ) 21 | if not logger.hasHandlers(): 22 | if log_file: 23 | fh = logging.FileHandler(log_file) 24 | fh.setLevel(logging.DEBUG) 25 | fh.setFormatter(formatter) 26 | logger.addHandler(fh) 27 | # Console handler 28 | ch = logging.StreamHandler() 29 | ch.setLevel(getattr(logging, log_level.upper(), logging.INFO)) 30 | ch.setFormatter(formatter) 31 | logger.addHandler(ch) 32 | return logger 33 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/color_info.py: -------------------------------------------------------------------------------- 1 | class RGBA: 2 | def __init__(self, red, green, blue, alpha=255): 3 | """ 4 | Initialize an RGBA color. 5 | :param red: Red channel (0-255) 6 | :param green: Green channel (0-255) 7 | :param blue: Blue channel (0-255) 8 | :param alpha: Alpha channel (0-255), default is 255 (opaque) 9 | """ 10 | self.red = red 11 | self.green = green 12 | self.blue = blue 13 | self.alpha = alpha 14 | 15 | def __str__(self): 16 | return "({},{},{},{})".format(self.red, self.green, self.blue, self.alpha) 17 | 18 | @property 19 | def hex(self): 20 | """Return the hexadecimal representation of the color.""" 21 | return "#{:02X}{:02X}{:02X}".format(self.red, self.green, self.blue) 22 | 23 | @property 24 | def rgba(self): 25 | """Return a tuple of the RGBA values.""" 26 | return (self.red, self.green, self.blue, self.alpha) 27 | 28 | @property 29 | def rgb(self): 30 | """Return a tuple of the RGB values.""" 31 | return (self.red, self.green, self.blue) 32 | 33 | @property 34 | def bgra(self): 35 | """Return a tuple of the BGRA values (Blue, Green, Red, Alpha).""" 36 | return (self.blue, self.green, self.red, self.alpha) 37 | 38 | @property 39 | def bgr(self): 40 | """Return a tuple of the BGR values (Blue, Green, Red).""" 41 | return (self.blue, self.green, self.red) 42 | 43 | @property 44 | def rgba_norm(self): 45 | """Return normalized RGBA values (0 to 1).""" 46 | return ( 47 | self.red / 255.0, 48 | self.green / 255.0, 49 | self.blue / 255.0, 50 | self.alpha / 255.0, 51 | ) 52 | 53 | @property 54 | def rgb_norm(self): 55 | """Return normalized RGB values (0 to 1).""" 56 | return (self.red / 255.0, self.green / 255.0, self.blue / 255.0) 57 | 58 | @property 59 | def bgra_norm(self): 60 | """Return normalized BGRA values (0 to 1).""" 61 | return ( 62 | self.blue / 255.0, 63 | self.green / 255.0, 64 | self.red / 255.0, 65 | self.alpha / 255.0, 66 | ) 67 | 68 | @property 69 | def bgr_norm(self): 70 | """Return normalized BGR values (0 to 1).""" 71 | return (self.blue / 255.0, self.green / 255.0, self.red / 255.0) 72 | 73 | 74 | COLORS = { 75 | "red": RGBA(255, 0, 0), 76 | "dark_red": RGBA(139, 0, 0), 77 | "green": RGBA(0, 255, 0), 78 | "dark_green": RGBA(0, 100, 0), 79 | "blue": RGBA(0, 0, 255), 80 | "yellow": RGBA(255, 255, 0), 81 | "magenta": RGBA(255, 0, 255), 82 | "cyan": RGBA(0, 255, 255), 83 | "orange": RGBA(255, 165, 0), 84 | "purple": RGBA(128, 0, 128), 85 | "brown": RGBA(165, 42, 42), 86 | "pink": RGBA(255, 192, 203), 87 | "lime": RGBA(0, 255, 0), 88 | "navy": RGBA(0, 0, 128), 89 | "teal": RGBA(0, 128, 128), 90 | "olive": RGBA(128, 128, 0), 91 | "maroon": RGBA(128, 0, 0), 92 | "coral": RGBA(255, 127, 80), 93 | "turquoise": RGBA(64, 224, 208), 94 | "indigo": RGBA(75, 0, 130), 95 | "violet": RGBA(238, 130, 238), 96 | "gold": RGBA(255, 215, 0), 97 | "skin": RGBA(255, 219, 172), 98 | "white": RGBA(255, 255, 255), 99 | "black": RGBA(0, 0, 0), 100 | "gray": RGBA(128, 128, 128), 101 | "darkgray": RGBA(64, 64, 64), 102 | "lightgray": RGBA(211, 211, 211), 103 | "tomato": RGBA(255, 99, 71), 104 | "deepskyblue": RGBA(0, 128, 255), 105 | # Tab10 colors 106 | "tab10_0": RGBA(31, 119, 180), 107 | "tab10_1": RGBA(255, 127, 14), 108 | "tab10_2": RGBA(44, 160, 44), 109 | "tab10_3": RGBA(214, 39, 40), 110 | "tab10_4": RGBA(148, 103, 189), 111 | "tab10_5": RGBA(140, 86, 75), 112 | "tab10_6": RGBA(227, 119, 194), 113 | "tab10_7": RGBA(127, 127, 127), 114 | "tab10_8": RGBA(188, 189, 34), 115 | "tab10_9": RGBA(23, 190, 207), 116 | } 117 | 118 | # RGB colors for Object classes 119 | OBJ_CLASS_COLORS = [ 120 | COLORS["black"], # background 121 | COLORS["tab10_0"], # object 1 122 | COLORS["tab10_1"], # object 2 123 | COLORS["tab10_2"], # object 3 124 | COLORS["tab10_3"], # object 4 125 | ] 126 | 127 | # RGB colors for Hands 128 | HAND_COLORS = [ 129 | COLORS["black"], # background 130 | COLORS["tab10_5"], # right hand 131 | COLORS["tab10_8"], # left hand 132 | ] 133 | 134 | # RGB colors for HOCap Dataset Segmentation 135 | HO_CAP_SEG_COLOR = [ 136 | COLORS["black"], # background 137 | OBJ_CLASS_COLORS[1], # object 1 138 | OBJ_CLASS_COLORS[2], # object 2 139 | OBJ_CLASS_COLORS[3], # object 3 140 | OBJ_CLASS_COLORS[4], # object 4 141 | HAND_COLORS[1], # right hand 142 | HAND_COLORS[2], # left hand 143 | ] 144 | 145 | # RGB colors for Hand Bones 146 | HAND_BONE_COLORS = [ 147 | # Palm connections 148 | COLORS["gray"], # (0, 1) 149 | COLORS["gray"], # (0, 5) 150 | COLORS["gray"], # (0, 17) 151 | COLORS["gray"], # (5, 9) 152 | COLORS["gray"], # (9, 13) 153 | COLORS["gray"], # (13, 17) 154 | # Thumb 155 | COLORS["red"], # (1, 2) 156 | COLORS["red"], # (2, 3) 157 | COLORS["red"], # (3, 4) 158 | # Index 159 | COLORS["green"], # (5, 6) 160 | COLORS["green"], # (6, 7) 161 | COLORS["green"], # (7, 8) 162 | # Middle 163 | COLORS["blue"], # (9, 10) 164 | COLORS["blue"], # (10, 11) 165 | COLORS["blue"], # (11, 12) 166 | # Ring 167 | COLORS["yellow"], # (13, 14) 168 | COLORS["yellow"], # (14, 15) 169 | COLORS["yellow"], # (15, 16) 170 | # Pinky 171 | COLORS["pink"], # (17, 18) 172 | COLORS["pink"], # (18, 19) 173 | COLORS["pink"], # (19, 20) 174 | ] 175 | 176 | # RGB colors for Hand Joints 177 | HAND_JOINT_COLORS = [ 178 | # Wrist (root) 179 | COLORS["black"], # 0 180 | # Thumb joints 181 | COLORS["red"], # 1 182 | COLORS["red"], # 2 183 | COLORS["red"], # 3 184 | COLORS["red"], # 4 185 | # Index joints 186 | COLORS["green"], # 5 187 | COLORS["green"], # 6 188 | COLORS["green"], # 7 189 | COLORS["green"], # 8 190 | # Middle joints 191 | COLORS["blue"], # 9 192 | COLORS["blue"], # 10 193 | COLORS["blue"], # 11 194 | COLORS["blue"], # 12 195 | # Ring joints 196 | COLORS["yellow"], # 13 197 | COLORS["yellow"], # 14 198 | COLORS["yellow"], # 15 199 | COLORS["yellow"], # 16 200 | # Pinky joints 201 | COLORS["pink"], # 17 202 | COLORS["pink"], # 18 203 | COLORS["pink"], # 19 204 | COLORS["pink"], # 20 205 | ] 206 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/common_imports.py: -------------------------------------------------------------------------------- 1 | # Standard library imports 2 | import os 3 | import sys 4 | import gc 5 | import time 6 | import math 7 | import json 8 | import shutil 9 | import pickle as pkl 10 | import argparse 11 | import itertools 12 | import multiprocessing 13 | from pathlib import Path 14 | from typing import List, Tuple, Dict, Any, Union, Optional 15 | import logging 16 | import concurrent.futures 17 | 18 | # Third-party libraries 19 | from ruamel.yaml import YAML 20 | from tqdm import tqdm 21 | import numpy as np 22 | from scipy.spatial.transform import Rotation as R 23 | from scipy.interpolate import interp1d, CubicSpline 24 | import cv2 25 | import matplotlib.pyplot as plt 26 | from matplotlib.gridspec import GridSpec 27 | from matplotlib.patches import Circle, Rectangle 28 | from matplotlib.lines import Line2D 29 | import open3d as o3d 30 | import open3d.core as o3c 31 | import trimesh 32 | import pyrender 33 | import av 34 | import torch 35 | 36 | yaml = YAML() 37 | yaml.default_flow_style = False 38 | yaml.indent(mapping=2, sequence=4, offset=2) 39 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/cv_utils.py: -------------------------------------------------------------------------------- 1 | from .common_imports import * 2 | from .color_info import ( 3 | COLORS, 4 | OBJ_CLASS_COLORS, 5 | HAND_COLORS, 6 | HAND_BONE_COLORS, 7 | HAND_JOINT_COLORS, 8 | HO_CAP_SEG_COLOR, 9 | ) 10 | from .mano_info import HAND_BONES 11 | 12 | 13 | def _apply_morphology( 14 | mask: np.ndarray, operation: str, kernel_size: int = 3, iterations: int = 1 15 | ) -> np.ndarray: 16 | """Helper function to apply a morphological operation (erode/dilate) on the mask.""" 17 | if mask.ndim not in [2, 3]: 18 | raise ValueError("Mask must be a 2D or 3D numpy array.") 19 | if kernel_size <= 1: 20 | raise ValueError("Kernel size must be greater than 1.") 21 | kernel = np.ones((kernel_size, kernel_size), np.uint8) 22 | mask_dtype = mask.dtype 23 | mask = mask.astype(np.uint8) 24 | if operation == "erode": 25 | result = cv2.erode(mask, kernel, iterations=iterations) 26 | elif operation == "dilate": 27 | result = cv2.dilate(mask, kernel, iterations=iterations) 28 | else: 29 | raise ValueError(f"Invalid operation: {operation}. Use 'erode' or 'dilate'.") 30 | return result.astype(mask_dtype) 31 | 32 | 33 | def _plot_image(ax, image, name, facecolor, titlecolor, fontsize): 34 | """Helper function to plot an image in the grid.""" 35 | if image.ndim == 3 and image.shape[2] == 3: # RGB image 36 | ax.imshow(image) 37 | elif image.ndim == 2 and image.dtype == np.uint8: # Grayscale/mask image 38 | unique_values = np.unique(image) 39 | cmap = "tab10" if len(unique_values) <= 10 else "gray" 40 | ax.imshow(image, cmap=cmap) 41 | elif image.ndim == 2 and image.dtype == bool: # Binary image 42 | ax.imshow(image, cmap="gray") 43 | else: # Depth or other image 44 | ax.imshow(image, cmap="viridis") 45 | 46 | if name: 47 | ax.text( 48 | 5, 49 | 5, 50 | name, 51 | fontsize=fontsize, 52 | color=titlecolor, 53 | verticalalignment="top", 54 | horizontalalignment="left", 55 | bbox=dict(facecolor=facecolor, alpha=0.5, edgecolor="none", pad=3), 56 | ) 57 | 58 | 59 | def erode_mask( 60 | mask: np.ndarray, kernel_size: int = 3, iterations: int = 1 61 | ) -> np.ndarray: 62 | """Apply erosion to the mask.""" 63 | return _apply_morphology( 64 | mask, operation="erode", kernel_size=kernel_size, iterations=iterations 65 | ) 66 | 67 | 68 | def dilate_mask( 69 | mask: np.ndarray, kernel_size: int = 3, iterations: int = 1 70 | ) -> np.ndarray: 71 | """Apply dilation to the mask.""" 72 | return _apply_morphology( 73 | mask, operation="dilate", kernel_size=kernel_size, iterations=iterations 74 | ) 75 | 76 | 77 | def get_depth_colormap(image: np.ndarray) -> np.ndarray: 78 | """Convert a depth image to a colormap representation.""" 79 | if image.ndim != 2: 80 | raise ValueError("Input image must be a 2D array.") 81 | d_min, d_max = image.min(), image.max() 82 | if d_min == d_max: 83 | return np.zeros_like(image, dtype=np.uint8) 84 | # Normalize the depth image to range [0, 255] 85 | img = (image - d_min) / (d_max - d_min) * 255 86 | img = img.astype(np.uint8) 87 | img = cv2.applyColorMap(img, cv2.COLORMAP_VIRIDIS) 88 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 89 | return img 90 | 91 | 92 | def draw_image_overlay( 93 | rgb_image: np.ndarray, overlay_image: np.ndarray, alpha: float = 0.5 94 | ) -> np.ndarray: 95 | """Draw an overlay image on top of an RGB image.""" 96 | return cv2.addWeighted(rgb_image, 1 - alpha, overlay_image, alpha, 0) 97 | 98 | 99 | def draw_image_grid( 100 | images, 101 | names=None, 102 | figsize=(1920, 1080), 103 | max_cols=4, 104 | facecolor="white", 105 | titlecolor="black", 106 | fontsize=12, 107 | bar_width=0.2, 108 | ): 109 | """Display a list of images in a grid and draw the title name on each image's top-left corner.""" 110 | num_images = len(images) 111 | if num_images == 0: 112 | raise ValueError("No images provided to display.") 113 | num_cols = min(num_images, max_cols) 114 | num_rows = (num_images + num_cols - 1) // num_cols 115 | # Default to no names if not provided 116 | if names is None or len(names) != num_images: 117 | names = [None] * num_images 118 | # Create figure and axis grid 119 | fig, axs = plt.subplots( 120 | num_rows, 121 | num_cols, 122 | figsize=(figsize[0] / 100.0, figsize[1] / 100.0), 123 | dpi=100, 124 | facecolor=facecolor, 125 | ) 126 | axs = np.atleast_1d(axs).flat # Ensure axs is always iterable 127 | # Plot each image 128 | for i, (image, name) in enumerate(zip(images, names)): 129 | _plot_image(axs[i], image, name, facecolor, titlecolor, fontsize) 130 | axs[i].axis("off") 131 | # Hide unused axes 132 | for ax in axs[i + 1 :]: 133 | ax.axis("off") 134 | # Adjust layout and spacing 135 | plt.tight_layout(pad=bar_width, h_pad=bar_width, w_pad=bar_width) 136 | # Convert the figure to an RGB array 137 | fig.canvas.draw() 138 | rgb_image = np.array(fig.canvas.buffer_rgba())[:, :, :3] 139 | # Close the figure 140 | plt.close(fig) 141 | return rgb_image 142 | 143 | 144 | def draw_hand_landmarks(rgb_image, landmarks, hand_side=None, box=None): 145 | """Draw hand landmarks on an image.""" 146 | img = rgb_image.copy() 147 | # draw bones 148 | for idx, bone in enumerate(HAND_BONES): 149 | if np.any(landmarks[bone[0]] == -1) or np.any(landmarks[bone[1]] == -1): 150 | continue 151 | cv2.line( 152 | img, 153 | landmarks[bone[0]], 154 | landmarks[bone[1]], 155 | HAND_BONE_COLORS[idx].rgb, 156 | 2, 157 | ) 158 | # draw joints 159 | for idx, mark in enumerate(landmarks): 160 | if np.any(mark == -1): 161 | continue 162 | cv2.circle(img, mark, 5, [255, 255, 255], -1) 163 | cv2.circle( 164 | img, 165 | mark, 166 | 3, 167 | HAND_JOINT_COLORS[idx].rgb, 168 | -1, 169 | ) 170 | 171 | # draw hand box 172 | if box is not None: 173 | cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2) 174 | 175 | # draw hand side text 176 | if hand_side is not None: 177 | text = hand_side.lower() 178 | text_x = np.min(landmarks[:, 0]) 179 | text_y = np.min(landmarks[:, 1]) - 5 # add margin to top 180 | text_color = HAND_COLORS[1] if text == "right" else HAND_COLORS[2] 181 | cv2.putText( 182 | img, 183 | text, 184 | (text_x, text_y), 185 | cv2.FONT_HERSHEY_DUPLEX, 186 | 1, 187 | text_color.rgb, 188 | 1, 189 | cv2.LINE_AA, 190 | ) 191 | return img 192 | 193 | 194 | def draw_all_camera_images( 195 | images, 196 | names=None, 197 | figsize=(1920, 1080), 198 | facecolor="white", 199 | titlecolor="black", 200 | fontsize=12, 201 | bar_width=0.2, 202 | show_only=False, 203 | ): 204 | """Draw nine images in a grid (8 from RealSense cameras and 1 from HoloLens) in a 3x4 layout. 205 | 206 | Args: 207 | images (list of np.ndarray): List of 9 images to be displayed. 208 | names (list of str, optional): List of image names to display on top-left. Defaults to None. 209 | figsize (tuple, optional): Figure size in pixels. Defaults to (1920, 1080). 210 | facecolor (str, optional): Background color of the figure. Defaults to "white". 211 | titlecolor (str, optional): Color of the image titles. Defaults to "black". 212 | fontsize (int, optional): Font size for the image titles. Defaults to 12. 213 | bar_width (float, optional): Padding between subplots. Defaults to 0.2. 214 | 215 | Returns: 216 | np.ndarray: The final figure rendered as an RGB image. 217 | """ 218 | num_images = len(images) 219 | if num_images != 9: 220 | raise ValueError(f"Expected exactly 9 images, but got {num_images}.") 221 | if len(names) != num_images: 222 | raise ValueError( 223 | f"Number of 'names' must match the number of images. Expected 9, but got {len(names)}." 224 | ) 225 | if names is None: 226 | names = [None] * num_images 227 | fig = plt.figure( 228 | figsize=(figsize[0] / 100.0, figsize[1] / 100.0), dpi=100, facecolor=facecolor 229 | ) 230 | gs = GridSpec(3, 4, figure=fig) 231 | # Plot the first eight images in a 2x4 grid 232 | for i in range(8): 233 | row, col = divmod(i, 4) # Divide by 4 to get row, modulo 4 to get column 234 | ax = fig.add_subplot(gs[row, col]) 235 | _plot_image(ax, images[i], names[i], facecolor, titlecolor, fontsize) 236 | ax.axis("off") 237 | # Plot the ninth image in the third row, spanning columns 1 and 2 238 | center_ax = fig.add_subplot(gs[2, 1:3]) 239 | _plot_image(center_ax, images[8], names[8], facecolor, titlecolor, fontsize) 240 | center_ax.axis("off") 241 | # Adjust layout and spacing 242 | plt.tight_layout(pad=bar_width, h_pad=bar_width, w_pad=bar_width) 243 | if show_only: 244 | plt.show() 245 | plt.close(fig) 246 | return 247 | 248 | # Convert figure to RGB image 249 | fig.canvas.draw() 250 | rgb_image = np.array(fig.canvas.buffer_rgba())[:, :, :3] 251 | # Close the figure to free memory 252 | plt.close(fig) 253 | return rgb_image 254 | 255 | 256 | def get_rgb_difference(rgb1, rgb2, scale=255.0): 257 | """Compute L2 error between RGB 1 and RGB2.""" 258 | # Convert to float32 and normalize 259 | im1 = rgb1.astype(np.float32) / scale 260 | im2 = rgb2.astype(np.float32) / scale 261 | # Compute the normalized L2 error 262 | diff = np.sqrt(np.mean((im1 - im2) ** 2)) 263 | return diff 264 | 265 | 266 | def get_mask_iou(mask1, mask2): 267 | """Compute Intersection over Union (IoU) between two binary masks.""" 268 | # Convert to boolean masks 269 | m1 = mask1.astype(bool) 270 | m2 = mask2.astype(bool) 271 | # Compute intersection and union 272 | intersection = np.logical_and(m1, m2).sum() 273 | union = np.logical_or(m1, m2).sum() 274 | # Calculate IoU score 275 | score = intersection / union if union != 0 else 0.0 276 | return score 277 | 278 | 279 | def get_mask_dice_coefficient(mask1, mask2): 280 | """Compute Dice coefficient between two binary masks.""" 281 | # Convert to boolean masks 282 | m1 = mask1.astype(bool) 283 | m2 = mask2.astype(bool) 284 | # Compute intersection and sum of masks 285 | intersection = np.logical_and(m1, m2).sum() 286 | sum_masks = m1.sum() + m2.sum() 287 | # Calculate Dice coefficient 288 | score = 2 * intersection / sum_masks if sum_masks != 0 else 0.0 289 | return score 290 | 291 | 292 | def create_video_from_rgb_images( 293 | file_path: Union[str, Path], rgb_images: List[np.ndarray], fps: int = 30 294 | ) -> None: 295 | """Create a video from a list of RGB images.""" 296 | if not rgb_images: 297 | raise ValueError("The list of RGB images is empty.") 298 | height, width = rgb_images[0].shape[:2] 299 | container = None 300 | try: 301 | container = av.open(str(file_path), mode="w") 302 | stream = container.add_stream("h264", rate=fps) 303 | stream.width = width 304 | stream.height = height 305 | stream.pix_fmt = "yuv420p" 306 | stream.thread_type = "FRAME" # Parallel processing of frames 307 | stream.thread_count = os.cpu_count() # Number of threads to use 308 | for image in rgb_images: 309 | frame = av.VideoFrame.from_ndarray(image, format="rgb24") 310 | for packet in stream.encode(frame): 311 | container.mux(packet) 312 | for packet in stream.encode(): 313 | container.mux(packet) 314 | except Exception as e: 315 | raise IOError(f"Failed to write video to '{file_path}': {e}") 316 | finally: 317 | if container: 318 | container.close() 319 | 320 | 321 | def create_video_from_depth_images( 322 | file_path: Union[str, Path], depth_images: list[np.ndarray], fps: int = 30 323 | ) -> None: 324 | """Create a video from a list of depth images.""" 325 | # Validate image dimensions 326 | height, width = depth_images[0].shape[:2] 327 | container = None 328 | try: 329 | container = av.open(str(file_path), mode="w") 330 | stream = container.add_stream("h264", rate=fps) 331 | stream.width = width 332 | stream.height = height 333 | stream.pix_fmt = "yuv420p" 334 | stream.thread_type = "FRAME" # Parallel processing of frames 335 | stream.thread_count = os.cpu_count() # Number of threads to use 336 | 337 | for depth_image in depth_images: 338 | image = get_depth_colormap(depth_image) 339 | frame = av.VideoFrame.from_ndarray(image, format="rgb24") 340 | for packet in stream.encode(frame): 341 | container.mux(packet) 342 | for packet in stream.encode(): 343 | container.mux(packet) 344 | except Exception as e: 345 | raise IOError(f"Failed to write video to '{file_path}': {e}") 346 | finally: 347 | if container: 348 | container.close() 349 | 350 | 351 | def create_video_from_image_files( 352 | file_path: Union[str, Path], 353 | image_files: List[Union[str, Path]], 354 | fps: int = 30, 355 | preload: bool = False, 356 | ) -> None: 357 | """Create a video from a list of image files (RGB or Depth images). 358 | 359 | Args: 360 | file_path (str | Path): Path to save the output video. 361 | image_files (list[str | Path]): List of image file paths. 362 | fps (int, optional): Frames per second for the video. Defaults to 30. 363 | preload (bool, optional): Preload all images into memory before creating the video. Defaults to False. 364 | """ 365 | 366 | def worker_read_image_file(image_file): 367 | """Helper to read the image file, handle depth images, and return an RGB image.""" 368 | img = cv2.imread(str(image_file), cv2.IMREAD_UNCHANGED) 369 | if img is None: 370 | raise ValueError(f"Failed to read image file: {image_file}") 371 | # If depth image (2D), apply colormap, otherwise assume it's an RGB image 372 | if img.ndim == 2: 373 | img = get_depth_colormap(img) 374 | return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 375 | 376 | if not image_files: 377 | raise ValueError("The list of image files is empty.") 378 | 379 | # Load all images into memory if preload is True 380 | if preload: 381 | images = [None] * len(image_files) 382 | with concurrent.futures.ThreadPoolExecutor() as executor: 383 | futures = { 384 | executor.submit(worker_read_image_file, image_file): i 385 | for i, image_file in enumerate(image_files) 386 | } 387 | for future in concurrent.futures.as_completed(futures): 388 | i = futures[future] 389 | try: 390 | images[i] = future.result() 391 | except Exception as e: 392 | raise ValueError(f"Error loading image: {e}") 393 | else: 394 | images = None 395 | 396 | first_image = worker_read_image_file(image_files[0]) 397 | height, width = first_image.shape[:2] 398 | container = None 399 | try: 400 | container = av.open(str(file_path), mode="w") 401 | stream = container.add_stream("h264", rate=fps) 402 | stream.width = width 403 | stream.height = height 404 | stream.pix_fmt = "yuv420p" 405 | stream.thread_type = "FRAME" # Parallel processing of frames 406 | stream.thread_count = os.cpu_count() # Number of threads to use 407 | for i in range(len(image_files)): 408 | image = images[i] if preload else worker_read_image_file(image_files[i]) 409 | frame = av.VideoFrame.from_ndarray(image, format="rgb24") 410 | for packet in stream.encode(frame): 411 | container.mux(packet) 412 | for packet in stream.encode(): 413 | container.mux(packet) 414 | except Exception as e: 415 | raise IOError(f"Failed to write video to '{file_path}': {e}") 416 | finally: 417 | if container: 418 | container.close() 419 | 420 | 421 | def write_points_to_ply( 422 | points: np.ndarray, save_path: Union[str, Path], colors: np.ndarray = None 423 | ) -> None: 424 | """Write a point cloud to a PLY file.""" 425 | if colors is None: # Default to green color 426 | colors = np.tile([0, 1, 0], (points.shape[0], 1)).astype(np.float32) 427 | pcd = o3d.geometry.PointCloud() 428 | pcd.points = o3d.utility.Vector3dVector(points) 429 | pcd.colors = o3d.utility.Vector3dVector(colors) 430 | o3d.io.write_point_cloud(str(save_path), pcd, write_ascii=True) 431 | 432 | 433 | def read_points_from_ply(file_path: Union[str, Path]) -> np.ndarray: 434 | """Read a point cloud from a PLY file.""" 435 | pcd = o3d.io.read_point_cloud(str(file_path)) 436 | points = np.asarray(pcd.points, dtype=np.float32) 437 | return points 438 | 439 | 440 | def get_xyz_from_uvd(u, v, d, fx, fy, cx, cy): 441 | if d == 0: # Handle division by zero 442 | return [0.0, 0.0, 0.0] 443 | x = (u - cx) * d / fx 444 | y = (v - cy) * d / fy 445 | z = d 446 | return [x, y, z] 447 | 448 | 449 | def get_uv_from_xyz(x, y, z, fx, fy, cx, cy): 450 | if z == 0: # Prevent division by zero 451 | return [-1.0, -1.0] 452 | u = x * fx / z + cx 453 | v = y * fy / z + cy 454 | return [u, v] 455 | 456 | 457 | def get_bbox_from_landmarks(landmarks, width, height, margin=3): 458 | """Get the xyxy bounding box from hand landmarks.""" 459 | # Filter landmarks where both x and y are valid (i.e., not -1) 460 | marks = np.array(landmarks) 461 | valid_mask = ~np.all(marks == -1, axis=1) 462 | if valid_mask.sum() == 0: 463 | # If no valid landmarks, return a full image bounding box 464 | return [-1, -1, -1, -1] 465 | # Get the bounding box using cv2.boundingRect 466 | x, y, w, h = cv2.boundingRect(marks[valid_mask]) 467 | bbox = np.array([x, y, x + w, y + h]) 468 | # Apply margin while ensuring the bounding box stays within image bounds 469 | bbox[0] = max(0, bbox[0] - margin) 470 | bbox[1] = max(0, bbox[1] - margin) 471 | bbox[2] = min(width - 1, bbox[2] + margin) 472 | bbox[3] = min(height - 1, bbox[3] + margin) 473 | return bbox.astype(int).tolist() 474 | 475 | 476 | def get_bbox_from_mask(mask, margin=3): 477 | """Get the xyxy bounding box from a binary mask.""" 478 | height, width = mask.shape[:2] 479 | if not np.any(mask): 480 | return [-1.0, -1.0, -1.0, -1.0] 481 | x, y, w, h = cv2.boundingRect(mask.astype(np.uint8)) 482 | bbox = np.array([x, y, x + w, y + h]) 483 | bbox[0] = max(0, bbox[0] - margin) 484 | bbox[1] = max(0, bbox[1] - margin) 485 | bbox[2] = min(width - 1, bbox[2] + margin) 486 | bbox[3] = min(height - 1, bbox[3] + margin) 487 | return bbox.astype(float).tolist() 488 | 489 | 490 | def get_mask_from_seg_image(seg_img, color_to_idx_map): 491 | H, W, _ = seg_img.shape 492 | flat_seg_img = seg_img.reshape(-1, 3) 493 | flat_mask_img = np.zeros((H * W), dtype=np.uint8) 494 | for color, idx in color_to_idx_map.items(): 495 | matching_pixels = np.all(flat_seg_img == color, axis=1) 496 | flat_mask_img[matching_pixels] = idx 497 | mask_img = flat_mask_img.reshape(H, W) 498 | return mask_img 499 | 500 | 501 | def draw_debug_image( 502 | rgb_image, 503 | hand_mask=None, 504 | object_mask=None, 505 | prompt_points=None, 506 | prompt_labels=None, 507 | hand_marks=None, 508 | alpha=0.5, 509 | draw_boxes=False, 510 | draw_hand_sides=False, 511 | reduce_background=False, 512 | ): 513 | """ 514 | Draws debug information on an RGB image. 515 | 516 | Args: 517 | rgb_image (np.ndarray): The original RGB image. 518 | hand_mask (np.ndarray, optional): Mask of the hands. 519 | object_mask (np.ndarray, optional): Mask of the objects. 520 | prompt_points (list, optional): Points to be drawn on the image. 521 | prompt_labels (list, optional): Labels for the prompt points. 522 | hand_marks (list, optional): Hand landmark points. 523 | alpha (float, optional): Transparency factor for overlay. Defaults to 0.5. 524 | reduce_background (bool, optional): Whether to reduce the background visibility. Defaults to False. 525 | draw_boxes (bool, optional): Whether to draw bounding boxes around hands and objects. Defaults to False. 526 | draw_hand_sides (bool, optional): Whether to draw text indicating left/right hand. Defaults to False. 527 | 528 | Returns: 529 | np.ndarray: The image with debug information drawn on it. 530 | """ 531 | height, width = rgb_image.shape[:2] 532 | overlay = np.zeros_like(rgb_image) if reduce_background else rgb_image.copy() 533 | 534 | def apply_mask(mask, colors): 535 | for label in np.unique(mask): 536 | if label == 0: 537 | continue 538 | overlay[mask == label] = colors[label].rgb 539 | 540 | def draw_boxes_from_mask(mask, colors): 541 | for label in np.unique(mask): 542 | if label == 0: 543 | continue 544 | box = get_bbox_from_mask(mask == label) 545 | cv2.rectangle( 546 | overlay, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), colors[label].rgb, 2 547 | ) 548 | 549 | # Draw hand mask 550 | if hand_mask is not None: 551 | apply_mask(hand_mask, HAND_COLORS) 552 | 553 | # Draw object mask 554 | if object_mask is not None: 555 | apply_mask(object_mask, OBJ_CLASS_COLORS) 556 | 557 | # Draw bounding boxes 558 | if draw_boxes: 559 | if hand_mask is not None: 560 | draw_boxes_from_mask(hand_mask, HAND_COLORS) 561 | if object_mask is not None: 562 | draw_boxes_from_mask(object_mask, OBJ_CLASS_COLORS) 563 | 564 | # Draw prompt points 565 | if prompt_points is not None and prompt_labels is not None: 566 | points = np.array(prompt_points, dtype=np.int32).reshape(-1, 2) 567 | labels = np.array(prompt_labels, dtype=np.int32).reshape(-1) 568 | for point, label in zip(points, labels): 569 | color = COLORS["dark_red"] if label == 0 else COLORS["dark_green"] 570 | cv2.circle(overlay, tuple(point), 3, color.rgb, -1) 571 | 572 | overlay = cv2.addWeighted(rgb_image, 1 - alpha, overlay, alpha, 0) 573 | 574 | # Draw hand sides 575 | if draw_hand_sides and hand_mask is not None and hand_marks is None: 576 | for label in np.unique(hand_mask): 577 | if label == 0: 578 | continue 579 | mask = hand_mask == label 580 | color = HAND_COLORS[label] 581 | text = "right" if label == 1 else "left" 582 | x, y, _, _ = cv2.boundingRect(mask.astype(np.uint8)) 583 | cv2.putText( 584 | overlay, 585 | text, 586 | (x, y - 5), 587 | cv2.FONT_HERSHEY_DUPLEX, 588 | 1, 589 | color.rgb, 590 | 1, 591 | cv2.LINE_AA, 592 | ) 593 | 594 | # Draw hand landmarks 595 | if hand_marks is not None: 596 | for ind, marks in enumerate(hand_marks): 597 | if np.all(marks == -1): 598 | continue 599 | 600 | # Draw bones 601 | for bone_idx, (start, end) in enumerate(HAND_BONES): 602 | if np.any(marks[start] == -1) or np.any(marks[end] == -1): 603 | continue 604 | color = HAND_BONE_COLORS[bone_idx] 605 | cv2.line(overlay, tuple(marks[start]), tuple(marks[end]), color.rgb, 2) 606 | 607 | # Draw joints 608 | for i, mark in enumerate(marks): 609 | if np.any(mark == -1): 610 | continue 611 | color = HAND_JOINT_COLORS[i] 612 | cv2.circle(overlay, tuple(mark), 5, (255, 255, 255), -1) 613 | cv2.circle(overlay, tuple(mark), 3, color.rgb, -1) 614 | 615 | if draw_boxes: 616 | box = get_bbox_from_landmarks(marks, width, height, margin=10) 617 | color = HAND_COLORS[1] if ind == 0 else HAND_COLORS[2] 618 | cv2.rectangle(overlay, (box[0], box[1]), (box[2], box[3]), color.rgb, 2) 619 | 620 | if draw_hand_sides: 621 | text = "right" if ind == 0 else "left" 622 | color = HAND_COLORS[1] if ind == 0 else HAND_COLORS[2] 623 | x, y, _, _ = cv2.boundingRect( 624 | np.array([m for m in marks if np.all(m != -1)], dtype=np.int64) 625 | ) 626 | cv2.putText( 627 | overlay, 628 | text, 629 | (x, y - 11), 630 | cv2.FONT_HERSHEY_DUPLEX, 631 | .8, 632 | color.rgb, 633 | 1, 634 | cv2.LINE_AA, 635 | ) 636 | 637 | return overlay 638 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/io.py: -------------------------------------------------------------------------------- 1 | from .common_imports import * 2 | 3 | 4 | def make_clean_folder(folder_path: Union[str, Path]) -> None: 5 | """Delete the folder if it exists and create a new one.""" 6 | if Path(folder_path).is_dir(): 7 | shutil.rmtree(str(folder_path)) 8 | try: 9 | Path(folder_path).mkdir(parents=True, exist_ok=True) 10 | except OSError as e: 11 | raise OSError(f"Failed to create folder '{folder_path}': {e}") 12 | 13 | 14 | def read_data_from_json(file_path: Union[str, Path]) -> Any: 15 | """Read data from a JSON file and return it.""" 16 | if not Path(file_path).is_file(): 17 | raise FileNotFoundError(f"File not found: {file_path}") 18 | try: 19 | with open(str(file_path), "r", encoding="utf-8") as f: 20 | return json.load(f) 21 | except json.JSONDecodeError as e: 22 | raise ValueError(f"Error parsing JSON from {file_path}: {e}") 23 | 24 | 25 | def write_data_to_json(file_path: Union[str, Path], data: Union[list, Dict]) -> None: 26 | """Write data to a JSON file.""" 27 | try: 28 | with open(str(file_path), "w", encoding="utf-8") as f: 29 | json.dump(data, f, indent=2, ensure_ascii=False, sort_keys=False) 30 | except IOError as e: 31 | raise IOError(f"Failed to write JSON data to {file_path}: {e}") 32 | 33 | 34 | def read_data_from_yaml(file_path: Union[str, Path]) -> Any: 35 | """Read data from a YAML file and return it.""" 36 | if not Path(file_path).is_file(): 37 | raise FileNotFoundError(f"File not found: {file_path}") 38 | try: 39 | with open(str(file_path), "r", encoding="utf-8") as f: 40 | return yaml.load(f) 41 | except FileNotFoundError: 42 | raise FileNotFoundError(f"File not found: {file_path}") 43 | except Exception as e: 44 | raise ValueError(f"Error reading YAML file from {file_path}: {e}") 45 | 46 | 47 | def write_data_to_yaml(file_path: Union[str, Path], data: Any) -> None: 48 | """Write data to a YAML file.""" 49 | try: 50 | with open(str(file_path), "w", encoding="utf-8") as f: 51 | yaml.dump(data, f) 52 | except IOError as e: 53 | raise IOError(f"Failed to write YAML data to {file_path}: {e}") 54 | 55 | 56 | def read_rgb_image(file_path: Union[str, Path]) -> np.ndarray: 57 | """Read an RGB image from the specified file path.""" 58 | if not Path(file_path).exists(): 59 | raise FileNotFoundError(f"Image file '{file_path}' does not exist.") 60 | image = cv2.imread(str(file_path)) 61 | if image is None: 62 | raise ValueError(f"Failed to load image from '{file_path}'.") 63 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 64 | return image 65 | 66 | 67 | def write_rgb_image(file_path: Union[str, Path], image: np.ndarray) -> None: 68 | """Write an RGB image to the specified file path.""" 69 | if image.ndim != 3 or image.shape[2] != 3: 70 | raise ValueError("Input image must be an RGB image with 3 channels.") 71 | success = cv2.imwrite(str(file_path), cv2.cvtColor(image, cv2.COLOR_RGB2BGR)) 72 | if not success: 73 | raise ValueError(f"Failed to write RGB image to '{file_path}'.") 74 | 75 | 76 | def read_depth_image(file_path: Union[str, Path], scale: float = 1.0) -> np.ndarray: 77 | """Read a depth image from the specified file path.""" 78 | if not Path(file_path).exists(): 79 | raise FileNotFoundError(f"Depth image file '{file_path}' does not exist.") 80 | image = cv2.imread(str(file_path), cv2.IMREAD_ANYDEPTH) 81 | if image is None: 82 | raise ValueError(f"Failed to load depth image from '{file_path}'.") 83 | image = image.astype(np.float32) / scale 84 | return image 85 | 86 | 87 | def write_depth_image(file_path: Union[str, Path], image: np.ndarray) -> None: 88 | """Write a depth image to the specified file path.""" 89 | if image.dtype not in [np.uint16, np.uint8]: 90 | raise ValueError("Depth image must be of type uint16 or uint8.") 91 | success = cv2.imwrite(str(file_path), image) 92 | if not success: 93 | raise ValueError(f"Failed to write depth image to '{file_path}'.") 94 | 95 | 96 | def read_mask_image(file_path: Union[str, Path]) -> np.ndarray: 97 | """Read a mask image from the specified file path.""" 98 | if not Path(file_path).exists(): 99 | raise FileNotFoundError(f"Mask image file '{file_path}' does not exist.") 100 | image = cv2.imread(str(file_path), cv2.IMREAD_GRAYSCALE) 101 | if image is None: 102 | raise ValueError(f"Failed to load mask image from '{file_path}'.") 103 | return image 104 | 105 | 106 | def write_mask_image(file_path: Union[str, Path], image: np.ndarray) -> None: 107 | """Write a mask image to the specified file path.""" 108 | success = cv2.imwrite(str(file_path), image) 109 | if not success: 110 | raise ValueError(f"Failed to write mask image to '{file_path}'.") 111 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/mano_info.py: -------------------------------------------------------------------------------- 1 | """MediaPipe Hands connections and MANO hand model enhancements.""" 2 | 3 | # Connections for the hand palm, thumb, and fingers 4 | HAND_PALM_CONNECTIONS = ((0, 1), (0, 5), (0, 17), (5, 9), (9, 13), (13, 17)) 5 | HAND_THUMB_CONNECTIONS = ((1, 2), (2, 3), (3, 4)) 6 | HAND_INDEX_FINGER_CONNECTIONS = ((5, 6), (6, 7), (7, 8)) 7 | HAND_MIDDLE_FINGER_CONNECTIONS = ((9, 10), (10, 11), (11, 12)) 8 | HAND_RING_FINGER_CONNECTIONS = ((13, 14), (14, 15), (15, 16)) 9 | HAND_PINKY_FINGER_CONNECTIONS = ((17, 18), (18, 19), (19, 20)) 10 | 11 | # All hand bone connections combined 12 | HAND_BONES = ( 13 | HAND_PALM_CONNECTIONS 14 | + HAND_THUMB_CONNECTIONS 15 | + HAND_INDEX_FINGER_CONNECTIONS 16 | + HAND_MIDDLE_FINGER_CONNECTIONS 17 | + HAND_RING_FINGER_CONNECTIONS 18 | + HAND_PINKY_FINGER_CONNECTIONS 19 | ) 20 | 21 | # Hand joint names as per the typical skeleton structure 22 | HAND_JOINT_NAMES = ( 23 | "WRIST", 24 | "THUMB_CMC", 25 | "THUMB_MCP", 26 | "THUMB_IP", 27 | "THUMB_TIP", 28 | "INDEX_MCP", 29 | "INDEX_PIP", 30 | "INDEX_DIP", 31 | "INDEX_TIP", 32 | "MIDDLE_MCP", 33 | "MIDDLE_PIP", 34 | "MIDDLE_DIP", 35 | "MIDDLE_TIP", 36 | "RING_MCP", 37 | "RING_PIP", 38 | "RING_DIP", 39 | "RING_TIP", 40 | "PINKY_MCP", 41 | "PINKY_PIP", 42 | "PINKY_DIP", 43 | "PINKY_TIP", 44 | ) 45 | 46 | # Parent-child relationships of hand joints (index refers to HAND_JOINT_NAMES) 47 | # -1 indicates no parent (root node) 48 | HAND_JOINT_PARENTS = [ 49 | -1, # WRIST 50 | 0, # THUMB_CMC 51 | 1, # THUMB_MCP 52 | 2, # THUMB_IP 53 | 3, # THUMB_TIP 54 | 0, # INDEX_MCP 55 | 5, # INDEX_PIP 56 | 6, # INDEX_DIP 57 | 7, # INDEX_TIP 58 | 0, # MIDDLE_MCP 59 | 9, # MIDDLE_PIP 60 | 10, # MIDDLE_DIP 61 | 11, # MIDDLE_TIP 62 | 0, # RING_MCP 63 | 13, # RING_PIP 64 | 14, # RING_DIP 65 | 15, # RING_TIP 66 | 0, # PINKY_MCP 67 | 17, # PINKY_PIP 68 | 18, # PINKY_DIP 69 | 19, # PINKY_TIP 70 | ] 71 | 72 | # Additional faces added to the MANO hand mesh for watertightness 73 | NEW_MANO_FACES = { 74 | "right": [ 75 | [92, 38, 234], 76 | [234, 38, 239], 77 | [38, 122, 239], 78 | [239, 122, 279], 79 | [122, 118, 279], 80 | [279, 118, 215], 81 | [118, 117, 215], 82 | [215, 117, 214], 83 | [117, 119, 214], 84 | [214, 119, 121], 85 | [119, 120, 121], 86 | [121, 120, 78], 87 | [120, 108, 78], 88 | [78, 108, 79], 89 | ], 90 | "left": [ 91 | [234, 38, 92], 92 | [239, 38, 234], 93 | [239, 122, 38], 94 | [279, 122, 239], 95 | [279, 118, 122], 96 | [215, 118, 279], 97 | [215, 117, 118], 98 | [214, 117, 215], 99 | [214, 119, 117], 100 | [121, 119, 214], 101 | [121, 120, 119], 102 | [78, 120, 121], 103 | [78, 108, 120], 104 | [79, 108, 78], 105 | ], 106 | } 107 | 108 | # Number of vertices and faces in the MANO model 109 | NUM_MANO_VERTS = 778 110 | NUM_MANO_FACES = 1538 111 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/misc.py: -------------------------------------------------------------------------------- 1 | from .common_imports import Union, Path, sys, logging, Optional, json, os 2 | 3 | 4 | def add_path(path): 5 | if str(path) not in sys.path: 6 | sys.path.insert(0, str(path)) 7 | 8 | 9 | def get_logger(log_name="HOCapToolkit", log_level="INFO", log_file=None): 10 | """Create and return a logger with console and optional file output.""" 11 | logger = logging.getLogger(log_name) 12 | logger.setLevel(logging.DEBUG) 13 | formatter = logging.Formatter( 14 | "[%(asctime)s] [%(name)s:%(funcName)s] [%(levelname).3s] %(message)s", 15 | datefmt="%Y%m%d;%H:%M:%S", 16 | ) 17 | if not logger.hasHandlers(): 18 | if log_file: 19 | fh = logging.FileHandler(log_file) 20 | fh.setLevel(logging.DEBUG) 21 | fh.setFormatter(formatter) 22 | logger.addHandler(fh) 23 | # Console handler 24 | ch = logging.StreamHandler() 25 | ch.setLevel(getattr(logging, log_level.upper(), logging.INFO)) 26 | ch.setFormatter(formatter) 27 | logger.addHandler(ch) 28 | return logger 29 | -------------------------------------------------------------------------------- /hocap_toolkit/utils/transforms.py: -------------------------------------------------------------------------------- 1 | from .common_imports import * 2 | 3 | 4 | def average_quats(quats: np.ndarray) -> np.ndarray: 5 | """ 6 | Calculate the average quaternion from a set of quaternions. 7 | 8 | Args: 9 | quats (np.ndarray): An array of quaternions of shape (N, 4), where N is the number of quaternions. 10 | 11 | Returns: 12 | np.ndarray: The averaged quaternion of shape (4,). 13 | """ 14 | if not isinstance(quats, np.ndarray) or quats.shape[-1] != 4: 15 | raise ValueError("Input must be a numpy array of shape (N, 4).") 16 | 17 | rotations = R.from_quat(quats) 18 | avg_quat = rotations.mean().as_quat().astype(np.float32) 19 | return avg_quat 20 | 21 | 22 | def normalize_quats(qs: np.ndarray) -> np.ndarray: 23 | """ 24 | Normalize quaternions to have unit length. 25 | 26 | Args: 27 | qs (np.ndarray): Input quaternion, shape (4,) or (N, 4) where each quaternion is (qx, qy, qz, qw). 28 | 29 | Returns: 30 | np.ndarray: Normalized quaternion(s), same shape as input. 31 | """ 32 | # Compute the norm of the quaternion 33 | norms = np.linalg.norm(qs, axis=-1, keepdims=True) 34 | if np.any(norms == 0): 35 | raise ValueError("Quaternion norms cannot be zero.") 36 | return qs / norms 37 | 38 | 39 | def rvt_to_quat(rvt: np.ndarray) -> np.ndarray: 40 | """ 41 | Convert rotation vector and translation vector to quaternion and translation vector. 42 | 43 | Args: 44 | rvt (np.ndarray): Rotation vector and translation vector, shape (6,) for single or (N, 6) for batch. 45 | 46 | Returns: 47 | np.ndarray: Quaternion and translation vector, shape (7,) for single or (N, 7) for batch, 48 | in the format [qx, qy, qz, qw, tx, ty, tz]. 49 | """ 50 | # Ensure the input has the correct shape 51 | if rvt.ndim == 1 and rvt.shape[0] == 6: 52 | rv = rvt[:3] 53 | t = rvt[3:] 54 | q = R.from_rotvec(rv).as_quat() 55 | return np.concatenate([q, t], dtype=np.float32) 56 | 57 | elif rvt.ndim == 2 and rvt.shape[1] == 6: 58 | rv = rvt[:, :3] 59 | t = rvt[:, 3:] 60 | q = R.from_rotvec(rv).as_quat() # Batch process 61 | return np.concatenate([q, t], axis=-1).astype(np.float32) 62 | 63 | else: 64 | raise ValueError("Input must be of shape (6,) or (N, 6).") 65 | 66 | 67 | def quat_to_rvt(quat: np.ndarray) -> np.ndarray: 68 | """ 69 | Convert quaternion and translation vector to rotation vector and translation vector. 70 | 71 | Args: 72 | quat (np.ndarray): Quaternion and translation vector. Shape can be (7,) for single input 73 | or (N, 7) for batched input. 74 | 75 | Returns: 76 | np.ndarray: Rotation vector and translation vector. Shape will be (6,) for single input 77 | or (N, 6) for batched input. 78 | 79 | Raises: 80 | ValueError: If the input does not have the expected shape or dimensions. 81 | """ 82 | # Validate input shape 83 | if not isinstance(quat, np.ndarray): 84 | raise TypeError("Input must be a numpy array.") 85 | 86 | if quat.ndim == 1 and quat.shape[0] == 7: 87 | batch_mode = False 88 | elif quat.ndim == 2 and quat.shape[1] == 7: 89 | batch_mode = True 90 | else: 91 | raise ValueError( 92 | "Input must have shape (7,) for a single quaternion or (N, 7) for a batch of quaternions." 93 | ) 94 | 95 | # Extract quaternion (q) and translation (t) 96 | q = quat[..., :4] # Quaternion (4 elements) 97 | t = quat[..., 4:] # Translation (3 elements) 98 | 99 | # Convert quaternion to rotation vector 100 | r = R.from_quat(q) 101 | rv = r.as_rotvec() # Convert to rotation vector (3 elements) 102 | 103 | # Concatenate rotation vector and translation vector 104 | return np.concatenate([rv, t], axis=-1).astype(np.float32) 105 | 106 | 107 | def rvt_to_mat(rvt: np.ndarray) -> np.ndarray: 108 | """ 109 | Convert rotation vector and translation vector to pose matrix. 110 | 111 | Args: 112 | rvt (np.ndarray): Rotation vector and translation vector, shape (6,) for single or (N, 6) for batch. 113 | 114 | Returns: 115 | np.ndarray: Pose matrix, shape (4, 4) for single or (N, 4, 4) for batch. 116 | """ 117 | # Single input case (shape (6,)) 118 | if rvt.ndim == 1 and rvt.shape[0] == 6: 119 | p = np.eye(4) 120 | rv = rvt[:3] 121 | t = rvt[3:] 122 | r = R.from_rotvec(rv) 123 | p[:3, :3] = r.as_matrix() 124 | p[:3, 3] = t 125 | return p.astype(np.float32) 126 | 127 | # Batched input case (shape (N, 6)) 128 | elif rvt.ndim == 2 and rvt.shape[1] == 6: 129 | N = rvt.shape[0] 130 | p = np.tile(np.eye(4), (N, 1, 1)) # Create an identity matrix for each batch 131 | rv = rvt[:, :3] # Rotation vectors (N, 3) 132 | t = rvt[:, 3:] # Translation vectors (N, 3) 133 | r = R.from_rotvec(rv) 134 | p[:, :3, :3] = r.as_matrix() # Set rotation matrices for each batch 135 | p[:, :3, 3] = t # Set translation vectors for each batch 136 | return p.astype(np.float32) 137 | 138 | else: 139 | raise ValueError("Input must be of shape (6,) or (N, 6).") 140 | 141 | 142 | def mat_to_rvt(mat_4x4: np.ndarray) -> np.ndarray: 143 | """ 144 | Convert pose matrix to rotation vector and translation vector. 145 | 146 | Args: 147 | mat_4x4 (np.ndarray): Pose matrix, shape (4, 4) for single input 148 | or (N, 4, 4) for batched input. 149 | 150 | Returns: 151 | np.ndarray: Rotation vector and translation vector, shape (6,) for single input 152 | or (N, 6) for batched input. 153 | """ 154 | # Single input case (shape (4, 4)) 155 | if mat_4x4.ndim == 2 and mat_4x4.shape == (4, 4): 156 | r = R.from_matrix(mat_4x4[:3, :3]) 157 | rv = r.as_rotvec() 158 | t = mat_4x4[:3, 3] 159 | return np.concatenate([rv, t], dtype=np.float32) 160 | 161 | # Batched input case (shape (N, 4, 4)) 162 | elif mat_4x4.ndim == 3 and mat_4x4.shape[1:] == (4, 4): 163 | rv = R.from_matrix(mat_4x4[:, :3, :3]).as_rotvec() # Batch process rotations 164 | t = mat_4x4[:, :3, 3] # Batch process translations 165 | return np.concatenate([rv, t], axis=-1).astype(np.float32) 166 | 167 | else: 168 | raise ValueError("Input must be of shape (4, 4) or (N, 4, 4).") 169 | 170 | 171 | def mat_to_quat(mat_4x4: np.ndarray) -> np.ndarray: 172 | """ 173 | Convert pose matrix to quaternion and translation vector. 174 | 175 | Args: 176 | mat_4x4 (np.ndarray): Pose matrix, shape (4, 4) for single input or (N, 4, 4) for batched input. 177 | 178 | Returns: 179 | np.ndarray: Quaternion and translation vector, shape (7,) for single input or (N, 7) for batched input. 180 | 181 | Raises: 182 | ValueError: If the input does not have the expected shape or dimensions. 183 | """ 184 | if not isinstance(mat_4x4, np.ndarray) or mat_4x4.shape[-2:] != (4, 4): 185 | raise ValueError("Input must be a numpy array with shape (4, 4) or (N, 4, 4).") 186 | 187 | if mat_4x4.ndim == 2: # Single matrix (shape (4, 4)) 188 | r = R.from_matrix(mat_4x4[:3, :3]) 189 | q = r.as_quat() # Quaternion (shape (4,)) 190 | t = mat_4x4[:3, 3] # Translation (shape (3,)) 191 | return np.concatenate([q, t], dtype=np.float32) 192 | 193 | elif mat_4x4.ndim == 3: # Batch of matrices (shape (N, 4, 4)) 194 | r = R.from_matrix(mat_4x4[:, :3, :3]) # Handle batch of rotation matrices 195 | q = r.as_quat() # Quaternions (shape (N, 4)) 196 | t = mat_4x4[:, :3, 3] # Translations (shape (N, 3)) 197 | return np.concatenate([q, t], axis=-1).astype(np.float32) # Shape (N, 7) 198 | 199 | else: 200 | raise ValueError("Input dimension is not valid. Must be 2D or 3D.") 201 | 202 | 203 | def quat_to_mat(quat: np.ndarray) -> np.ndarray: 204 | """ 205 | Convert quaternion and translation vector to a pose matrix. 206 | 207 | This function supports converting a single quaternion or a batch of quaternions. 208 | 209 | Args: 210 | quat (np.ndarray): Quaternion and translation vector. Shape can be (7,) for a single quaternion 211 | or (N, 7) for a batch of quaternions, where N is the batch size. 212 | 213 | Returns: 214 | np.ndarray: Pose matrix. Shape will be (4, 4) for a single quaternion or (N, 4, 4) for a batch of quaternions. 215 | 216 | Raises: 217 | ValueError: If the input does not have the expected shape or dimensions. 218 | """ 219 | # Validate input shape 220 | if not isinstance(quat, np.ndarray): 221 | raise TypeError("Input must be a numpy array.") 222 | 223 | if quat.ndim == 1 and quat.shape[0] == 7: 224 | batch_mode = False 225 | elif quat.ndim == 2 and quat.shape[1] == 7: 226 | batch_mode = True 227 | else: 228 | raise ValueError( 229 | "Input must have shape (7,) for a single quaternion or (N, 7) for a batch of quaternions." 230 | ) 231 | 232 | # Extract quaternion (q) and translation (t) 233 | q = quat[..., :4] # Quaternion (4 elements) 234 | t = quat[..., 4:] # Translation (3 elements) 235 | 236 | # Prepare the pose matrix 237 | if batch_mode: 238 | N = quat.shape[0] 239 | p = np.tile(np.eye(4), (N, 1, 1)) # Create N identity matrices 240 | else: 241 | p = np.eye(4) # Single identity matrix 242 | 243 | # Convert quaternion to rotation matrix and fill in the pose matrix 244 | r = R.from_quat(q) 245 | p[..., :3, :3] = r.as_matrix() # Fill rotation part 246 | p[..., :3, 3] = t # Fill translation part 247 | 248 | return p.astype(np.float32) 249 | 250 | 251 | def quat_distance( 252 | q1: np.ndarray, q2: np.ndarray, in_degree: bool = False 253 | ) -> Union[float, np.ndarray]: 254 | """ 255 | Calculate the shortest angular distance in degrees between paired quaternions. 256 | 257 | Args: 258 | q1 (np.ndarray): First quaternion(s), shape (4,) or (N, 4). 259 | q2 (np.ndarray): Second quaternion(s), shape (4,) or (N, 4). 260 | 261 | Returns: 262 | float or np.ndarray: Angular distance in degrees, scalar if single pair, array if multiple pairs. 263 | """ 264 | # Validate input shapes 265 | if q1.ndim not in {1, 2} or q2.ndim not in {1, 2}: 266 | raise ValueError("q1 and q2 must be 1D or 2D arrays.") 267 | if q1.shape[-1] != 4 or q2.shape[-1] != 4: 268 | raise ValueError("Each quaternion must have 4 components (qx, qy, qz, qw).") 269 | if q1.shape != q2.shape: 270 | raise ValueError("q1 and q2 must have the same shape.") 271 | 272 | # Normalize quaternions to ensure they are unit quaternions 273 | q1 = q1 / np.linalg.norm(q1, axis=-1, keepdims=True) 274 | q2 = q2 / np.linalg.norm(q2, axis=-1, keepdims=True) 275 | 276 | # Compute the dot product between paired quaternions 277 | dot_product = np.sum(q1 * q2, axis=-1) 278 | 279 | # Clamp the dot product to the range [-1, 1] to handle numerical precision issues 280 | dot_product = np.clip(dot_product, -1.0, 1.0) 281 | 282 | # Calculate the shortest angular distance in radians 283 | angular_distance = 2 * np.arccos(np.abs(dot_product)) 284 | 285 | # Convert to degrees if needed 286 | if in_degree: 287 | return np.degrees(angular_distance) 288 | return angular_distance 289 | 290 | 291 | def trans_distance(t1, t2): 292 | """Calculate the Euclidean distance between two translation vectors or arrays of translation vectors. 293 | 294 | Args: 295 | t1 (np.ndarray): First translation vector(s) in shape (3,) or (N, 3), where N is the number of vectors. 296 | t2 (np.ndarray): Second translation vector(s) in shape (3,) or (N, 3), where N is the number of vectors. 297 | 298 | Returns: 299 | float or np.ndarray: Euclidean distance. Returns a scalar if inputs are 1D vectors, or an array of distances if inputs are 2D arrays. 300 | Raises: 301 | ValueError: If the inputs are not valid translation vectors or if their shapes are incompatible. 302 | """ 303 | 304 | # Ensure both inputs are NumPy arrays 305 | t1 = np.asarray(t1, dtype=np.float32) 306 | t2 = np.asarray(t2, dtype=np.float32) 307 | 308 | # Check if the shapes of t1 and t2 are compatible 309 | if t1.shape != t2.shape: 310 | raise ValueError( 311 | f"Shape mismatch: t1.shape {t1.shape} and t2.shape {t2.shape} must be the same." 312 | ) 313 | 314 | # Check for valid shapes: (3,) for a single vector or (N, 3) for multiple vectors 315 | if t1.shape[-1] != 3: 316 | raise ValueError("Each translation vector must have 3 components (tx, ty, tz).") 317 | 318 | # Compute Euclidean distance 319 | return np.linalg.norm(t1 - t2, axis=-1) 320 | 321 | 322 | def angular_difference(q1: np.ndarray, q2: np.ndarray) -> Union[float, np.ndarray]: 323 | """ 324 | Calculate the angular difference in degrees between two quaternions or arrays of quaternions. 325 | 326 | Args: 327 | q1 (np.ndarray): First quaternion(s) in [qx, qy, qz, qw] or [N, qx, qy, qz, qw] format. 328 | q2 (np.ndarray): Second quaternion(s) in [qx, qy, qz, qw] or [N, qx, qy, qz, qw] format. 329 | 330 | Returns: 331 | float or np.ndarray: Angular difference in degrees, scalar if single pair or array if multiple pairs. 332 | """ 333 | dim = q1.ndim 334 | if dim == 1: 335 | q1 = q1 / np.linalg.norm(q1) 336 | q2 = q2 / np.linalg.norm(q2) 337 | else: 338 | q1 = q1 / np.linalg.norm(q1, axis=1, keepdims=True) 339 | q2 = q2 / np.linalg.norm(q2, axis=1, keepdims=True) 340 | 341 | q1 = R.from_quat(q1) 342 | q2 = R.from_quat(q2) 343 | delta_q = q1.inv() * q2 344 | delta_q_quat = delta_q.as_quat() 345 | 346 | if dim == 1: 347 | if delta_q_quat[3] < 0: 348 | delta_q_quat = -delta_q_quat 349 | else: 350 | negative_indices = delta_q_quat[:, 3] < 0 351 | delta_q_quat[negative_indices] = -delta_q_quat[negative_indices] 352 | 353 | if dim == 1: 354 | angular_diff = 2 * np.arccos(np.clip(delta_q_quat[3], -1.0, 1.0)) 355 | else: 356 | angular_diff = 2 * np.arccos(np.clip(delta_q_quat[:, 3], -1.0, 1.0)) 357 | 358 | return np.degrees(angular_diff) 359 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0", "torch>=2.3.1"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "hocap-toolkit" 7 | version = "1.0.0" 8 | description = "Python package providing evaluation and visualization tools for the HoCap dataset." 9 | requires-python = ">=3.10" 10 | license = { "text" = "GPL-3.0" } 11 | authors = [ 12 | { name = "Jikai Wang", email = "jikai.wang@utdallas.edu" } 13 | ] 14 | 15 | dependencies = [ 16 | "numpy>=1.26.4,<2", 17 | "scipy>=1.13.1", 18 | "matplotlib>=3.9.1", 19 | "ruamel.yaml>=0.18.5", 20 | "tqdm>=4.66.4", 21 | "ninja>=1.11.1.1", 22 | "opencv-python>=4.7.0", 23 | "open3d>=0.18.0", 24 | "av>=12.2.0", 25 | "pyglet<2", 26 | "trimesh==4.4.1", 27 | "pyrender==0.1.45", 28 | "pyOpenGL>=3.1.0", 29 | "pyopengl-accelerate>=3.1.0; sys_platform != 'darwin'", 30 | "mediapipe==0.10.14", 31 | "gdown>=5.2.0", 32 | "pycocotools>=2.0.7", 33 | "chumpy @ git+https://github.com/gobanana520/chumpy.git", 34 | "manopth @ git+https://github.com/gobanana520/manopth.git" 35 | ] 36 | 37 | [tool.setuptools.packages] 38 | find = { include = ["hocap_toolkit"] } 39 | -------------------------------------------------------------------------------- /results/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tools/hocap_dataset_split.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from hocap_toolkit.factory import HOCapFactory 3 | 4 | 5 | if __name__ == "__main__": 6 | # Parse arguments 7 | parser = argparse.ArgumentParser(description="Split HOCAP dataset") 8 | parser.add_argument( 9 | "--task", 10 | type=str, 11 | choices=["hpe", "odet", "ope"], 12 | required=True, 13 | help="Dataset task (hpe, odet, ope)", 14 | ) 15 | parser.add_argument( 16 | "--anno_type", 17 | type=str, 18 | default="coco", 19 | choices=["coco", "yolo"], 20 | help="Annotation type for odet (coco, yolo)", 21 | ) 22 | args = parser.parse_args() 23 | 24 | factory = HOCapFactory() 25 | 26 | if args.task == "hpe": 27 | factory.create_hpe_dataset() 28 | 29 | if args.task == "odet": 30 | factory.create_odet_dataset(args.anno_type) 31 | 32 | if args.task == "ope": 33 | factory.create_ope_dataset() 34 | -------------------------------------------------------------------------------- /tools/hocap_downloader.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import requests 3 | from hocap_toolkit.utils import * 4 | 5 | PROJ_ROOT = Path(__file__).parent.parent 6 | 7 | 8 | def download_box_file(box_link, save_file_path): 9 | output_path = Path(save_file_path) 10 | resume_header = {} 11 | downloaded_size = 0 12 | 13 | with requests.get(box_link, headers=resume_header, stream=True) as response: 14 | # Check if the request was successful 15 | if response.status_code == 200: 16 | total_size = int(response.headers.get("content-length", 0)) 17 | else: 18 | print(f"Failed to retrieve file info. Status code: {response.status_code}") 19 | return 20 | 21 | if output_path.exists(): 22 | downloaded_size = output_path.stat().st_size 23 | # Check if there's a partial download and get its size 24 | resume_header = {"Range": f"bytes={downloaded_size}-"} 25 | 26 | # Check if the file is already fully downloaded 27 | if downloaded_size == total_size: 28 | tqdm.write(f" ** {output_path.name} is already downloaded.") 29 | return 30 | 31 | # Send a GET request with the range header if needed 32 | with requests.get(box_link, headers=resume_header, stream=True) as response: 33 | # Check if the request was successful 34 | if response.status_code in [200, 206]: 35 | # Initialize tqdm progress bar 36 | with tqdm( 37 | total=total_size, 38 | initial=downloaded_size, 39 | unit="B", 40 | unit_scale=True, 41 | ncols=80, 42 | ) as pbar: 43 | # Download the file in chunks 44 | with output_path.open("ab") as file: 45 | for chunk in response.iter_content( 46 | chunk_size=1024 * 1024 47 | ): # 1 MB chunks 48 | if chunk: 49 | file.write(chunk) 50 | pbar.update(len(chunk)) 51 | else: 52 | print(f"Failed to download file. Status code: {response.status_code}") 53 | 54 | 55 | def unzip_file(zip_file, output_dir): 56 | zip_file = Path(zip_file) 57 | output_dir = Path(output_dir) 58 | 59 | if not output_dir.exists(): 60 | output_dir.mkdir(parents=True) 61 | 62 | with zipfile.ZipFile(zip_file, "r") as zip_ref: 63 | zip_ref.extractall(output_dir) 64 | 65 | 66 | def main(): 67 | dataset_files = read_data_from_yaml(PROJ_ROOT / "config/hocap_recordings.yaml") 68 | 69 | tqdm.write(f"- Downloading 'calibration.zip'...") 70 | download_box_file( 71 | dataset_files["calibration"], PROJ_ROOT / "datasets/calibration.zip" 72 | ) 73 | 74 | tqdm.write(f"- Downloading 'models.zip'...") 75 | download_box_file(dataset_files["models"], PROJ_ROOT / "datasets/models.zip") 76 | 77 | tqdm.write(f"- Downloading 'poses.zip'...") 78 | download_box_file(dataset_files["poses"], PROJ_ROOT / "datasets/poses.zip") 79 | 80 | tqdm.write(f"- Downloading 'labels.zip'...") 81 | download_box_file(dataset_files["labels"], PROJ_ROOT / "datasets/labels.zip") 82 | 83 | subject_ids = ( 84 | [f"subject_{i}" for i in range(1, 10)] 85 | if args.subject_id == "all" 86 | else [args.subject_id] 87 | ) 88 | 89 | for subject_id in subject_ids: 90 | tqdm.write(f"- Downloading '{subject_id}.zip'...") 91 | download_box_file( 92 | dataset_files[subject_id], PROJ_ROOT / "datasets" / f"{subject_id}.zip" 93 | ) 94 | 95 | # Extract the downloaded zip files 96 | zip_files = list(PROJ_ROOT.glob("datasets/*.zip")) 97 | tqdm.write(f"- Extracting downloaded zip files...") 98 | for zip_file in zip_files: 99 | tqdm.write(f" ** Extracting '{zip_file.name}'...") 100 | unzip_file(zip_file, zip_file.parent) 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser(description="Download dataset files") 105 | parser.add_argument( 106 | "--subject_id", 107 | type=str, 108 | default="all", 109 | choices=[ 110 | "all", 111 | "subject_1", 112 | "subject_2", 113 | "subject_3", 114 | "subject_4", 115 | "subject_5", 116 | "subject_6", 117 | "subject_7", 118 | "subject_8", 119 | "subject_9", 120 | ], 121 | help="The subject id to download", 122 | ) 123 | args = parser.parse_args() 124 | 125 | main() 126 | --------------------------------------------------------------------------------