├── .gitmodules ├── LICENSE.md ├── README.md ├── data └── .gitkeep ├── data_preprocess ├── rgbd_dog │ ├── detect_dog_mask2former.py │ └── preprocess.py ├── robot │ └── preprocess.py └── zju │ ├── detect_person.py │ ├── diff.patch │ ├── preprocess.py │ └── read_smpl.py ├── environment.yml ├── figures ├── reconstruction_rotate_motion_0_0.gif ├── repose_rotate_0_0.gif └── robot_example.jpg ├── install.sh └── src ├── confs ├── atlas.yml ├── atlas_merge.yml ├── baxter.yml ├── baxter_merge.yml ├── cassie.yml ├── cassie_merge.yml ├── default.yml ├── dog.yml ├── dog_merge.yml ├── iiwa.yml ├── iiwa_merge.yml ├── nao.yml ├── nao_merge.yml ├── pandas.yml ├── pandas_merge.yml ├── spot.yml ├── spot_merge.yml ├── zju366.yml ├── zju366_merge.yml ├── zju377.yml ├── zju377_merge.yml ├── zju381.yml ├── zju381_merge.yml ├── zju384.yml ├── zju384_merge.yml ├── zju387.yml └── zju387_merge.yml ├── datasets └── dataset.py ├── demo_notebook.ipynb ├── models ├── decoder.py ├── loss.py └── model.py ├── train_single_video.py ├── utils ├── config.py ├── get_args.py ├── graph_utils.py ├── model_utils.py ├── render_utils.py ├── sdf_utils.py ├── train_utils.py ├── trainer.py └── visualization_utils.py ├── validation ├── SMPL_regression.py ├── lpips_ssim.py └── reconstruction.py └── visualize ├── create_reconstruction_video.py ├── create_repose_video.py ├── part_merging.py ├── repose_configs ├── cassie.yml ├── iiwa.yml └── spot.yml └── repose_person_by_driving_pose.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "data_preprocess/rgbd_dog/RGBD-Dog"] 2 | path = data_preprocess/rgbd_dog/RGBD-Dog 3 | url = git@github.com:CAMERA-Bath/RGBD-Dog.git 4 | [submodule "data_preprocess/rgbd_dog/RGBD_Dog"] 5 | path = data_preprocess/rgbd_dog/RGBD_Dog 6 | url = git@github.com:CAMERA-Bath/RGBD-Dog.git 7 | [submodule "data_preprocess/zju/AdeliDet"] 8 | path = data_preprocess/zju/AdeliDet 9 | url = git@github.com:aim-uofa/AdelaiDet.git 10 | [submodule "data_preprocess/rgbd_dog/Mask2Former"] 11 | path = data_preprocess/rgbd_dog/Mask2Former 12 | url = git@github.com:facebookresearch/Mask2Former.git 13 | [submodule "data_preprocess/zju/EasyMocap"] 14 | path = data_preprocess/zju/EasyMocap 15 | url = git@github.com:zju3dv/EasyMocap.git 16 | [submodule "AdelaiDet"] 17 | path = AdelaiDet 18 | url = https://github.com/aim-uofa/AdelaiDet.git 19 | [submodule "Mask2Former"] 20 | path = Mask2Former 21 | url = git@github.com:facebookresearch/Mask2Former.git 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## NVIDIA License 2 | 3 | ### 1. Definitions 4 | 5 | “Licensor” means any person or entity that distributes its Work. 6 | “Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license. 7 | The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. 8 | Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license. 9 | 10 | ### 2. License Grant 11 | 12 | 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. 13 | 14 | ### 3. Limitations 15 | 16 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work. 17 | 18 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself. 19 | 20 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for research or evaluation purposes only. 21 | 22 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately. 23 | 24 | 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license. 25 | 26 | 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately. 27 | 28 | ### 4. Disclaimer of Warranty. 29 | 30 | THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 31 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 32 | 33 | ### 5. Limitation of Liability. 34 | 35 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Watch It Move 2 | 3 | Official implementation of the IEEE/CVF CVPR 2022 paper 4 | 5 | **Watch It Move: Unsupervised Discovery of 3D Joints for Re-Posing of Articulated Objects**\ 6 | Atsuhiro Noguchi, Umar Iqbal, Jonathan Tremblay, Tatsuya Harada, Orazio Gallo\ 7 | [Project page](https://nvlabs.github.io/watch-it-move/) / [Paper](https://arxiv.org/abs/2112.11347) 8 | / [Video](https://www.youtube.com/watch?v=oRnnuCVV89o) 9 | 10 | Abstract: Rendering articulated objects while controlling their poses is critical to applications such as virtual 11 | reality or animation for movies. Manipulating the pose of an object, however, requires the understanding of its 12 | underlying structure, that is, its joints and how they interact with each other. Unfortunately, assuming the structure 13 | to be known, as existing methods do, precludes the ability to work on new object categories. We propose to learn both 14 | the appearance and the structure of previously unseen articulated objects by observing them move from multiple views, 15 | with no joints annotation supervision, or information about the structure. We observe that 3D points that are static 16 | relative to one another should belong to the same part, and that adjacent parts that move relative to each other must be 17 | connected by a joint. To leverage this insight, we model the object parts in 3D as ellipsoids, which allows us to 18 | identify joints. We combine this explicit representation with an implicit one that compensates for the approximation 19 | introduced. We show that our method works for different structures, from quadrupeds, to single-arm robots, to humans. 20 | 21 | ## Table of content 22 | * [Setup](#setup) 23 | * [Steps to replicate the teaser video for spot](#steps-to-replicate-the-teaser-video-for-spot) 24 | * [Steps to train for spot](#steps-to-train-for-spot) 25 | * [The WIM dataset](#the-wim-dataset) 26 | * [Dataset Preprocessing](#dataset-preprocessing) 27 | * [Training](#training) 28 | * [Pretrained Models](#pretrained-models) 29 | * [Demo](#demo) 30 | * [Evaluation (ZJU only)](#evaluation--zju-only-) 31 | * [Visualization](#visualization) 32 | * [Citation](#citation) 33 | 34 | ## Setup 35 | Clone this repository and create the environment. 36 | ```angular2html 37 | git clone --recursive git@github.com:NVlabs/watch-it-move.git 38 | cd watch-it-move 39 | bash install.sh 40 | 41 | # To run the training and rendering examples below, download the data for Spot 42 | mkdir -p data/robots/spot 43 | gdown https://drive.google.com/u/1/uc\?id\=1HNzCa8olJgedpKe6jBCIi-_LffLX9f8R\&export\=download -O data/robots/spot/cache.pickle 44 | ``` 45 | ### Disclaimer 46 | We have only tested the following code on NVIDIA A100 GPUs. 47 | 48 | ## Steps to replicate the teaser video for spot 49 | 50 | ```angular2html 51 | # download pretrained model for spot 52 | mkdir -p data/output/result/spot_merge 53 | gdown https://drive.google.com/u/1/uc\?id\=12_K-x-daAGqvIoDd3tRvRIe0LKLOALyC\&export\=download -O data/output/result/spot_merge/snapshot_latest.pth 54 | 55 | cd /src 56 | # save the reconstruction video 57 | python visualize/create_reconstruction_video.py --exp_name spot_merge 58 | # save re-pose video 59 | python visualize/create_repose_video.py --exp_name spot_merge --repose_config visualize/repose_configs/spot.yml --rotate 60 | ``` 61 | Videos in mp4 format and the png image for each frame will be saved to `/data/output/result/spot_merge/` 62 | 63 |
64 | 65 |
66 | 67 | ## Steps to train for spot 68 | ```angular2html 69 | cd /src 70 | CUDA_VISIBLE_DEVICES=[gpu_id] python train_single_video.py --config confs/spot.yml --default_config confs/default.yml 71 | CUDA_VISIBLE_DEVICES=[gpu_id] python train_single_video.py --config confs/spot_merge.yml --default_config confs/default.yml 72 | ``` 73 | ## The WIM dataset 74 | 75 | 76 | We provide multiview videos for seven different moving robots [here](https://drive.google.com/drive/folders/1i5rWanA8FgVLrWPO4bl0aaGKBYwhY6IQ) (see [LICENSE.md](LICENSE.md) for terms of use). 77 | We provide both raw video data and preprocessed data. Please follow the instructions bellow to download and preprocess the data. 78 | It includes: 1000 frame videos of moving robots from 20 different viewpoints and preprocessed data of 300 frames of 5 chosen viewpoints. 79 | 80 | 81 | ## Dataset Preprocessing 82 | ### WIM Dataset 83 | - The WIM dataset is available [here](https://drive.google.com/drive/folders/1i5rWanA8FgVLrWPO4bl0aaGKBYwhY6IQ). 84 | - We provide preprocessed data in the directory named [preprocessed](https://drive.google.com/drive/folders/1toiwb06VggqH1FOS9OnKYlqRFk3H6T9g). Download, uncompress, and place it in 85 | ```angular2html 86 | data/robots//cache.pickle 87 | ``` 88 | - If you run with pre-processing on your own, download tar.gz files from [here](https://drive.google.com/drive/folders/1i5rWanA8FgVLrWPO4bl0aaGKBYwhY6IQ), uncompress them, place them as 89 | ``` 90 | data/robots//cam_.json 91 | data/robots//frame__cam_.png 92 | ``` 93 | and run 94 | ```angular2html 95 | cd /data_preprocess/robot 96 | python preprocess.py --data_root ../../data/robots --robot_name atlas --robot_name baxter --robot_name spot --robot_name cassie --robot_name iiwa --robot_name nao --robot_name pandas 97 | ``` 98 | 99 | ### ZJU MOCAP 100 | 101 | - Requirements (installed by `install.sh`): [Adet](https://github.com/aim-uofa/AdelaiDet), [EasyMocap](https://github.com/zju3dv/EasyMocap) 102 | 103 | - Download the COCO instance segmentation model 104 | named [R_101_dcni3_5x](https://github.com/aim-uofa/AdelaiDet#coco-instance-segmentation-baselines-with-blendmask) from 105 | Adet and copy it to `data_preprocess/zju/R_101_dcni3_5x.pth`. 106 | - Download the [ZJU MOCAP LightStage dataset](https://github.com/zju3dv/EasyMocap#zju-mocap) and copy it in 107 | ``` 108 | /data/ 109 | └── zju_mocap 110 | ├── 366 111 | ├── 377 112 | ├── 381 113 | ├── 384 114 | └── 387 115 | ``` 116 | - Download the SMPL models 117 | following [EasyMocap installation](https://github.com/zju3dv/EasyMocap/blob/master/doc/installation.md). You only need to download smplx models. 118 | ``` 119 | /data 120 | └── smplx 121 | ├── J_regressor_body25.npy 122 | ├── J_regressor_body25_smplh.txt 123 | ├── J_regressor_body25_smplx.txt 124 | ├── J_regressor_mano_LEFT.txt 125 | ├── J_regressor_mano_RIGHT.txt 126 | └── smplx 127 | ├── SMPLX_FEMALE.pkl 128 | ├── SMPLX_MALE.pkl 129 | └── SMPLX_NEUTRAL.pkl 130 | ``` 131 | - Run 132 | ```angular2html 133 | cd /data_preprocess/zju 134 | python preprocess.py --smpl_model_path ../../data/smplx --zju_path ../../data/zju_mocap --person_id 366 --person_id 377 --person_id 381 --person_id 384 --person_id 387 135 | ``` 136 | 137 | ### Dog dataset 138 | - Requirement (installed by `install.sh`): [mask2former](https://github.com/facebookresearch/Mask2Former) 139 | - Download `Mask2Former (200 queries)` model from https://github.com/facebookresearch/Mask2Former/blob/main/MODEL_ZOO.md#instance-segmentation and copy it to `/data_preprocess/rgbd_dog/model_final_e5f453.pth`. 140 | - Download [RGBD-Dog dataset](https://github.com/CAMERA-Bath/RGBD-Dog) as 141 | ``` 142 | /data/rgbd_dog 143 | └── dog1 144 | └── motion_testSeq 145 | ├── kinect_depth 146 | ├── kinect_rgb 147 | ├── motion_capture 148 | └── sony 149 | ``` 150 | We used `motion_testSeq` for training. 151 | - Run 152 | ```angular2html 153 | cd /data_preprocess/rgbd_dog 154 | python preprocess.py --data_root ../../data/rgbd_dog/dog1/motion_testSeq 155 | ``` 156 | 157 | ## Training 158 | Run the following commands to train the model. Please specify the experiment name in `[exp_name]` 159 | ``` 160 | cd /src 161 | CUDA_VISIBLE_DEVICES=[gpu_id] python train_single_video.py --config confs/[exp_name].yml --default_config confs/default.yml 162 | ``` 163 | 164 | ## Pretrained Models 165 | Pretrained models for ZJU mocap dataset, robot dataset, and dog dataset are available [here](https://drive.google.com/drive/folders/1gmkkHXRr5-1w5W-kCSHcsInMY8ODEqyK). 166 | The name of each directory corresponds to the name of a config fine under `src/confs`. 167 | Please download and place these directories in `data/output/result`. 168 | ```angular2html 169 | /data/output/result 170 | ├── atlas 171 | │ └── snapshot_latest.pth 172 | ├── baxter 173 | ... 174 | ``` 175 | 176 | ## Demo 177 | Visualization code is available in `/src/visualize/demo_notebook.ipynb` 178 | 179 | ## Evaluation (ZJU only) 180 | 181 | ### LPIPS and SSIM 182 | Calculate lpips and ssim between generated and ground truth images. 183 | ```angular2html 184 | cd /src 185 | python validation/reconsuruction.py --exp_name zju366 --exp_name zju377 186 | python validation/lpips_ssim.py --exp_name zju366 --exp_name zju377 187 | ``` 188 | Results will be saved to `[output_dir]/result/[exp_name]/validation` 189 | 190 | ### Pose Regression 191 | Calculate MPJPE (mm) between ground truth and regressed joint locations. 192 | ```angular2html 193 | cd /src 194 | python validation/SMPL_regression.py --exp_name zju366 --exp_name zju377 195 | ``` 196 | 197 | ## Visualization 198 | ### Reconstruction video 199 | Results will be saved to `/data/output/result/[exp_name]/reconstruction_...`. 200 | ```angular2html 201 | cd /src 202 | python visualize/create_reconstruction_video.py --exp_name zju366 --exp_name zju377 203 | ``` 204 | 205 | ### Manual re-posing 206 | Results will be saved to `/data/output/result/[exp_name]/repose_...`. 207 | ```angular2html 208 | cd /src 209 | python visualize/create_repose_video.py --exp_name spot --repose_config visualize/repose_configs/spot.yml --rotate 210 | ``` 211 | repose_config (e.g., `/src/visualize/repose_configs/spot.yml`) includes the following parameters: 212 | ```angular2html 213 | camera_id: camera id of the reference frame 214 | frame_id: frame id of the reference frame 215 | root: part id of the root. 216 | first: part id and its rotation in rodrigues form for the first quarter of the video. 217 | second: part id and its rotation in rodrigues form for the next quarter of the video. 218 | ``` 219 | `root`, `first`, and `second` vary depending on the training results, even when trained on the same data. 220 | For your pretrained models, please follow `/src/visualize/demo_notebook.ipynb` to adapt them. 221 | 222 | ### Merge Parts 223 | Images of merged structure will be saved to `/data/output/result/[exp_name]/merge`. 224 | ```angular2html 225 | cd /src 226 | python visualize/part_merging.py --exp_name spot --camera_id 0 227 | ``` 228 | 229 | ### Re-posing by driving frames (ZJU only) 230 | Re-posing by test frames. 231 | ```angular2html 232 | cd /src 233 | python visualize/repose_person_by_driving_pose.py --exp_name zju366 --camera_id 0 --num_video_frames 50 234 | ``` 235 | Results will be saved to `/data/output/result/[exp_name]/drive_..`. 236 | 237 | # Citation 238 | ```bibtex 239 | @inproceedings{noguchi2022watch, 240 | title = {Watch It Move: {U}nsupervised Discovery of {3D} Joints for Re-Posing of Articulated Objects}, 241 | author = {Atsuhiro Noguchi and Umar Iqbal and Jonathan Tremblay and Tatsuya Harada and Orazio Gallo}, 242 | journal = {CVPR}, 243 | year = {2022}, 244 | } 245 | ``` 246 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/watch-it-move/5fdec2b71b07f9d4a5492fb3dad6bfcc7d9a9f8b/data/.gitkeep -------------------------------------------------------------------------------- /data_preprocess/rgbd_dog/detect_dog_mask2former.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import sys 13 | from typing import Any, List 14 | 15 | import numpy as np 16 | import torch 17 | from detectron2.config import get_cfg 18 | from detectron2.engine.defaults import DefaultPredictor 19 | from detectron2.projects.deeplab import add_deeplab_config 20 | from tqdm import tqdm 21 | 22 | sys.path.append("Mask2Former") 23 | from mask2former import add_maskformer2_config 24 | 25 | 26 | def setup_cfg(): 27 | # load config from file and command-line arguments 28 | cfg = get_cfg() 29 | add_deeplab_config(cfg) 30 | add_maskformer2_config(cfg) 31 | cfg.merge_from_file( 32 | "Mask2Former/configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml") 33 | cfg.merge_from_list(["MODEL.WEIGHTS", "model_final_e5f453.pkl"]) 34 | cfg.freeze() 35 | 36 | return cfg 37 | 38 | 39 | class DogDetector(object): 40 | def __init__(self): 41 | cfg = setup_cfg() 42 | self.cpu_device = torch.device("cpu") 43 | self.vis_text = cfg.MODEL.ROI_HEADS.NAME == "TextHead" 44 | 45 | self.predictor = DefaultPredictor(cfg) 46 | 47 | def run_on_video(self, video: List[np.ndarray]) -> List[Any]: 48 | """ 49 | Visualizes predictions on frames of the input video. 50 | Args: 51 | video (np.array): 52 | Returns: 53 | ndarray: RGB 54 | """ 55 | 56 | def process_predictions(predictions: Any): 57 | predictions = predictions["instances"].to(self.cpu_device) 58 | return predictions 59 | 60 | detected_video = [] 61 | for frame in tqdm(video): 62 | detected_video.append(process_predictions(self.predictor(frame))) 63 | 64 | return detected_video 65 | -------------------------------------------------------------------------------- /data_preprocess/robot/preprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | import argparse 12 | import json 13 | import pickle 14 | from typing import List 15 | 16 | import blosc 17 | import cv2 18 | import numpy as np 19 | from tqdm import tqdm 20 | 21 | 22 | def read_frames(chosen_camera_id: List[int], video_len: int, data_dir: str): 23 | """ 24 | 25 | Args: 26 | chosen_camera_id: 27 | video_len: 28 | data_dir: 29 | 30 | Returns: 31 | 32 | """ 33 | all_video = [] 34 | all_mask = [] 35 | all_camera_intrinsic = [] 36 | all_camera_rotation = [] 37 | all_camera_translation = [] 38 | for c_id in tqdm(chosen_camera_id): 39 | for f_id in range(video_len): 40 | img_path = f"{data_dir}/frame_{f_id:0>5}_cam_{c_id:0>3}.png" 41 | config_path = f"{data_dir}/cam_{c_id:0>3}.json" 42 | img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) 43 | img_scale = 1 44 | if img.shape[0] != 512: 45 | img_scale = 512 / img.shape[0] 46 | img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_AREA) 47 | with open(config_path, "r") as f: 48 | config = json.load(f) 49 | frame = img[:, :, [2, 1, 0]] 50 | mask = img[:, :, 3] > 127.5 51 | all_video.append(frame) 52 | all_mask.append(mask) 53 | 54 | intrinsic = config["camera_data"]["intrinsics"] 55 | camera_intrinsic = np.zeros((3, 3), dtype="float32") 56 | camera_intrinsic[0, 0] = intrinsic['fx'] * img_scale 57 | camera_intrinsic[1, 1] = intrinsic['fy'] * img_scale 58 | camera_intrinsic[0, 2] = intrinsic['cx'] * img_scale 59 | camera_intrinsic[1, 2] = intrinsic['cy'] * img_scale 60 | camera_intrinsic[2, 2] = 1 61 | all_camera_intrinsic.append(camera_intrinsic) 62 | 63 | extrinsic = np.array(config["camera_data"]["camera_view_matrix"]) 64 | extrinsic[:, 1] = -extrinsic[:, 1] 65 | extrinsic[:, 2] = -extrinsic[:, 2] 66 | all_camera_rotation.append(extrinsic[:3, :3].transpose()) # (3, 3) 67 | all_camera_translation.append(extrinsic[3, :3, None]) # (3, 1) 68 | 69 | all_video = np.array(all_video) 70 | all_mask = np.array(all_mask) 71 | all_camera_intrinsic = np.array(all_camera_intrinsic) 72 | all_camera_rotation = np.array(all_camera_rotation) 73 | all_camera_translation = np.array(all_camera_translation) 74 | 75 | frame_id = [np.arange(video_len) for cam in chosen_camera_id] 76 | frame_id = np.concatenate(frame_id, axis=0) 77 | 78 | camera_id = [np.ones(video_len, dtype="int") * (cam - 1) for cam in chosen_camera_id] 79 | camera_id = np.concatenate(camera_id, axis=0) 80 | 81 | return all_video, all_mask, frame_id, camera_id, all_camera_intrinsic, all_camera_rotation, all_camera_translation 82 | 83 | 84 | def preprocess_robot(robot_name: str): 85 | if robot_name == "atlas": 86 | data_dir = f"{DATA_ROOT}/atlas" 87 | chosen_camera_id = [0, 2, 5, 13, 18] 88 | video_len = 300 89 | elif robot_name == "baxter": 90 | data_dir = f"{DATA_ROOT}/baxter" 91 | chosen_camera_id = [0, 2, 5, 13, 18] 92 | video_len = 300 93 | elif robot_name == "spot": 94 | data_dir = f"{DATA_ROOT}/spot" 95 | chosen_camera_id = [0, 2, 5, 10, 13] 96 | video_len = 300 97 | elif robot_name == "cassie": 98 | data_dir = f"{DATA_ROOT}/cassie" 99 | chosen_camera_id = [0, 3, 5, 10, 13] 100 | video_len = 300 101 | elif robot_name == "iiwa": 102 | data_dir = f"{DATA_ROOT}/iiwa" 103 | chosen_camera_id = [0, 2, 4, 5, 13] 104 | video_len = 300 105 | elif robot_name == "nao": 106 | data_dir = f"{DATA_ROOT}/nao" 107 | chosen_camera_id = [7, 10, 11, 14, 15] 108 | video_len = 300 109 | elif robot_name == "pandas": 110 | data_dir = f"{DATA_ROOT}/pandas" 111 | chosen_camera_id = [0, 2, 5, 10, 13] 112 | video_len = 300 113 | else: 114 | raise ValueError("invalid robot name") 115 | 116 | data_dict = {} 117 | 118 | # read frame 119 | all_video, all_mask, frame_id, camera_id, all_intrinsic, all_rot, all_trans = read_frames(chosen_camera_id, 120 | video_len, data_dir) 121 | 122 | data_dict["frame_id"] = frame_id 123 | data_dict["img"] = np.array([blosc.pack_array(frame.transpose(2, 0, 1)) for frame in tqdm(all_video)], 124 | dtype="object") 125 | data_dict["mask"] = np.array([blosc.pack_array(mask) for mask in tqdm(all_mask)], dtype="object") 126 | data_dict["camera_intrinsic"] = all_intrinsic 127 | data_dict["camera_rotation"] = all_rot 128 | data_dict["camera_translation"] = all_trans 129 | 130 | data_dict["camera_id"] = np.arange(len(all_video)) // (len(all_video) // len(chosen_camera_id)) 131 | 132 | with open(data_dir + '/cache.pickle', 'wb') as f: 133 | pickle.dump(data_dict, f) 134 | 135 | 136 | if __name__ == "__main__": 137 | parser = argparse.ArgumentParser(description='Robot data preprocessing') 138 | parser.add_argument('--data_root', type=str, required=True) 139 | parser.add_argument('--robot_name', action='append', 140 | required=True) 141 | args = parser.parse_args() 142 | 143 | DATA_ROOT = args.data_root 144 | robot_names = args.robot_name 145 | 146 | for robot_name in robot_names: 147 | preprocess_robot(robot_name) 148 | -------------------------------------------------------------------------------- /data_preprocess/zju/detect_person.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | from typing import Any, List 12 | 13 | import numpy as np 14 | import torch 15 | from adet.config import get_cfg 16 | from detectron2.data import MetadataCatalog 17 | from detectron2.engine.defaults import DefaultPredictor 18 | from tqdm import tqdm 19 | 20 | 21 | def setup_cfg(): 22 | # load config from file and command-line arguments 23 | confidence_threshold = 0.3 24 | cfg = get_cfg() 25 | cfg.merge_from_file("AdeliDet/configs/BlendMask/R_101_dcni3_5x.yaml") 26 | cfg.merge_from_list(["MODEL.WEIGHTS", "R_101_dcni3_5x.pth"]) 27 | # Set score_threshold for builtin models 28 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = confidence_threshold 29 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold 30 | cfg.MODEL.FCOS.INFERENCE_TH_TEST = confidence_threshold 31 | cfg.MODEL.MEInst.INFERENCE_TH_TEST = confidence_threshold 32 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = confidence_threshold 33 | cfg.freeze() 34 | 35 | return cfg 36 | 37 | 38 | class PersonDetector(object): 39 | def __init__(self): 40 | cfg = setup_cfg() 41 | self.metadata = MetadataCatalog.get( 42 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 43 | ) 44 | self.cpu_device = torch.device("cpu") 45 | self.vis_text = cfg.MODEL.ROI_HEADS.NAME == "TextHead" 46 | 47 | self.predictor = DefaultPredictor(cfg) 48 | 49 | def process_predictions(self, frame: np.ndarray, predictions: Any) -> np.ndarray: 50 | """ 51 | 52 | Args: 53 | frame: 54 | predictions: 55 | 56 | Returns: 57 | 58 | """ 59 | predictions = predictions["instances"].to(self.cpu_device) 60 | if predictions.pred_masks.shape[0] == 0: 61 | print("No mask detected") 62 | return np.zeros((frame.shape[0], frame.shape[1], 1)) 63 | 64 | mask = predictions.pred_masks[0, :, :, None].cpu().numpy() 65 | 66 | return mask 67 | 68 | def run_on_video(self, video: np.ndarray) -> List[np.ndarray]: 69 | """ 70 | Detect person from video 71 | Args: 72 | video: 73 | 74 | Returns: 75 | 76 | """ 77 | detected_video = [] 78 | for frame in tqdm(video): 79 | detected_video.append(self.process_predictions(frame, self.predictor(frame))) 80 | 81 | return detected_video 82 | -------------------------------------------------------------------------------- /data_preprocess/zju/diff.patch: -------------------------------------------------------------------------------- 1 | diff --git a/easymocap/smplmodel/lbs.py b/easymocap/smplmodel/lbs.py 2 | index 4c82dd2..6cde76a 100644 3 | --- a/easymocap/smplmodel/lbs.py 4 | +++ b/easymocap/smplmodel/lbs.py 5 | @@ -215,0 +215,5 @@ def 6 | + # Calculate for only the parts required for WatchItMove 7 | + num_rots = rot_mats.shape[1] # Modification for WatchItMove 8 | + rot_mats = rot_mats[:, :num_rots] # Modification for WatchItMove 9 | + J = J[:, :num_rots] # Modification for WatchItMove 10 | + parents = parents[:num_rots] # Modification for WatchItMove 11 | @@ -216,0 +221,3 @@ def 12 | + # return joint locations and transformation matrices for WatchItMove 13 | + if lbs_weights is None: # Modification for WatchItMove 14 | + return J_transformed, A # Modification for WatchItMove 15 | -------------------------------------------------------------------------------- /data_preprocess/zju/preprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | import argparse 12 | import glob 13 | import json 14 | import os 15 | import pickle 16 | from typing import Tuple, List, Dict 17 | 18 | import blosc 19 | import cv2 20 | import numpy as np 21 | import torch 22 | from easymocap.smplmodel import SMPLlayer 23 | from tqdm import tqdm 24 | 25 | from detect_person import PersonDetector 26 | from read_smpl import PoseLoader 27 | 28 | 29 | def read_frames(person_id: int, save_size: int, crop_size: int, chosen_camera_id: np.ndarray 30 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]: 31 | """ 32 | 33 | Args: 34 | person_id: 35 | save_size: 36 | crop_size: 37 | chosen_camera_id: 38 | 39 | Returns: 40 | 41 | """ 42 | all_video = [] 43 | for cam in tqdm(chosen_camera_id): 44 | video_path = f"{ZJU_PATH}/{person_id}/videos/{cam:0>2}.mp4" 45 | video = cv2.VideoCapture(video_path) 46 | frames = [] 47 | while True: 48 | ret, frame = video.read() 49 | if not ret: 50 | break 51 | frame = frame[:crop_size, :crop_size] 52 | frame = cv2.resize(frame, (save_size, save_size), interpolation=cv2.INTER_CUBIC) 53 | frames.append(frame[:, :, ::-1]) 54 | frames = np.array(frames) 55 | all_video.append(frames) 56 | video_len = np.array([video.shape[0] for video in all_video]) 57 | assert (video_len == video_len[0]).all() 58 | frame_id = [np.arange(video_len[0]) for _ in range(NUM_CAMERA)] 59 | frame_id = np.stack(frame_id, axis=0) 60 | 61 | all_video = np.stack(all_video, axis=0) 62 | 63 | camera_id = [np.ones(video_len[0], dtype="int") * (cam - 1) for cam in chosen_camera_id] 64 | camera_id = np.stack(camera_id, axis=0) 65 | 66 | return all_video, frame_id, camera_id, video_len[0] 67 | 68 | 69 | class DetectPerson: 70 | def __init__(self): 71 | self.detector = PersonDetector() 72 | 73 | def __call__(self, all_video: np.ndarray): 74 | """ 75 | 76 | Args: 77 | all_video: 78 | 79 | Returns: 80 | 81 | """ 82 | detected = self.detector.run_on_video(all_video) 83 | 84 | return detected 85 | 86 | 87 | def read_intrinsic(person_id: int, save_scale: float) -> np.ndarray: 88 | """ 89 | 90 | Args: 91 | person_id: 92 | save_scale: 93 | 94 | Returns: 95 | 96 | """ 97 | fs = cv2.FileStorage(f"{ZJU_PATH}/{person_id}/intri.yml", cv2.FILE_STORAGE_READ) 98 | all_intrinsic = [] 99 | for cam in range(1, NUM_CAMERA + 1): 100 | matrix = fs.getNode(f"K_{cam:0>2}").mat() 101 | matrix = np.array(matrix).reshape(3, 3) 102 | all_intrinsic.append(matrix) 103 | all_intrinsic = np.array(all_intrinsic) 104 | all_intrinsic[:, :2] /= save_scale 105 | 106 | return all_intrinsic 107 | 108 | 109 | def read_extrinsic(person_id: int) -> Tuple[np.ndarray, np.ndarray]: 110 | """ 111 | 112 | Args: 113 | person_id: 114 | 115 | Returns: 116 | 117 | """ 118 | fs = cv2.FileStorage(f"{ZJU_PATH}/{person_id}/extri.yml", cv2.FILE_STORAGE_READ) 119 | all_rot = [] 120 | all_trans = [] 121 | for cam in range(1, NUM_CAMERA + 1): 122 | rot = fs.getNode(f"Rot_{cam:0>2}").mat() 123 | rot = np.array(rot).reshape(3, 3) 124 | trans = fs.getNode(f"T_{cam:0>2}").mat() 125 | trans = np.array(trans).reshape(3, 1) 126 | all_rot.append(rot) 127 | all_trans.append(trans) 128 | all_rot = np.array(all_rot) 129 | all_trans = np.array(all_trans) 130 | 131 | return all_rot, all_trans 132 | 133 | 134 | def read_smpl_parameters(person_id: int, video_len: int) -> np.ndarray: 135 | """ 136 | 137 | Args: 138 | person_id: 139 | video_len: 140 | 141 | Returns: 142 | 143 | """ 144 | all_smpl_param = [] 145 | for frame_id in tqdm(range(video_len)): 146 | smpl_path = f"{ZJU_PATH}/{person_id}/smplx/{frame_id:0>6}.json" 147 | 148 | with open(smpl_path, "r") as f: 149 | smpl_param = json.load(f)[0] 150 | 151 | smpl_param = pose_loader(smpl_param) 152 | all_smpl_param.append(smpl_param) 153 | all_smpl_param = np.array(all_smpl_param) 154 | 155 | return all_smpl_param 156 | 157 | 158 | def read_smpl_verts(person_id: int, smpllayer: SMPLlayer) -> np.ndarray: 159 | """ 160 | 161 | Args: 162 | person_id: 163 | smpllayer: 164 | 165 | Returns: 166 | 167 | """ 168 | all_smpl_verts = [] 169 | smpl_paths = sorted(glob.glob(f"{ZJU_PATH}/{person_id}/smplx/*.json")) 170 | for frame_id in tqdm(range(len(smpl_paths))): 171 | smpl_path = smpl_paths[frame_id] 172 | 173 | with open(smpl_path, "r") as f: 174 | smpl_param = json.load(f)[0] 175 | with torch.no_grad(): 176 | Rh = torch.tensor(np.array(smpl_param["Rh"])).float() # 1 x 3 177 | Th = torch.tensor(np.array(smpl_param["Th"])).float() # 1 x 3 178 | poses = torch.tensor(np.array(smpl_param["poses"])).float() # 1 x 72 179 | shapes = torch.tensor(smpl_param["shapes"]).float() # 1 x 10 180 | expression = torch.tensor(smpl_param["expression"]).float() # 1 x 10 181 | verts = smpllayer(poses, shapes, Rh, Th, expression) 182 | 183 | all_smpl_verts.append(verts[0].cpu().numpy()) 184 | all_smpl_verts = np.array(all_smpl_verts) 185 | 186 | return all_smpl_verts # (video_len, n_verts, 3) 187 | 188 | 189 | def create_dict(video: np.ndarray, mask: List[np.ndarray], frame: np.ndarray, camera: np.ndarray, 190 | all_intrinsic: np.ndarray, all_rot: np.ndarray, all_trans: np.ndarray, smpl: np.ndarray, set_size: int 191 | ) -> Dict[str, np.ndarray]: 192 | """ 193 | 194 | Args: 195 | video: 196 | mask: 197 | frame: 198 | camera: 199 | all_intrinsic: 200 | all_rot: 201 | all_trans: 202 | smpl: 203 | set_size: 204 | 205 | Returns: 206 | 207 | """ 208 | data_dict = {} 209 | data_dict["frame_id"] = frame.reshape(-1) 210 | data_dict["img"] = np.array([blosc.pack_array(frame.transpose(2, 0, 1)) for frame in tqdm(video)], 211 | dtype="object") 212 | 213 | data_dict["mask"] = np.array([blosc.pack_array(det[:, :, 0]) for det in tqdm(mask)], 214 | dtype="object") 215 | data_dict["camera_intrinsic"] = all_intrinsic[camera] 216 | data_dict["camera_rotation"] = all_rot[camera] 217 | data_dict["camera_translation"] = all_trans[camera] 218 | 219 | data_dict["camera_id"] = np.arange(len(frame)) // set_size 220 | data_dict["smpl_pose"] = smpl 221 | 222 | return data_dict 223 | 224 | 225 | def process_train_set(person_id: int, all_video, all_intrinsic: np.ndarray, all_rot: np.ndarray, all_trans: np.ndarray, 226 | all_smpl_param: np.ndarray, frame_id, camera_id, video_len: int, train_set_rate: float) -> int: 227 | """ 228 | 229 | Args: 230 | person_id: 231 | all_video: 232 | all_intrinsic: 233 | all_rot: 234 | all_trans: 235 | all_smpl_param: 236 | frame_id: 237 | camera_id: 238 | video_len: 239 | train_set_rate: 240 | 241 | Returns: 242 | 243 | """ 244 | train_set_size = int(video_len * train_set_rate) 245 | train_video = all_video[TRAIN_CAMERA_ID - 1, :train_set_size].reshape(-1, *all_video.shape[2:]) 246 | train_frame = frame_id[TRAIN_CAMERA_ID - 1, :train_set_size].reshape(-1, *frame_id.shape[2:]) 247 | train_camera = camera_id[TRAIN_CAMERA_ID - 1, :train_set_size].reshape(-1, *camera_id.shape[2:]) 248 | train_mask = person_detector(train_video) 249 | 250 | train_dict = create_dict(train_video, train_mask, train_frame, train_camera, all_intrinsic, 251 | all_rot, all_trans, all_smpl_param, train_set_size) 252 | 253 | with open(f'{ZJU_PATH}/cache{SAVE_SIZE}/{person_id}/cache_train.pickle', 'wb') as f: 254 | pickle.dump(train_dict, f) 255 | 256 | print("person id:", person_id, "train set size", train_set_size) 257 | 258 | return train_set_size 259 | 260 | 261 | def process_test_set(person_id: int, train_set_size: int, test_set_size: int, video_len: int, all_video: np.ndarray, 262 | all_intrinsic: np.ndarray, all_rot: np.ndarray, all_trans: np.ndarray, 263 | all_smpl_param: np.ndarray, frame_id: np.ndarray, camera_id: np.ndarray, mode: str) -> None: 264 | """ 265 | 266 | Args: 267 | person_id: 268 | train_set_size: 269 | test_set_size: 270 | video_len: 271 | all_video: 272 | all_intrinsic: 273 | all_rot: 274 | all_trans: 275 | all_smpl_param: 276 | frame_id: 277 | camera_id: 278 | mode: 279 | 280 | Returns: 281 | 282 | """ 283 | if mode == "novel_view": 284 | test_frame_id = np.linspace(0, train_set_size - 1, test_set_size).astype("int") 285 | camera_idx = TEST_CAMERA_ID 286 | cache_name = "cache_test" 287 | elif mode == "novel_pose": 288 | test_frame_id = np.linspace(train_set_size, video_len - 1, test_set_size).astype("int") 289 | camera_idx = ALL_CAMERA_ID 290 | cache_name = "cache_novel_pose" 291 | else: 292 | raise ValueError() 293 | 294 | test_video = all_video[camera_idx - 1][:, test_frame_id].reshape(-1, *all_video.shape[2:]) 295 | test_frame = frame_id[camera_idx - 1][:, test_frame_id].reshape(-1, *frame_id.shape[2:]) 296 | test_camera = camera_id[camera_idx - 1][:, test_frame_id].reshape(-1, *camera_id.shape[2:]) 297 | test_mask = person_detector(test_video) 298 | 299 | test_dict = create_dict(test_video, test_mask, test_frame, test_camera, all_intrinsic, 300 | all_rot, all_trans, all_smpl_param, test_set_size) 301 | 302 | with open(f'{ZJU_PATH}/cache{SAVE_SIZE}/{person_id}/{cache_name}.pickle', 'wb') as f: 303 | pickle.dump(test_dict, f) 304 | 305 | 306 | def main(): 307 | person_ids = args.person_id 308 | train_set_rate = 0.8 309 | test_set_size = 20 310 | 311 | for person_id in person_ids: 312 | # smpl verts 313 | all_verts = read_smpl_verts(person_id, smpllayer) 314 | data_dict = {"smpl_verts": all_verts} 315 | os.makedirs(f'{ZJU_PATH}/cache{SAVE_SIZE}/{person_id}', exist_ok=True) 316 | with open(f'{ZJU_PATH}/cache{SAVE_SIZE}/{person_id}/smpl_verts.pickle', 'wb') as f: 317 | pickle.dump(data_dict, f) 318 | 319 | # read frame 320 | all_video, frame_id, camera_id, video_len = read_frames(person_id, SAVE_SIZE, CROP_SIZE, ALL_CAMERA_ID) 321 | all_smpl_param = read_smpl_parameters(person_id, video_len) 322 | all_smpl_param = all_smpl_param[:, :23] 323 | all_intrinsic = read_intrinsic(person_id, SAVE_SCALE) 324 | all_rot, all_trans = read_extrinsic(person_id) 325 | 326 | # train set 327 | train_set_size = process_train_set(person_id, all_video, all_intrinsic, all_rot, all_trans, all_smpl_param, 328 | frame_id, camera_id, video_len, train_set_rate) 329 | 330 | # novel view 331 | process_test_set(person_id, train_set_size, test_set_size, video_len, all_video, all_intrinsic, all_rot, 332 | all_trans, all_smpl_param, frame_id, camera_id, "novel_view") 333 | 334 | # novel pose 335 | process_test_set(person_id, train_set_size, test_set_size, video_len, all_video, all_intrinsic, all_rot, 336 | all_trans, all_smpl_param, frame_id, camera_id, "novel_pose") 337 | 338 | 339 | if __name__ == "__main__": 340 | parser = argparse.ArgumentParser(description='ZJU data preprocessing') 341 | parser.add_argument('--smpl_model_path', type=str, required=True) 342 | parser.add_argument('--zju_path', type=str, required=True) 343 | parser.add_argument('--person_id', action='append', 344 | required=True) 345 | args = parser.parse_args() 346 | 347 | SMPL_MODEL_PATH = args.smpl_model_path 348 | 349 | ZJU_PATH = args.zju_path 350 | SAVE_SCALE = 2 351 | CROP_SIZE = 1024 352 | NUM_CAMERA = 23 353 | SAVE_SIZE = CROP_SIZE // SAVE_SCALE 354 | 355 | TRAIN_CAMERA_ID = np.array([1, 5, 9, 13, 17, 21]) 356 | TEST_CAMERA_ID = np.array([2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 20, 22, 23]) 357 | ALL_CAMERA_ID = np.arange(1, NUM_CAMERA + 1) 358 | 359 | pose_loader = PoseLoader(SMPL_MODEL_PATH) 360 | smpllayer = SMPLlayer(SMPL_MODEL_PATH + "/smplx", model_type='smplx') 361 | person_detector = DetectPerson() 362 | 363 | main() 364 | -------------------------------------------------------------------------------- /data_preprocess/zju/read_smpl.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | from typing import Any 12 | 13 | import cv2 14 | import numpy as np 15 | import torch 16 | from easymocap.smplmodel import load_model 17 | 18 | from EasyMocap.easymocap.smplmodel.lbs import lbs as extract_bone 19 | 20 | 21 | class PoseLoader: 22 | def __init__(self, smpl_model_path: str): 23 | """ 24 | 25 | Args: 26 | smpl_model_path: 27 | """ 28 | self.body_model = load_model( 29 | gender="neutral", 30 | model_type="smplx", 31 | model_path=smpl_model_path, 32 | device="cpu") 33 | 34 | def __call__(self, smpl_param: Any) -> np.ndarray: 35 | """ 36 | 37 | Args: 38 | smpl_param: 39 | 40 | Returns: 41 | 42 | """ 43 | Rh = np.array(smpl_param["Rh"]) # 1 x 3 44 | Th = np.array(smpl_param["Th"]) # 1 x 3 45 | poses = np.array(smpl_param["poses"])[:, :72] # 1 x 72 46 | shapes = smpl_param["shapes"] # 1 x 10 47 | expression = smpl_param["expression"] # 1 x 10 48 | 49 | shapes = torch.tensor(shapes).float() 50 | expression = torch.tensor(expression).float() 51 | shapes = torch.cat([shapes, expression], dim=1) 52 | poses = torch.tensor(poses).float() 53 | v_template = self.body_model.j_v_template 54 | joints, transformation = extract_bone(shapes, poses, v_template, 55 | self.body_model.j_shapedirs, None, 56 | self.body_model.j_J_regressor, self.body_model.parents, 57 | None, dtype=self.body_model.dtype, 58 | use_pose_blending=False) 59 | bone_pose = transformation.clone() 60 | bone_pose[:, :, :3, 3] = joints 61 | 62 | trans = np.eye(4) 63 | trans[:3, :3] = cv2.Rodrigues(Rh[0])[0] 64 | trans[:3, 3] = Th 65 | 66 | bone_pose_world = np.matmul(trans, bone_pose.numpy()[0]) 67 | 68 | return bone_pose_world 69 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: watch_it_move 2 | channels: 3 | - pytorch 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - python=3.9.12 8 | - pip==22.0.4 9 | - cudatoolkit=11.1.1 10 | - easydict=1.9 11 | - ninja=1.10.2 12 | - numpy=1.21.5 13 | - pytorch=1.10.1 14 | - torchvision=0.11.2 15 | - pip: 16 | - blosc==1.10.6 17 | - lpips==0.1.4 18 | - matplotlib==3.5.1 19 | - opencv-contrib-python==4.5.5.64 20 | - opencv-python==4.5.5.64 21 | - scikit-image==0.19.2 22 | - smplx==0.1.28 23 | - tensorboardx==2.5 24 | - cython==0.29.28 25 | - timm==0.5.4 26 | - h5py==3.6.0 27 | - submitit==1.4.2 28 | - gdown==4.4.0 29 | -------------------------------------------------------------------------------- /figures/reconstruction_rotate_motion_0_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/watch-it-move/5fdec2b71b07f9d4a5492fb3dad6bfcc7d9a9f8b/figures/reconstruction_rotate_motion_0_0.gif -------------------------------------------------------------------------------- /figures/repose_rotate_0_0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/watch-it-move/5fdec2b71b07f9d4a5492fb3dad6bfcc7d9a9f8b/figures/repose_rotate_0_0.gif -------------------------------------------------------------------------------- /figures/robot_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/watch-it-move/5fdec2b71b07f9d4a5492fb3dad6bfcc7d9a9f8b/figures/robot_example.jpg -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | # Create the conda environment 2 | 3 | conda env create --file environment.yml 4 | eval "$(conda shell.bash hook)" 5 | conda activate watch_it_move 6 | pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.10/index.html 7 | pip install "git+https://github.com/facebookresearch/pytorch3d.git@v0.6.2" 8 | 9 | # Initialize the submodules 10 | 11 | cd data_preprocess/zju/EasyMocap 12 | python setup.py develop 13 | # Patch lbs.py to expose a local variable we need use 14 | patch -p1 < ../diff.patch 15 | cd ../../.. 16 | 17 | cd AdelaiDet 18 | python setup.py build develop 19 | cd .. 20 | 21 | cd Mask2Former/mask2former/modeling/pixel_decoder/ops 22 | sh make.sh 23 | -------------------------------------------------------------------------------- /src/confs/atlas.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: atlas 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/atlas" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 20 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 384 75 | -------------------------------------------------------------------------------- /src/confs/atlas_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: atlas_merge 11 | 12 | resume_model_path: ../data/output/result/atlas/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/atlas" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 20 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 384 78 | -------------------------------------------------------------------------------- /src/confs/baxter.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: baxter 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/baxter" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 20 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 384 75 | -------------------------------------------------------------------------------- /src/confs/baxter_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: baxter_merge 11 | 12 | resume_model_path: ../data/output/result/baxter/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/baxter" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 20 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 384 78 | -------------------------------------------------------------------------------- /src/confs/cassie.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: cassie 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/cassie" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 15 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 512 75 | -------------------------------------------------------------------------------- /src/confs/cassie_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: cassie_merge 11 | 12 | resume_model_path: ../data/output/result/cassie/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/cassie" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 15 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 512 78 | -------------------------------------------------------------------------------- /src/confs/default.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: 10 | exp_name: 11 | 12 | fp16: false 13 | resume_model_path: 14 | load_optimizer: true 15 | iteration: 16 | 17 | dataset: 18 | data_root: "" 19 | batchsize: 4 20 | size: 256 21 | set_name: 22 | num_parts: 24 23 | num_workers: 2 # dataloader 24 | n_repetition_in_epoch: 1000 25 | coordinate_scale: 1500 26 | num_frames: 27 | num_view: 28 | prob_sample_latest: 0.2 29 | thin_out_interval: 1 30 | background_color: -1 31 | compression: True 32 | 33 | test_dataset: 34 | data_root: "" 35 | batchsize: 4 36 | size: 256 37 | num_parts: 24 38 | num_workers: 2 # dataloader 39 | n_repetition_in_epoch: 1 40 | coordinate_scale: 1500 41 | 42 | network_params: 43 | pixel_sampler: uniform 44 | decoder_params: 45 | hidden_dim: 256 46 | n_power: 6 47 | num_layers: 8 48 | sdf_residual_range: 0.05 49 | child_root: [ [ -0.75, 0, 0 ], [ 0, 0.75, 0 ], [ 0, -0.75, 0 ], 50 | [ 0, 0, 0.75 ], [ 0, 0, -0.75 ], [ 0.75, 0, 0 ] ] 51 | sdf_scale: 100 52 | initial_sdf_weight: 1. 53 | residual_sdf: true 54 | trajectory_params: 55 | dct: 56 | hidden_dim: 256 57 | n_mlp: 4 58 | k: 200 59 | n_split: 1 60 | surface_loss: false 61 | structure_loss: false 62 | center_coef_for_structure_loss: 0 63 | 64 | loss_params: 65 | mask_loss_multiplier: 1 66 | surface_loss_coef: 0 67 | structure_loss_coef: 0 68 | initial_structure_loss_coef: 0 69 | max_structure_loss_coef: 0 70 | joint_2d_loss_coef: 0 71 | joint_3d_separation_loss_coef: 0 72 | sdf_loss_coef: 0 73 | pull_rigid_parts_loss_coef: 0 74 | 75 | train_setting: 76 | num_iter: 100000 77 | optimizer: Adam 78 | lr: 0.001 79 | decay: 0 80 | clip_grad: false 81 | scheduler_gamma: 1 82 | resume: False 83 | 84 | val_interval: 5000 85 | save_interval: 5000 86 | log_interval: 100 87 | 88 | # for DDP 89 | master_addr: localhost 90 | master_port: '12355' 91 | backend: nccl 92 | 93 | dataset_schedule_type: "incremental" # incremental 94 | frame_schedule: 95 | incremental: 96 | initial_frame: 2932 97 | start: 98 | incremental_period: 99 | 100 | 101 | render_setting: 102 | num_ray: 100 103 | -------------------------------------------------------------------------------- /src/confs/dog.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: dog 11 | 12 | 13 | dataset: 14 | data_root: "../data/rgbd_dog/dog1/motion_testSeq" 15 | batchsize: 16 16 | size: 512 17 | set_name: 18 | num_parts: 20 19 | num_workers: 0 # dataloader 20 | num_frames: 601 21 | num_view: 8 22 | coordinate_scale: 1000 23 | prob_sample_latest: 0.2 24 | thin_out_interval: 1 25 | background_color: -1 26 | 27 | network_params: 28 | pixel_sampler: foreground 29 | decoder_params: 30 | sdf_residual_range: 0.02 31 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 32 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 33 | sdf_scale: 600 34 | initial_sdf_weight: 30. 35 | trajectory_params: 36 | dct: 37 | hidden_dim: 256 38 | n_mlp: 4 39 | k: 50 40 | n_split: 1 41 | surface_loss: true 42 | structure_loss: true 43 | center_coef_for_structure_loss: 0.02 44 | 45 | loss_params: 46 | mask_loss_multiplier: 1 47 | surface_loss_coef: 600 48 | structure_loss_coef: 2 49 | initial_structure_loss_coef: 2 50 | max_structure_loss_coef: 50 51 | joint_2d_loss_coef: 0 52 | joint_3d_separation_loss_coef: 1 53 | sdf_loss_coef: 0.2 54 | 55 | train_setting: 56 | num_iter: 200000 57 | optimizer: AdamW 58 | lr: 0.0003 59 | decay: 0.005 60 | clip_grad: true 61 | scheduler_gamma: 0.99995 62 | resume: False 63 | 64 | val_interval: 5000 65 | save_interval: 500 66 | log_interval: 100 67 | 68 | frame_schedule: 69 | incremental: 70 | initial_frame: 10 71 | start: 10000 72 | incremental_period: 70000 73 | 74 | render_setting: 75 | num_ray: 384 76 | -------------------------------------------------------------------------------- /src/confs/dog_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: dog_merge 11 | 12 | resume_model_path: ../data/output/result/dog/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/rgbd_dog/dog1/motion_testSeq" 17 | batchsize: 16 18 | size: 512 19 | set_name: 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 601 23 | num_view: 8 24 | coordinate_scale: 1000 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [ [ -0.75, 0, 0 ], [ 0, 0.75, 0 ], [ 0, -0.75, 0 ], 34 | [ 0, 0, 0.75 ], [ 0, 0, -0.75 ], [ 0.75, 0, 0 ] ] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 0 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/iiwa.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: iiwa 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/iiwa" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 8 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 512 75 | -------------------------------------------------------------------------------- /src/confs/iiwa_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: iiwa_merge 11 | 12 | resume_model_path: ../data/output/result/iiwa/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/iiwa" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 8 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 512 78 | -------------------------------------------------------------------------------- /src/confs/nao.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: nao 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/nao" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 20 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 0.333 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 600 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 0 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.2 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 384 75 | -------------------------------------------------------------------------------- /src/confs/nao_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: nao_merge 11 | 12 | resume_model_path: ../data/output/result/nao/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | 16 | dataset: 17 | data_root: "../data/robots/nao" 18 | batchsize: 16 19 | size: 512 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 300 23 | num_view: 5 24 | coordinate_scale: 0.333 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: 1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 30 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 1 51 | initial_structure_loss_coef: 1 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 0 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/pandas.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: pandas 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/pandas" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 10 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 512 75 | -------------------------------------------------------------------------------- /src/confs/pandas_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: pandas_merge 11 | 12 | resume_model_path: ../data/output/result/pandas/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/pandas" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 10 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 512 78 | -------------------------------------------------------------------------------- /src/confs/spot.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: spot 11 | 12 | 13 | dataset: 14 | data_root: "../data/robots/spot" 15 | batchsize: 16 16 | size: 512 17 | num_parts: 15 18 | num_workers: 0 # dataloader 19 | num_frames: 300 20 | num_view: 5 21 | coordinate_scale: 1 22 | prob_sample_latest: 0.2 23 | thin_out_interval: 1 24 | background_color: 1 25 | 26 | network_params: 27 | pixel_sampler: foreground 28 | decoder_params: 29 | sdf_residual_range: 0.02 30 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 31 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 32 | sdf_scale: 600 33 | initial_sdf_weight: 30. 34 | trajectory_params: 35 | dct: 36 | hidden_dim: 256 37 | n_mlp: 4 38 | k: 30 39 | n_split: 1 40 | surface_loss: true 41 | structure_loss: true 42 | center_coef_for_structure_loss: 0.02 43 | 44 | loss_params: 45 | mask_loss_multiplier: 1 46 | surface_loss_coef: 400 47 | structure_loss_coef: 1 48 | initial_structure_loss_coef: 1 49 | max_structure_loss_coef: 50 50 | joint_2d_loss_coef: 1000 51 | joint_3d_separation_loss_coef: 1 52 | sdf_loss_coef: 0.02 53 | 54 | train_setting: 55 | num_iter: 200000 56 | optimizer: AdamW 57 | lr: 0.0003 58 | decay: 0.005 59 | clip_grad: true 60 | scheduler_gamma: 0.99995 61 | resume: False 62 | 63 | val_interval: 5000 64 | save_interval: 500 65 | log_interval: 100 66 | 67 | frame_schedule: 68 | incremental: 69 | initial_frame: 10 70 | start: 10000 71 | incremental_period: 70000 72 | 73 | render_setting: 74 | num_ray: 512 75 | -------------------------------------------------------------------------------- /src/confs/spot_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: spot_merge 11 | 12 | resume_model_path: ../data/output/result/spot/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/robots/spot" 17 | batchsize: 16 18 | size: 512 19 | num_parts: 15 20 | num_workers: 0 # dataloader 21 | num_frames: 300 22 | num_view: 5 23 | coordinate_scale: 1 24 | prob_sample_latest: 0.2 25 | thin_out_interval: 1 26 | background_color: 1 27 | 28 | network_params: 29 | pixel_sampler: foreground 30 | decoder_params: 31 | sdf_residual_range: 0.02 32 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 33 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 34 | sdf_scale: 600 35 | initial_sdf_weight: 30. 36 | trajectory_params: 37 | dct: 38 | hidden_dim: 256 39 | n_mlp: 4 40 | k: 30 41 | n_split: 1 42 | surface_loss: true 43 | structure_loss: true 44 | center_coef_for_structure_loss: 0.02 45 | 46 | loss_params: 47 | mask_loss_multiplier: 1 48 | surface_loss_coef: 400 49 | structure_loss_coef: 1 50 | initial_structure_loss_coef: 1 51 | max_structure_loss_coef: 50 52 | joint_2d_loss_coef: 1000 53 | joint_3d_separation_loss_coef: 1 54 | sdf_loss_coef: 0.02 55 | pull_rigid_parts_loss_coef: 5 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 512 78 | -------------------------------------------------------------------------------- /src/confs/zju366.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju366 11 | 12 | 13 | dataset: 14 | data_root: "../data/zju_mocap/cache512/366" 15 | batchsize: 16 16 | size: 512 17 | set_name: train 18 | num_parts: 20 19 | num_workers: 0 # dataloader 20 | num_frames: 623 21 | num_view: 6 22 | coordinate_scale: 1.5 23 | prob_sample_latest: 0.2 24 | thin_out_interval: 1 25 | background_color: -1 26 | 27 | network_params: 28 | pixel_sampler: foreground 29 | decoder_params: 30 | sdf_residual_range: 0.02 31 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 32 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 33 | sdf_scale: 600 34 | initial_sdf_weight: 30. 35 | trajectory_params: 36 | dct: 37 | hidden_dim: 256 38 | n_mlp: 4 39 | k: 50 40 | n_split: 1 41 | surface_loss: true 42 | structure_loss: true 43 | center_coef_for_structure_loss: 0.02 44 | 45 | loss_params: 46 | mask_loss_multiplier: 1 47 | surface_loss_coef: 600 48 | structure_loss_coef: 2 49 | initial_structure_loss_coef: 2 50 | max_structure_loss_coef: 50 51 | joint_2d_loss_coef: 1000 52 | joint_3d_separation_loss_coef: 1 53 | sdf_loss_coef: 0.2 54 | 55 | train_setting: 56 | num_iter: 200000 57 | optimizer: AdamW 58 | lr: 0.0003 59 | decay: 0.005 60 | clip_grad: true 61 | scheduler_gamma: 0.99995 62 | resume: False 63 | 64 | val_interval: 5000 65 | save_interval: 500 66 | log_interval: 100 67 | 68 | frame_schedule: 69 | incremental: 70 | initial_frame: 10 71 | start: 10000 72 | incremental_period: 70000 73 | 74 | render_setting: 75 | num_ray: 384 76 | -------------------------------------------------------------------------------- /src/confs/zju366_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju366_merge 11 | 12 | resume_model_path: ../data/output/result/zju366/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/366" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 623 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/zju377.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju377 11 | 12 | 13 | dataset: 14 | data_root: "../data/zju_mocap/cache512/377" 15 | batchsize: 16 16 | size: 512 17 | set_name: train 18 | num_parts: 20 19 | num_workers: 0 # dataloader 20 | num_frames: 493 21 | num_view: 6 22 | coordinate_scale: 1.5 23 | prob_sample_latest: 0.2 24 | thin_out_interval: 1 25 | background_color: -1 26 | 27 | network_params: 28 | pixel_sampler: foreground 29 | decoder_params: 30 | sdf_residual_range: 0.02 31 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 32 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 33 | sdf_scale: 600 34 | initial_sdf_weight: 30. 35 | trajectory_params: 36 | dct: 37 | hidden_dim: 256 38 | n_mlp: 4 39 | k: 50 40 | n_split: 1 41 | surface_loss: true 42 | structure_loss: true 43 | center_coef_for_structure_loss: 0.02 44 | 45 | loss_params: 46 | mask_loss_multiplier: 1 47 | surface_loss_coef: 600 48 | structure_loss_coef: 2 49 | initial_structure_loss_coef: 2 50 | max_structure_loss_coef: 50 51 | joint_2d_loss_coef: 1000 52 | joint_3d_separation_loss_coef: 1 53 | sdf_loss_coef: 0.2 54 | 55 | train_setting: 56 | num_iter: 200000 57 | optimizer: AdamW 58 | lr: 0.0003 59 | decay: 0.005 60 | clip_grad: true 61 | scheduler_gamma: 0.99995 62 | resume: False 63 | 64 | val_interval: 5000 65 | save_interval: 500 66 | log_interval: 100 67 | 68 | frame_schedule: 69 | incremental: 70 | initial_frame: 10 71 | start: 10000 72 | incremental_period: 70000 73 | 74 | render_setting: 75 | num_ray: 384 76 | -------------------------------------------------------------------------------- /src/confs/zju377_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju377_merge 11 | 12 | resume_model_path: ../data/output/result/zju377/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/377" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 493 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/zju381.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju381 11 | 12 | resume_model_path: ../data/output/result/zju381_str25/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/381" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 500 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | 57 | train_setting: 58 | num_iter: 200000 59 | optimizer: AdamW 60 | lr: 0.0003 61 | decay: 0.005 62 | clip_grad: true 63 | scheduler_gamma: 0.99995 64 | resume: False 65 | 66 | val_interval: 5000 67 | save_interval: 500 68 | log_interval: 100 69 | 70 | frame_schedule: 71 | incremental: 72 | initial_frame: 10 73 | start: 10000 74 | incremental_period: 70000 75 | 76 | render_setting: 77 | num_ray: 384 78 | -------------------------------------------------------------------------------- /src/confs/zju381_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju381_merge 11 | 12 | resume_model_path: ../data/output/result/zju381/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/381" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 500 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/zju384.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju384 11 | 12 | 13 | dataset: 14 | data_root: "../data/zju_mocap/cache512/384" 15 | batchsize: 16 16 | size: 512 17 | set_name: train 18 | num_parts: 20 19 | num_workers: 0 # dataloader 20 | num_frames: 756 21 | num_view: 6 22 | coordinate_scale: 1.5 23 | prob_sample_latest: 0.2 24 | thin_out_interval: 1 25 | background_color: -1 26 | 27 | network_params: 28 | pixel_sampler: foreground 29 | decoder_params: 30 | sdf_residual_range: 0.02 31 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 32 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 33 | sdf_scale: 600 34 | initial_sdf_weight: 30. 35 | trajectory_params: 36 | dct: 37 | hidden_dim: 256 38 | n_mlp: 4 39 | k: 50 40 | n_split: 1 41 | surface_loss: true 42 | structure_loss: true 43 | center_coef_for_structure_loss: 0.02 44 | 45 | loss_params: 46 | mask_loss_multiplier: 1 47 | surface_loss_coef: 600 48 | structure_loss_coef: 2 49 | initial_structure_loss_coef: 2 50 | max_structure_loss_coef: 50 51 | joint_2d_loss_coef: 1000 52 | joint_3d_separation_loss_coef: 1 53 | sdf_loss_coef: 0.2 54 | 55 | train_setting: 56 | num_iter: 200000 57 | optimizer: AdamW 58 | lr: 0.0003 59 | decay: 0.005 60 | clip_grad: true 61 | scheduler_gamma: 0.99995 62 | resume: False 63 | 64 | val_interval: 5000 65 | save_interval: 500 66 | log_interval: 100 67 | 68 | frame_schedule: 69 | incremental: 70 | initial_frame: 10 71 | start: 10000 72 | incremental_period: 70000 73 | 74 | render_setting: 75 | num_ray: 384 76 | -------------------------------------------------------------------------------- /src/confs/zju384_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju384_merge 11 | 12 | resume_model_path: ../data/output/result/zju384/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/384" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 756 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/confs/zju387.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju387 11 | 12 | 13 | dataset: 14 | data_root: "../data/zju_mocap/cache512/387" 15 | batchsize: 16 16 | size: 512 17 | set_name: train 18 | num_parts: 20 19 | num_workers: 0 # dataloader 20 | num_frames: 523 21 | num_view: 6 22 | coordinate_scale: 1.5 23 | prob_sample_latest: 0.2 24 | thin_out_interval: 1 25 | background_color: -1 26 | 27 | network_params: 28 | pixel_sampler: foreground 29 | decoder_params: 30 | sdf_residual_range: 0.02 31 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 32 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 33 | sdf_scale: 600 34 | initial_sdf_weight: 30. 35 | trajectory_params: 36 | dct: 37 | hidden_dim: 256 38 | n_mlp: 4 39 | k: 50 40 | n_split: 1 41 | surface_loss: true 42 | structure_loss: true 43 | center_coef_for_structure_loss: 0.02 44 | 45 | loss_params: 46 | mask_loss_multiplier: 1 47 | surface_loss_coef: 600 48 | structure_loss_coef: 2 49 | initial_structure_loss_coef: 2 50 | max_structure_loss_coef: 50 51 | joint_2d_loss_coef: 1000 52 | joint_3d_separation_loss_coef: 1 53 | sdf_loss_coef: 0.2 54 | 55 | train_setting: 56 | num_iter: 200000 57 | optimizer: AdamW 58 | lr: 0.0003 59 | decay: 0.005 60 | clip_grad: true 61 | scheduler_gamma: 0.99995 62 | resume: False 63 | 64 | val_interval: 5000 65 | save_interval: 500 66 | log_interval: 100 67 | 68 | frame_schedule: 69 | incremental: 70 | initial_frame: 10 71 | start: 10000 72 | incremental_period: 70000 73 | 74 | render_setting: 75 | num_ray: 384 76 | -------------------------------------------------------------------------------- /src/confs/zju387_merge.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | output_dir: ../data/output 10 | exp_name: zju387_merge 11 | 12 | resume_model_path: ../data/output/result/zju387/snapshot_150000.pth 13 | load_optimizer: false 14 | 15 | dataset: 16 | data_root: "../data/zju_mocap/cache512/387" 17 | batchsize: 16 18 | size: 512 19 | set_name: train 20 | num_parts: 20 21 | num_workers: 0 # dataloader 22 | num_frames: 523 23 | num_view: 6 24 | coordinate_scale: 1.5 25 | prob_sample_latest: 0.2 26 | thin_out_interval: 1 27 | background_color: -1 28 | 29 | network_params: 30 | pixel_sampler: foreground 31 | decoder_params: 32 | sdf_residual_range: 0.02 33 | child_root: [[-0.75, 0, 0], [0, 0.75, 0], [0, -0.75, 0], 34 | [0, 0, 0.75], [0, 0, -0.75], [0.75, 0, 0]] 35 | sdf_scale: 600 36 | initial_sdf_weight: 30. 37 | trajectory_params: 38 | dct: 39 | hidden_dim: 256 40 | n_mlp: 4 41 | k: 50 42 | n_split: 1 43 | surface_loss: true 44 | structure_loss: true 45 | center_coef_for_structure_loss: 0.02 46 | 47 | loss_params: 48 | mask_loss_multiplier: 1 49 | surface_loss_coef: 600 50 | structure_loss_coef: 2 51 | initial_structure_loss_coef: 2 52 | max_structure_loss_coef: 50 53 | joint_2d_loss_coef: 1000 54 | joint_3d_separation_loss_coef: 1 55 | sdf_loss_coef: 0.2 56 | pull_rigid_parts_loss_coef: 5 57 | 58 | train_setting: 59 | num_iter: 200000 60 | optimizer: AdamW 61 | lr: 0.0003 62 | decay: 0.005 63 | clip_grad: true 64 | scheduler_gamma: 0.99995 65 | resume: False 66 | 67 | val_interval: 5000 68 | save_interval: 500 69 | log_interval: 100 70 | 71 | frame_schedule: 72 | incremental: 73 | initial_frame: 10 74 | start: 10000 75 | incremental_period: 70000 76 | 77 | render_setting: 78 | num_ray: 384 79 | -------------------------------------------------------------------------------- /src/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import pickle 13 | import random 14 | from typing import Dict, Any, Optional 15 | 16 | import blosc 17 | import numpy as np 18 | 19 | 20 | class SingleVideoDataset: 21 | def __init__(self, config: Dict[str, Any]) -> None: 22 | """ 23 | 24 | Args: 25 | config: 26 | """ 27 | self.config = config 28 | self.num_parts = config.num_parts 29 | self.num_frames = config.num_frames 30 | self.num_view = config.num_view 31 | self.img_size = config.size 32 | self.thin_out_interval = config.thin_out_interval 33 | self.return_neighboring_frames = config.return_neighboring_frames 34 | self.return_random_frames = config.return_random_frames 35 | self.compression = config.compression 36 | self.video_cache = self.cache_data(config.set_name) 37 | self.n_repetition_in_epoch = config.n_repetition_in_epoch 38 | self.coordinate_scale = config.coordinate_scale 39 | self.current_max_frame_id = self.num_frames // self.thin_out_interval 40 | self.current_min_frame_id = 0 41 | self.prob_sample_latest = config.prob_sample_latest 42 | self.background_color = config.background_color 43 | 44 | @staticmethod 45 | def seed(): 46 | np.random.seed() 47 | random.seed() 48 | 49 | def cache_data(self, set_name: Optional[str] = None) -> Dict: 50 | """ 51 | cache data into a dictionary of numpy array 52 | Args: 53 | set_name: 54 | 55 | Returns: 56 | video_cache (dict): cached data 57 | """ 58 | file_name = "cache.pickle" if set_name is None else f"cache_{set_name}.pickle" 59 | cache_path = f"{self.config.data_root}/{file_name}" 60 | with open(cache_path, "rb") as f: 61 | video_cache = pickle.load(f) 62 | 63 | return video_cache 64 | 65 | def __len__(self) -> int: 66 | return self.num_frames * self.num_view // self.thin_out_interval * \ 67 | self.n_repetition_in_epoch # number of frames 68 | 69 | def get_index(self, index: int) -> np.ndarray: 70 | """ 71 | 72 | Args: 73 | index: 74 | 75 | Returns: 76 | 77 | """ 78 | num_frames = self.num_frames // self.thin_out_interval 79 | if self.current_max_frame_id >= num_frames: 80 | index = index // self.n_repetition_in_epoch 81 | else: 82 | current_max_frame_id = min(num_frames, self.current_max_frame_id) 83 | current_min_frame_id = self.current_min_frame_id 84 | if random.random() < self.prob_sample_latest: 85 | min_frame_id = max(0, current_max_frame_id - 6) 86 | frame_id = random.randint(min_frame_id, current_max_frame_id - 1) 87 | else: 88 | frame_id = random.randint(current_min_frame_id, current_max_frame_id - 1) 89 | camera_id = random.randint(0, self.num_view - 1) 90 | index = self.num_frames * camera_id + frame_id * self.thin_out_interval 91 | 92 | return index 93 | 94 | def __getitem__(self, index: int) -> dict: 95 | """ 96 | 97 | Args: 98 | index: 99 | 100 | Returns: 101 | 102 | """ 103 | self.seed() 104 | index = self.get_index(index) 105 | 106 | frame_id = self.video_cache["frame_id"][index] 107 | 108 | img = self.video_cache["img"][index] 109 | mask = self.video_cache["mask"][index] 110 | if self.compression: 111 | img = blosc.unpack_array(img) 112 | mask = blosc.unpack_array(mask) 113 | img = img / 127.5 - 1 114 | 115 | # remove background 116 | fg_mask = (mask == 1) # ignore unreliable pixels 117 | img = img * fg_mask + (1 - fg_mask) * self.background_color 118 | 119 | camera_rotation = self.video_cache["camera_rotation"][index] 120 | camera_translation = self.video_cache["camera_translation"][index] / self.coordinate_scale 121 | camera_id = self.video_cache["camera_id"][index] 122 | camera_intrinsic = self.video_cache["camera_intrinsic"][index] 123 | minibatch = {"frame_id": frame_id, 124 | "img": img.astype("float32"), 125 | "mask": mask.astype("float32"), 126 | "camera_rotation": camera_rotation.astype("float32"), 127 | "camera_translation": camera_translation.astype("float32"), 128 | "camera_id": camera_id, 129 | "camera_intrinsic": camera_intrinsic.astype("float32")} 130 | 131 | return minibatch 132 | -------------------------------------------------------------------------------- /src/train_single_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | from typing import Tuple, Dict, Any 13 | 14 | import torch 15 | from torch import nn 16 | 17 | from easydict import EasyDict as edict 18 | from models.loss import SupervisedLoss 19 | from models.model import SingleVideoPartDecomposition 20 | from utils.get_args import get_args 21 | from utils.train_utils import (create_optimizer, 22 | send_model_to_gpu) 23 | from utils.trainer import TrainerBase 24 | 25 | mse = nn.MSELoss() 26 | 27 | 28 | def loss_reconstruction_based(minibatch: dict, model: nn.Module, loss_func: SupervisedLoss, config: edict, 29 | pull_rigid_parts: bool = False) -> Tuple[torch.Tensor, Dict[str, float]]: 30 | """ 31 | 32 | Args: 33 | minibatch: 34 | model: 35 | loss_func: 36 | config: 37 | pull_rigid_parts: 38 | 39 | Returns: 40 | 41 | """ 42 | img = minibatch["img"] 43 | mask = minibatch["mask"] 44 | camera_rotation = minibatch["camera_rotation"] 45 | camera_translation = minibatch["camera_translation"] 46 | inv_intrinsics = torch.inverse(minibatch["camera_intrinsic"]) 47 | frame_id = minibatch["frame_id"] 48 | 49 | model_output_dict = model(frame_id, camera_rotation, camera_translation, 50 | inv_intrinsics, num_ray=config.render_setting.num_ray, mask=mask) 51 | 52 | loss, loss_dict = loss_func(img, mask, model_output_dict, pull_rigid_parts=pull_rigid_parts) 53 | 54 | if "sdf_grad" in model_output_dict and config.loss_params.sdf_loss_coef > 0: 55 | sdf_grad = model_output_dict["sdf_grad"] 56 | sdf_loss = mse(torch.norm(sdf_grad, dim=1), torch.ones_like(sdf_grad[:, 0])) 57 | loss += sdf_loss * config.loss_params.sdf_loss_coef 58 | loss_dict["sdf_loss"] = sdf_loss.item() 59 | 60 | return loss, loss_dict 61 | 62 | 63 | class Trainer(TrainerBase): 64 | def __init__(self): 65 | self.snapshot_prefix = "snapshot" 66 | self.only_update_joint = False 67 | self.pull_rigid_parts = False 68 | 69 | def prepare_model_and_optimizer(self, config: edict, rank: int, ddp: int) -> Tuple[nn.Module, nn.Module, Any]: 70 | """ 71 | 72 | Args: 73 | config: 74 | rank: 75 | ddp: 76 | 77 | Returns: 78 | 79 | """ 80 | self.config = config 81 | model = SingleVideoPartDecomposition(config.network_params) 82 | optimizer = create_optimizer(config.train_setting, model) # optimizer works locally, define before DDP_model 83 | 84 | model, model_module = send_model_to_gpu(rank, model, ddp) 85 | return model, model_module, optimizer 86 | 87 | def define_loss_func(self, config: edict, model_module: nn.Module, ddp: bool) -> None: 88 | """ 89 | 90 | Args: 91 | config: 92 | model_module: 93 | ddp: 94 | 95 | Returns: 96 | 97 | """ 98 | self.reconstruction_loss_func = SupervisedLoss(config.loss_params, model_module, ddp, coarse_rate=64) 99 | 100 | def process_incremental(self, schedule_config: edict, iteration: int) -> None: 101 | """ 102 | 103 | Args: 104 | schedule_config: 105 | iteration: 106 | 107 | Returns: 108 | 109 | """ 110 | initial_frame = schedule_config.initial_frame 111 | start = schedule_config.start 112 | incremental_period = schedule_config.incremental_period 113 | num_frames = self.config.dataset.num_frames // self.config.dataset.thin_out_interval 114 | 115 | if start is None: 116 | start = 1e10 117 | if incremental_period is None: 118 | incremental_period = 1e10 119 | self.train_loader.dataset.current_max_frame_id = \ 120 | int(initial_frame + min(max(0, iteration - start), incremental_period) * 121 | (num_frames - initial_frame) / incremental_period) 122 | self.model.current_max_frame_id = self.train_loader.dataset.current_max_frame_id 123 | 124 | loss_config = self.reconstruction_loss_func.config 125 | if loss_config.initial_structure_loss_coef > 0: 126 | if iteration > start: 127 | loss_config.structure_loss_coef = loss_config.max_structure_loss_coef 128 | else: 129 | loss_config.structure_loss_coef = loss_config.initial_structure_loss_coef * (1 - iteration / start) + \ 130 | loss_config.max_structure_loss_coef * (iteration / start) 131 | 132 | if iteration > start + incremental_period: 133 | self.pull_rigid_parts = True 134 | assert self.train_loader.num_workers == 0 135 | 136 | def process_before_train_step(self, iteration: int) -> None: 137 | """ 138 | 139 | Args: 140 | iteration: 141 | 142 | Returns: 143 | 144 | """ 145 | dataset_schedule_type = self.config.train_setting.dataset_schedule_type 146 | schedule_config = self.config.train_setting.frame_schedule[dataset_schedule_type] 147 | if dataset_schedule_type == "incremental": 148 | self.process_incremental(schedule_config, iteration) 149 | else: 150 | raise ValueError("Invalid dataset schedule type") 151 | 152 | def lossfunc(self, config: edict, minibatch: dict, model: nn.Module, model_module: nn.Module, 153 | pull_rigid_parts: bool = False) -> Tuple[torch.Tensor, dict]: 154 | """ 155 | 156 | Args: 157 | config: 158 | minibatch: 159 | model: 160 | model_module: 161 | pull_rigid_parts: 162 | 163 | Returns: 164 | 165 | """ 166 | loss_dict = {} 167 | 168 | # reconstruction branch 169 | recon_loss_func = self.reconstruction_loss_func 170 | loss, _loss_dict = loss_reconstruction_based(minibatch, model, recon_loss_func, config, 171 | pull_rigid_parts=self.pull_rigid_parts) 172 | loss_dict.update(_loss_dict) 173 | 174 | return loss, loss_dict 175 | 176 | 177 | if __name__ == "__main__": 178 | args, config = get_args() 179 | 180 | trainer = Trainer() 181 | trainer.run(config) 182 | -------------------------------------------------------------------------------- /src/utils/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import yaml 13 | from easydict import EasyDict as edict 14 | 15 | 16 | def check_config(config: edict): 17 | """ 18 | 19 | Args: 20 | config: 21 | 22 | Returns: 23 | 24 | """ 25 | if "temporal_consistency_loss_coef" in config.loss_params: 26 | assert (config.loss_params.temporal_consistency_loss_coef > 0) == config.network_params.temporal_consistency 27 | assert not config.network_params.temporal_consistency or (config.dataset.video_len > 1) 28 | 29 | if "surface_loss" in config.network_params: 30 | assert (config.loss_params.surface_loss_coef > 0) == config.network_params.surface_loss 31 | if "structure_loss" in config.network_params: 32 | assert (config.loss_params.structure_loss_coef > 0) == config.network_params.structure_loss 33 | 34 | if "transformation_equivariance" in config.network_params: 35 | assert ((config.loss_params.heatmap_2d_equivariance_loss_coef > 36 | 0) == config.network_params.transformation_equivariance) or \ 37 | ((config.loss_params.depth_map_equivariance_loss_coef > 38 | 0) == config.network_params.transformation_equivariance) or \ 39 | ((config.loss_params.pose_equivariance_loss_coef > 40 | 0) == config.network_params.transformation_equivariance) 41 | 42 | 43 | def yaml_config(config_path: str, default_config_path: str) -> edict: 44 | """ 45 | 46 | Args: 47 | config_path: 48 | default_config_path: 49 | 50 | Returns: 51 | 52 | """ 53 | default_config = edict(yaml.load(open(default_config_path), Loader=yaml.SafeLoader)) 54 | current_config = edict(yaml.load(open(config_path), Loader=yaml.SafeLoader)) 55 | 56 | def _copy(conf: dict, default_conf: dict): 57 | for key in conf: 58 | if isinstance(default_conf[key], edict): 59 | _copy(conf[key], default_conf[key]) 60 | else: 61 | default_conf[key] = conf[key] 62 | 63 | _copy(current_config, default_config) 64 | 65 | # copy params 66 | default_config.network_params.size = default_config.dataset.size 67 | default_config.network_params.num_parts = default_config.dataset.num_parts 68 | 69 | if "video_len" in default_config.dataset: 70 | default_config.network_params.video_len = default_config.dataset.video_len 71 | 72 | if "transformation_equivariance" in default_config.network_params: 73 | default_config.dataset.transformation_equivariance = default_config.network_params.transformation_equivariance 74 | default_config.test_dataset.transformation_equivariance = False 75 | 76 | if "decoder_params" in default_config.network_params: 77 | default_config.network_params.decoder_params.num_parts = default_config.dataset.num_parts 78 | default_config.network_params.decoder_params.num_camera = default_config.dataset.num_view 79 | 80 | if "multiview" in default_config.dataset: 81 | default_config.network_params.multiview = default_config.dataset.multiview 82 | 83 | if "num_frames" in default_config.dataset: 84 | default_config.network_params.video_length = default_config.dataset.num_frames 85 | default_config.network_params.num_view = default_config.dataset.num_view 86 | 87 | return_neighboring_frames = False 88 | 89 | default_config.dataset.return_neighboring_frames = return_neighboring_frames 90 | default_config.test_dataset.return_neighboring_frames = return_neighboring_frames 91 | 92 | return_random_frames = False 93 | 94 | default_config.dataset.return_random_frames = return_random_frames 95 | default_config.test_dataset.return_random_frames = return_random_frames 96 | 97 | default_config.network_params.background_color = default_config.dataset.background_color 98 | 99 | check_config(default_config) 100 | 101 | return default_config 102 | -------------------------------------------------------------------------------- /src/utils/get_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import argparse 13 | from typing import Optional, Any, Tuple 14 | 15 | from easydict import EasyDict as edict 16 | 17 | from .config import yaml_config 18 | 19 | 20 | def get_config(args: Any) -> edict: 21 | """ 22 | 23 | Args: 24 | args: 25 | 26 | Returns: 27 | 28 | """ 29 | config = yaml_config(args.config, args.default_config) 30 | config.resume_latest = args.resume_latest 31 | if config.resume_model_path is None: 32 | config.resume_model_path = args.resume_model_path 33 | 34 | return config 35 | 36 | 37 | def get_args(config_path: Optional[str] = None) -> Tuple[Any, edict]: 38 | """ 39 | 40 | Args: 41 | config_path: 42 | 43 | Returns: 44 | 45 | """ 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('--config', type=str, default="confs/default.yml") 48 | parser.add_argument('--default_config', type=str, default="confs/default.yml") 49 | parser.add_argument('--resume_latest', action='store_true') 50 | parser.add_argument('--resume_model_path', type=str, default=None) 51 | 52 | args = parser.parse_args() 53 | if config_path is not None: 54 | args.config = config_path 55 | 56 | config = get_config(args) 57 | 58 | return args, config 59 | 60 | 61 | def get_ddp_args(config_path: Optional[str] = None) -> Tuple[Any, edict]: 62 | """ 63 | 64 | Args: 65 | config_path: 66 | 67 | Returns: 68 | 69 | """ 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument('--config', type=str, default="confs/default.yml") 72 | parser.add_argument('--default_config', type=str, default="confs/default.yml") 73 | parser.add_argument('--resume_latest', action='store_true') 74 | parser.add_argument('--resume_model_path', type=str, default=None) 75 | parser.add_argument('--gpus', type=int, default=1) 76 | parser.add_argument('--nodes', type=int, default=1) 77 | 78 | args = parser.parse_args() 79 | if config_path is not None: 80 | args.config = config_path 81 | 82 | config = get_config(args) 83 | 84 | return args, config 85 | 86 | 87 | def get_args_jupyter(config_path: str = "cons/default.yml", default_config: str = "confs/default.yml" 88 | ) -> Tuple[None, edict]: 89 | """ 90 | 91 | Args: 92 | config_path: 93 | default_config: 94 | 95 | Returns: 96 | 97 | """ 98 | config = yaml_config(config_path, default_config) 99 | 100 | return None, config 101 | -------------------------------------------------------------------------------- /src/utils/graph_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | 13 | from typing import Tuple, List 14 | from copy import deepcopy 15 | import numpy as np 16 | import queue 17 | 18 | 19 | def get_parent_and_children_id(num_parts: int, joint_connection: np.ndarray, selected_candidate_id: np.ndarray, 20 | root_id: int) -> Tuple[np.ndarray, List[List], List[List]]: 21 | """ 22 | Get parent and children id of each part 23 | Args: 24 | num_parts: 25 | joint_connection: 26 | selected_candidate_id: 27 | root_id: 28 | 29 | Returns: 30 | 31 | """ 32 | parent_id = -np.ones(num_parts, dtype="int64") # initialize parent id with -1 33 | children_id = [[_] for _ in range(num_parts)] 34 | connected_to = [[] for _ in range(num_parts)] 35 | for j1, j2 in joint_connection: 36 | connected_to[j1].append(j2) 37 | connected_to[j2].append(j1) 38 | 39 | que = queue.Queue() 40 | visited = [root_id] 41 | [que.put(ct) for ct in connected_to[root_id]] 42 | while True: 43 | current = que.get() 44 | visited.append(current) 45 | parent_id[current] = list(set(visited) & set(connected_to[current]))[0] 46 | if len(visited) == num_parts: 47 | break 48 | not_visited = list(set(connected_to[current]) - set(visited)) 49 | [que.put(ct) for ct in not_visited] 50 | 51 | for idx in reversed(visited): 52 | if parent_id[idx] >= 0: 53 | children_id[parent_id[idx]] += deepcopy(children_id[idx]) 54 | 55 | selected_candidate = [] 56 | for i in range(num_parts): 57 | parent = parent_id[i] 58 | if parent < 0: 59 | cand_i, cand_parent = -1, -1 60 | else: 61 | matched = (joint_connection == np.array([i, parent])).all(axis=1) 62 | 63 | if matched.any(): 64 | assert matched.sum() == 1 65 | cand_i, cand_parent = selected_candidate_id[matched][0] 66 | else: 67 | matched = (joint_connection == np.array([parent, i])).all(axis=1) 68 | assert matched.sum() == 1 69 | cand_parent, cand_i = selected_candidate_id[matched][0] 70 | 71 | selected_candidate.append([cand_i, cand_parent]) 72 | 73 | return parent_id, children_id, selected_candidate 74 | -------------------------------------------------------------------------------- /src/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | from typing import Optional, Tuple 13 | 14 | import numpy as np 15 | import torch 16 | import torch.nn.functional as F 17 | from easydict import EasyDict as edict 18 | from pytorch3d.transforms.rotation_conversions import rotation_6d_to_matrix 19 | from torch import nn 20 | 21 | 22 | def expand_mask(mask: torch.Tensor, coarse_rate: int = 32, stride: int = 32): 23 | """ 24 | Expand mask by max pooling 25 | Args: 26 | mask: 27 | coarse_rate: 28 | stride: 29 | 30 | Returns: 31 | 32 | """ 33 | pad = 0 if coarse_rate == stride else (coarse_rate - 1) // 2 34 | dilate_mask = F.max_pool2d(mask[:, None], coarse_rate, stride, pad) 35 | if stride > 1: 36 | dilate_mask = F.interpolate(dilate_mask, scale_factor=stride, mode="nearest") 37 | 38 | return dilate_mask 39 | 40 | 41 | def foreground_sampler(img_size: int, num_ray: int, mask: torch.Tensor, coarse_rate: int = 32, 42 | stride: int = 32, dim: int = 1, expand=True) -> torch.Tensor: 43 | """uniformly sample around foreground mask 44 | 45 | Args: 46 | img_size (int): image size 47 | num_ray (int): number of points to sample 48 | mask (int): shape: (B, img_size, img_size) 49 | coarse_rate 50 | stride 51 | dim 52 | expand 53 | 54 | Returns: 55 | torch.Tensor: sampled coordinates, shape: (B, 2, num_ray) if dim==1 56 | """ 57 | if expand: 58 | dilate_mask = expand_mask(mask, coarse_rate, stride).squeeze(1) > 0.5 59 | else: 60 | dilate_mask = mask > 0.5 61 | unreliable_mask = mask > 1 62 | dilate_mask = dilate_mask.float() - unreliable_mask * 2 63 | noised_dilate_mask = dilate_mask + torch.empty_like(dilate_mask, dtype=torch.float).uniform_() 64 | noised_dilate_mask = noised_dilate_mask.reshape(-1, img_size ** 2) 65 | _, coordinates = torch.topk(noised_dilate_mask, num_ray, dim=1, sorted=False) 66 | coordinates = torch.stack([coordinates % img_size, 67 | torch.div(coordinates, img_size, rounding_mode='trunc')], dim=dim) 68 | 69 | return coordinates 70 | 71 | 72 | def patch_sampler(img_size: int, num_ray: int, mask: torch.Tensor, coarse_rate: int = 32, 73 | dim: int = 1, expand=True) -> torch.Tensor: 74 | """sample patch 75 | 76 | Args: 77 | img_size (int): image size 78 | num_ray (int): number of points to sample 79 | mask (torch.Tensor): shape: (B, img_size, img_size) 80 | coarse_rate 81 | dim 82 | expand 83 | 84 | Returns: 85 | torch.Tensor: sampled coordinates, shape: (B, 2, num_ray) if dim==1 86 | """ 87 | assert (num_ray ** 0.5).is_integer() 88 | assert expand 89 | 90 | patch_size = int(num_ray ** 0.5) 91 | expansion_size = max(0, coarse_rate - patch_size // 2) 92 | 93 | dilate_mask = expand_mask(mask, expansion_size + 1, 1) 94 | 95 | noised_dilate_mask = (dilate_mask > 0.5) + torch.empty_like(dilate_mask, dtype=torch.float).uniform_() 96 | noised_dilate_mask = noised_dilate_mask.reshape(-1, img_size ** 2) 97 | patch_center = torch.argmax(noised_dilate_mask, dim=1, keepdim=True) 98 | 99 | device = mask.device 100 | coordinates = torch.stack([patch_center % img_size, 101 | torch.div(patch_center, img_size, rounding_mode='trunc')], dim=dim) 102 | coordinates = coordinates.clamp(patch_size // 2, img_size - patch_size // 2 - 1) 103 | 104 | grid = torch.meshgrid(torch.arange(-patch_size // 2, patch_size // 2, device=device), 105 | torch.arange(-patch_size // 2, patch_size // 2, device=device), indexing='ij') 106 | grid = torch.stack([grid[1].reshape(1, -1), grid[0].reshape(1, -1)], dim=dim) 107 | 108 | coordinates = coordinates + grid 109 | 110 | return coordinates 111 | 112 | 113 | class PixelSampler: 114 | def __init__(self, sample_strategy: str = "uniform"): 115 | """ 116 | 117 | Args: 118 | sample_strategy: 119 | """ 120 | self.sample_strategy = sample_strategy 121 | 122 | @staticmethod 123 | def unifrom_sampler(img_size: int, num_ray: int, batchsize: int) -> torch.Tensor: 124 | """uniformly sample pixel coordinates 125 | 126 | Args: 127 | img_size (int): image size 128 | num_ray (int): number of points to sample 129 | batchsize: 130 | 131 | Returns: 132 | torch.Tensor: sampled coordinates, shape: (B, 2, num_ray) 133 | """ 134 | coordinates = torch.randint(high=img_size, size=(batchsize, 2, num_ray), device="cuda") 135 | 136 | return coordinates 137 | 138 | def __call__(self, img_size: int, num_ray: int, batchsize: int, 139 | mask: Optional[torch.Tensor] = None, expand=True, 140 | coarse_rate: int = 32, stride: int = 32) -> torch.Tensor: 141 | """ 142 | 143 | Args: 144 | img_size: 145 | num_ray: 146 | batchsize: 147 | mask: 148 | expand: 149 | coarse_rate: 150 | stride: 151 | 152 | Returns: 153 | 154 | """ 155 | if self.sample_strategy == "uniform": 156 | return self.unifrom_sampler(img_size, num_ray, batchsize) 157 | elif self.sample_strategy == "foreground": 158 | return foreground_sampler(img_size, num_ray, mask, expand=expand, 159 | coarse_rate=coarse_rate, stride=stride) 160 | elif self.sample_strategy == "patch": 161 | return patch_sampler(img_size, num_ray, mask, expand=expand, 162 | coarse_rate=coarse_rate) 163 | else: 164 | raise ValueError() 165 | 166 | 167 | class PoseTrajectoryMLP(nn.Module): 168 | def __init__(self, video_len: int, n_keypoints: int, hidden_dim: int = 128, n_mlp: int = 4, k: int = 100, 169 | n_split: int = 1, **kwargs): 170 | """ 171 | 172 | Args: 173 | video_len: 174 | n_keypoints: 175 | hidden_dim: 176 | n_mlp: 177 | k: 178 | n_split: 179 | **kwargs: 180 | """ 181 | super(PoseTrajectoryMLP, self).__init__() 182 | self.video_len = video_len 183 | self.n_keypoints = n_keypoints 184 | self.k = k 185 | self.n_split = n_split 186 | if n_split > 1: 187 | split_loc = [-1 / n_split] + [(i + 1) / n_split for i in range(n_split - 1)] + [1 + 1 / n_split] 188 | self.split_loc = np.array(split_loc) 189 | 190 | layers = [nn.Conv1d(self.k, hidden_dim * n_split, 1), nn.ELU(inplace=True)] 191 | for i in range(n_mlp - 1): 192 | layers.append(nn.Conv1d(hidden_dim * n_split, hidden_dim * n_split, 1, groups=n_split)) 193 | layers.append(nn.ELU(inplace=True)) 194 | 195 | layers.append(nn.Conv1d(hidden_dim * n_split, n_keypoints * 9 * n_split, 1, groups=n_split)) 196 | else: 197 | layers = [nn.Linear(self.k, hidden_dim), nn.ELU(inplace=True)] 198 | for i in range(n_mlp - 1): 199 | layers.append(nn.Linear(hidden_dim, hidden_dim)) 200 | layers.append(nn.ELU(inplace=True)) 201 | 202 | layers.append(nn.Linear(hidden_dim, n_keypoints * 9)) 203 | 204 | self.model = nn.Sequential(*layers) 205 | 206 | def backbone(self, t: torch.Tensor) -> torch.Tensor: 207 | """ 208 | 209 | Args: 210 | t: 211 | 212 | Returns: 213 | 214 | """ 215 | batchsize = t.shape[0] 216 | device = t.device 217 | freq = (t[:, None] + 0.5 / self.video_len) * np.pi * torch.arange(0, self.k, device=device) # (B, k) 218 | if self.n_split > 1: 219 | freq = freq[:, :, None] # (B, k) 220 | 221 | trajectory = self.model(torch.cos(freq)) # (B, n_kpts * 9) 222 | 223 | if self.n_split > 1: 224 | trajectory = trajectory.reshape(batchsize, self.n_split, self.n_keypoints * 9) 225 | split_loc = torch.tensor(self.split_loc, device=device, dtype=torch.float) 226 | sigmoid_scale = 12 * self.n_split 227 | weight = torch.sigmoid((t[:, None] - split_loc[None, :-1]) * sigmoid_scale) * \ 228 | torch.sigmoid(-(t[:, None] - split_loc[None, 1:]) * sigmoid_scale) 229 | trajectory = torch.sum(trajectory * weight[:, :, None], dim=1) 230 | 231 | return trajectory 232 | 233 | def forward(self, idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 234 | """ 235 | Compute rotation and translation matrices from positionally encoded time 236 | Args: 237 | idx: frame index: (B, ) 238 | 239 | Returns: 240 | joint rotation: (B, n_kpts, 3, 3) 241 | joint_translation: (B, n_kpts, 3, 1) 242 | 243 | """ 244 | batchsize = idx.shape[0] 245 | 246 | t = idx / self.video_len 247 | 248 | trajectory = self.backbone(t) 249 | 250 | rot, trans = torch.split(trajectory, [6 * self.n_keypoints, 251 | 3 * self.n_keypoints], dim=1) 252 | rot = rotation_6d_to_matrix(rot.reshape(batchsize * self.n_keypoints, 6)) 253 | rot = rot.reshape(batchsize, self.n_keypoints, 3, 3) 254 | trans = trans.reshape(batchsize, self.n_keypoints, 3, 1) 255 | 256 | return rot, trans 257 | 258 | 259 | def get_pose_trajectory(config: edict) -> PoseTrajectoryMLP: 260 | """ 261 | 262 | Args: 263 | config: 264 | 265 | Returns: 266 | 267 | """ 268 | video_len = config.video_length 269 | num_parts = config.num_parts 270 | params = config.trajectory_params.dct 271 | 272 | return PoseTrajectoryMLP(video_len, num_parts, **params) 273 | -------------------------------------------------------------------------------- /src/utils/render_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | from typing import Tuple 13 | 14 | import numpy as np 15 | import torch 16 | import torch.nn.functional as F 17 | from torch import nn 18 | 19 | 20 | def _get_ray_direction(pixel_location: torch.Tensor, inv_intrinsics: torch.Tensor) -> torch.Tensor: 21 | """ 22 | 23 | Args: 24 | pixel_location: 25 | inv_intrinsics: 26 | 27 | Returns: 28 | 29 | """ 30 | batchsize, _, num_ray = pixel_location.shape 31 | # + 0.5 is required 32 | homogeneous = torch.cat( 33 | [pixel_location + 0.5, torch.ones(batchsize, 1, num_ray, device="cuda")], dim=1) 34 | 35 | ray_direction = torch.matmul(inv_intrinsics, 36 | homogeneous) # shape: (B, 3, num_ray), not unit vector 37 | 38 | return ray_direction 39 | 40 | 41 | def _coarse_sample(pixel_location: torch.Tensor, inv_intrinsics: torch.Tensor, 42 | joint_translation: torch.Tensor = None, near_plane: float = 1, 43 | far_plane: float = 5, num_coarse: int = 64 44 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 45 | """coarse sampling. uniform sampling in camera frustrum 46 | 47 | Args: 48 | pixel_location (torch.Tensor): 2D location on image, 49 | shape: (B, 2, num_ray) 50 | inv_intrinsics (torch.Tensor): inverse of camera intrinsics 51 | shape: (B, 3, 3) 52 | joint_translation (torch.Tensor, optional): (B, num_parts, 3, 1) 53 | near_plane (float, optional): [description]. Defaults to 1. 54 | far_plane (float, optional): [description]. Defaults to 4. 55 | num_coarse (int, optional): number of sample in each ray 56 | 57 | Returns: 58 | coarse_location (torch.Tensor): shape: (B, 3, num_ray, num_coarse) 59 | coarse_depth (torch.Tensor): shape: (B, 1, num_ray, num_coarse) 60 | ray_direction (torch.Tensor): shape: (B, 3, num_ray) 61 | """ 62 | batchsize, _, num_ray = pixel_location.shape 63 | 64 | ray_direction = _get_ray_direction(pixel_location, inv_intrinsics) 65 | 66 | if joint_translation is None: 67 | uniform_depth = torch.linspace(near_plane, far_plane, num_coarse, device="cuda") 68 | coarse_depth = uniform_depth[None, None, None].repeat(batchsize, 1, num_ray, 1) 69 | else: 70 | max_depth = joint_translation[:, :, 2, 0].max(dim=1)[0] 71 | min_depth = joint_translation[:, :, 2, 0].min(dim=1)[0] 72 | 73 | far = max_depth + 0.5 74 | near = torch.clamp_min(min_depth - 0.5, near_plane) 75 | eps = torch.linspace(0, 1, num_coarse, device="cuda") 76 | uniform_depth = near[:, None] * (1 - eps) + far[:, None] * eps 77 | uniform_depth = uniform_depth[:, None, None] 78 | coarse_depth = uniform_depth.repeat(1, 1, num_ray, 1) 79 | 80 | coarse_location = ray_direction[:, :, :, None] * uniform_depth 81 | ray_direction = F.normalize(ray_direction, dim=1) 82 | 83 | return coarse_location, coarse_depth, ray_direction, 84 | 85 | 86 | def _weight_for_volume_rendering(density: torch.Tensor, depth: torch.Tensor) -> torch.Tensor: 87 | """weight for volume rendering 88 | 89 | Args: 90 | density (torch.Tensor): [description] 91 | depth (torch.Tensor): [description] 92 | 93 | Returns: 94 | torch.Tensor: weight for each coarse bin, shape: (B, 1, 1, num_ray, num - 1) 95 | """ 96 | assert density.ndim == 4 97 | assert depth.ndim == 4 98 | sigmoid = torch.sigmoid(density) # shape: (B, 1, num_ray, num_on_ray) 99 | alpha = torch.clamp_min((sigmoid[..., :-1] - sigmoid[..., 1:]) / (sigmoid[..., :-1] + 1e-10), 0) 100 | _alpha = torch.cat([torch.zeros_like(alpha[..., :1]), alpha], dim=-1) 101 | alpha_ = torch.cat([alpha, torch.zeros_like(alpha[..., :1])], dim=-1) 102 | T_i = torch.cumprod(1 - _alpha, dim=-1) 103 | weights = T_i * alpha_ 104 | 105 | return weights 106 | 107 | 108 | def _multinomial_sample(weights: torch.Tensor, num_fine: int) -> torch.Tensor: 109 | """multinomial sample for fine sampling 110 | 111 | Args: 112 | weights (torch.Tensor): [description] 113 | num_fine (int): [description] 114 | 115 | Returns: 116 | torch.Tensor: normalized sampled position 117 | """ 118 | batchsize, _, num_ray, num_coarse = weights.shape 119 | weights = weights.reshape(batchsize * num_ray, num_coarse) 120 | sampled_bins = torch.multinomial(torch.clamp_min(weights, 1e-8), num_fine, 121 | replacement=True).reshape(batchsize, 1, 1, num_ray, num_fine) / (num_coarse - 1) 122 | offset_in_bins = torch.cuda.FloatTensor( 123 | batchsize, 1, 1, num_ray, num_fine).uniform_() / (num_coarse - 1) 124 | sampled_normalized_depth = sampled_bins + offset_in_bins 125 | 126 | return sampled_normalized_depth 127 | 128 | 129 | def _get_fine_location(coarse_location: torch.Tensor, coarse_depth: torch.Tensor, 130 | sampled_normalized_depth: torch.Tensor 131 | ) -> Tuple[torch.Tensor, torch.Tensor]: 132 | """ 133 | 134 | Args: 135 | coarse_location: 136 | coarse_depth: 137 | sampled_normalized_depth: 138 | 139 | Returns: 140 | 141 | """ 142 | near_location = coarse_location[:, :, :, :1] 143 | far_location = coarse_location[:, :, :, -1:] 144 | fine_location = (near_location * (1 - sampled_normalized_depth) + 145 | far_location * sampled_normalized_depth) 146 | 147 | near_depth = coarse_depth[:, :, :, :1] 148 | far_depth = coarse_depth[:, :, :, -1:] 149 | fine_depth = (near_depth * (1 - sampled_normalized_depth) + 150 | far_depth * sampled_normalized_depth) 151 | 152 | fine_location = torch.cat([coarse_location, fine_location], dim=3) 153 | fine_depth = torch.cat([coarse_depth, fine_depth], dim=3) 154 | 155 | return fine_location, fine_depth 156 | 157 | 158 | def fine_sample(implicit_model: nn.Module, joint_rotation, 159 | joint_translation: torch.Tensor, pixel_location: torch.Tensor, 160 | inv_intrinsics: torch.Tensor, near_plane: float = 1, far_plane: float = 4, 161 | num_coarse: int = 64, num_fine: int = 64 162 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 163 | """fine sampling for nerf 164 | 165 | Args: 166 | implicit_model (nn.Module): implicit decoder 167 | joint_rotation (nn.Module): rotation matrix for each part 168 | joint_translation (nn.Module): translation for each part 169 | pixel_location (torch.Tensor): location of pixels 170 | inv_intrinsics (torch.Tensor): inverse of intrinsic matrix 171 | near_plane (float, optional): Defaults to 1. 172 | far_plane (float, optional): Defaults to 4. 173 | num_coarse (int, optional): number of sampling points for coarse sampling. Defaults to 64. 174 | num_fine (int, optional): number of sampling points for fine sampling. Defaults to 32. 175 | 176 | Returns: 177 | torch.Tensor: fine points 178 | """ 179 | (coarse_location, coarse_depth, ray_direction) = _coarse_sample(pixel_location, inv_intrinsics, joint_translation, 180 | near_plane, far_plane, num_coarse) 181 | 182 | _, _, _, num_coarse = coarse_location.shape 183 | with torch.no_grad(): 184 | decoder_output = implicit_model(coarse_location, joint_rotation, joint_translation, 185 | coarse_sample=True) 186 | 187 | coarse_density = decoder_output["density"] 188 | sdf_scale = decoder_output.get("sdf_scale") 189 | 190 | if sdf_scale is not None: 191 | coarse_density = coarse_density * sdf_scale 192 | 193 | weights = _weight_for_volume_rendering(coarse_density, coarse_depth) 194 | 195 | # normalised fine points, shape: (B, 1, num_ray, num_fine) 196 | sampled_normalized_depth = _multinomial_sample(weights, num_fine)[:, 0] 197 | 198 | # fine points, shape: (B, 3, num_ray, num_fine) 199 | fine_location, fine_depth = _get_fine_location(coarse_location, coarse_depth, sampled_normalized_depth) 200 | 201 | sort_idx = torch.argsort(fine_depth, dim=3) 202 | fine_location = torch.gather(fine_location, dim=3, index=sort_idx.repeat(1, 3, 1, 1)) 203 | fine_depth = torch.gather(fine_depth, dim=3, index=sort_idx) 204 | 205 | return (fine_location, # (batchsize, 3, num_ray, (num_coarse + num_fine)) 206 | fine_depth, # (batchsize, 1, num_ray, (num_coarse + num_fine)) 207 | ray_direction) # (batchsize, 3, num_ray) 208 | 209 | 210 | def volume_rendering(density: torch.Tensor, color: torch.Tensor, depth: torch.tensor 211 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 212 | """volume rendering 213 | 214 | Args: 215 | density (torch.Tensor): shape: (B, 1, num_ray, num_coarse + num_fine) 216 | color (torch.Tensor): shape: (B, 3, num_ray, num_coarse + num_fine) 217 | depth (torch.Tensor): shape: (B, 1, num_ray, num_coarse + num_fine) 218 | 219 | Returns: 220 | rendered_color (torch.Tensor): shape: (B, 3, num_ray) 221 | rendered_mask (torch.Tensor): shape: (B, 1, num_ray) 222 | rendered_disparity (torch.Tensor): shape: (B, 1, num_ray) 223 | weights 224 | """ 225 | weights = _weight_for_volume_rendering(density, depth) 226 | 227 | rendered_color = torch.sum(weights * color, dim=3) 228 | rendered_mask = torch.sum(weights, dim=3).squeeze(1) 229 | rendered_disparity = torch.sum(weights / depth, dim=3).squeeze(1) 230 | 231 | return (rendered_color, # (B, 3, num_ray) 232 | rendered_mask, # (B, num_ray) 233 | rendered_disparity, # (B, num_ray) 234 | weights) # (B, 1, 1, num_ray, num_points) 235 | 236 | 237 | def gather_pixel(img: torch.Tensor, pixel_location: torch.Tensor) -> torch.Tensor: 238 | """ 239 | 240 | Args: 241 | img: 242 | pixel_location: 243 | 244 | Returns: 245 | 246 | """ 247 | single_channel = (img.ndim == 3) 248 | if single_channel: 249 | img = img[:, None] 250 | 251 | batchsize, ch, height, width = img.shape 252 | 253 | if pixel_location.dtype == torch.int64: # pixel index 254 | img = img.reshape(batchsize, ch, height * width) 255 | # gather pixel values from pixel_location 256 | x_coord = pixel_location[:, 0] 257 | y_coord = pixel_location[:, 1] 258 | flattened_location = y_coord * width + x_coord # (B, num_ray) 259 | gathered_img = torch.gather(img, dim=2, index=flattened_location[:, None].repeat(1, ch, 1)) 260 | elif pixel_location.dtype == torch.float32: # in pixel index space (top-left = (0, 0)) 261 | _pixel_location = pixel_location.permute(0, 2, 1)[:, :, None] + 0.5 # (B, n_rays, 1, 2) 262 | _pixel_location = _pixel_location / (height / 2) - 1 263 | gathered_img = F.grid_sample(img, _pixel_location, mode='bicubic') # (B, ch, n_rays, 1) 264 | gathered_img = gathered_img.squeeze(3) 265 | else: 266 | raise TypeError("Invalid type for pixel_location") 267 | if single_channel: 268 | gathered_img = gathered_img[:, 0] 269 | 270 | return gathered_img 271 | 272 | 273 | def rotation_matrix(theta: float, axis: str = "y") -> torch.Tensor: 274 | """ 275 | 276 | Args: 277 | theta: 278 | axis: 279 | 280 | Returns: 281 | R: rotation matrix 282 | """ 283 | c = np.cos(theta) 284 | s = np.sin(theta) 285 | if axis == "y": 286 | R = torch.tensor(np.array([[c, 0, -s, 0], 287 | [0, 1, 0, 0], 288 | [s, 0, c, 0], 289 | [0, 0, 0, 1]])).float().cuda() 290 | elif axis == "z": 291 | R = torch.tensor(np.array([[c, -s, 0, 0], 292 | [s, c, 0, 0], 293 | [0, 0, 1, 0], 294 | [0, 0, 0, 1]])).float().cuda() 295 | else: 296 | raise ValueError("invalid axis") 297 | 298 | return R 299 | 300 | 301 | def rotate_pose(pose_camera: torch.Tensor, R: torch.Tensor) -> torch.Tensor: 302 | """ 303 | 304 | Args: 305 | pose_camera: 306 | R: 307 | 308 | Returns: 309 | 310 | """ 311 | center = torch.zeros(4, 4, device=R.device, dtype=torch.float) 312 | center[:3, 3] = pose_camera[0, :, :3, 3].mean(dim=0) 313 | center[3, 3] = 1 314 | rotated_pose = torch.matmul(R, (pose_camera - center)) + center 315 | 316 | return rotated_pose 317 | -------------------------------------------------------------------------------- /src/utils/sdf_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | from typing import Tuple 13 | 14 | import torch 15 | 16 | 17 | def f(radius: torch.Tensor, x: torch.Tensor, lam: torch.Tensor) -> torch.Tensor: 18 | """func to optimize 19 | 20 | Args: 21 | radius: radius of ellipsoids, (..., n_part, 3, 1) 22 | x: position, (..., n_part, 3, n_pts) 23 | lam: Lagrange multiplier, (..., n_part, n_pts) 24 | 25 | Returns: 26 | 27 | """ 28 | lam = lam.unsqueeze(-2) 29 | h = radius.square() * x.square() / torch.clamp_min((radius.square() + lam).square(), 1e-15) 30 | h = torch.sum(h, dim=-2) 31 | 32 | return h # (..., n_part, n_pts) 33 | 34 | 35 | def d_f(radius: torch.Tensor, x: torch.Tensor, lam: torch.Tensor) -> torch.Tensor: 36 | """derivative of f 37 | 38 | Args: 39 | radius: radius of ellipsoids, (..., n_part, 3, 1) 40 | x: position, (..., n_part, 3, n_pts) 41 | lam: Lagrange multiplier, (..., n_part, n_pts) 42 | 43 | Returns: 44 | 45 | """ 46 | lam = lam.unsqueeze(-2) 47 | eps = (((radius.square() + lam) > 0) * 2 - 1) * 1e-20 48 | h = radius.square() * x.square() / ((radius.square() + lam) ** 3 + eps) 49 | h = -2 * torch.sum(h, dim=-2) 50 | 51 | return h # (..., n_part, n_pts) 52 | 53 | 54 | def newton_step(radius: torch.Tensor, x: torch.Tensor, lam: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 55 | """ 56 | 57 | Args: 58 | radius: radius of ellipsoids, (..., n_part, 3, 1) 59 | x: position, (..., n_part, 3, n_pts) 60 | lam: Lagrange multiplier, (..., n_part, n_pts) 61 | 62 | Returns: 63 | 64 | """ 65 | with torch.no_grad(): 66 | diff = 1 - f(radius, x, lam) 67 | df = d_f(radius, x, lam) 68 | eps = ((df > 0) * 2 - 1) * 1e-15 69 | update = diff / (df + eps) 70 | 71 | return lam + update, diff 72 | 73 | 74 | def search_lam(radius: torch.Tensor, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 75 | """ 76 | 77 | Args: 78 | radius: radius of ellipsoids, (..., n_part, 3) 79 | x: position, (..., n_part, 3, n_pts) 80 | 81 | Returns: 82 | 83 | """ 84 | radius = radius.unsqueeze(-1) 85 | lam = torch.max(radius * x.abs() - radius.square(), dim=-2)[0] # (..., n_part, n_pts) 86 | diff = torch.tensor(0) 87 | for _ in range(10): 88 | lam, diff = newton_step(radius, x, lam) 89 | 90 | valid = torch.lt(diff, 1e-5) 91 | 92 | valid_lam = torch.lt(-torch.square(radius.min(dim=-2)[0]), lam) 93 | valid = torch.logical_and(valid, valid_lam) 94 | 95 | return lam, valid 96 | 97 | 98 | def lam_to_sdf(radius: torch.Tensor, x: torch.Tensor, lam: torch.Tensor) -> torch.Tensor: 99 | """ 100 | 101 | Args: 102 | radius: radius of ellipsoids, (..., n_part, 3) 103 | x: position, (..., n_part, 3, n_pts) 104 | lam: Lagrange multiplier, (..., n_part, n_pts) 105 | 106 | Returns: 107 | 108 | """ 109 | with torch.no_grad(): 110 | radius = radius.unsqueeze(-1) 111 | lam = lam.unsqueeze(-2) 112 | foot_on_sphere = radius / (radius.square() + lam) * x 113 | 114 | # differentiable from here! 115 | foot_on_ellipsoid = foot_on_sphere * radius # (..., n_part, 3, n_pts) 116 | with torch.no_grad(): 117 | sign = torch.sign(torch.sum(x.square() / radius.square(), dim=-2) - 1) 118 | sdf = torch.norm(foot_on_ellipsoid - x, dim=-2) * sign # (..., n_part, n_pts) 119 | 120 | return sdf 121 | 122 | 123 | @torch.jit.script 124 | def ellipsoid_sdf(radius: torch.Tensor, x: torch.Tensor) -> torch.Tensor: 125 | """ 126 | 127 | Args: 128 | radius: radius of ellipsoids, (..., n_part, 3) 129 | x: position, (..., n_part, 3, n_pts) 130 | 131 | Returns: 132 | 133 | """ 134 | lam, valid = search_lam(radius, x) 135 | sdf = lam_to_sdf(radius, x, lam) 136 | min_sdf = -radius.min(dim=-1)[0].unsqueeze(-1) 137 | sdf = torch.where(valid, sdf, min_sdf) 138 | sdf = torch.where(sdf < min_sdf, min_sdf, sdf) 139 | 140 | return sdf 141 | -------------------------------------------------------------------------------- /src/utils/train_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import os 13 | from typing import Any, Tuple 14 | 15 | import torch 16 | import torch.optim as optim 17 | from easydict import EasyDict as edict 18 | from torch import nn 19 | from torch.utils.data import DataLoader, Dataset 20 | 21 | from datasets.dataset import SingleVideoDataset as HumanVideoDataset 22 | 23 | 24 | def create_dataloaders(config: dict, shuffle: bool = True) -> DataLoader: 25 | """create train and val dataloaders 26 | 27 | Args: 28 | config (dict): config.dataset 29 | shuffle 30 | 31 | Returns: 32 | data_loader (DataLoader): dataloader 33 | """ 34 | dataset = HumanVideoDataset(config) 35 | 36 | batchsize = config.batchsize 37 | num_workers = config.num_workers 38 | data_loader = DataLoader(dataset, batch_size=batchsize, num_workers=num_workers, 39 | shuffle=shuffle, drop_last=True, pin_memory=True) 40 | 41 | return data_loader 42 | 43 | 44 | def ddp_data_sampler(dataset: Dataset, rank: int, world_size: int, shuffle: bool, drop_last: bool 45 | ) -> torch.utils.data.distributed.DistributedSampler: 46 | """ 47 | 48 | Args: 49 | dataset: 50 | rank: 51 | world_size: 52 | shuffle: 53 | drop_last: 54 | 55 | Returns: 56 | 57 | """ 58 | dist_sampler = torch.utils.data.distributed.DistributedSampler( 59 | dataset, rank=rank, num_replicas=world_size, shuffle=shuffle, 60 | drop_last=drop_last) 61 | 62 | return dist_sampler 63 | 64 | 65 | def create_ddp_dataloaders(config: dict, rank: int, world_size: int) -> DataLoader: 66 | """create train and val dataloaders for ddp 67 | 68 | Args: 69 | config (dict): config.dataset 70 | rank 71 | world_size 72 | 73 | Returns: 74 | train_loader (DataLoader): dataloader for train 75 | val_loader (DataLoader): dataloader for val 76 | """ 77 | 78 | dataset = HumanVideoDataset(config) 79 | 80 | batchsize = config.batchsize 81 | num_workers = config.num_workers 82 | ddp_sampler = ddp_data_sampler(dataset, rank, world_size, shuffle=True, drop_last=True) 83 | data_loader = DataLoader(dataset, batch_size=batchsize, num_workers=num_workers, 84 | sampler=ddp_sampler, pin_memory=True) 85 | 86 | return data_loader 87 | 88 | 89 | def to_gpu(minibatch: dict) -> dict: 90 | """send minibatch dict to gpu 91 | 92 | Args: 93 | minibatch (dict): [description] 94 | 95 | Returns: 96 | dict: [description] 97 | """ 98 | 99 | return {key: val.cuda(non_blocking=True) for key, val in minibatch.items()} 100 | 101 | 102 | def to_tensor(minibatch: dict) -> dict: 103 | """numpy to torch.tensor 104 | Args: 105 | minibatch (dict): [description] 106 | 107 | Returns: 108 | dict: [description] 109 | """ 110 | 111 | return {key: torch.tensor(val).cuda(non_blocking=True).float() for key, val in minibatch.items()} 112 | 113 | 114 | def cat_dim0(tensor: torch.Tensor) -> torch.Tensor: 115 | """ 116 | 117 | Args: 118 | tensor: 119 | 120 | Returns: 121 | 122 | """ 123 | shape = tensor.shape 124 | 125 | return tensor.reshape((shape[0] * shape[1],) + shape[2:]) 126 | 127 | 128 | def cat_dim0_dict(minibatch: dict) -> dict: 129 | """ 130 | 131 | Args: 132 | minibatch: 133 | 134 | Returns: 135 | 136 | """ 137 | out_dict = {} 138 | for key, val in minibatch.items(): 139 | shape = val.shape 140 | if len(shape) <= 2: 141 | reshaped = val.reshape(-1) 142 | else: 143 | reshaped = val.reshape((shape[0] * shape[1],) + shape[2:]) 144 | out_dict[key] = reshaped 145 | 146 | return out_dict 147 | 148 | 149 | def set_port(config: edict) -> None: 150 | """ 151 | 152 | Args: 153 | config: 154 | 155 | Returns: 156 | 157 | """ 158 | master_addr = config.master_addr 159 | master_port = config.master_port 160 | os.environ['MASTER_ADDR'] = master_addr 161 | os.environ['MASTER_PORT'] = master_port 162 | 163 | 164 | def all_reduce_scalar(scalar: float) -> float: 165 | """ 166 | 167 | Args: 168 | scalar: 169 | 170 | Returns: 171 | 172 | """ 173 | scalar = torch.tensor(scalar).cuda(non_blocking=True) 174 | torch.distributed.all_reduce(scalar) 175 | 176 | return scalar.item() 177 | 178 | 179 | def all_reduce_dict(dictionary: dict, world_size: int) -> dict: 180 | """ 181 | 182 | Args: 183 | dictionary: 184 | world_size: 185 | 186 | Returns: 187 | 188 | """ 189 | reduced_dict = {} 190 | for key, val in dictionary.items(): 191 | reduced_dict[key] = all_reduce_scalar(val) / world_size 192 | 193 | return reduced_dict 194 | 195 | 196 | def grid_coordinates(size: int, device: str, scale: int = 2) -> torch.Tensor: 197 | """ 198 | 199 | Args: 200 | size: 201 | device: 202 | scale: 203 | 204 | Returns: 205 | 206 | """ 207 | grid = torch.meshgrid(torch.arange(size, device=device), 208 | torch.arange(size, device=device), indexing='ij')[::-1] 209 | grid = torch.stack(grid, dim=-1) * scale + 0.5 210 | grid = grid.reshape(1, size ** 2, 2) 211 | 212 | return grid 213 | 214 | 215 | def check_nan(model: nn.Module) -> bool: 216 | """ 217 | 218 | Args: 219 | model: 220 | 221 | Returns: 222 | 223 | """ 224 | state_dict = model.state_dict() 225 | 226 | isnan = False 227 | for val in state_dict.values(): 228 | if val.isnan().any(): 229 | isnan = True 230 | break 231 | 232 | return isnan 233 | 234 | 235 | def load_snapshot(model: nn.Module, optimizer, path: str, load_optimizer: bool = True) -> int: 236 | """ 237 | 238 | Args: 239 | model: 240 | optimizer: 241 | path: 242 | load_optimizer: 243 | 244 | Returns: 245 | 246 | """ 247 | snapshot = torch.load(path, map_location=lambda storage, loc: storage) # avoid OOM 248 | 249 | name_in_model = [n for n, _ in model.named_parameters()] 250 | for name in list(snapshot["model"].keys()): 251 | if name not in name_in_model: 252 | snapshot["model"].pop(name) 253 | 254 | model.load_state_dict(snapshot["model"], strict=False) 255 | 256 | if load_optimizer: 257 | optimizer.load_state_dict(snapshot["optimizer"]) 258 | iter = snapshot["iteration"] 259 | del snapshot 260 | torch.cuda.empty_cache() # remove cache for resuming 261 | 262 | return iter 263 | 264 | 265 | def create_optimizer(config: dict, model: nn.Module) -> Any: 266 | """create optimizer 267 | 268 | Args: 269 | config (dict): config.tran_setting 270 | model (nn.Module): target model 271 | 272 | Returns: 273 | [type]: optimizer 274 | """ 275 | if config.optimizer == "Adam": 276 | lr = config.lr 277 | decay = config.decay 278 | optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-20, weight_decay=decay) 279 | elif config.optimizer == "AdamW": 280 | lr = config.lr 281 | decay = config.decay 282 | print("adamw", decay) 283 | optimizer = optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-20, weight_decay=decay) 284 | else: 285 | raise ValueError() 286 | 287 | return optimizer 288 | 289 | 290 | def send_model_to_gpu(rank: int, model: nn.Module, ddp: bool) -> Tuple[nn.Module, nn.Module]: 291 | """ 292 | 293 | Args: 294 | rank: 295 | model: 296 | ddp: 297 | 298 | Returns: 299 | 300 | """ 301 | num_gpus = torch.cuda.device_count() 302 | n_gpu = rank % num_gpus 303 | 304 | torch.cuda.set_device(n_gpu) 305 | model.cuda(n_gpu) 306 | 307 | if ddp: 308 | print(n_gpu) 309 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 310 | model = nn.parallel.DistributedDataParallel(model, device_ids=[n_gpu], find_unused_parameters=True) 311 | model_module = model.module 312 | else: 313 | model_module = model 314 | 315 | return model, model_module, 316 | 317 | 318 | def save_model(model_module: nn.Module, optimizer, save_dir: str, iteration: int, rank: int, 319 | snapshot_prefix: str = "snapshot") -> int: 320 | """ 321 | Save model. If nan is detected, load the latest snapshot 322 | Args: 323 | model_module: 324 | optimizer: 325 | save_dir: 326 | iteration: 327 | rank: 328 | snapshot_prefix: 329 | 330 | Returns: 331 | 332 | """ 333 | isnan = check_nan(model_module) 334 | 335 | if isnan: 336 | print("nan detected") 337 | model_path = os.path.join(save_dir, f"{snapshot_prefix}_latest.pth") 338 | assert os.path.exists(model_path), "model snapshot is not saved" 339 | 340 | iteration = load_snapshot(model_module, optimizer, model_path) 341 | else: 342 | if rank == 0: 343 | params_to_save = {"iteration": iteration, 344 | "model": model_module.state_dict(), 345 | "optimizer": optimizer.state_dict()} 346 | torch.save(params_to_save, os.path.join(save_dir, f"{snapshot_prefix}_latest.pth")) 347 | torch.save( 348 | params_to_save, os.path.join( 349 | save_dir, f"{snapshot_prefix}_{(iteration // 10000 + 1) * 10000}.pth")) 350 | 351 | return iteration 352 | -------------------------------------------------------------------------------- /src/utils/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | 13 | import os 14 | 15 | import tensorboardX as tbx 16 | import torch 17 | import torch.distributed as dist 18 | from torch.cuda.amp import GradScaler 19 | 20 | from utils.train_utils import (all_reduce_dict, cat_dim0_dict, create_dataloaders, create_ddp_dataloaders, 21 | load_snapshot, to_gpu, save_model) 22 | from utils.train_utils import set_port 23 | 24 | 25 | class TrainerBase: 26 | def run(self, config: dict, rank: int = 0, world_size: int = 1) -> None: 27 | """ 28 | 29 | Args: 30 | config: 31 | rank: 32 | world_size: 33 | 34 | Returns: 35 | 36 | """ 37 | torch.backends.cudnn.benchmark = True 38 | ddp = False 39 | assert world_size == 1 40 | train_loader = create_dataloaders(config.dataset) 41 | 42 | self.train_loader = train_loader 43 | 44 | self.train_func(config, train_loader, None, rank=rank, ddp=ddp, world_size=world_size) 45 | 46 | def ddp_run(self, rank: int, config: dict, world_size: int = 1) -> None: 47 | """ 48 | 49 | Args: 50 | rank: 51 | config: 52 | world_size: 53 | 54 | Returns: 55 | 56 | """ 57 | assert world_size > 1 58 | torch.backends.cudnn.benchmark = True 59 | ddp = True 60 | 61 | set_port(config.train_setting) 62 | backend = config.train_setting.backend 63 | dist.init_process_group(backend=backend, init_method='env://', rank=rank, 64 | world_size=world_size) 65 | torch.manual_seed(0) 66 | 67 | train_loader = create_ddp_dataloaders(config.dataset, rank, world_size) 68 | 69 | self.train_loader = train_loader 70 | 71 | try: 72 | self.train_func(config, train_loader, None, rank=rank, ddp=ddp, world_size=world_size) 73 | except KeyboardInterrupt: 74 | print('interrupted') 75 | 76 | dist.destroy_process_group() 77 | 78 | def prepare_model_and_optimizer(self, *args, **kwargs): 79 | raise NotImplementedError("Please implement prepare_model_and_optimizer") 80 | 81 | def define_loss_func(self, *args, **kwargs): 82 | raise NotImplementedError("Please implement define_loss_func") 83 | 84 | def lossfunc(self, *args, **kwargs): 85 | raise NotImplementedError("Please implement lossfunc") 86 | 87 | def process_before_train_step(self, iteration: int): 88 | pass 89 | 90 | def train_func(self, config: dict, train_loader, val_loader=None, rank: int = 0, 91 | ddp: bool = False, world_size: int = 1) -> None: 92 | """ 93 | 94 | Args: 95 | config: 96 | train_loader: 97 | val_loader: 98 | rank: 99 | ddp: 100 | world_size: 101 | 102 | Returns: 103 | 104 | """ 105 | num_iter = config.train_setting.num_iter 106 | log_interval = config.train_setting.log_interval 107 | save_interval = config.train_setting.save_interval 108 | out_dir = config.output_dir 109 | exp_name = config.exp_name 110 | 111 | model, model_module, optimizer = self.prepare_model_and_optimizer(config, rank, ddp) 112 | self.model = model 113 | 114 | save_dir = os.path.join(out_dir, "result", exp_name) 115 | if rank == 0: 116 | writer = tbx.SummaryWriter(os.path.join(out_dir, "tensorboard", exp_name)) 117 | os.makedirs(save_dir, exist_ok=True) 118 | os.chmod(save_dir, 0o755) 119 | 120 | iteration = 0 121 | 122 | if config.resume_model_path or config.resume_latest: 123 | if config.resume_model_path is not None: 124 | model_path = config.resume_model_path 125 | else: 126 | model_path = os.path.join(save_dir, f"{self.snapshot_prefix}_latest.pth") 127 | 128 | iteration = load_snapshot(model_module, optimizer, model_path, load_optimizer=config.load_optimizer) 129 | if config.iteration is not None: 130 | iteration = config.iteration 131 | 132 | # define loss 133 | self.define_loss_func(config, model_module, ddp) 134 | 135 | self.process_before_train_step(iteration) 136 | while iteration < num_iter: 137 | for i, minibatch in enumerate(train_loader): 138 | self.process_before_train_step(iteration) 139 | 140 | iteration += 1 141 | model.train() 142 | minibatch = to_gpu(minibatch) 143 | 144 | if minibatch["img"].ndim == 5: 145 | # reshape (B, video_len, *) -> (B * video_len, *) 146 | minibatch = cat_dim0_dict(minibatch) 147 | 148 | optimizer.zero_grad(set_to_none=True) 149 | 150 | # loss calculation 151 | loss, loss_dict = self.lossfunc(config, minibatch, model, model_module) 152 | 153 | if config.fp16: 154 | scaler = GradScaler() 155 | scaler.scale(loss).backward() 156 | scaler.step(optimizer) 157 | scaler.update() 158 | else: 159 | 160 | # with torch.autograd.detect_anomaly(): 161 | loss.backward() 162 | 163 | # detect nan 164 | nan = any([p.grad.isnan().any() for p in model_module.parameters() if p.grad is not None]) 165 | if nan: 166 | print("NaN is detected!!!!") 167 | del loss 168 | torch.cuda.empty_cache() 169 | else: 170 | if config.train_setting.clip_grad: 171 | torch.nn.utils.clip_grad_norm_(model.parameters(), 172 | max_norm=2.0, norm_type=2) 173 | 174 | optimizer.step() 175 | 176 | if ddp: 177 | loss_dict = all_reduce_dict(loss_dict, world_size) 178 | 179 | if iteration % 10 == 0 and rank == 0: 180 | print(iteration, loss_dict) 181 | # tensorboard 182 | if iteration % log_interval == 0 and rank == 0: 183 | print("log") 184 | for key, val in loss_dict.items(): 185 | writer.add_scalar("metrics/" + key, val, iteration) 186 | 187 | if iteration % save_interval == 0: 188 | iteration = save_model(model_module, optimizer, save_dir, iteration, rank, self.snapshot_prefix) 189 | -------------------------------------------------------------------------------- /src/validation/SMPL_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | 13 | import argparse 14 | import os 15 | import sys 16 | from typing import Dict, Tuple 17 | 18 | import torch 19 | 20 | sys.path.append(".") 21 | 22 | from models.model import SingleVideoPartDecomposition 23 | from datasets.dataset import SingleVideoDataset as HumanVideoDataset 24 | from utils.get_args import get_args_jupyter 25 | from utils.train_utils import create_dataloaders 26 | 27 | 28 | def regress(model: SingleVideoPartDecomposition, config: Dict, dataset: HumanVideoDataset 29 | ) -> Tuple[torch.Tensor, torch.Tensor, float]: 30 | """ 31 | 32 | Args: 33 | model: 34 | config: 35 | dataset: 36 | 37 | Returns: 38 | 39 | """ 40 | with torch.no_grad(): 41 | frame_id = torch.arange(config.dataset.num_frames, dtype=torch.float, device="cuda") 42 | trajectory = model.joint_trajectory(frame_id) 43 | 44 | rotation, translation = trajectory 45 | video_len = rotation.shape[0] 46 | 47 | smpl_pose = torch.tensor(dataset.video_cache["smpl_pose"], device=rotation.device, 48 | dtype=torch.float) / dataset.coordinate_scale 49 | 50 | child_root, self_root = model.decoder.joint_root_locations(rotation, translation) 51 | child_root = child_root.permute(0, 1, 3, 2).reshape(video_len, -1, 3) 52 | estimated_keypoints = torch.cat([child_root, self_root.squeeze(-1)], dim=1) 53 | smpl_keypoints = smpl_pose[:, :, :3, 3] 54 | 55 | # train test split 56 | train_idx = range(0, video_len, 10) 57 | test_idx = [i for i in range(video_len) if i % 10 != 0] 58 | estimated_keypoints_train = estimated_keypoints[train_idx] 59 | estimated_keypoints_test = estimated_keypoints[test_idx] 60 | smpl_keypoints_train = smpl_keypoints[train_idx] 61 | smpl_keypoints_test = smpl_keypoints[test_idx] 62 | 63 | estimated_keypoints_train = estimated_keypoints_train.permute(0, 2, 1).reshape(len(train_idx) * 3, -1).cpu() 64 | estimated_keypoints_test = estimated_keypoints_test.permute(0, 2, 1).reshape(len(test_idx) * 3, -1).cpu() 65 | smpl_keypoints_train = smpl_keypoints_train.permute(0, 2, 1).reshape(len(train_idx) * 3, -1).cpu() 66 | smpl_keypoints_test = smpl_keypoints_test.permute(0, 2, 1).reshape(len(test_idx) * 3, -1).cpu() 67 | 68 | lstsq_result_train = torch.linalg.lstsq(estimated_keypoints_train, smpl_keypoints_train, driver="gelsd") 69 | j2s_mapping_train = lstsq_result_train.solution 70 | test_error = estimated_keypoints_test @ j2s_mapping_train - smpl_keypoints_test 71 | test_error = test_error.reshape(len(test_idx), 3, smpl_keypoints_test.shape[-1]) 72 | test_error = test_error.norm(dim=1).mean() 73 | 74 | return j2s_mapping_train, estimated_keypoints, test_error * dataset.coordinate_scale * 1000 # millimeter 75 | 76 | 77 | def smpl_regression(config_path: str, default_config: str) -> None: 78 | """ 79 | 80 | Args: 81 | config_path: 82 | default_config: 83 | 84 | Returns: 85 | 86 | """ 87 | args, config = get_args_jupyter(config_path, default_config) 88 | config.dataset.batchsize = 1 89 | 90 | train_dataset: HumanVideoDataset = create_dataloaders(config.dataset, shuffle=True).dataset 91 | 92 | out_dir = config.output_dir 93 | exp_name = config.exp_name 94 | 95 | # model 96 | model = SingleVideoPartDecomposition(config.network_params) 97 | model.cuda() 98 | 99 | save_dir = os.path.join(out_dir, "result", exp_name) 100 | model_path = os.path.join(save_dir, "snapshot_latest.pth") 101 | if os.path.exists(model_path): 102 | snapshot = torch.load(model_path) 103 | state_dict = snapshot["model"] 104 | model.load_state_dict(state_dict, strict=False) 105 | else: 106 | assert False, "model is not loaded" 107 | 108 | _, _, mpjpe = regress(model, config, train_dataset) 109 | 110 | print(f"{exp_name}: MPJPE={mpjpe:.4f}mm") 111 | 112 | 113 | if __name__ == "__main__": 114 | parser = argparse.ArgumentParser(description='SMPL regression evaluation') 115 | parser.add_argument('--exp_name', action='append', required=True) 116 | args = parser.parse_args() 117 | default_config = "confs/default.yml" 118 | 119 | exp_names = args.exp_name 120 | for exp_name in exp_names: 121 | config_path = f"confs/{exp_name}.yml" 122 | smpl_regression(config_path, default_config) 123 | -------------------------------------------------------------------------------- /src/validation/lpips_ssim.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | import sys 12 | import argparse 13 | import os 14 | import pickle 15 | from typing import Tuple 16 | 17 | import lpips 18 | import numpy as np 19 | import torch 20 | from skimage.metrics import structural_similarity as ssim 21 | 22 | sys.path.append(".") 23 | from utils.get_args import get_args_jupyter 24 | 25 | loss_fn_vgg = lpips.LPIPS(net='vgg').cuda() 26 | 27 | 28 | def evaluate(path: str) -> Tuple[np.float, np.float]: 29 | """ 30 | 31 | Args: 32 | path: 33 | 34 | Returns: 35 | 36 | """ 37 | with open(path, "rb") as f: 38 | data = pickle.load(f) 39 | 40 | gt_img = np.clip(data["gt_img"], 0, 255) / 127.5 - 1 41 | gen_img = np.clip(data["gen_img"], 0, 255) / 127.5 - 1 42 | 43 | batchsize = 16 44 | lpips_vals = [] 45 | with torch.no_grad(): 46 | for i in range(0, gt_img.shape[0], batchsize): 47 | lpips_val = loss_fn_vgg(torch.tensor(gt_img[i:i + batchsize]).cuda(), 48 | torch.tensor(gen_img[i:i + batchsize]).cuda()).squeeze().cpu().numpy() 49 | lpips_vals.append(lpips_val) 50 | 51 | mean_lpips = np.concatenate(lpips_vals).mean() 52 | 53 | ssim_vals = [] 54 | for i in range(gt_img.shape[0]): 55 | gt = gt_img[i].transpose(1, 2, 0) 56 | gen = gen_img[i].transpose(1, 2, 0) 57 | ssim_vals.append(ssim(gt, gen, data_range=gt.max() - gt.min(), multichannel=True)) 58 | 59 | mean_ssim = np.array(ssim_vals).mean() 60 | 61 | return mean_lpips, mean_ssim 62 | 63 | 64 | def eval_all(exp_name: str) -> None: 65 | """ 66 | 67 | Args: 68 | exp_name: 69 | 70 | Returns: 71 | 72 | """ 73 | default_config = "confs/default.yml" 74 | config_path = f"confs/{exp_name}.yml" 75 | args, config = get_args_jupyter(config_path, default_config) 76 | out_dir = config.output_dir 77 | exp_name = config.exp_name 78 | root = os.path.join(out_dir, "result") 79 | result = {} 80 | validation_dir_name = f"{root}/{exp_name}/validation" 81 | mean_lpips, mean_ssim = evaluate(f"{validation_dir_name}/reconstruction_test.pkl") 82 | print(exp_name) 83 | print("NV", mean_lpips, mean_ssim) 84 | result["novel_view"] = {"lpips": mean_lpips, "ssim": mean_ssim} 85 | mean_lpips, mean_ssim = evaluate(f"{validation_dir_name}/reconstruction_novel_pose.pkl") 86 | print("NP", mean_lpips, mean_ssim) 87 | result["novel_pose"] = {"lpips": mean_lpips, "ssim": mean_ssim} 88 | 89 | with open(f"{validation_dir_name}/lpips_ssim.pkl", "wb") as f: 90 | pickle.dump(result, f) 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser(description='Compute lpips and ssim') 95 | parser.add_argument('--exp_name', action='append', required=True) 96 | args = parser.parse_args() 97 | 98 | exp_names = args.exp_name 99 | 100 | for exp_name in exp_names: 101 | eval_all(exp_name) 102 | -------------------------------------------------------------------------------- /src/validation/reconstruction.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import argparse 13 | import os 14 | import pickle 15 | import sys 16 | from typing import Tuple, Any, Optional 17 | 18 | import numpy as np 19 | import torch 20 | from easydict import EasyDict as edict 21 | from torch import nn 22 | from torch.utils.data import Dataset 23 | from tqdm import tqdm 24 | 25 | sys.path.append(".") 26 | from models.model import SingleVideoPartDecomposition 27 | from utils.get_args import get_args_jupyter 28 | from utils.train_utils import to_gpu, create_dataloaders 29 | 30 | 31 | def render(model: nn.Module, test_dataset: Dataset, data_idx: int, bg_color: np.ndarray, 32 | part_pose: Optional[Tuple[Any, Any]] = None) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: 33 | """ 34 | 35 | Args: 36 | model: 37 | test_dataset: 38 | data_idx: 39 | bg_color: 40 | part_pose: 41 | 42 | Returns: 43 | 44 | """ 45 | minibatch = test_dataset[data_idx] 46 | minibatch = {k: torch.tensor(v) for k, v in minibatch.items()} 47 | minibatch = to_gpu(minibatch) 48 | img = minibatch["img"] 49 | gt_mask = minibatch["mask"] 50 | camera_rotation = minibatch["camera_rotation"][None] 51 | camera_translation = minibatch["camera_translation"][None] 52 | inv_intrinsics = torch.inverse(minibatch["camera_intrinsic"])[None] 53 | frame_id = minibatch["frame_id"][None] 54 | 55 | model.eval() 56 | 57 | with torch.no_grad(): 58 | if part_pose is not None: 59 | _part_pose = (part_pose[0][frame_id], part_pose[1][frame_id]) 60 | else: 61 | _part_pose = None 62 | 63 | rendered_dict = model.render_entire_img(frame_id, camera_rotation, camera_translation, 64 | inv_intrinsics, segmentation_label=False, 65 | ray_batchsize=16384, 66 | rotate_angle=0, manipulate_pose_config=None, 67 | part_pose=_part_pose) 68 | color = rendered_dict["rendered_colors"] 69 | mask = rendered_dict["rendered_masks"] 70 | 71 | color = color + (1 - mask) * bg_color 72 | 73 | img = img.cpu().numpy() * 127.5 + 127.5 74 | gt_mask = gt_mask.cpu().numpy() 75 | color = color.cpu().numpy() * 127.5 + 127.5 76 | mask = mask.cpu().numpy() 77 | 78 | return img, gt_mask, color, mask 79 | 80 | 81 | def test(config_path: str, default_config: str, mode: str = "test") -> None: 82 | """ 83 | 84 | Args: 85 | config_path: 86 | default_config: 87 | mode: 88 | 89 | Returns: 90 | 91 | """ 92 | assert mode in ["test", "novel_pose"] 93 | 94 | args, config = get_args_jupyter(config_path, default_config) 95 | config.dataset.batchsize = 1 96 | 97 | config.dataset.set_name = mode # test and novel_pose 98 | 99 | test_dataset = create_dataloaders(config.dataset, shuffle=True).dataset 100 | num_train_data = test_dataset.num_frames 101 | 102 | test_dataset.n_repetition_in_epoch = 1 103 | test_dataset.color_augmentation = False 104 | test_dataset.camera_dir_augmentation = False 105 | test_dataset.thin_out_interval = 1 106 | 107 | num_test_data = len(test_dataset.video_cache["img"]) 108 | test_dataset.current_max_frame_id = 100000000 109 | test_dataset.num_frames = 20 110 | num_test_view = num_test_data // test_dataset.num_frames 111 | num_camera_to_use = 5 112 | camera_for_test = np.linspace(0, num_test_view, num_camera_to_use, endpoint=False, dtype="int") 113 | 114 | out_dir = config.output_dir 115 | exp_name = config.exp_name 116 | save_dir = os.path.join(out_dir, "result", exp_name) 117 | 118 | # model 119 | model = SingleVideoPartDecomposition(config.network_params) 120 | model.cuda() 121 | 122 | model_path = os.path.join(save_dir, "snapshot_latest.pth") 123 | if os.path.exists(model_path): 124 | snapshot = torch.load(model_path) 125 | state_dict = snapshot["model"] 126 | model.load_state_dict(state_dict, strict=False) 127 | else: 128 | raise FileNotFoundError() 129 | 130 | if mode == "novel_pose": 131 | part_pose = regress_learned_from_smpl(num_train_data, model, test_dataset, config, use_smpl_verts=True) 132 | else: 133 | part_pose = None 134 | 135 | background_color = config.dataset.background_color 136 | 137 | gt_imgs, gt_masks, gen_imgs, gen_masks = [], [], [], [] 138 | for fra_idx in tqdm(range(test_dataset.num_frames)): 139 | for cam_idx in camera_for_test: 140 | data_idx = fra_idx + cam_idx * test_dataset.num_frames 141 | gt_img, gt_mask, gen_img, gen_mask = render(model, test_dataset, data_idx, background_color, part_pose) 142 | gt_imgs.append(gt_img) 143 | gt_masks.append(gt_mask) 144 | gen_imgs.append(gen_img) 145 | gen_masks.append(gen_mask) 146 | 147 | gt_imgs = np.array(gt_imgs) 148 | gt_masks = np.array(gt_masks) 149 | gen_imgs = np.array(gen_imgs) 150 | gen_masks = np.array(gen_masks) 151 | 152 | save_dict = {"gt_img": gt_imgs, "gt_mask": gt_masks, "gen_img": gen_imgs, "gen_mask": gen_masks} 153 | 154 | os.makedirs(f"{save_dir}/validation", exist_ok=True) 155 | with open(f"{save_dir}/validation/reconstruction_{mode}.pkl", "wb") as f: 156 | pickle.dump(save_dict, f) 157 | 158 | 159 | def regress_learned_from_smpl(num_train_data: int, model: nn.Module, test_dataset: Dataset, config: edict, 160 | use_smpl_verts: bool = True) -> Tuple[torch.Tensor, torch.Tensor]: 161 | """ 162 | 163 | Args: 164 | num_train_data: 165 | model: 166 | test_dataset: 167 | config: 168 | use_smpl_verts: 169 | 170 | Returns: 171 | 172 | """ 173 | with torch.no_grad(): 174 | frame_id = torch.arange(num_train_data, dtype=torch.float, device="cuda") 175 | trajectory = model.joint_trajectory(frame_id) 176 | 177 | rotation, translation = trajectory 178 | video_len = rotation.shape[0] 179 | 180 | child_root, self_root = model.decoder.joint_root_locations(rotation, translation) 181 | estimated_keypoints = torch.cat([child_root, self_root], dim=-1) 182 | 183 | estimated_keypoints = estimated_keypoints.permute(0, 1, 3, 2) 184 | 185 | if use_smpl_verts: 186 | smpl_verts_path = os.path.join(config.dataset.data_root, "smpl_verts.pickle") 187 | with open(smpl_verts_path, "rb") as f: 188 | smpl_keypoints = pickle.load(f)["smpl_verts"] 189 | smpl_keypoints = smpl_keypoints / 1.5 190 | smpl_keypoints = torch.tensor(smpl_keypoints, dtype=torch.float) 191 | smpl_translation = smpl_keypoints # (L, n_verts, 3) 192 | smpl_keypoints = smpl_keypoints[:num_train_data].cpu() 193 | else: 194 | smpl_pose = torch.tensor(test_dataset.video_cache["smpl_pose"], device="cpu", dtype=torch.float).clone() 195 | smpl_pose[:, :, :3, 3] /= 1.5 196 | smpl_keypoints = smpl_pose[:num_train_data, :, :3, 3] 197 | 198 | smpl_translation = smpl_pose[:, :, :3, 3] # (L, 22, 3) 199 | 200 | _estimated_keypoints = estimated_keypoints.reshape(video_len, -1, 3).permute(0, 2, 1).reshape(video_len * 3, 201 | -1).cpu() 202 | _smpl_keypoints = smpl_keypoints.permute(0, 2, 1).reshape(video_len * 3, -1) 203 | 204 | lam = 5e-1 if use_smpl_verts else 1e-1 205 | s2j_mapping = torch.inverse( 206 | _smpl_keypoints.T.matmul(_smpl_keypoints) + torch.eye(_smpl_keypoints.shape[1]) * lam).matmul( 207 | _smpl_keypoints.T).matmul(_estimated_keypoints) 208 | 209 | regressed = s2j_mapping.T @ smpl_translation # (L, 3, 140) 210 | 211 | # canonical pose 212 | with torch.no_grad(): 213 | child_root_can, self_root_can = model.decoder.joint_root_locations(torch.eye(3, device="cuda", 214 | dtype=torch.float), 215 | torch.zeros(model.num_parts, 3, 1, 216 | device="cuda", 217 | dtype=torch.float)) 218 | estimated_keypoints_can = torch.cat([child_root_can, self_root_can], dim=-1) 219 | estimated_keypoints_can = estimated_keypoints_can.permute(0, 2, 1).cpu() 220 | 221 | regressed_translation = regressed.reshape(regressed.shape[0], model.num_parts, 7, 3) 222 | U, S, Vh = torch.linalg.svd(regressed_translation.permute(0, 1, 3, 2) @ estimated_keypoints_can) 223 | R = Vh.permute(0, 1, 3, 2) @ U.permute(0, 1, 3, 2) 224 | det = torch.linalg.det(R) 225 | Vh[:, :, 2] = Vh[:, :, 2] * det[:, :, None] 226 | R = Vh.permute(0, 1, 3, 2) @ U.permute(0, 1, 3, 2) 227 | joint_rotation = R.permute(0, 1, 3, 2).cuda() 228 | joint_translation = regressed_translation[:, :, -1, :, None].cuda() 229 | 230 | return joint_rotation, joint_translation # (L, num_parts, 3, 3) 231 | 232 | 233 | if __name__ == "__main__": 234 | # evaluate novel view and novel pose reconstruction 235 | # novel view -> learned pose, new camera 236 | # novel pose -> novel pose, all camera. Requires smpl regression 237 | parser = argparse.ArgumentParser(description='Save reconstructed images') 238 | parser.add_argument('--exp_name', action='append', required=True) 239 | args = parser.parse_args() 240 | exp_names = args.exp_name 241 | 242 | default_config = "confs/default.yml" 243 | for exp_name in exp_names: 244 | config_path = f"confs/{exp_name}.yml" 245 | test(config_path, default_config, mode="test") # novel view 246 | test(config_path, default_config, mode="novel_pose") 247 | -------------------------------------------------------------------------------- /src/visualize/create_reconstruction_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | 13 | import argparse 14 | import sys 15 | 16 | sys.path.append(".") 17 | from utils.visualization_utils import GenerateVideoFromConfig 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser(description='Save reconstruction video') 21 | parser.add_argument('--exp_name', action='append', required=True) 22 | parser.add_argument('--camera_id', type=int, default=0) 23 | parser.add_argument('--num_video_frames', type=int, default=50) 24 | args = parser.parse_args() 25 | 26 | exp_names = args.exp_name 27 | default_config = "confs/default.yml" 28 | camera_id = args.camera_id 29 | num_video_frames = args.num_video_frames 30 | 31 | for exp_name in exp_names: 32 | config_path = f"confs/{exp_name}.yml" 33 | generate_vide_from_conf = GenerateVideoFromConfig(config_path, default_config) 34 | generate_vide_from_conf(rotate=True, increment=True, camera_id=camera_id, num_video_frames=num_video_frames) 35 | -------------------------------------------------------------------------------- /src/visualize/create_repose_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import argparse 13 | import sys 14 | 15 | import yaml 16 | from easydict import EasyDict as edict 17 | 18 | sys.path.append(".") 19 | from utils.visualization_utils import GenerateVideoFromConfig 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser(description='Manual re-posing') 23 | parser.add_argument('--exp_name', required=True, type=str) 24 | parser.add_argument('--repose_config', required=True, type=str) 25 | parser.add_argument('--rotate', action="store_true") 26 | parser.add_argument('--num_video_frames', type=int, default=20) 27 | parser.add_argument('--iteration', type=int, default=-1) 28 | 29 | args = parser.parse_args() 30 | 31 | config_path = f"confs/{args.exp_name}.yml" 32 | 33 | default_config = "confs/default.yml" 34 | 35 | repose_config = edict(yaml.load(open(args.repose_config), Loader=yaml.SafeLoader)) 36 | frame_id = repose_config.frame_id 37 | camera_id = repose_config.camera_id 38 | root = repose_config.root 39 | first = repose_config.first 40 | second = repose_config.second 41 | rotate = args.rotate 42 | num_video_frames = args.num_video_frames 43 | iteration = args.iteration 44 | 45 | generate_vide_from_conf = GenerateVideoFromConfig(config_path, default_config, iteration=iteration) 46 | generate_vide_from_conf.repose(root, first, second, frame_id, camera_id, 47 | rotate=rotate, num_video_frames=num_video_frames) 48 | -------------------------------------------------------------------------------- /src/visualize/part_merging.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import argparse 13 | import io 14 | import os 15 | import sys 16 | from copy import deepcopy 17 | from typing import List, Tuple 18 | 19 | import cv2 20 | import matplotlib.pyplot as plt 21 | import numpy as np 22 | import torch 23 | 24 | sys.path.append(".") 25 | from utils.visualization_utils import GenerateVideoFromConfig, render_and_get_joints 26 | 27 | 28 | def draw_polygon(joints: np.ndarray, color: str) -> None: 29 | """ 30 | 31 | Args: 32 | joints: (n, 2) 33 | color: 34 | 35 | Returns: 36 | 37 | """ 38 | indices = [] 39 | 40 | visited = np.zeros(len(joints)) 41 | current_idx = joints[:, 0].argmin() 42 | current_angle = np.pi / 2 43 | indices.append(current_idx) 44 | 45 | while visited[current_idx] == 0: 46 | visited[current_idx] = 1 47 | vec = joints - joints[current_idx] 48 | angle = np.arctan2(vec[:, 1], vec[:, 0]) 49 | rel_angle = (current_angle - angle) % (2 * np.pi) 50 | rel_angle[current_idx] = 1e4 # ignore self 51 | current_idx = rel_angle.argmin() 52 | current_angle = angle[current_idx] 53 | indices.append(current_idx) 54 | indices.append(indices[0]) 55 | 56 | plt.plot(joints[indices, 0], 57 | joints[indices, 1], c=f"#{color}") 58 | 59 | 60 | def main(config_path: str, default_config: str, cam_id: int) -> None: 61 | """ 62 | 63 | Args: 64 | config_path: 65 | default_config: 66 | cam_id: 67 | 68 | Returns: 69 | 70 | """ 71 | self = GenerateVideoFromConfig(config_path, default_config, args.iteration) 72 | model = self.model 73 | train_loader = self.train_loader 74 | 75 | with torch.no_grad(): 76 | frame_id = torch.arange(train_loader.dataset.num_frames, dtype=torch.float, device="cuda") 77 | trajectory = model.joint_trajectory(frame_id) 78 | 79 | rotation, translation = trajectory 80 | relative_rotation = torch.matmul(rotation[:, :, None].transpose(-1, -2), rotation[:, None]) 81 | relative_translation = torch.matmul(rotation[:, :, None].transpose(-1, -2), 82 | translation[:, None] - translation[:, :, None]) 83 | mat = relative_rotation.std(dim=0).mean(dim=(2, 3)) + relative_translation.std(dim=0).mean(dim=(2, 3)) * 3 84 | mat = mat + mat.transpose(0, 1) 85 | mat = mat 86 | 87 | asort = (mat + torch.eye(model.num_parts, device="cuda") * 1e10).reshape(-1).argsort() 88 | merged = torch.stack([torch.div(asort, model.num_parts, rounding_mode='trunc'), 89 | asort % model.num_parts], dim=1)[::2] 90 | val = mat[merged[:, 0], merged[:, 1]] 91 | for i in range(len(merged[:10])): 92 | print(merged[i].cpu().numpy(), val[i].item()) 93 | 94 | frame_id = 0 95 | color_each_view = [] 96 | mask_each_view = [] 97 | segmentation_each_view = [] 98 | gt_img_each_view = [] 99 | gt_mask_each_view = [] 100 | joint_2d_each_view = [] 101 | background_each_view = [] 102 | disparity_each_view = [] 103 | child_root_each_view = [] 104 | self_root_each_view = [] 105 | 106 | (img, gt_mask, color, mask, disparity, segmentation, joint_2d, child_root, 107 | self_root, background) = render_and_get_joints(model, train_loader, frame_id, cam_id) 108 | color_each_view.append(color) 109 | mask_each_view.append(mask) 110 | segmentation_each_view.append(segmentation) 111 | gt_img_each_view.append(img) 112 | gt_mask_each_view.append(gt_mask) 113 | joint_2d_each_view.append(joint_2d) 114 | background_each_view.append(background) 115 | disparity_each_view.append(disparity) 116 | child_root_each_view.append(child_root) 117 | self_root_each_view.append(self_root) 118 | 119 | num_prune = 40 120 | 121 | i = 0 122 | 123 | joint_2d = joint_2d_each_view[i][0, :, :] 124 | child_root = child_root_each_view[i] 125 | 126 | joint_connection = model.joint_connection.cpu().numpy() 127 | child_ids = model.child_ids.cpu().numpy() 128 | 129 | new_to_old_ = {_: [_] for _ in range(model.num_parts)} 130 | old_to_new_ = {_: _ for _ in range(model.num_parts)} 131 | 132 | def create_figures(text: bool) -> Tuple[List[np.ndarray], List[np.ndarray]]: 133 | """ 134 | 135 | Args: 136 | text: 137 | 138 | Returns: 139 | 140 | """ 141 | old_to_new = deepcopy(old_to_new_) 142 | new_to_old = deepcopy(new_to_old_) 143 | joint_figure = [] 144 | center_figure = [] 145 | fig_id = 0 146 | for _ in range(num_prune): 147 | should_merge = True 148 | if _ > 0: 149 | merged_idx = merged[_ - 1].cpu().numpy() 150 | From = np.max(merged_idx) 151 | To = np.min(merged_idx) 152 | if old_to_new[To] != old_to_new[From]: 153 | new_to_old[old_to_new[To]] += new_to_old[old_to_new[From]].copy() 154 | new_to_old[old_to_new[From]] = [] 155 | old_to_new = {} 156 | for ii in range(model.num_parts): 157 | connected_to_ii = new_to_old[ii] 158 | for jj in connected_to_ii: 159 | old_to_new[jj] = ii 160 | else: 161 | should_merge = False 162 | 163 | if should_merge: 164 | fig_id += 1 165 | out = 1 - mask_each_view[i].cpu().numpy()[:, :, None][:, :, [0, 0, 0]] / 2 166 | plt.imshow(out, vmin=0, vmax=1, alpha=0.2) 167 | joint_location = (child_root[0, joint_connection[:, 0], :, child_ids[:, 0]] + 168 | child_root[0, joint_connection[:, 1], :, child_ids[:, 1]]) / 2 169 | new_joint_connection = np.array([[old_to_new[jc[0]], old_to_new[jc[1]]] for jc in joint_connection]) 170 | for j in range(model.num_parts): 171 | if len(new_to_old[j]) > 0: 172 | joints = joint_location[np.where((new_joint_connection == old_to_new[j]) & ( 173 | new_joint_connection[:, :1] != new_joint_connection[:, 1:]))[0]] 174 | if len(joints) > 1: 175 | color = format(j * 600000, '06x') 176 | draw_polygon(joints, color) 177 | elif len(joints) == 1: 178 | plt.plot([joints[0, 0], joint_2d[j, 0]], 179 | [joints[0, 1], joint_2d[j, 1]], c="b") 180 | else: 181 | break 182 | if text: 183 | plt.text(joint_2d[j, 0], joint_2d[j, 1], j, fontsize="small") 184 | 185 | plt.axis("off") 186 | plt.subplots_adjust(left=0, right=1, bottom=0, top=1) 187 | 188 | buf = io.BytesIO() 189 | plt.savefig(buf, format='png', dpi=150) 190 | enc = np.frombuffer(buf.getvalue(), dtype=np.uint8) 191 | dst = cv2.imdecode(enc, 1)[:, :, ::-1] 192 | joint_figure.append(dst) 193 | plt.clf() 194 | 195 | out = 1 - mask_each_view[i].cpu().numpy()[:, :, None][:, :, [0, 0, 0]] / 2 196 | plt.imshow(out, vmin=0, vmax=1, alpha=0.2) 197 | 198 | new_joint_2d = [np.mean(joint_2d[new_to_old[_]], axis=0) if len(new_to_old[_]) > 0 else None 199 | for _ in range(model.num_parts)] 200 | for j in range(model.num_parts): 201 | if len(new_to_old[j]) > 0: 202 | if text: 203 | plt.text(new_joint_2d[old_to_new[j]][0], new_joint_2d[old_to_new[j]][1], j, 204 | fontsize="small") 205 | if j == model.num_parts - 1: 206 | break 207 | for njc in new_joint_connection: 208 | if njc[0] != njc[1]: 209 | plt.plot([new_joint_2d[njc[0]][0], new_joint_2d[njc[1]][0]], 210 | [new_joint_2d[njc[0]][1], new_joint_2d[njc[1]][1]]) 211 | 212 | plt.axis("off") 213 | plt.subplots_adjust(left=0, right=1, bottom=0, top=1) 214 | 215 | buf = io.BytesIO() 216 | plt.savefig(buf, format='png', dpi=150) 217 | enc = np.frombuffer(buf.getvalue(), dtype=np.uint8) 218 | dst = cv2.imdecode(enc, 1)[:, :, ::-1] 219 | center_figure.append(dst) 220 | plt.clf() 221 | 222 | return joint_figure, center_figure 223 | 224 | joint_figure, center_figure = create_figures(text=True) 225 | os.makedirs(f"{self.save_dir}/merge", exist_ok=True) 226 | for idx, jf in enumerate(joint_figure): 227 | cv2.imwrite(f"{self.save_dir}/merge/joints_{idx:0>4}.png", jf) 228 | for idx, cf in enumerate(center_figure): 229 | cv2.imwrite(f"{self.save_dir}/merge/centers_{idx:0>4}.png", cf) 230 | 231 | joint_figure, center_figure = create_figures(text=False) 232 | os.makedirs(f"{self.save_dir}/merge", exist_ok=True) 233 | for idx, jf in enumerate(joint_figure): 234 | cv2.imwrite(f"{self.save_dir}/merge/joints_notext_{idx:0>4}.png", jf) 235 | for idx, cf in enumerate(center_figure): 236 | cv2.imwrite(f"{self.save_dir}/merge/centers_notext_{idx:0>4}.png", cf) 237 | 238 | 239 | if __name__ == "__main__": 240 | parser = argparse.ArgumentParser(description='Part merging') 241 | parser.add_argument('--exp_name', required=True, type=str) 242 | parser.add_argument('--camera_id', required=True, type=int) 243 | parser.add_argument('--iteration', default=-1, type=int) 244 | args = parser.parse_args() 245 | 246 | exp_name = args.exp_name 247 | camera_id = args.camera_id 248 | config_path = f"confs/{exp_name}.yml" 249 | 250 | default_config = "confs/default.yml" 251 | 252 | main(config_path, default_config, camera_id) 253 | -------------------------------------------------------------------------------- /src/visualize/repose_configs/cassie.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | 10 | 11 | camera_id: 3 12 | frame_id: 0 13 | root: 8 14 | first: [ [ 0, [ 0, -2.0943, 0 ] ], [ 6, [ 0, 1.0471, 0 ] ] ] 15 | second: [ [ 3, [ 0, -2.0943, 0 ] ], [ 4, [ 0, 1.0471, 0 ] ] ] -------------------------------------------------------------------------------- /src/visualize/repose_configs/iiwa.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | 10 | 11 | camera_id: 1 12 | frame_id: 0 13 | root: 7 14 | first: [ [ 6, [ 0, -2.0943, 0 ] ] ] 15 | second: [ [ 5, [ 0, 2.0943, 0 ] ] ] -------------------------------------------------------------------------------- /src/visualize/repose_configs/spot.yml: -------------------------------------------------------------------------------- 1 | #SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | #SPDX-License-Identifier: LicenseRef-NvidiaProprietary 3 | #NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 4 | #property and proprietary rights in and to this material, related 5 | #documentation and any modifications thereto. Any use, reproduction, 6 | #disclosure or distribution of this material and related documentation 7 | #without an express license agreement from NVIDIA CORPORATION or 8 | #its affiliates is strictly prohibited. 9 | 10 | 11 | camera_id: 0 12 | frame_id: 0 13 | root: 2 14 | first: [ [ 0, [ 0, -2.0943, 0 ] ],[ 4, [ 0, 1.0471, 0 ] ],[ 7, [ 0, -2.0943, 0 ] ],[ 11, [ 0, 1.0471, 0 ] ] ] 15 | second: [ [ 12, [ 0, -2.0943, 0 ] ],[ 8, [ 0, 1.0471, 0 ] ],[ 3, [ 0, -2.0943, 0 ] ],[ 9, [ 0, 1.0471, 0 ] ] ] -------------------------------------------------------------------------------- /src/visualize/repose_person_by_driving_pose.py: -------------------------------------------------------------------------------- 1 | """ 2 | SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | SPDX-License-Identifier: LicenseRef-NvidiaProprietary 4 | NVIDIA CORPORATION, its affiliates and licensors retain all intellectual 5 | property and proprietary rights in and to this material, related 6 | documentation and any modifications thereto. Any use, reproduction, 7 | disclosure or distribution of this material and related documentation 8 | without an express license agreement from NVIDIA CORPORATION or 9 | its affiliates is strictly prohibited. 10 | """ 11 | 12 | import argparse 13 | import os 14 | import pickle 15 | import sys 16 | from typing import List 17 | 18 | import cv2 19 | import numpy as np 20 | import torch 21 | from easydict import EasyDict as edict 22 | from torch import nn 23 | from torch.utils.data import DataLoader 24 | from tqdm import tqdm 25 | 26 | sys.path.append(".") 27 | from models.model import SingleVideoPartDecomposition 28 | from utils.get_args import get_args_jupyter 29 | from utils.train_utils import to_gpu, create_dataloaders 30 | 31 | 32 | def generate_video_from_pose(model: nn.Module, train_loader: DataLoader, num_video_frames: int, camera_id: int, 33 | joint_rotation: torch.Tensor, joint_translation: torch.Tensor, 34 | rotate: bool = False, frame_id: int = 0): 35 | """ 36 | 37 | Args: 38 | model: 39 | train_loader: 40 | num_video_frames: 41 | camera_id: 42 | joint_rotation: 43 | joint_translation: 44 | rotate: 45 | frame_id: 46 | 47 | Returns: 48 | 49 | """ 50 | num_frames = train_loader.dataset.num_frames 51 | minibatch = train_loader.dataset[camera_id * num_frames] 52 | minibatch = {k: torch.tensor(v) for k, v in minibatch.items()} 53 | minibatch = to_gpu(minibatch) 54 | camera_rotation = minibatch["camera_rotation"][None] 55 | camera_translation = minibatch["camera_translation"][None] 56 | inv_intrinsics = torch.inverse(minibatch["camera_intrinsic"])[None] 57 | img = minibatch["img"].cpu().numpy() 58 | 59 | video = [] 60 | 61 | model.eval() 62 | rotate_angle = 0 63 | frame_interval = 5 64 | with torch.no_grad(): 65 | for i in tqdm(range(num_video_frames)): 66 | frame_id += frame_interval 67 | if rotate: 68 | rotate_angle = i / 20 * (2 * np.pi) 69 | if frame_id >= len(joint_translation): 70 | break 71 | out_dict = model.render_entire_img(None, camera_rotation, camera_translation, 72 | inv_intrinsics, segmentation_label=False, 73 | rotate_angle=rotate_angle, ray_batchsize=10000, 74 | part_pose=(joint_rotation[frame_id], joint_translation[frame_id])) 75 | color = out_dict["rendered_colors"] 76 | mask = out_dict["rendered_masks"] 77 | segmentation = out_dict["segmentation_colors"] 78 | 79 | color = (color + (1 - mask[None])).cpu().numpy().transpose(1, 2, 0) 80 | segmentation = (segmentation + (1 - mask[None])).cpu().numpy().transpose(1, 2, 0) 81 | color = np.concatenate([img.transpose(1, 2, 0), color, segmentation], axis=1) 82 | video.append(np.clip(color * 127.5 + 127.5, 0, 255).astype("uint8")) 83 | 84 | return video 85 | 86 | 87 | def save_video(frames: List[np.ndarray], file_name: str, fps: int = 10, n_repeat: int = 10): 88 | """ 89 | 90 | Args: 91 | frames: 92 | file_name: 93 | fps: 94 | n_repeat: 95 | 96 | Returns: 97 | 98 | """ 99 | size = (frames[0].shape[-2], frames[0].shape[-3]) 100 | 101 | fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') 102 | writer = cv2.VideoWriter(file_name, fmt, fps, size) 103 | 104 | for i in range(n_repeat): 105 | for frame in frames: 106 | writer.write(frame[:, :, ::-1]) 107 | 108 | writer.release() 109 | 110 | 111 | def save_png(frames: List[np.ndarray], dir_name: str): 112 | """ 113 | 114 | Args: 115 | frames: 116 | dir_name: 117 | 118 | Returns: 119 | 120 | """ 121 | os.makedirs(dir_name, exist_ok=True) 122 | for i, frame in enumerate(frames): 123 | img_size = frame.shape[0] 124 | cv2.imwrite(f'{dir_name}/gt_{i:0>5}.png', frame[:, :img_size, ::-1]) 125 | cv2.imwrite(f'{dir_name}/gen_{i:0>5}.png', frame[:, img_size:img_size * 2, ::-1]) 126 | cv2.imwrite(f'{dir_name}/seg_{i:0>5}.png', frame[:, img_size * 2:, ::-1]) 127 | 128 | 129 | def get_mapping(model: nn.Module, config: edict, train_loader: DataLoader, use_smpl_verts: bool) -> torch.Tensor: 130 | """ 131 | 132 | Args: 133 | model: 134 | config: 135 | train_loader: 136 | use_smpl_verts: 137 | 138 | Returns: 139 | 140 | """ 141 | with torch.no_grad(): 142 | num_frames = config.dataset.num_frames 143 | coordinate_scale = train_loader.dataset.coordinate_scale 144 | frame_id = torch.arange(config.dataset.num_frames, dtype=torch.float, device="cuda") 145 | trajectory = model.joint_trajectory(frame_id) 146 | 147 | rotation, translation = trajectory 148 | video_len = rotation.shape[0] 149 | 150 | child_root, self_root = model.decoder.joint_root_locations(rotation, translation) 151 | estimated_keypoints = torch.cat([child_root, self_root], dim=-1) 152 | 153 | estimated_keypoints = estimated_keypoints.permute(0, 1, 3, 2) 154 | 155 | if use_smpl_verts: 156 | smpl_verts_path = os.path.join(config.dataset.data_root, "smpl_verts.pickle") 157 | with open(smpl_verts_path, "rb") as f: 158 | smpl_keypoints = pickle.load(f)["smpl_verts"] 159 | smpl_keypoints = smpl_keypoints / coordinate_scale 160 | smpl_keypoints = torch.tensor(smpl_keypoints, device=rotation.device, dtype=torch.float) 161 | smpl_keypoints = smpl_keypoints[:num_frames] 162 | 163 | else: 164 | smpl_pose = torch.tensor(train_loader.dataset.video_cache["smpl_pose"], device=rotation.device, 165 | dtype=torch.float).clone() 166 | smpl_pose[:, :, :3, 3] /= coordinate_scale 167 | 168 | smpl_keypoints = smpl_pose[:num_frames, :, :3].clone() 169 | smpl_keypoints[:, :, :3, :3] *= 0.1 170 | smpl_keypoints[:, :, :3, :3] += smpl_keypoints[:, :, :3, 3:] 171 | smpl_keypoints = smpl_keypoints.permute(0, 1, 3, 2).reshape(video_len, -1, 3) 172 | 173 | _estimated_keypoints = estimated_keypoints.reshape(video_len, -1, 3).permute(0, 2, 1).reshape(video_len * 3, 174 | -1).cpu() 175 | _smpl_keypoints = smpl_keypoints.permute(0, 2, 1).reshape(video_len * 3, -1).cpu() 176 | 177 | lam = 5e-1 if use_smpl_verts else 1e-1 178 | s2j_mapping = torch.inverse( 179 | _smpl_keypoints.T.matmul(_smpl_keypoints) + torch.eye(_smpl_keypoints.shape[1]) * lam).matmul( 180 | _smpl_keypoints.T).matmul(_estimated_keypoints) 181 | 182 | return s2j_mapping 183 | 184 | 185 | def repose(config_path: str, default_config: str, driving_person_id: int, num_video_frames: int, use_smpl_verts: bool 186 | ) -> None: 187 | """ 188 | 189 | Args: 190 | config_path: 191 | default_config: 192 | driving_person_id: 193 | num_video_frames: 194 | use_smpl_verts: use smpl vertices for regression or not 195 | 196 | Returns: 197 | 198 | """ 199 | args, config = get_args_jupyter(config_path, default_config) 200 | config.dataset.batchsize = 1 201 | 202 | train_loader = create_dataloaders(config.dataset, shuffle=True) 203 | 204 | train_loader.dataset.n_repetition_in_epoch = 1 205 | train_loader.dataset.color_augmentation = False 206 | train_loader.dataset.camera_dir_augmentation = False 207 | train_loader.dataset.background_color = 1 208 | 209 | out_dir = config.output_dir 210 | exp_name = config.exp_name 211 | 212 | same_person = driving_person_id == -1 213 | 214 | # model 215 | model = SingleVideoPartDecomposition(config.network_params) 216 | model.cuda() 217 | 218 | save_dir = os.path.join(out_dir, "result", exp_name) 219 | model_path = os.path.join(save_dir, "snapshot_latest.pth") 220 | if os.path.exists(model_path): 221 | snapshot = torch.load(model_path) 222 | state_dict = snapshot["model"] 223 | model.load_state_dict(state_dict, strict=False) 224 | else: 225 | print("model is not loaded") 226 | 227 | train_loader.dataset.current_max_frame_id = train_loader.dataset.num_frames 228 | 229 | num_frames = train_loader.dataset.num_frames 230 | coordinate_scale = train_loader.dataset.coordinate_scale 231 | 232 | s2j_mapping = get_mapping(model, config, train_loader, use_smpl_verts) 233 | 234 | if use_smpl_verts: 235 | if same_person: 236 | data_root = config.dataset.data_root 237 | else: 238 | person_id = str(driving_person_id) 239 | data_root = f"../data/zju_mocap/cache512/{person_id}/" 240 | smpl_verts_path = os.path.join(data_root, "smpl_verts.pickle") 241 | with open(smpl_verts_path, "rb") as f: 242 | smpl_keypoints = pickle.load(f)["smpl_verts"] 243 | smpl_keypoints = smpl_keypoints / coordinate_scale 244 | smpl_translation = torch.tensor(smpl_keypoints, dtype=torch.float) 245 | else: 246 | if same_person: 247 | smpl_pose = torch.tensor(train_loader.dataset.video_cache["smpl_pose"], dtype=torch.float).clone() 248 | else: 249 | # read other smpl sequence 250 | person_id = str(driving_person_id) 251 | with open(f"../data/zju_mocap/cache512/{person_id}/cache_train.pickle", "rb") as f: 252 | smpl_pose = torch.tensor(pickle.load(f)["smpl_pose"], dtype=torch.float).clone() 253 | 254 | smpl_pose[:, :, :3, 3] /= coordinate_scale 255 | 256 | smpl_translation = smpl_pose[:, :, :3].clone() 257 | smpl_translation[:, :, :3, :3] *= 0.1 258 | smpl_translation[:, :, :3, :3] += smpl_translation[:, :, :3, 3:] 259 | smpl_translation = smpl_translation.permute(0, 1, 3, 2).reshape(-1, 23 * 4, 3) 260 | 261 | estimated_367_translation = s2j_mapping.T @ smpl_translation # (L, 140, 3) 262 | 263 | # org pose 264 | with torch.no_grad(): 265 | child_root_org, self_root_org = model.decoder.joint_root_locations(torch.eye(3, device="cuda", 266 | dtype=torch.float), 267 | torch.zeros(model.num_parts, 3, 1, 268 | device="cuda", 269 | dtype=torch.float)) 270 | estimated_keypoints_org = torch.cat([child_root_org, self_root_org], dim=-1) 271 | estimated_keypoints_org = estimated_keypoints_org.permute(0, 2, 1).cpu() 272 | 273 | estimated_367_translation = estimated_367_translation.reshape(estimated_367_translation.shape[0], model.num_parts, 274 | 7, 3) 275 | estimated_367_translation_centered = estimated_367_translation - estimated_367_translation[:, :, -1:] 276 | U, S, Vh = torch.linalg.svd(estimated_367_translation_centered.permute(0, 1, 3, 2) @ estimated_keypoints_org) 277 | R = Vh.permute(0, 1, 3, 2) @ U.permute(0, 1, 3, 2) 278 | det = torch.linalg.det(R) 279 | Vh[:, :, 2] = Vh[:, :, 2] * det[:, :, None] 280 | R = Vh.permute(0, 1, 3, 2) @ U.permute(0, 1, 3, 2) 281 | 282 | frame_id = num_frames if same_person else 0 283 | camera_id = 0 284 | rotate = False 285 | 286 | video = generate_video_from_pose(model, train_loader, num_video_frames, camera_id, 287 | rotate=rotate, 288 | joint_rotation=R.permute(0, 1, 3, 2).cuda(), 289 | joint_translation=estimated_367_translation[:, :, -1, :, None].cuda(), 290 | frame_id=frame_id) 291 | save_video(video, os.path.join( 292 | save_dir, 293 | 'drive_' + 'verts_' * use_smpl_verts + f'{driving_person_id}' * ~same_person + f'_{camera_id}.mp4')) 294 | 295 | save_png(video, os.path.join( 296 | save_dir, 297 | 'drive_' + 'verts_' * use_smpl_verts + f'{driving_person_id}' * ~same_person + f'_{camera_id}')) 298 | 299 | 300 | if __name__ == "__main__": 301 | parser = argparse.ArgumentParser(description='Create reposing videos with test poses') 302 | parser.add_argument('--exp_name', required=True, type=str) 303 | parser.add_argument('--camera_id', required=True, type=int) 304 | parser.add_argument('--num_video_frames', type=int, default=100) 305 | parser.add_argument('--driving_person_id', type=int, default=-1, 306 | help="Driving person id is same as input person id if -1") 307 | args = parser.parse_args() 308 | 309 | exp_name = args.exp_name 310 | num_video_frames = args.num_video_frames 311 | driving_person_id = args.driving_person_id 312 | 313 | default_config = "confs/default.yml" 314 | 315 | config_path = f"confs/{exp_name}.yml" 316 | repose(config_path, default_config, driving_person_id, num_video_frames, use_smpl_verts=True) 317 | --------------------------------------------------------------------------------