├── lib ├── core │ ├── __init__.py │ ├── config.py │ ├── evaluate.py │ └── loss.py ├── utils │ ├── __init__.py │ ├── one_euro_filter.py │ ├── smooth_pose.py │ ├── pose_tracker.py │ ├── smooth_bbox.py │ ├── renderer.py │ ├── utils.py │ ├── eval_utils.py │ └── demo_utils.py ├── models │ ├── __init__.py │ ├── attention.py │ ├── motion_discriminator.py │ ├── smpl.py │ └── vibe.py ├── dataset │ ├── __init__.py │ ├── penn_action.py │ ├── posetrack.py │ ├── threedpw.py │ ├── mpii3d.py │ ├── amass.py │ ├── insta.py │ ├── loaders.py │ ├── inference.py │ ├── dataset_2d.py │ └── dataset_3d.py ├── data_utils │ ├── feature_extractor.py │ ├── penn_action_utils.py │ ├── amass_utils.py │ ├── posetrack_utils.py │ └── threedpw_utils.py └── smplify │ ├── prior.py │ ├── losses.py │ └── temporal_smplify.py ├── girl_dance.mp4 ├── .gitignore ├── scripts ├── prepare_data.sh ├── install_pip.sh ├── install_conda.sh └── prepare_training_data.sh ├── doc ├── eval.md ├── train.md └── demo.md ├── requirements.txt ├── .github └── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature_request.md ├── configs ├── config.yaml └── config_wo_3dpw.yaml ├── tests ├── test_2d_datasets.py └── test_3d_datasets.py ├── eval.py ├── vibe_demo.ipynb ├── train.py ├── LICENSE └── README.md /lib/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /girl_dance.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cedro3/VIBE/master/girl_dance.mp4 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | data 3 | __pycache__/ 4 | vibe-env/ 5 | output/ 6 | *.mp4 7 | results 8 | -------------------------------------------------------------------------------- /lib/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .vibe import VIBE 2 | from .motion_discriminator import MotionDiscriminator 3 | -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_2d import Dataset2D 2 | from .dataset_3d import Dataset3D 3 | 4 | from .insta import Insta 5 | from .amass import AMASS 6 | from .mpii3d import MPII3D 7 | from .threedpw import ThreeDPW 8 | from .posetrack import PoseTrack 9 | from .penn_action import PennAction 10 | 11 | -------------------------------------------------------------------------------- /scripts/prepare_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | mkdir -p data 4 | cd data 5 | gdown "https://drive.google.com/uc?id=1untXhYOLQtpNEy4GTY_0fL_H-k6cTf_r" 6 | unzip vibe_data.zip 7 | rm vibe_data.zip 8 | cd .. 9 | mv data/vibe_data/sample_video.mp4 . 10 | mkdir -p $HOME/.torch/models/ 11 | mv data/vibe_data/yolov3.weights $HOME/.torch/models/ 12 | -------------------------------------------------------------------------------- /scripts/install_pip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo "Creating virtual environment" 4 | python3.7 -m venv vibe-env 5 | echo "Activating virtual environment" 6 | 7 | source $PWD/vibe-env/bin/activate 8 | 9 | $PWD/vibe-env/bin/pip install numpy==1.17.5 torch==1.4.0 torchvision==0.5.0 10 | $PWD/vibe-env/bin/pip install git+https://github.com/giacaglia/pytube.git --upgrade 11 | $PWD/vibe-env/bin/pip install -r requirements.txt 12 | -------------------------------------------------------------------------------- /scripts/install_conda.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CONDA_ENV_NAME=vibe-env 4 | echo $CONDA_ENV_NAME 5 | 6 | conda create -n $CONDA_ENV_NAME python=3.7 7 | 8 | eval "$(conda shell.bash hook)" 9 | conda activate $CONDA_ENV_NAME 10 | 11 | which python 12 | which pip 13 | 14 | pip install numpy==1.17.5 torch==1.4.0 torchvision==0.5.0 15 | pip install git+https://github.com/giacaglia/pytube.git --upgrade 16 | pip install -r requirements.txt 17 | -------------------------------------------------------------------------------- /doc/eval.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | Run the commands below to evaluate a pretrained model. 4 | 5 | ```shell script 6 | python eval.py --cfg configs/config.yaml 7 | ``` 8 | 9 | Change the `TRAIN.PRETRAINED` field of the config file to the checkpoint you would like to evaluate. 10 | You should be able to obtain the output below: 11 | 12 | ```shell script 13 | # TRAIN.PRETRAINED = 'data/vibe_data/vibe_model_wo_3dpw.pth.tar' 14 | ...Evaluating on 3DPW test set... 15 | MPJPE: 93.5881, PA-MPJPE: 56.5608, PVE: 113.4118, ACCEL: 27.1242, ACCEL_ERR: 27.9877 16 | ``` 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.28.1 2 | yacs==0.1.6 3 | h5py==2.10.0 4 | numpy==1.17.5 5 | scipy==1.4.1 6 | numba==0.47.0 7 | smplx==0.1.13 8 | gdown==3.6.4 9 | PyYAML==5.3.1 10 | joblib==0.14.1 11 | pillow==6.2.1 12 | trimesh==3.5.25 13 | pyrender==0.1.36 14 | progress==1.5 15 | filterpy==1.4.5 16 | matplotlib==3.1.3 17 | tensorflow==1.15.4 18 | tensorboard==2.1.0 19 | torchvision==0.5.0 20 | scikit-image==0.16.2 21 | scikit-video==1.1.11 22 | opencv-python==4.1.2.30 23 | llvmlite==0.32.1 24 | git+https://github.com/mattloper/chumpy.git 25 | git+https://github.com/mkocabas/yolov3-pytorch.git 26 | git+https://github.com/mkocabas/multi-person-tracker.git 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Use this to report bugs 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Thanks for your interest in our research! 11 | 12 | If you have problems running our code, please include; 13 | 14 | 1. your operating system and the version, 15 | 2. your python version, 16 | 3. your pytorch version, 17 | 4. the stack trace of the error that you see, 18 | 19 | Specifically, if you have an issue with pyrender or OpenGL setup & installation, please refer to pyrender [docs](https://pyrender.readthedocs.io/en/latest/) or [github issues](https://github.com/mmatl/pyrender/issues). 20 | -------------------------------------------------------------------------------- /scripts/prepare_training_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | mkdir -p ./data/vibe_db 4 | export PYTHONPATH="./:$PYTHONPATH" 5 | 6 | # AMASS 7 | python lib/data_utils/amass_utils.py --dir ./data/amass 8 | 9 | # InstaVariety 10 | # Comment this if you already downloaded the preprocessed file 11 | python lib/data_utils/insta_utils.py --dir ./data/insta_variety 12 | 13 | # 3DPW 14 | python lib/data_utils/threedpw_utils.py --dir ./data/3dpw 15 | 16 | # MPI-INF-3D-HP 17 | python lib/data_utils/mpii3d_utils.py --dir ./data/mpi_inf_3dhp 18 | 19 | # PoseTrack 20 | python lib/data_utils/posetrack_utils.py --dir ./data/posetrack 21 | 22 | # PennAction 23 | python lib/data_utils/penn_action_utils.py --dir ./data/penn_action 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Use this to suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /configs/config.yaml: -------------------------------------------------------------------------------- 1 | DEBUG: false 2 | DEBUG_FREQ: 5 3 | LOGDIR: '' 4 | DEVICE: 'cuda' 5 | EXP_NAME: 'vibe' 6 | OUTPUT_DIR: 'results/vibe_tests' 7 | NUM_WORKERS: 8 8 | SEED_VALUE: -1 9 | DATASET: 10 | SEQLEN: 16 11 | LOSS: 12 | KP_2D_W: 300.0 13 | KP_3D_W: 300.0 14 | SHAPE_W: 0.06 15 | POSE_W: 60.0 16 | D_MOTION_LOSS_W: 0.5 17 | TRAIN: 18 | BATCH_SIZE: 32 19 | NUM_ITERS_PER_EPOCH: 500 20 | PRETRAINED: '' 21 | PRETRAINED_REGRESSOR: 'data/vibe_data/spin_model_checkpoint.pth.tar' 22 | RESUME: '' 23 | START_EPOCH: 0 24 | END_EPOCH: 30 25 | LR_PATIENCE: 5 26 | DATA_2D_RATIO: 0.6 27 | DATASETS_2D: 28 | - 'Insta' 29 | # - 'PoseTrack' 30 | # - 'PennAction' 31 | DATASETS_3D: 32 | # - 'ThreeDPW' 33 | - 'MPII3D' 34 | DATASET_EVAL: 'ThreeDPW' 35 | GEN_LR: 0.00005 36 | GEN_WD: 0.0 37 | MOT_DISCR: 38 | OPTIM: 'Adam' 39 | LR: 0.0001 40 | WD: 0.0001 41 | MOMENTUM: 0.9 42 | HIDDEN_SIZE: 1024 43 | NUM_LAYERS: 2 44 | FEATURE_POOL: 'attention' 45 | ATT: 46 | LAYERS: 3 47 | SIZE: 1024 48 | DROPOUT: 0.2 49 | MODEL: 50 | TEMPORAL_TYPE: 'gru' 51 | TGRU: 52 | NUM_LAYERS: 2 53 | ADD_LINEAR: true 54 | RESIDUAL: true 55 | BIDIRECTIONAL: false 56 | HIDDEN_SIZE: 1024 -------------------------------------------------------------------------------- /configs/config_wo_3dpw.yaml: -------------------------------------------------------------------------------- 1 | DEBUG: false 2 | DEBUG_FREQ: 5 3 | LOGDIR: '' 4 | DEVICE: 'cuda' 5 | EXP_NAME: 'vibe' 6 | OUTPUT_DIR: 'results/vibe_wo_3dpw' 7 | NUM_WORKERS: 8 8 | SEED_VALUE: -1 9 | DATASET: 10 | SEQLEN: 16 11 | LOSS: 12 | KP_2D_W: 300.0 13 | KP_3D_W: 300.0 14 | SHAPE_W: 0.06 15 | POSE_W: 60.0 16 | D_MOTION_LOSS_W: 0.5 17 | TRAIN: 18 | BATCH_SIZE: 32 19 | NUM_ITERS_PER_EPOCH: 500 20 | PRETRAINED: '' 21 | PRETRAINED_REGRESSOR: 'data/vibe_data/spin_model_checkpoint.pth.tar' 22 | RESUME: '' 23 | START_EPOCH: 0 24 | END_EPOCH: 30 25 | LR_PATIENCE: 5 26 | DATA_2D_RATIO: 0.6 27 | DATASETS_2D: 28 | - 'Insta' 29 | # - 'PoseTrack' 30 | # - 'PennAction' 31 | DATASETS_3D: 32 | # - 'ThreeDPW' 33 | - 'MPII3D' 34 | DATASET_EVAL: 'ThreeDPW' 35 | GEN_LR: 0.00005 36 | GEN_WD: 0.0 37 | MOT_DISCR: 38 | OPTIM: 'Adam' 39 | LR: 0.0001 40 | WD: 0.0001 41 | MOMENTUM: 0.9 42 | HIDDEN_SIZE: 1024 43 | NUM_LAYERS: 2 44 | FEATURE_POOL: 'attention' 45 | ATT: 46 | LAYERS: 3 47 | SIZE: 1024 48 | DROPOUT: 0.2 49 | MODEL: 50 | TEMPORAL_TYPE: 'gru' 51 | TGRU: 52 | NUM_LAYERS: 2 53 | ADD_LINEAR: true 54 | RESIDUAL: true 55 | BIDIRECTIONAL: false 56 | HIDDEN_SIZE: 1024 -------------------------------------------------------------------------------- /lib/dataset/penn_action.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from lib.dataset import Dataset2D 18 | from lib.core.config import PENNACTION_DIR 19 | 20 | 21 | class PennAction(Dataset2D): 22 | def __init__(self, seqlen, overlap=0.75, debug=False): 23 | db_name = 'pennaction' 24 | 25 | super(PennAction, self).__init__( 26 | seqlen = seqlen, 27 | folder=PENNACTION_DIR, 28 | dataset_name=db_name, 29 | debug=debug, 30 | overlap=overlap, 31 | ) 32 | print(f'{db_name} - number of dataset objects {self.__len__()}') 33 | -------------------------------------------------------------------------------- /lib/dataset/posetrack.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from lib.dataset import Dataset2D 18 | from lib.core.config import POSETRACK_DIR 19 | 20 | 21 | class PoseTrack(Dataset2D): 22 | def __init__(self, seqlen, overlap=0.75, folder=None, debug=False): 23 | db_name = 'posetrack' 24 | super(PoseTrack, self).__init__( 25 | seqlen = seqlen, 26 | folder=POSETRACK_DIR, 27 | dataset_name=db_name, 28 | debug=debug, 29 | overlap=overlap, 30 | ) 31 | print(f'{db_name} - number of dataset objects {self.__len__()}') 32 | -------------------------------------------------------------------------------- /tests/test_2d_datasets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('.') 3 | 4 | import torch 5 | import numpy as np 6 | import skimage.io as io 7 | import matplotlib.pyplot as plt 8 | from torch.utils.data import DataLoader 9 | 10 | from lib.dataset import * 11 | from lib.utils.vis import batch_draw_skeleton, batch_visualize_preds 12 | 13 | 14 | def debug_2d_data(dataset, DEBUG=True): 15 | is_train = True 16 | seqlen = 32 17 | batch_size = 1 18 | db = eval(dataset)(seqlen=seqlen, debug=DEBUG) 19 | 20 | dataloader = DataLoader( 21 | dataset=db, 22 | batch_size=batch_size, 23 | shuffle=True, 24 | num_workers=1, 25 | ) 26 | 27 | for i, target in enumerate(dataloader): 28 | for k, v in target.items(): 29 | print(k, v.shape) 30 | 31 | if DEBUG: 32 | if dataset is 'Insta': 33 | input = torch.ones(batch_size, seqlen, 3, 224, 224)[0] 34 | else: 35 | input = target['video'][0] 36 | single_target = {k: v[0] for k, v in target.items()} 37 | 38 | dataset_name = 'spin' 39 | plt.figure(figsize=(19.2,10.8)) 40 | images = batch_draw_skeleton(input, single_target, dataset=dataset_name, max_images=4) 41 | plt.imshow(images) 42 | plt.show() 43 | 44 | if i == 20: 45 | break 46 | 47 | 48 | if __name__ == '__main__': 49 | debug_2d_data('Insta', DEBUG=True) 50 | -------------------------------------------------------------------------------- /lib/utils/one_euro_filter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | def smoothing_factor(t_e, cutoff): 6 | r = 2 * math.pi * cutoff * t_e 7 | return r / (r + 1) 8 | 9 | 10 | def exponential_smoothing(a, x, x_prev): 11 | return a * x + (1 - a) * x_prev 12 | 13 | 14 | class OneEuroFilter: 15 | def __init__(self, t0, x0, dx0=0.0, min_cutoff=1.0, beta=0.0, 16 | d_cutoff=1.0): 17 | """Initialize the one euro filter.""" 18 | # The parameters. 19 | self.min_cutoff = float(min_cutoff) 20 | self.beta = float(beta) 21 | self.d_cutoff = float(d_cutoff) 22 | # Previous values. 23 | self.x_prev = x0 24 | self.dx_prev = dx0 25 | self.t_prev = t0 26 | 27 | def __call__(self, t, x): 28 | """Compute the filtered signal.""" 29 | t_e = t - self.t_prev 30 | 31 | # The filtered derivative of the signal. 32 | a_d = smoothing_factor(t_e, self.d_cutoff) 33 | dx = (x - self.x_prev) / t_e 34 | dx_hat = exponential_smoothing(a_d, dx, self.dx_prev) 35 | 36 | # The filtered signal. 37 | cutoff = self.min_cutoff + self.beta * np.abs(dx_hat) 38 | a = smoothing_factor(t_e, cutoff) 39 | x_hat = exponential_smoothing(a, x, self.x_prev) 40 | 41 | # Memorize the previous values. 42 | self.x_prev = x_hat 43 | self.dx_prev = dx_hat 44 | self.t_prev = t 45 | 46 | return x_hat 47 | -------------------------------------------------------------------------------- /lib/dataset/threedpw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from lib.dataset import Dataset3D 18 | from lib.core.config import THREEDPW_DIR 19 | 20 | class ThreeDPW(Dataset3D): 21 | def __init__(self, set, seqlen, overlap=0.75, debug=False): 22 | db_name = '3dpw' 23 | 24 | # during testing we don't need data augmentation 25 | # but we can use it as an ensemble 26 | is_train = False 27 | overlap = overlap if is_train else 0. 28 | print('3DPW Dataset overlap ratio: ', overlap) 29 | super(ThreeDPW, self).__init__( 30 | set=set, 31 | folder=THREEDPW_DIR, 32 | seqlen=seqlen, 33 | overlap=overlap, 34 | dataset_name=db_name, 35 | debug=debug, 36 | ) 37 | print(f'{db_name} - number of dataset objects {self.__len__()}') -------------------------------------------------------------------------------- /lib/dataset/mpii3d.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from lib.dataset import Dataset3D 18 | from lib.core.config import MPII3D_DIR 19 | 20 | 21 | class MPII3D(Dataset3D): 22 | def __init__(self, set, seqlen, overlap=0, debug=False): 23 | db_name = 'mpii3d' 24 | 25 | # during testing we don't need data augmentation 26 | # but we can use it as an ensemble 27 | is_train = set == 'train' 28 | overlap = overlap if is_train else 0. 29 | print('MPII3D Dataset overlap ratio: ', overlap) 30 | super(MPII3D, self).__init__( 31 | set = set, 32 | folder=MPII3D_DIR, 33 | seqlen=seqlen, 34 | overlap=overlap, 35 | dataset_name=db_name, 36 | debug=debug, 37 | ) 38 | print(f'{db_name} - number of dataset objects {self.__len__()}') -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from lib.dataset import ThreeDPW 5 | from lib.models import VIBE 6 | from lib.core.evaluate import Evaluator 7 | from lib.core.config import parse_args 8 | from torch.utils.data import DataLoader 9 | 10 | 11 | def main(cfg): 12 | print('...Evaluating on 3DPW test set...') 13 | 14 | model = VIBE( 15 | n_layers=cfg.MODEL.TGRU.NUM_LAYERS, 16 | batch_size=cfg.TRAIN.BATCH_SIZE, 17 | seqlen=cfg.DATASET.SEQLEN, 18 | hidden_size=cfg.MODEL.TGRU.HIDDEN_SIZE, 19 | pretrained=cfg.TRAIN.PRETRAINED_REGRESSOR, 20 | add_linear=cfg.MODEL.TGRU.ADD_LINEAR, 21 | bidirectional=cfg.MODEL.TGRU.BIDIRECTIONAL, 22 | use_residual=cfg.MODEL.TGRU.RESIDUAL, 23 | ).to(cfg.DEVICE) 24 | 25 | if cfg.TRAIN.PRETRAINED != '' and os.path.isfile(cfg.TRAIN.PRETRAINED): 26 | checkpoint = torch.load(cfg.TRAIN.PRETRAINED) 27 | best_performance = checkpoint['performance'] 28 | model.load_state_dict(checkpoint['gen_state_dict']) 29 | print(f'==> Loaded pretrained model from {cfg.TRAIN.PRETRAINED}...') 30 | print(f'Performance on 3DPW test set {best_performance}') 31 | else: 32 | print(f'{cfg.TRAIN.PRETRAINED} is not a pretrained model!!!!') 33 | exit() 34 | 35 | test_db = ThreeDPW(set='test', seqlen=cfg.DATASET.SEQLEN, debug=cfg.DEBUG) 36 | 37 | test_loader = DataLoader( 38 | dataset=test_db, 39 | batch_size=cfg.TRAIN.BATCH_SIZE, 40 | shuffle=False, 41 | num_workers=cfg.NUM_WORKERS, 42 | ) 43 | 44 | Evaluator( 45 | model=model, 46 | device=cfg.DEVICE, 47 | test_loader=test_loader, 48 | ).run() 49 | 50 | 51 | if __name__ == '__main__': 52 | cfg, cfg_file = parse_args() 53 | 54 | main(cfg) 55 | -------------------------------------------------------------------------------- /tests/test_3d_datasets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('.') 3 | import time 4 | from lib.dataset import * 5 | import matplotlib.pyplot as plt 6 | from torch.utils.data import DataLoader 7 | from lib.models.smpl import SMPL, SMPL_MODEL_DIR 8 | from lib.utils.vis import batch_draw_skeleton, batch_visualize_preds 9 | 10 | dataset = 'MPII3D' 11 | seqlen = 16 12 | DEBUG = True 13 | 14 | db = eval(dataset)(set='val', seqlen=seqlen, debug=DEBUG) 15 | 16 | dataloader = DataLoader( 17 | dataset=db, 18 | batch_size=4, 19 | shuffle=True, 20 | num_workers=1, 21 | ) 22 | 23 | smpl = SMPL(SMPL_MODEL_DIR) 24 | 25 | start = time.time() 26 | for i, target in enumerate(dataloader): 27 | data_time = time.time() - start 28 | start = time.time() 29 | print(f'Data loading time {data_time:.4f}') 30 | 31 | for k, v in target.items(): 32 | print(k, v.shape) 33 | 34 | if DEBUG: 35 | input = target['video'][0] 36 | single_target = {k: v[0] for k, v in target.items()} 37 | 38 | if dataset == 'MPII3D': 39 | images = batch_draw_skeleton(input, single_target, dataset='spin', max_images=4) 40 | plt.imshow(images) 41 | plt.show() 42 | else: 43 | theta = single_target['theta'] 44 | pose, shape = theta[:, 3:75], theta[:, 75:] 45 | 46 | # verts, j3d, smpl_j3d = smpl(pose, shape) 47 | 48 | pred_output = smpl(betas=shape, body_pose=pose[:, 3:], global_orient=pose[:, :3], pose2rot=True) 49 | 50 | single_target['verts'] = pred_output.vertices 51 | 52 | images = batch_visualize_preds(input, single_target, single_target, max_images=4, dataset='spin') 53 | # images = batch_draw_skeleton(input, single_target, dataset='common', max_images=10) 54 | plt.imshow(images) 55 | plt.show() 56 | 57 | if i == 100: 58 | break -------------------------------------------------------------------------------- /lib/dataset/amass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import torch 18 | import joblib 19 | import numpy as np 20 | import os.path as osp 21 | from torch.utils.data import Dataset 22 | 23 | from lib.core.config import VIBE_DB_DIR 24 | from lib.data_utils.img_utils import split_into_chunks 25 | 26 | class AMASS(Dataset): 27 | def __init__(self, seqlen): 28 | self.seqlen = seqlen 29 | 30 | self.stride = seqlen 31 | 32 | self.db = self.load_db() 33 | self.vid_indices = split_into_chunks(self.db['vid_name'], self.seqlen, self.stride) 34 | del self.db['vid_name'] 35 | print(f'AMASS dataset number of videos: {len(self.vid_indices)}') 36 | 37 | def __len__(self): 38 | return len(self.vid_indices) 39 | 40 | def __getitem__(self, index): 41 | return self.get_single_item(index) 42 | 43 | def load_db(self): 44 | db_file = osp.join(VIBE_DB_DIR, 'amass_db.pt') 45 | db = joblib.load(db_file) 46 | return db 47 | 48 | def get_single_item(self, index): 49 | start_index, end_index = self.vid_indices[index] 50 | thetas = self.db['theta'][start_index:end_index+1] 51 | 52 | cam = np.array([1., 0., 0.])[None, ...] 53 | cam = np.repeat(cam, thetas.shape[0], axis=0) 54 | theta = np.concatenate([cam, thetas], axis=-1) 55 | 56 | target = { 57 | 'theta': torch.from_numpy(theta).float(), # cam, pose and shape 58 | } 59 | return target 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /lib/utils/smooth_pose.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import torch 18 | import numpy as np 19 | 20 | from lib.models.smpl import SMPL, SMPL_MODEL_DIR 21 | from lib.utils.one_euro_filter import OneEuroFilter 22 | 23 | 24 | def smooth_pose(pred_pose, pred_betas, min_cutoff=0.004, beta=0.7): 25 | # min_cutoff: Decreasing the minimum cutoff frequency decreases slow speed jitter 26 | # beta: Increasing the speed coefficient(beta) decreases speed lag. 27 | 28 | one_euro_filter = OneEuroFilter( 29 | np.zeros_like(pred_pose[0]), 30 | pred_pose[0], 31 | min_cutoff=min_cutoff, 32 | beta=beta, 33 | ) 34 | 35 | smpl = SMPL(model_path=SMPL_MODEL_DIR) 36 | 37 | pred_pose_hat = np.zeros_like(pred_pose) 38 | 39 | # initialize 40 | pred_pose_hat[0] = pred_pose[0] 41 | 42 | pred_verts_hat = [] 43 | pred_joints3d_hat = [] 44 | 45 | smpl_output = smpl( 46 | betas=torch.from_numpy(pred_betas[0]).unsqueeze(0), 47 | body_pose=torch.from_numpy(pred_pose[0, 1:]).unsqueeze(0), 48 | global_orient=torch.from_numpy(pred_pose[0, 0:1]).unsqueeze(0), 49 | ) 50 | pred_verts_hat.append(smpl_output.vertices.detach().cpu().numpy()) 51 | pred_joints3d_hat.append(smpl_output.joints.detach().cpu().numpy()) 52 | 53 | for idx, pose in enumerate(pred_pose[1:]): 54 | idx += 1 55 | 56 | t = np.ones_like(pose) * idx 57 | pose = one_euro_filter(t, pose) 58 | pred_pose_hat[idx] = pose 59 | 60 | smpl_output = smpl( 61 | betas=torch.from_numpy(pred_betas[idx]).unsqueeze(0), 62 | body_pose=torch.from_numpy(pred_pose_hat[idx, 1:]).unsqueeze(0), 63 | global_orient=torch.from_numpy(pred_pose_hat[idx, 0:1]).unsqueeze(0), 64 | ) 65 | pred_verts_hat.append(smpl_output.vertices.detach().cpu().numpy()) 66 | pred_joints3d_hat.append(smpl_output.joints.detach().cpu().numpy()) 67 | 68 | return np.vstack(pred_verts_hat), pred_pose_hat, np.vstack(pred_joints3d_hat) -------------------------------------------------------------------------------- /lib/dataset/insta.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import h5py 18 | import torch 19 | import logging 20 | import numpy as np 21 | import os.path as osp 22 | 23 | from torch.utils.data import Dataset 24 | from lib.core.config import VIBE_DB_DIR 25 | from lib.data_utils.kp_utils import convert_kps 26 | from lib.data_utils.img_utils import normalize_2d_kp, split_into_chunks 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | class Insta(Dataset): 31 | def __init__(self, seqlen, overlap=0., debug=False): 32 | self.seqlen = seqlen 33 | self.stride = int(seqlen * (1-overlap)) 34 | 35 | self.h5_file = osp.join(VIBE_DB_DIR, 'insta_train_db.h5') 36 | 37 | with h5py.File(self.h5_file, 'r') as db: 38 | self.db = db 39 | self.vid_indices = split_into_chunks(self.db['vid_name'], self.seqlen, self.stride) 40 | 41 | print(f'InstaVariety number of dataset objects {self.__len__()}') 42 | 43 | def __len__(self): 44 | return len(self.vid_indices) 45 | 46 | def __getitem__(self, index): 47 | return self.get_single_item(index) 48 | 49 | def get_single_item(self, index): 50 | start_index, end_index = self.vid_indices[index] 51 | 52 | with h5py.File(self.h5_file, 'r') as db: 53 | self.db = db 54 | 55 | kp_2d = self.db['joints2D'][start_index:end_index + 1] 56 | kp_2d = convert_kps(kp_2d, src='insta', dst='spin') 57 | kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) 58 | 59 | 60 | input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float() 61 | 62 | vid_name = self.db['vid_name'][start_index:end_index + 1] 63 | frame_id = self.db['frame_id'][start_index:end_index + 1].astype(str) 64 | instance_id = np.array([v.decode('ascii') + f for v, f in zip(vid_name, frame_id)]) 65 | 66 | for idx in range(self.seqlen): 67 | kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) 68 | kp_2d_tensor[idx] = kp_2d[idx] 69 | 70 | target = { 71 | 'features': input, 72 | 'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping 73 | # 'instance_id': instance_id 74 | } 75 | 76 | return target -------------------------------------------------------------------------------- /lib/dataset/loaders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from torch.utils.data import ConcatDataset, DataLoader 18 | 19 | from lib.dataset import * 20 | 21 | 22 | def get_data_loaders(cfg): 23 | def get_2d_datasets(dataset_names): 24 | datasets = [] 25 | for dataset_name in dataset_names: 26 | db = eval(dataset_name)(seqlen=cfg.DATASET.SEQLEN, debug=cfg.DEBUG) 27 | datasets.append(db) 28 | return ConcatDataset(datasets) 29 | 30 | def get_3d_datasets(dataset_names): 31 | datasets = [] 32 | for dataset_name in dataset_names: 33 | db = eval(dataset_name)(set='train', seqlen=cfg.DATASET.SEQLEN, debug=cfg.DEBUG) 34 | datasets.append(db) 35 | return ConcatDataset(datasets) 36 | 37 | # ===== 2D keypoint datasets ===== 38 | train_2d_dataset_names = cfg.TRAIN.DATASETS_2D 39 | train_2d_db = get_2d_datasets(train_2d_dataset_names) 40 | 41 | data_2d_batch_size = int(cfg.TRAIN.BATCH_SIZE * cfg.TRAIN.DATA_2D_RATIO) 42 | data_3d_batch_size = cfg.TRAIN.BATCH_SIZE - data_2d_batch_size 43 | 44 | train_2d_loader = DataLoader( 45 | dataset=train_2d_db, 46 | batch_size=data_2d_batch_size, 47 | shuffle=True, 48 | num_workers=cfg.NUM_WORKERS, 49 | ) 50 | 51 | # ===== 3D keypoint datasets ===== 52 | train_3d_dataset_names = cfg.TRAIN.DATASETS_3D 53 | train_3d_db = get_3d_datasets(train_3d_dataset_names) 54 | 55 | train_3d_loader = DataLoader( 56 | dataset=train_3d_db, 57 | batch_size=data_3d_batch_size, 58 | shuffle=True, 59 | num_workers=cfg.NUM_WORKERS, 60 | ) 61 | 62 | # ===== Motion Discriminator dataset ===== 63 | motion_disc_db = AMASS(seqlen=cfg.DATASET.SEQLEN) 64 | 65 | motion_disc_loader = DataLoader( 66 | dataset=motion_disc_db, 67 | batch_size=cfg.TRAIN.BATCH_SIZE, 68 | shuffle=True, 69 | num_workers=cfg.NUM_WORKERS, 70 | ) 71 | 72 | # ===== Evaluation dataset ===== 73 | valid_db = eval(cfg.TRAIN.DATASET_EVAL)(set='val', seqlen=cfg.DATASET.SEQLEN, debug=cfg.DEBUG) 74 | 75 | valid_loader = DataLoader( 76 | dataset=valid_db, 77 | batch_size=cfg.TRAIN.BATCH_SIZE, 78 | shuffle=False, 79 | num_workers=cfg.NUM_WORKERS, 80 | ) 81 | 82 | return train_2d_loader, train_3d_loader, motion_disc_loader, valid_loader -------------------------------------------------------------------------------- /lib/models/attention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import torch 18 | from torch import nn 19 | 20 | def init_weights(m): 21 | if type(m) == nn.Linear: 22 | torch.nn.init.uniform_(m.weight, -0.1, 0.1) 23 | m.bias.data.fill_(0.01) 24 | 25 | class SelfAttention(nn.Module): 26 | def __init__(self, attention_size, 27 | batch_first=False, 28 | layers=1, 29 | dropout=.0, 30 | non_linearity="tanh"): 31 | super(SelfAttention, self).__init__() 32 | 33 | self.batch_first = batch_first 34 | 35 | if non_linearity == "relu": 36 | activation = nn.ReLU() 37 | else: 38 | activation = nn.Tanh() 39 | 40 | modules = [] 41 | for i in range(layers - 1): 42 | modules.append(nn.Linear(attention_size, attention_size)) 43 | modules.append(activation) 44 | modules.append(nn.Dropout(dropout)) 45 | 46 | # last attention layer must output 1 47 | modules.append(nn.Linear(attention_size, 1)) 48 | modules.append(activation) 49 | modules.append(nn.Dropout(dropout)) 50 | 51 | self.attention = nn.Sequential(*modules) 52 | self.attention.apply(init_weights) 53 | self.softmax = nn.Softmax(dim=-1) 54 | 55 | 56 | def forward(self, inputs): 57 | 58 | ################################################################## 59 | # STEP 1 - perform dot product 60 | # of the attention vector and each hidden state 61 | ################################################################## 62 | 63 | # inputs is a 3D Tensor: batch, len, hidden_size 64 | # scores is a 2D Tensor: batch, len 65 | scores = self.attention(inputs).squeeze() 66 | scores = self.softmax(scores) 67 | 68 | ################################################################## 69 | # Step 2 - Weighted sum of hidden states, by the attention scores 70 | ################################################################## 71 | 72 | # multiply each hidden state with the attention weights 73 | weighted = torch.mul(inputs, scores.unsqueeze(-1).expand_as(inputs)) 74 | 75 | # sum the hidden states 76 | # representations = weighted.sum(1).squeeze() 77 | representations = weighted.sum(1).squeeze() 78 | return representations, scores 79 | 80 | -------------------------------------------------------------------------------- /vibe_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "kJe1q2JFK4LZ" 17 | }, 18 | "source": [ 19 | "# セットアップ" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "Tvd4cfPk5a0e" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# githubからコードをコピー\n", 31 | "!git clone https://github.com/cedro3/VIBE.git\n", 32 | "%cd VIBE/\n", 33 | "\n", 34 | "# ライブラリを取得\n", 35 | "!pip install torch==1.4.0 numpy==1.17.5\n", 36 | "!pip install git+https://github.com/giacaglia/pytube.git --upgrade\n", 37 | "!pip install -r requirements.txt\n", 38 | "\n", 39 | "# 学習済み重みとSMPLデータのダウンロード\n", 40 | "!source scripts/prepare_data.sh" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "nflTgaAWLqsu" 47 | }, 48 | "source": [ 49 | "# デモの実行\n", 50 | "最後に --sideview オプションを追加すると横からのView推定も行います。" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "id": "qVNszfLQ7rC9" 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# デモの実行\n", 62 | "!python demo.py --vid_file girl_dance.mp4 --output_folder output/ " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "id": "j8zxBa_K-FJf" 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# 作成した動画を再生する\n", 74 | "from IPython.display import HTML\n", 75 | "from base64 import b64encode\n", 76 | "\n", 77 | "def video(path):\n", 78 | " mp4 = open(path,'rb').read()\n", 79 | " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", 80 | " return HTML('' % data_url)\n", 81 | "\n", 82 | "video('output/girl_dance/girl_dance_vibe_result.mp4') " 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "accelerator": "GPU", 88 | "colab": { 89 | "collapsed_sections": [], 90 | "include_colab_link": true, 91 | "name": "vibe_demo", 92 | "provenance": [], 93 | "toc_visible": true 94 | }, 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.7.9" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 1 115 | } 116 | -------------------------------------------------------------------------------- /lib/utils/pose_tracker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import json 19 | import shutil 20 | import subprocess 21 | import numpy as np 22 | import os.path as osp 23 | 24 | 25 | def run_openpose( 26 | video_file, 27 | output_folder, 28 | staf_folder, 29 | vis=False, 30 | ): 31 | pwd = os.getcwd() 32 | 33 | os.chdir(staf_folder) 34 | 35 | render = 1 if vis else 0 36 | display = 2 if vis else 0 37 | cmd = [ 38 | 'build/examples/openpose/openpose.bin', 39 | '--model_pose', 'BODY_21A', 40 | '--tracking', '1', 41 | '--render_pose', str(render), 42 | '--video', video_file, 43 | '--write_json', output_folder, 44 | '--display', str(display) 45 | ] 46 | 47 | print('Executing', ' '.join(cmd)) 48 | subprocess.call(cmd) 49 | os.chdir(pwd) 50 | 51 | 52 | def read_posetrack_keypoints(output_folder): 53 | 54 | people = dict() 55 | 56 | for idx, result_file in enumerate(sorted(os.listdir(output_folder))): 57 | json_file = osp.join(output_folder, result_file) 58 | data = json.load(open(json_file)) 59 | # print(idx, data) 60 | for person in data['people']: 61 | person_id = person['person_id'][0] 62 | joints2d = person['pose_keypoints_2d'] 63 | if person_id in people.keys(): 64 | people[person_id]['joints2d'].append(joints2d) 65 | people[person_id]['frames'].append(idx) 66 | else: 67 | people[person_id] = { 68 | 'joints2d': [], 69 | 'frames': [], 70 | } 71 | people[person_id]['joints2d'].append(joints2d) 72 | people[person_id]['frames'].append(idx) 73 | 74 | for k in people.keys(): 75 | people[k]['joints2d'] = np.array(people[k]['joints2d']).reshape((len(people[k]['joints2d']), -1, 3)) 76 | people[k]['frames'] = np.array(people[k]['frames']) 77 | 78 | return people 79 | 80 | 81 | def run_posetracker(video_file, staf_folder, posetrack_output_folder='/tmp', display=False): 82 | posetrack_output_folder = os.path.join( 83 | posetrack_output_folder, 84 | f'{os.path.basename(video_file)}_posetrack' 85 | ) 86 | 87 | # run posetrack on video 88 | run_openpose( 89 | video_file, 90 | posetrack_output_folder, 91 | vis=display, 92 | staf_folder=staf_folder 93 | ) 94 | 95 | people_dict = read_posetrack_keypoints(posetrack_output_folder) 96 | 97 | shutil.rmtree(posetrack_output_folder) 98 | 99 | return people_dict -------------------------------------------------------------------------------- /lib/models/motion_discriminator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | from torch.nn.utils import spectral_norm 21 | from lib.models.attention import SelfAttention 22 | 23 | class MotionDiscriminator(nn.Module): 24 | 25 | def __init__(self, 26 | rnn_size, 27 | input_size, 28 | num_layers, 29 | output_size=2, 30 | feature_pool="concat", 31 | use_spectral_norm=False, 32 | attention_size=1024, 33 | attention_layers=1, 34 | attention_dropout=0.5): 35 | 36 | super(MotionDiscriminator, self).__init__() 37 | self.input_size = input_size 38 | self.rnn_size = rnn_size 39 | self.feature_pool = feature_pool 40 | self.num_layers = num_layers 41 | self.attention_size = attention_size 42 | self.attention_layers = attention_layers 43 | self.attention_dropout = attention_dropout 44 | 45 | self.gru = nn.GRU(self.input_size, self.rnn_size, num_layers=num_layers) 46 | 47 | linear_size = self.rnn_size if not feature_pool == "concat" else self.rnn_size * 2 48 | 49 | if feature_pool == "attention" : 50 | self.attention = SelfAttention(attention_size=self.attention_size, 51 | layers=self.attention_layers, 52 | dropout=self.attention_dropout) 53 | if use_spectral_norm: 54 | self.fc = spectral_norm(nn.Linear(linear_size, output_size)) 55 | else: 56 | self.fc = nn.Linear(linear_size, output_size) 57 | 58 | def forward(self, sequence): 59 | """ 60 | sequence: of shape [batch_size, seq_len, input_size] 61 | """ 62 | batchsize, seqlen, input_size = sequence.shape 63 | sequence = torch.transpose(sequence, 0, 1) 64 | 65 | outputs, state = self.gru(sequence) 66 | 67 | if self.feature_pool == "concat": 68 | outputs = F.relu(outputs) 69 | avg_pool = F.adaptive_avg_pool1d(outputs.permute(1, 2, 0), 1).view(batchsize, -1) 70 | max_pool = F.adaptive_max_pool1d(outputs.permute(1, 2, 0), 1).view(batchsize, -1) 71 | output = self.fc(torch.cat([avg_pool, max_pool], dim=1)) 72 | elif self.feature_pool == "attention": 73 | outputs = outputs.permute(1, 0, 2) 74 | y, attentions = self.attention(outputs) 75 | output = self.fc(y) 76 | else: 77 | output = self.fc(outputs[-1]) 78 | 79 | return output 80 | -------------------------------------------------------------------------------- /lib/dataset/inference.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import cv2 19 | import numpy as np 20 | import os.path as osp 21 | from torch.utils.data import Dataset 22 | from torchvision.transforms.functional import to_tensor 23 | 24 | from lib.utils.smooth_bbox import get_all_bbox_params 25 | from lib.data_utils.img_utils import get_single_image_crop_demo 26 | 27 | 28 | class Inference(Dataset): 29 | def __init__(self, image_folder, frames, bboxes=None, joints2d=None, scale=1.0, crop_size=224): 30 | self.image_file_names = [ 31 | osp.join(image_folder, x) 32 | for x in os.listdir(image_folder) 33 | if x.endswith('.png') or x.endswith('.jpg') 34 | ] 35 | self.image_file_names = sorted(self.image_file_names) 36 | self.image_file_names = np.array(self.image_file_names)[frames] 37 | self.bboxes = bboxes 38 | self.joints2d = joints2d 39 | self.scale = scale 40 | self.crop_size = crop_size 41 | self.frames = frames 42 | self.has_keypoints = True if joints2d is not None else False 43 | 44 | self.norm_joints2d = np.zeros_like(self.joints2d) 45 | 46 | if self.has_keypoints: 47 | bboxes, time_pt1, time_pt2 = get_all_bbox_params(joints2d, vis_thresh=0.3) 48 | bboxes[:, 2:] = 150. / bboxes[:, 2:] 49 | self.bboxes = np.stack([bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 2]]).T 50 | 51 | self.image_file_names = self.image_file_names[time_pt1:time_pt2] 52 | self.joints2d = joints2d[time_pt1:time_pt2] 53 | self.frames = frames[time_pt1:time_pt2] 54 | 55 | def __len__(self): 56 | return len(self.image_file_names) 57 | 58 | def __getitem__(self, idx): 59 | img = cv2.cvtColor(cv2.imread(self.image_file_names[idx]), cv2.COLOR_BGR2RGB) 60 | 61 | bbox = self.bboxes[idx] 62 | 63 | j2d = self.joints2d[idx] if self.has_keypoints else None 64 | 65 | norm_img, raw_img, kp_2d = get_single_image_crop_demo( 66 | img, 67 | bbox, 68 | kp_2d=j2d, 69 | scale=self.scale, 70 | crop_size=self.crop_size) 71 | if self.has_keypoints: 72 | return norm_img, kp_2d 73 | else: 74 | return norm_img 75 | 76 | 77 | class ImageFolder(Dataset): 78 | def __init__(self, image_folder): 79 | self.image_file_names = [ 80 | osp.join(image_folder, x) 81 | for x in os.listdir(image_folder) 82 | if x.endswith('.png') or x.endswith('.jpg') 83 | ] 84 | self.image_file_names = sorted(self.image_file_names) 85 | 86 | def __len__(self): 87 | return len(self.image_file_names) 88 | 89 | def __getitem__(self, idx): 90 | img = cv2.cvtColor(cv2.imread(self.image_file_names[idx]), cv2.COLOR_BGR2RGB) 91 | return to_tensor(img) 92 | -------------------------------------------------------------------------------- /doc/train.md: -------------------------------------------------------------------------------- 1 | # Training Instructions 2 | 3 | Throughout the documentation we refer to VIBE root folder as `$ROOT`. 4 | 5 | ## Data Preparation 6 | During training, VIBE uses precomputed image features to reduce training time. Thus, we process the datasets into a 7 | standard format before using them for training. To obtain these standard training files, you need to run: 8 | 9 | ```shell script 10 | source scripts/prepare_training_data.sh 11 | ``` 12 | 13 | This script will first create a folder for the dataset files `$ROOT/data/vibe_db`, then process each dataset and save 14 | output files to this directory. Before proceeding, you need to download each of the datasets listed 15 | below, then modify the `--dir` argument in the script to point the 16 | directory of each dataset. 17 | 18 | 19 | 20 | ## Datasets 21 | 22 | - **AMASS** (https://amass.is.tue.mpg.de) 23 | 24 | Directory structure: 25 | 26 | ```shell script 27 | amass 28 | |-- ACCAD 29 | |-- BioMotionLab_NTroje 30 | |-- CMU 31 | |-- ... 32 | `-- Transitions_mocap 33 | ``` 34 | 35 | - **InstaVariety** 36 | 37 | For your convenience, we uploaded the preprocessed InstaVariety data 38 | [here](https://owncloud.tuebingen.mpg.de/index.php/s/MKLnHtPjwn24y9C) (size: 18 GB). 39 | After downloading the file, put it under 40 | `$ROOT/data/vibe_db`. Do not forget to verify checksum for sanity check: 41 | ``` 42 | md5sum : 8ec335d1d48bd54687ad5c9a6eeb2999 43 | sha256sum : 7eadff77043cd85b49cbba8bfc9111c4305792ca64da1b92fb40fa702689dfa9 44 | ``` 45 | 46 | You may also preprocess the dataset yourself by downloading the 47 | [preprocessed tfrecords](https://github.com/akanazawa/human_dynamics/blob/master/doc/insta_variety.md#pre-processed-tfrecords) 48 | provided by the authors of Temporal HMR. 49 | 50 | Directory structure: 51 | ```shell script 52 | insta_variety 53 | |-- train 54 | | |-- insta_variety_00_copy00_hmr_noS5.ckpt-642561.tfrecord 55 | | |-- insta_variety_01_copy00_hmr_noS5.ckpt-642561.tfrecord 56 | | `-- ... 57 | `-- test 58 | |-- insta_variety_00_copy00_hmr_noS5.ckpt-642561.tfrecord 59 | |-- insta_variety_01_copy00_hmr_noS5.ckpt-642561.tfrecord 60 | `-- ... 61 | ``` 62 | 63 | - **MPI-3D-HP** (http://gvv.mpi-inf.mpg.de/3dhp-dataset) 64 | 65 | Donwload the dataset using the bash script provided by the authors. We will be using standard cameras only, so wall and ceiling 66 | cameras aren't needed. Then, run this 67 | [script](https://gist.github.com/mkocabas/cc6fe78aac51f97859e45f46476882b6) to extract frames of videos. 68 | 69 | Directory structure: 70 | ```shell script 71 | 72 | mpi_inf_3dhp 73 | |-- S1 74 | | |-- Seq1 75 | | |-- Seq2 76 | |-- S2 77 | | |-- Seq1 78 | | |-- Seq2 79 | |-- ... 80 | `-- util 81 | ``` 82 | 83 | - **3DPW** (https://virtualhumans.mpi-inf.mpg.de/3DPW) 84 | 85 | Directory structure: 86 | ```shell script 87 | 3dpw 88 | |-- imageFiles 89 | | |-- courtyard_arguing_00 90 | | |-- courtyard_backpack_00 91 | | |-- ... 92 | `-- sequenceFiles 93 | |-- test 94 | |-- train 95 | `-- validation 96 | ``` 97 | 98 | - **PennAction** (http://dreamdragon.github.io/PennAction/) 99 | 100 | Directory structure: 101 | ```shell script 102 | pennaction 103 | |-- frames 104 | | |-- 0000 105 | | |-- 0001 106 | | |-- ... 107 | `-- labels 108 | |-- 0000.mat 109 | |-- 0001.mat 110 | `-- ... 111 | ``` 112 | 113 | - **PoseTrack** (https://posetrack.net/) 114 | 115 | Directory structure: 116 | ```shell script 117 | posetrack 118 | |-- images 119 | | |-- train 120 | | |-- val 121 | | |-- test 122 | `-- posetrack_data 123 | `-- annotations 124 | |-- train 125 | |-- val 126 | `-- test 127 | ``` 128 | 129 | 130 | 131 | ## Training 132 | Run the command below to start training. 133 | 134 | ```shell script 135 | python train.py --cfg configs/config.yaml 136 | ``` 137 | 138 | See [`configs/config.yaml`](configs/config.yaml) or [`config.py`](lib/core/config.py) to 139 | play with different configurations. 140 | -------------------------------------------------------------------------------- /lib/models/smpl.py: -------------------------------------------------------------------------------- 1 | # This script is borrowed and extended from https://github.com/nkolot/SPIN/blob/master/models/hmr.py 2 | # Adhere to their licence to use this script 3 | 4 | import torch 5 | import numpy as np 6 | import os.path as osp 7 | from smplx import SMPL as _SMPL 8 | from smplx.body_models import ModelOutput 9 | from smplx.lbs import vertices2joints 10 | 11 | from lib.core.config import VIBE_DATA_DIR 12 | 13 | # Map joints to SMPL joints 14 | JOINT_MAP = { 15 | 'OP Nose': 24, 'OP Neck': 12, 'OP RShoulder': 17, 16 | 'OP RElbow': 19, 'OP RWrist': 21, 'OP LShoulder': 16, 17 | 'OP LElbow': 18, 'OP LWrist': 20, 'OP MidHip': 0, 18 | 'OP RHip': 2, 'OP RKnee': 5, 'OP RAnkle': 8, 19 | 'OP LHip': 1, 'OP LKnee': 4, 'OP LAnkle': 7, 20 | 'OP REye': 25, 'OP LEye': 26, 'OP REar': 27, 21 | 'OP LEar': 28, 'OP LBigToe': 29, 'OP LSmallToe': 30, 22 | 'OP LHeel': 31, 'OP RBigToe': 32, 'OP RSmallToe': 33, 'OP RHeel': 34, 23 | 'Right Ankle': 8, 'Right Knee': 5, 'Right Hip': 45, 24 | 'Left Hip': 46, 'Left Knee': 4, 'Left Ankle': 7, 25 | 'Right Wrist': 21, 'Right Elbow': 19, 'Right Shoulder': 17, 26 | 'Left Shoulder': 16, 'Left Elbow': 18, 'Left Wrist': 20, 27 | 'Neck (LSP)': 47, 'Top of Head (LSP)': 48, 28 | 'Pelvis (MPII)': 49, 'Thorax (MPII)': 50, 29 | 'Spine (H36M)': 51, 'Jaw (H36M)': 52, 30 | 'Head (H36M)': 53, 'Nose': 24, 'Left Eye': 26, 31 | 'Right Eye': 25, 'Left Ear': 28, 'Right Ear': 27 32 | } 33 | JOINT_NAMES = [ 34 | 'OP Nose', 'OP Neck', 'OP RShoulder', 35 | 'OP RElbow', 'OP RWrist', 'OP LShoulder', 36 | 'OP LElbow', 'OP LWrist', 'OP MidHip', 37 | 'OP RHip', 'OP RKnee', 'OP RAnkle', 38 | 'OP LHip', 'OP LKnee', 'OP LAnkle', 39 | 'OP REye', 'OP LEye', 'OP REar', 40 | 'OP LEar', 'OP LBigToe', 'OP LSmallToe', 41 | 'OP LHeel', 'OP RBigToe', 'OP RSmallToe', 'OP RHeel', 42 | 'Right Ankle', 'Right Knee', 'Right Hip', 43 | 'Left Hip', 'Left Knee', 'Left Ankle', 44 | 'Right Wrist', 'Right Elbow', 'Right Shoulder', 45 | 'Left Shoulder', 'Left Elbow', 'Left Wrist', 46 | 'Neck (LSP)', 'Top of Head (LSP)', 47 | 'Pelvis (MPII)', 'Thorax (MPII)', 48 | 'Spine (H36M)', 'Jaw (H36M)', 49 | 'Head (H36M)', 'Nose', 'Left Eye', 50 | 'Right Eye', 'Left Ear', 'Right Ear' 51 | ] 52 | 53 | JOINT_IDS = {JOINT_NAMES[i]: i for i in range(len(JOINT_NAMES))} 54 | JOINT_REGRESSOR_TRAIN_EXTRA = osp.join(VIBE_DATA_DIR, 'J_regressor_extra.npy') 55 | SMPL_MEAN_PARAMS = osp.join(VIBE_DATA_DIR, 'smpl_mean_params.npz') 56 | SMPL_MODEL_DIR = VIBE_DATA_DIR 57 | H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9] 58 | H36M_TO_J14 = H36M_TO_J17[:14] 59 | 60 | 61 | class SMPL(_SMPL): 62 | """ Extension of the official SMPL implementation to support more joints """ 63 | 64 | def __init__(self, *args, **kwargs): 65 | super(SMPL, self).__init__(*args, **kwargs) 66 | joints = [JOINT_MAP[i] for i in JOINT_NAMES] 67 | J_regressor_extra = np.load(JOINT_REGRESSOR_TRAIN_EXTRA) 68 | self.register_buffer('J_regressor_extra', torch.tensor(J_regressor_extra, dtype=torch.float32)) 69 | self.joint_map = torch.tensor(joints, dtype=torch.long) 70 | 71 | def forward(self, *args, **kwargs): 72 | kwargs['get_skin'] = True 73 | smpl_output = super(SMPL, self).forward(*args, **kwargs) 74 | extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices) 75 | joints = torch.cat([smpl_output.joints, extra_joints], dim=1) 76 | joints = joints[:, self.joint_map, :] 77 | output = ModelOutput(vertices=smpl_output.vertices, 78 | global_orient=smpl_output.global_orient, 79 | body_pose=smpl_output.body_pose, 80 | joints=joints, 81 | betas=smpl_output.betas, 82 | full_pose=smpl_output.full_pose) 83 | return output 84 | 85 | 86 | def get_smpl_faces(): 87 | smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False) 88 | return smpl.faces -------------------------------------------------------------------------------- /lib/core/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import argparse 18 | from yacs.config import CfgNode as CN 19 | 20 | # CONSTANTS 21 | # You may modify them at will 22 | VIBE_DB_DIR = 'data/vibe_db' 23 | AMASS_DIR = 'data/amass' 24 | INSTA_DIR = 'data/insta_variety' 25 | MPII3D_DIR = 'data/mpi_inf_3dhp' 26 | THREEDPW_DIR = 'data/3dpw' 27 | PENNACTION_DIR = 'data/penn_action' 28 | POSETRACK_DIR = 'data/posetrack' 29 | VIBE_DATA_DIR = 'data/vibe_data' 30 | 31 | # Configuration variables 32 | cfg = CN() 33 | 34 | cfg.OUTPUT_DIR = 'results' 35 | cfg.EXP_NAME = 'default' 36 | cfg.DEVICE = 'cuda' 37 | cfg.DEBUG = True 38 | cfg.LOGDIR = '' 39 | cfg.NUM_WORKERS = 8 40 | cfg.DEBUG_FREQ = 1000 41 | cfg.SEED_VALUE = -1 42 | 43 | cfg.CUDNN = CN() 44 | cfg.CUDNN.BENCHMARK = True 45 | cfg.CUDNN.DETERMINISTIC = False 46 | cfg.CUDNN.ENABLED = True 47 | 48 | cfg.TRAIN = CN() 49 | cfg.TRAIN.DATASETS_2D = ['Insta'] 50 | cfg.TRAIN.DATASETS_3D = ['MPII3D'] 51 | cfg.TRAIN.DATASET_EVAL = 'ThreeDPW' 52 | cfg.TRAIN.BATCH_SIZE = 32 53 | cfg.TRAIN.DATA_2D_RATIO = 0.5 54 | cfg.TRAIN.START_EPOCH = 0 55 | cfg.TRAIN.END_EPOCH = 5 56 | cfg.TRAIN.PRETRAINED_REGRESSOR = '' 57 | cfg.TRAIN.PRETRAINED = '' 58 | cfg.TRAIN.RESUME = '' 59 | cfg.TRAIN.NUM_ITERS_PER_EPOCH = 1000 60 | cfg.TRAIN.LR_PATIENCE = 5 61 | 62 | # <====== generator optimizer 63 | cfg.TRAIN.GEN_OPTIM = 'Adam' 64 | cfg.TRAIN.GEN_LR = 1e-4 65 | cfg.TRAIN.GEN_WD = 1e-4 66 | cfg.TRAIN.GEN_MOMENTUM = 0.9 67 | 68 | # <====== motion discriminator optimizer 69 | cfg.TRAIN.MOT_DISCR = CN() 70 | cfg.TRAIN.MOT_DISCR.OPTIM = 'SGD' 71 | cfg.TRAIN.MOT_DISCR.LR = 1e-2 72 | cfg.TRAIN.MOT_DISCR.WD = 1e-4 73 | cfg.TRAIN.MOT_DISCR.MOMENTUM = 0.9 74 | cfg.TRAIN.MOT_DISCR.UPDATE_STEPS = 1 75 | cfg.TRAIN.MOT_DISCR.FEATURE_POOL = 'concat' 76 | cfg.TRAIN.MOT_DISCR.HIDDEN_SIZE = 1024 77 | cfg.TRAIN.MOT_DISCR.NUM_LAYERS = 1 78 | cfg.TRAIN.MOT_DISCR.ATT = CN() 79 | cfg.TRAIN.MOT_DISCR.ATT.SIZE = 1024 80 | cfg.TRAIN.MOT_DISCR.ATT.LAYERS = 1 81 | cfg.TRAIN.MOT_DISCR.ATT.DROPOUT = 0.1 82 | 83 | cfg.DATASET = CN() 84 | cfg.DATASET.SEQLEN = 20 85 | cfg.DATASET.OVERLAP = 0.5 86 | 87 | cfg.LOSS = CN() 88 | cfg.LOSS.KP_2D_W = 60. 89 | cfg.LOSS.KP_3D_W = 30. 90 | cfg.LOSS.SHAPE_W = 0.001 91 | cfg.LOSS.POSE_W = 1.0 92 | cfg.LOSS.D_MOTION_LOSS_W = 1. 93 | 94 | cfg.MODEL = CN() 95 | 96 | cfg.MODEL.TEMPORAL_TYPE = 'gru' 97 | 98 | # GRU model hyperparams 99 | cfg.MODEL.TGRU = CN() 100 | cfg.MODEL.TGRU.NUM_LAYERS = 1 101 | cfg.MODEL.TGRU.ADD_LINEAR = False 102 | cfg.MODEL.TGRU.RESIDUAL = False 103 | cfg.MODEL.TGRU.HIDDEN_SIZE = 2048 104 | cfg.MODEL.TGRU.BIDIRECTIONAL = False 105 | 106 | 107 | def get_cfg_defaults(): 108 | """Get a yacs CfgNode object with default values for my_project.""" 109 | # Return a clone so that the defaults will not be altered 110 | # This is for the "local variable" use pattern 111 | return cfg.clone() 112 | 113 | 114 | def update_cfg(cfg_file): 115 | cfg = get_cfg_defaults() 116 | cfg.merge_from_file(cfg_file) 117 | return cfg.clone() 118 | 119 | 120 | def parse_args(): 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--cfg', type=str, help='cfg file path') 123 | 124 | args = parser.parse_args() 125 | print(args, end='\n\n') 126 | 127 | cfg_file = args.cfg 128 | if args.cfg is not None: 129 | cfg = update_cfg(args.cfg) 130 | else: 131 | cfg = get_cfg_defaults() 132 | 133 | return cfg, cfg_file 134 | -------------------------------------------------------------------------------- /lib/data_utils/feature_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import torch 19 | import torchvision 20 | import numpy as np 21 | import matplotlib.pyplot as plt 22 | 23 | from lib.utils.vis import batch_visualize_preds 24 | from lib.data_utils.img_utils import get_single_image_crop, convert_cvimg_to_tensor 25 | 26 | 27 | def extract_features(model, video, bbox, debug=False, batch_size=200, kp_2d=None, dataset=None, scale=1.3): 28 | ''' 29 | :param model: pretrained HMR model, use lib/models/hmr.py:get_pretrained_hmr() 30 | :param video: video filename, torch.Tensor in shape (num_frames,W,H,C) 31 | :param bbox: bbox array in shape (T,4) 32 | :param debug: boolean, true if you want to debug HMR predictions 33 | :param batch_size: batch size for HMR input 34 | :return: features: resnet50 features np.ndarray -> shape (num_frames, 4) 35 | ''' 36 | device = 'cuda' 37 | 38 | if isinstance(video, torch.Tensor) or isinstance(video, np.ndarray): 39 | video = video 40 | elif isinstance(video, str): 41 | if os.path.isfile(video): 42 | video, _, _ = torchvision.io.read_video(video) 43 | else: 44 | raise ValueError(f'{video} is not a valid file.') 45 | else: 46 | raise ValueError(f'Unknown type {type(video)} for video object') 47 | 48 | # For debugging ground truth 2d keypoints 49 | if debug and kp_2d is not None: 50 | import cv2 51 | if isinstance(video[0], np.str_): 52 | print(video[0]) 53 | frame = cv2.cvtColor(cv2.imread(video[0]), cv2.COLOR_BGR2RGB) 54 | elif isinstance(video[0], np.ndarray): 55 | frame = video[0] 56 | else: 57 | frame = video[0].numpy() 58 | for i in range(kp_2d.shape[1]): 59 | frame = cv2.circle( 60 | frame.copy(), 61 | (int(kp_2d[0,i,0]), int(kp_2d[0,i,1])), 62 | thickness=3, 63 | color=(255,0,0), 64 | radius=3, 65 | ) 66 | 67 | plt.imshow(frame) 68 | plt.show() 69 | 70 | if dataset == 'insta': 71 | video = torch.cat( 72 | [convert_cvimg_to_tensor(image).unsqueeze(0) for image in video], dim=0 73 | ).to(device) 74 | else: 75 | # crop bbox locations 76 | video = torch.cat( 77 | [get_single_image_crop(image, bbox, scale=scale).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0 78 | ).to(device) 79 | 80 | features = [] 81 | 82 | # split video into batches of frames 83 | frames = torch.split(video, batch_size) 84 | 85 | with torch.no_grad(): 86 | for images in frames: 87 | 88 | if not debug: 89 | pred = model.feature_extractor(images) 90 | features.append(pred.cpu()) 91 | del pred, images 92 | else: 93 | preds = model(images) 94 | dataset = 'spin' # dataset if dataset else 'common' 95 | result_image = batch_visualize_preds( 96 | images, 97 | preds[-1], 98 | target_exists=False, 99 | max_images=4, 100 | dataset=dataset, 101 | ) 102 | 103 | plt.figure(figsize=(19.2, 10.8)) 104 | plt.axis('off') 105 | plt.imshow(result_image) 106 | plt.show() 107 | 108 | del preds, images 109 | return 0 110 | 111 | features = torch.cat(features, dim=0) 112 | 113 | return features.numpy() 114 | -------------------------------------------------------------------------------- /lib/data_utils/penn_action_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import sys 18 | sys.path.append('.') 19 | 20 | import glob 21 | import torch 22 | import joblib 23 | import argparse 24 | from tqdm import tqdm 25 | import os.path as osp 26 | from skimage import io 27 | from scipy.io import loadmat 28 | 29 | from lib.models import spin 30 | from lib.data_utils.kp_utils import * 31 | from lib.core.config import VIBE_DB_DIR 32 | from lib.data_utils.img_utils import get_bbox_from_kp2d 33 | from lib.data_utils.feature_extractor import extract_features 34 | 35 | 36 | def calc_kpt_bound(kp_2d): 37 | MAX_COORD = 10000 38 | x = kp_2d[:, 0] 39 | y = kp_2d[:, 1] 40 | z = kp_2d[:, 2] 41 | u = MAX_COORD 42 | d = -1 43 | l = MAX_COORD 44 | r = -1 45 | for idx, vis in enumerate(z): 46 | if vis == 0: # skip invisible joint 47 | continue 48 | u = min(u, y[idx]) 49 | d = max(d, y[idx]) 50 | l = min(l, x[idx]) 51 | r = max(r, x[idx]) 52 | return u, d, l, r 53 | 54 | 55 | def load_mat(path): 56 | mat = loadmat(path) 57 | del mat['pose'], mat['__header__'], mat['__globals__'], mat['__version__'], mat['train'], mat['action'] 58 | mat['nframes'] = mat['nframes'][0][0] 59 | 60 | return mat 61 | 62 | 63 | def read_data(folder): 64 | dataset = { 65 | 'img_name' : [], 66 | 'joints2D': [], 67 | 'bbox': [], 68 | 'vid_name': [], 69 | 'features': [], 70 | } 71 | 72 | model = spin.get_pretrained_hmr() 73 | 74 | file_names = sorted(glob.glob(folder + '/labels/'+'*.mat')) 75 | 76 | for fname in tqdm(file_names): 77 | vid_dict=load_mat(fname) 78 | imgs = sorted(glob.glob(folder + '/frames/'+ fname.strip().split('/')[-1].split('.')[0]+'/*.jpg')) 79 | kp_2d = np.zeros((vid_dict['nframes'], 13, 3)) 80 | perm_idxs = get_perm_idxs('pennaction', 'common') 81 | 82 | kp_2d[:, :, 0] = vid_dict['x'] 83 | kp_2d[:, :, 1] = vid_dict['y'] 84 | kp_2d[:, :, 2] = vid_dict['visibility'] 85 | kp_2d = kp_2d[:, perm_idxs, :] 86 | 87 | # fix inconsistency 88 | n_kp_2d = np.zeros((kp_2d.shape[0], 14, 3)) 89 | n_kp_2d[:, :12, :] = kp_2d[:, :-1, :] 90 | n_kp_2d[:, 13, :] = kp_2d[:, 12, :] 91 | kp_2d = n_kp_2d 92 | 93 | bbox = np.zeros((vid_dict['nframes'], 4)) 94 | 95 | for fr_id, fr in enumerate(kp_2d): 96 | u, d, l, r = calc_kpt_bound(fr) 97 | center = np.array([(l + r) * 0.5, (u + d) * 0.5], dtype=np.float32) 98 | c_x, c_y = center[0], center[1] 99 | w, h = r - l, d - u 100 | w = h = np.where(w / h > 1, w, h) 101 | 102 | bbox[fr_id,:] = np.array([c_x, c_y, w, h]) 103 | 104 | dataset['vid_name'].append(np.array([f'{fname}']* vid_dict['nframes'])) 105 | dataset['img_name'].append(np.array(imgs)) 106 | dataset['joints2D'].append(kp_2d) 107 | dataset['bbox'].append(bbox) 108 | 109 | features = extract_features(model, np.array(imgs) , bbox, dataset='pennaction', debug=False) 110 | dataset['features'].append(features) 111 | 112 | for k in dataset.keys(): 113 | dataset[k] = np.array(dataset[k]) 114 | for k in dataset.keys(): 115 | dataset[k] = np.concatenate(dataset[k]) 116 | 117 | return dataset 118 | 119 | 120 | if __name__ == '__main__': 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--dir', type=str, help='dataset directory', default='data/pennaction') 123 | args = parser.parse_args() 124 | 125 | dataset = read_data(args.dir) 126 | joblib.dump(dataset, osp.join(VIBE_DB_DIR, 'pennaction_train_db.pt')) 127 | 128 | -------------------------------------------------------------------------------- /lib/utils/smooth_bbox.py: -------------------------------------------------------------------------------- 1 | # This script is borrowed from https://github.com/akanazawa/human_dynamics/blob/master/src/util/smooth_bbox.py 2 | # Adhere to their licence to use this script 3 | 4 | import numpy as np 5 | import scipy.signal as signal 6 | from scipy.ndimage.filters import gaussian_filter1d 7 | 8 | 9 | def get_smooth_bbox_params(kps, vis_thresh=2, kernel_size=11, sigma=3): 10 | """ 11 | Computes smooth bounding box parameters from keypoints: 12 | 1. Computes bbox by rescaling the person to be around 150 px. 13 | 2. Linearly interpolates bbox params for missing annotations. 14 | 3. Median filtering 15 | 4. Gaussian filtering. 16 | 17 | Recommended thresholds: 18 | * detect-and-track: 0 19 | * 3DPW: 0.1 20 | 21 | Args: 22 | kps (list): List of kps (Nx3) or None. 23 | vis_thresh (float): Threshold for visibility. 24 | kernel_size (int): Kernel size for median filtering (must be odd). 25 | sigma (float): Sigma for gaussian smoothing. 26 | 27 | Returns: 28 | Smooth bbox params [cx, cy, scale], start index, end index 29 | """ 30 | bbox_params, start, end = get_all_bbox_params(kps, vis_thresh) 31 | smoothed = smooth_bbox_params(bbox_params, kernel_size, sigma) 32 | smoothed = np.vstack((np.zeros((start, 3)), smoothed)) 33 | return smoothed, start, end 34 | 35 | 36 | def kp_to_bbox_param(kp, vis_thresh): 37 | """ 38 | Finds the bounding box parameters from the 2D keypoints. 39 | 40 | Args: 41 | kp (Kx3): 2D Keypoints. 42 | vis_thresh (float): Threshold for visibility. 43 | 44 | Returns: 45 | [center_x, center_y, scale] 46 | """ 47 | if kp is None: 48 | return 49 | vis = kp[:, 2] > vis_thresh 50 | if not np.any(vis): 51 | return 52 | min_pt = np.min(kp[vis, :2], axis=0) 53 | max_pt = np.max(kp[vis, :2], axis=0) 54 | person_height = np.linalg.norm(max_pt - min_pt) 55 | if person_height < 0.5: 56 | return 57 | center = (min_pt + max_pt) / 2. 58 | scale = 150. / person_height 59 | return np.append(center, scale) 60 | 61 | 62 | def get_all_bbox_params(kps, vis_thresh=2): 63 | """ 64 | Finds bounding box parameters for all keypoints. 65 | 66 | Look for sequences in the middle with no predictions and linearly 67 | interpolate the bbox params for those 68 | 69 | Args: 70 | kps (list): List of kps (Kx3) or None. 71 | vis_thresh (float): Threshold for visibility. 72 | 73 | Returns: 74 | bbox_params, start_index (incl), end_index (excl) 75 | """ 76 | # keeps track of how many indices in a row with no prediction 77 | num_to_interpolate = 0 78 | start_index = -1 79 | bbox_params = np.empty(shape=(0, 3), dtype=np.float32) 80 | 81 | for i, kp in enumerate(kps): 82 | bbox_param = kp_to_bbox_param(kp, vis_thresh=vis_thresh) 83 | if bbox_param is None: 84 | num_to_interpolate += 1 85 | continue 86 | 87 | if start_index == -1: 88 | # Found the first index with a prediction! 89 | start_index = i 90 | num_to_interpolate = 0 91 | 92 | if num_to_interpolate > 0: 93 | # Linearly interpolate each param. 94 | previous = bbox_params[-1] 95 | # This will be 3x(n+2) 96 | interpolated = np.array( 97 | [np.linspace(prev, curr, num_to_interpolate + 2) 98 | for prev, curr in zip(previous, bbox_param)]) 99 | bbox_params = np.vstack((bbox_params, interpolated.T[1:-1])) 100 | num_to_interpolate = 0 101 | bbox_params = np.vstack((bbox_params, bbox_param)) 102 | 103 | return bbox_params, start_index, i - num_to_interpolate + 1 104 | 105 | 106 | def smooth_bbox_params(bbox_params, kernel_size=11, sigma=8): 107 | """ 108 | Applies median filtering and then gaussian filtering to bounding box 109 | parameters. 110 | 111 | Args: 112 | bbox_params (Nx3): [cx, cy, scale]. 113 | kernel_size (int): Kernel size for median filtering (must be odd). 114 | sigma (float): Sigma for gaussian smoothing. 115 | 116 | Returns: 117 | Smoothed bounding box parameters (Nx3). 118 | """ 119 | smoothed = np.array([signal.medfilt(param, kernel_size) 120 | for param in bbox_params.T]).T 121 | return np.array([gaussian_filter1d(traj, sigma) for traj in smoothed.T]).T 122 | -------------------------------------------------------------------------------- /lib/utils/renderer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import math 18 | import trimesh 19 | import pyrender 20 | import numpy as np 21 | from pyrender.constants import RenderFlags 22 | from lib.models.smpl import get_smpl_faces 23 | 24 | 25 | class WeakPerspectiveCamera(pyrender.Camera): 26 | def __init__(self, 27 | scale, 28 | translation, 29 | znear=pyrender.camera.DEFAULT_Z_NEAR, 30 | zfar=None, 31 | name=None): 32 | super(WeakPerspectiveCamera, self).__init__( 33 | znear=znear, 34 | zfar=zfar, 35 | name=name, 36 | ) 37 | self.scale = scale 38 | self.translation = translation 39 | 40 | def get_projection_matrix(self, width=None, height=None): 41 | P = np.eye(4) 42 | P[0, 0] = self.scale[0] 43 | P[1, 1] = self.scale[1] 44 | P[0, 3] = self.translation[0] * self.scale[0] 45 | P[1, 3] = -self.translation[1] * self.scale[1] 46 | P[2, 2] = -1 47 | return P 48 | 49 | 50 | class Renderer: 51 | def __init__(self, resolution=(224,224), orig_img=False, wireframe=False): 52 | self.resolution = resolution 53 | 54 | self.faces = get_smpl_faces() 55 | self.orig_img = orig_img 56 | self.wireframe = wireframe 57 | self.renderer = pyrender.OffscreenRenderer( 58 | viewport_width=self.resolution[0], 59 | viewport_height=self.resolution[1], 60 | point_size=1.0 61 | ) 62 | 63 | # set the scene 64 | self.scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0], ambient_light=(0.3, 0.3, 0.3)) 65 | 66 | light = pyrender.PointLight(color=[1.0, 1.0, 1.0], intensity=1) 67 | 68 | light_pose = np.eye(4) 69 | light_pose[:3, 3] = [0, -1, 1] 70 | self.scene.add(light, pose=light_pose) 71 | 72 | light_pose[:3, 3] = [0, 1, 1] 73 | self.scene.add(light, pose=light_pose) 74 | 75 | light_pose[:3, 3] = [1, 1, 2] 76 | self.scene.add(light, pose=light_pose) 77 | 78 | def render(self, img, verts, cam, angle=None, axis=None, mesh_filename=None, color=[1.0, 1.0, 0.9]): 79 | 80 | mesh = trimesh.Trimesh(vertices=verts, faces=self.faces, process=False) 81 | 82 | Rx = trimesh.transformations.rotation_matrix(math.radians(180), [1, 0, 0]) 83 | mesh.apply_transform(Rx) 84 | 85 | if mesh_filename is not None: 86 | mesh.export(mesh_filename) 87 | 88 | if angle and axis: 89 | R = trimesh.transformations.rotation_matrix(math.radians(angle), axis) 90 | mesh.apply_transform(R) 91 | 92 | sx, sy, tx, ty = cam 93 | 94 | camera = WeakPerspectiveCamera( 95 | scale=[sx, sy], 96 | translation=[tx, ty], 97 | zfar=1000. 98 | ) 99 | 100 | material = pyrender.MetallicRoughnessMaterial( 101 | metallicFactor=0.0, 102 | alphaMode='OPAQUE', 103 | baseColorFactor=(color[0], color[1], color[2], 1.0) 104 | ) 105 | 106 | mesh = pyrender.Mesh.from_trimesh(mesh, material=material) 107 | 108 | mesh_node = self.scene.add(mesh, 'mesh') 109 | 110 | camera_pose = np.eye(4) 111 | cam_node = self.scene.add(camera, pose=camera_pose) 112 | 113 | if self.wireframe: 114 | render_flags = RenderFlags.RGBA | RenderFlags.ALL_WIREFRAME 115 | else: 116 | render_flags = RenderFlags.RGBA 117 | 118 | rgb, _ = self.renderer.render(self.scene, flags=render_flags) 119 | valid_mask = (rgb[:, :, -1] > 0)[:, :, np.newaxis] 120 | output_img = rgb[:, :, :-1] * valid_mask + (1 - valid_mask) * img 121 | image = output_img.astype(np.uint8) 122 | 123 | self.scene.remove_node(mesh_node) 124 | self.scene.remove_node(cam_node) 125 | 126 | return image 127 | -------------------------------------------------------------------------------- /lib/data_utils/amass_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import joblib 19 | import argparse 20 | import numpy as np 21 | import os.path as osp 22 | from tqdm import tqdm 23 | 24 | from lib.core.config import VIBE_DB_DIR 25 | 26 | dict_keys = ['betas', 'dmpls', 'gender', 'mocap_framerate', 'poses', 'trans'] 27 | 28 | # extract SMPL joints from SMPL-H model 29 | joints_to_use = np.array([ 30 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 31 | 11, 12, 13, 14, 15, 16, 17, 18, 19, 32 | 20, 21, 22, 37 33 | ]) 34 | joints_to_use = np.arange(0,156).reshape((-1,3))[joints_to_use].reshape(-1) 35 | 36 | all_sequences = [ 37 | 'ACCAD', 38 | 'BioMotionLab_NTroje', 39 | 'CMU', 40 | 'EKUT', 41 | 'Eyes_Japan_Dataset', 42 | 'HumanEva', 43 | 'KIT', 44 | 'MPI_HDM05', 45 | 'MPI_Limits', 46 | 'MPI_mosh', 47 | 'SFU', 48 | 'SSM_synced', 49 | 'TCD_handMocap', 50 | 'TotalCapture', 51 | 'Transitions_mocap', 52 | ] 53 | 54 | def read_data(folder, sequences): 55 | # sequences = [osp.join(folder, x) for x in sorted(os.listdir(folder)) if osp.isdir(osp.join(folder, x))] 56 | 57 | if sequences == 'all': 58 | sequences = all_sequences 59 | 60 | db = { 61 | 'theta': [], 62 | 'vid_name': [], 63 | } 64 | 65 | for seq_name in sequences: 66 | print(f'Reading {seq_name} sequence...') 67 | seq_folder = osp.join(folder, seq_name) 68 | 69 | thetas, vid_names = read_single_sequence(seq_folder, seq_name) 70 | seq_name_list = np.array([seq_name]*thetas.shape[0]) 71 | print(seq_name, 'number of videos', thetas.shape[0]) 72 | db['theta'].append(thetas) 73 | db['vid_name'].append(vid_names) 74 | 75 | db['theta'] = np.concatenate(db['theta'], axis=0) 76 | db['vid_name'] = np.concatenate(db['vid_name'], axis=0) 77 | 78 | return db 79 | 80 | 81 | 82 | def read_single_sequence(folder, seq_name): 83 | subjects = os.listdir(folder) 84 | 85 | thetas = [] 86 | vid_names = [] 87 | 88 | for subject in tqdm(subjects): 89 | actions = [x for x in os.listdir(osp.join(folder, subject)) if x.endswith('.npz')] 90 | 91 | for action in actions: 92 | fname = osp.join(folder, subject, action) 93 | 94 | if fname.endswith('shape.npz'): 95 | continue 96 | 97 | data = np.load(fname) 98 | 99 | pose = data['poses'][:, joints_to_use] 100 | 101 | if pose.shape[0] < 60: 102 | continue 103 | 104 | shape = np.repeat(data['betas'][:10][np.newaxis], pose.shape[0], axis=0) 105 | theta = np.concatenate([pose,shape], axis=1) 106 | vid_name = np.array([f'{seq_name}_{subject}_{action[:-4]}']*pose.shape[0]) 107 | 108 | vid_names.append(vid_name) 109 | thetas.append(theta) 110 | 111 | return np.concatenate(thetas, axis=0), np.concatenate(vid_names, axis=0) 112 | 113 | 114 | def read_seq_data(folder, nsubjects, fps): 115 | subjects = os.listdir(folder) 116 | sequences = {} 117 | 118 | assert nsubjects < len(subjects), 'nsubjects should be less than len(subjects)' 119 | 120 | for subject in subjects[:nsubjects]: 121 | actions = os.listdir(osp.join(folder, subject)) 122 | 123 | for action in actions: 124 | data = np.load(osp.join(folder, subject, action)) 125 | mocap_framerate = int(data['mocap_framerate']) 126 | sampling_freq = mocap_framerate // fps 127 | sequences[(subject, action)] = data['poses'][0::sampling_freq, joints_to_use] 128 | 129 | train_set = {} 130 | test_set = {} 131 | 132 | for i, (k,v) in enumerate(sequences.items()): 133 | if i < len(sequences.keys()) - len(sequences.keys()) // 4: 134 | train_set[k] = v 135 | else: 136 | test_set[k] = v 137 | 138 | return train_set, test_set 139 | 140 | if __name__ == '__main__': 141 | parser = argparse.ArgumentParser() 142 | parser.add_argument('--dir', type=str, help='dataset directory', default='data/amass') 143 | args = parser.parse_args() 144 | 145 | db = read_data(args.dir, sequences=all_sequences) 146 | db_file = osp.join(VIBE_DB_DIR, 'amass_db.pt') 147 | print(f'Saving AMASS dataset to {db_file}') 148 | joblib.dump(db, db_file) 149 | -------------------------------------------------------------------------------- /lib/dataset/dataset_2d.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import torch 19 | import random 20 | import logging 21 | import numpy as np 22 | import os.path as osp 23 | import joblib 24 | 25 | from torch.utils.data import Dataset 26 | 27 | from lib.core.config import VIBE_DB_DIR 28 | from lib.data_utils.kp_utils import convert_kps 29 | from lib.data_utils.img_utils import normalize_2d_kp, transfrom_keypoints, split_into_chunks 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | class Dataset2D(Dataset): 34 | def __init__(self, seqlen, overlap=0., 35 | folder=None, dataset_name=None, debug=False): 36 | 37 | self.folder = folder 38 | self.dataset_name = dataset_name 39 | self.seqlen = seqlen 40 | self.stride = int(seqlen * (1-overlap)) 41 | self.debug = debug 42 | self.db = self.load_db() 43 | self.vid_indices = split_into_chunks(self.db['vid_name'], self.seqlen, self.stride) 44 | 45 | 46 | def __len__(self): 47 | return len(self.vid_indices) 48 | 49 | def __getitem__(self, index): 50 | return self.get_single_item(index) 51 | 52 | def load_db(self): 53 | set = 'train' 54 | 55 | db_file = osp.join(VIBE_DB_DIR, f'{self.dataset_name}_{set}_db.pt') 56 | 57 | if osp.isfile(db_file): 58 | db = joblib.load(db_file) 59 | else: 60 | raise ValueError(f'{db_file} do not exists') 61 | 62 | print(f'Loaded {self.dataset_name} dataset from {db_file}') 63 | return db 64 | 65 | def get_single_item(self, index): 66 | start_index, end_index = self.vid_indices[index] 67 | 68 | kp_2d = self.db['joints2D'][start_index:end_index+1] 69 | if self.dataset_name != 'posetrack': 70 | kp_2d = convert_kps(kp_2d, src=self.dataset_name, dst='spin') 71 | kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) 72 | 73 | bbox = self.db['bbox'][start_index:end_index+1] 74 | 75 | input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float() 76 | 77 | 78 | for idx in range(self.seqlen): 79 | # crop image and transform 2d keypoints 80 | kp_2d[idx,:,:2], trans = transfrom_keypoints( 81 | kp_2d=kp_2d[idx,:,:2], 82 | center_x=bbox[idx,0], 83 | center_y=bbox[idx,1], 84 | width=bbox[idx,2], 85 | height=bbox[idx,3], 86 | patch_width=224, 87 | patch_height=224, 88 | do_augment=False, 89 | ) 90 | 91 | kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) 92 | kp_2d_tensor[idx] = kp_2d[idx] 93 | 94 | vid_name = self.db['vid_name'][start_index:end_index+1] 95 | frame_id = self.db['img_name'][start_index:end_index+1].astype(str) 96 | instance_id = np.array([v+f for v,f in zip(vid_name, frame_id)]) 97 | 98 | target = { 99 | 'features': input, 100 | 'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping 101 | # 'instance_id': instance_id, 102 | } 103 | 104 | if self.debug: 105 | from lib.data_utils.img_utils import get_single_image_crop 106 | 107 | vid_name = self.db['vid_name'][start_index] 108 | 109 | if self.dataset_name == 'pennaction': 110 | vid_folder = "frames" 111 | vid_name = vid_name.split('/')[-1].split('.')[0] 112 | img_id = "img_name" 113 | elif self.dataset_name == 'posetrack': 114 | vid_folder = osp.join('images', vid_name.split('/')[-2]) 115 | vid_name = vid_name.split('/')[-1].split('.')[0] 116 | img_id = "img_name" 117 | else: 118 | vid_name = '_'.join(vid_name.split('_')[:-1]) 119 | vid_folder = 'imageFiles' 120 | img_id= 'frame_id' 121 | f = osp.join(self.folder, vid_folder, vid_name) 122 | video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')] 123 | frame_idxs = self.db[img_id][start_index:end_index + 1] 124 | if self.dataset_name == 'pennaction' or self.dataset_name == 'posetrack': 125 | video = frame_idxs 126 | else: 127 | video = [video_file_list[i] for i in frame_idxs] 128 | 129 | video = torch.cat( 130 | [get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0 131 | ) 132 | 133 | target['video'] = video 134 | 135 | return target 136 | 137 | 138 | -------------------------------------------------------------------------------- /lib/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import yaml 19 | import time 20 | import torch 21 | import shutil 22 | import logging 23 | import operator 24 | from tqdm import tqdm 25 | from os import path as osp 26 | from functools import reduce 27 | from typing import List, Union 28 | 29 | 30 | def move_dict_to_device(dict, device, tensor2float=False): 31 | for k,v in dict.items(): 32 | if isinstance(v, torch.Tensor): 33 | if tensor2float: 34 | dict[k] = v.float().to(device) 35 | else: 36 | dict[k] = v.to(device) 37 | 38 | 39 | def get_from_dict(dict, keys): 40 | return reduce(operator.getitem, keys, dict) 41 | 42 | 43 | def tqdm_enumerate(iter): 44 | i = 0 45 | for y in tqdm(iter): 46 | yield i, y 47 | i += 1 48 | 49 | 50 | def iterdict(d): 51 | for k,v in d.items(): 52 | if isinstance(v, dict): 53 | d[k] = dict(v) 54 | iterdict(v) 55 | return d 56 | 57 | 58 | def accuracy(output, target): 59 | _, pred = output.topk(1) 60 | pred = pred.view(-1) 61 | 62 | correct = pred.eq(target).sum() 63 | 64 | return correct.item(), target.size(0) - correct.item() 65 | 66 | 67 | def lr_decay(optimizer, step, lr, decay_step, gamma): 68 | lr = lr * gamma ** (step/decay_step) 69 | for param_group in optimizer.param_groups: 70 | param_group['lr'] = lr 71 | return lr 72 | 73 | 74 | def step_decay(optimizer, step, lr, decay_step, gamma): 75 | lr = lr * gamma ** (step / decay_step) 76 | for param_group in optimizer.param_groups: 77 | param_group['lr'] = lr 78 | return lr 79 | 80 | 81 | def read_yaml(filename): 82 | return yaml.load(open(filename, 'r')) 83 | 84 | 85 | def write_yaml(filename, object): 86 | with open(filename, 'w') as f: 87 | yaml.dump(object, f) 88 | 89 | 90 | def save_dict_to_yaml(obj, filename, mode='w'): 91 | with open(filename, mode) as f: 92 | yaml.dump(obj, f, default_flow_style=False) 93 | 94 | 95 | def save_to_file(obj, filename, mode='w'): 96 | with open(filename, mode) as f: 97 | f.write(obj) 98 | 99 | 100 | def concatenate_dicts(dict_list, dim=0): 101 | rdict = dict.fromkeys(dict_list[0].keys()) 102 | for k in rdict.keys(): 103 | rdict[k] = torch.cat([d[k] for d in dict_list], dim=dim) 104 | return rdict 105 | 106 | 107 | def bool_to_string(x: Union[List[bool],bool]) -> Union[List[str],str]: 108 | """ 109 | boolean to string conversion 110 | :param x: list or bool to be converted 111 | :return: string converted thing 112 | """ 113 | if isinstance(x, bool): 114 | return [str(x)] 115 | for i, j in enumerate(x): 116 | x[i]=str(j) 117 | return x 118 | 119 | 120 | def checkpoint2model(checkpoint, key='gen_state_dict'): 121 | state_dict = checkpoint[key] 122 | print(f'Performance of loaded model on 3DPW is {checkpoint["performance"]:.2f}mm') 123 | # del state_dict['regressor.mean_theta'] 124 | return state_dict 125 | 126 | 127 | def get_optimizer(model, optim_type, lr, weight_decay, momentum): 128 | if optim_type in ['sgd', 'SGD']: 129 | opt = torch.optim.SGD(lr=lr, params=model.parameters(), momentum=momentum) 130 | elif optim_type in ['Adam', 'adam', 'ADAM']: 131 | opt = torch.optim.Adam(lr=lr, params=model.parameters(), weight_decay=weight_decay) 132 | else: 133 | raise ModuleNotFoundError 134 | return opt 135 | 136 | 137 | def create_logger(logdir, phase='train'): 138 | os.makedirs(logdir, exist_ok=True) 139 | 140 | log_file = osp.join(logdir, f'{phase}_log.txt') 141 | 142 | head = '%(asctime)-15s %(message)s' 143 | logging.basicConfig(filename=log_file, 144 | format=head) 145 | logger = logging.getLogger() 146 | logger.setLevel(logging.INFO) 147 | console = logging.StreamHandler() 148 | logging.getLogger('').addHandler(console) 149 | 150 | return logger 151 | 152 | 153 | class AverageMeter(object): 154 | def __init__(self): 155 | self.val = 0 156 | self.avg = 0 157 | self.sum = 0 158 | self.count = 0 159 | 160 | def update(self, val, n=1): 161 | self.val = val 162 | self.sum += val * n 163 | self.count += n 164 | self.avg = self.sum / self.count 165 | 166 | 167 | def prepare_output_dir(cfg, cfg_file): 168 | 169 | # ==== create logdir 170 | logtime = time.strftime('%d-%m-%Y_%H-%M-%S') 171 | logdir = f'{logtime}_{cfg.EXP_NAME}' 172 | 173 | logdir = osp.join(cfg.OUTPUT_DIR, logdir) 174 | os.makedirs(logdir, exist_ok=True) 175 | shutil.copy(src=cfg_file, dst=osp.join(cfg.OUTPUT_DIR, 'config.yaml')) 176 | 177 | cfg.LOGDIR = logdir 178 | 179 | # save config 180 | save_dict_to_yaml(cfg, osp.join(cfg.LOGDIR, 'config.yaml')) 181 | 182 | return cfg 183 | -------------------------------------------------------------------------------- /lib/core/evaluate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import time 18 | import torch 19 | import shutil 20 | import logging 21 | import numpy as np 22 | import os.path as osp 23 | from progress.bar import Bar 24 | 25 | from lib.core.config import VIBE_DATA_DIR 26 | from lib.utils.utils import move_dict_to_device, AverageMeter 27 | 28 | from lib.utils.eval_utils import ( 29 | compute_accel, 30 | compute_error_accel, 31 | compute_error_verts, 32 | batch_compute_similarity_transform_torch, 33 | ) 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | class Evaluator(): 38 | def __init__( 39 | self, 40 | test_loader, 41 | model, 42 | device=None, 43 | ): 44 | self.test_loader = test_loader 45 | self.model = model 46 | self.device = device 47 | 48 | self.evaluation_accumulators = dict.fromkeys(['pred_j3d', 'target_j3d', 'target_theta', 'pred_verts']) 49 | 50 | if self.device is None: 51 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 52 | 53 | def validate(self): 54 | self.model.eval() 55 | 56 | start = time.time() 57 | 58 | summary_string = '' 59 | 60 | bar = Bar('Validation', fill='#', max=len(self.test_loader)) 61 | 62 | if self.evaluation_accumulators is not None: 63 | for k,v in self.evaluation_accumulators.items(): 64 | self.evaluation_accumulators[k] = [] 65 | 66 | J_regressor = torch.from_numpy(np.load(osp.join(VIBE_DATA_DIR, 'J_regressor_h36m.npy'))).float() 67 | 68 | for i, target in enumerate(self.test_loader): 69 | 70 | # video = video.to(self.device) 71 | move_dict_to_device(target, self.device) 72 | 73 | # <============= 74 | with torch.no_grad(): 75 | inp = target['features'] 76 | 77 | preds = self.model(inp, J_regressor=J_regressor) 78 | 79 | # convert to 14 keypoint format for evaluation 80 | # if self.use_spin: 81 | n_kp = preds[-1]['kp_3d'].shape[-2] 82 | pred_j3d = preds[-1]['kp_3d'].view(-1, n_kp, 3).cpu().numpy() 83 | target_j3d = target['kp_3d'].view(-1, n_kp, 3).cpu().numpy() 84 | pred_verts = preds[-1]['verts'].view(-1, 6890, 3).cpu().numpy() 85 | target_theta = target['theta'].view(-1, 85).cpu().numpy() 86 | 87 | 88 | self.evaluation_accumulators['pred_verts'].append(pred_verts) 89 | self.evaluation_accumulators['target_theta'].append(target_theta) 90 | 91 | self.evaluation_accumulators['pred_j3d'].append(pred_j3d) 92 | self.evaluation_accumulators['target_j3d'].append(target_j3d) 93 | # =============> 94 | 95 | batch_time = time.time() - start 96 | 97 | summary_string = f'({i + 1}/{len(self.test_loader)}) | batch: {batch_time * 10.0:.4}ms | ' \ 98 | f'Total: {bar.elapsed_td} | ETA: {bar.eta_td:}' 99 | 100 | bar.suffix = summary_string 101 | bar.next() 102 | 103 | bar.finish() 104 | 105 | logger.info(summary_string) 106 | 107 | def evaluate(self): 108 | 109 | for k, v in self.evaluation_accumulators.items(): 110 | self.evaluation_accumulators[k] = np.vstack(v) 111 | 112 | pred_j3ds = self.evaluation_accumulators['pred_j3d'] 113 | target_j3ds = self.evaluation_accumulators['target_j3d'] 114 | 115 | pred_j3ds = torch.from_numpy(pred_j3ds).float() 116 | target_j3ds = torch.from_numpy(target_j3ds).float() 117 | 118 | print(f'Evaluating on {pred_j3ds.shape[0]} number of poses...') 119 | pred_pelvis = (pred_j3ds[:,[2],:] + pred_j3ds[:,[3],:]) / 2.0 120 | target_pelvis = (target_j3ds[:,[2],:] + target_j3ds[:,[3],:]) / 2.0 121 | 122 | 123 | pred_j3ds -= pred_pelvis 124 | target_j3ds -= target_pelvis 125 | 126 | # Absolute error (MPJPE) 127 | errors = torch.sqrt(((pred_j3ds - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu().numpy() 128 | S1_hat = batch_compute_similarity_transform_torch(pred_j3ds, target_j3ds) 129 | errors_pa = torch.sqrt(((S1_hat - target_j3ds) ** 2).sum(dim=-1)).mean(dim=-1).cpu().numpy() 130 | pred_verts = self.evaluation_accumulators['pred_verts'] 131 | target_theta = self.evaluation_accumulators['target_theta'] 132 | 133 | m2mm = 1000 134 | 135 | pve = np.mean(compute_error_verts(target_theta=target_theta, pred_verts=pred_verts)) * m2mm 136 | accel = np.mean(compute_accel(pred_j3ds)) * m2mm 137 | accel_err = np.mean(compute_error_accel(joints_pred=pred_j3ds, joints_gt=target_j3ds)) * m2mm 138 | mpjpe = np.mean(errors) * m2mm 139 | pa_mpjpe = np.mean(errors_pa) * m2mm 140 | 141 | eval_dict = { 142 | 'mpjpe': mpjpe, 143 | 'pa-mpjpe': pa_mpjpe, 144 | 'pve': pve, 145 | 'accel': accel, 146 | 'accel_err': accel_err 147 | } 148 | 149 | log_str = ' '.join([f'{k.upper()}: {v:.4f},'for k,v in eval_dict.items()]) 150 | print(log_str) 151 | 152 | def run(self): 153 | self.validate() 154 | self.evaluate() -------------------------------------------------------------------------------- /doc/demo.md: -------------------------------------------------------------------------------- 1 | # Demo 2 | 3 | ## Flags 4 | 5 | - `--vid_file (str)`: Path to input video file or a YouTube link. If you provide a YouTube link it will be downloaded 6 | to a temporary folder and then processed. 7 | 8 | - `--output_folder (str)`: Path to folder to store the VIBE predictions and output renderings. 9 | 10 | - `--tracking_method (str), default=bbox`: Defines the tracking method to compute bboxes and tracklets of people in the input video. 11 | Available options are `bbox` or `pose`. `bbox` tracking is available [here](https://github.com/mkocabas/multi-person-tracker) 12 | as a standalone python package. For `pose` tracking, you need to install 13 | [STAF](https://github.com/soulslicer/openpose/tree/staf), extension of OpenPose to 14 | multi-person posetracking recently introduced in [1](). 15 | 16 | - `--detector (str), default=yolo`: Defines the type of detector to be used by `bbox` tracking method if enabled. Available options are 17 | `maskrcnn` and `yolo`. `maskrcnn` is more accurate but slower compared to `yolo`. Refer to [speed comparison](demo.md#runtime-performance) for further information. 18 | 19 | - `--yolo_img_size (int), default=416`: Input image size of YOLO detector. 20 | 21 | - `--tracker_batch_size (int), default=12`: Batch size of the bbox tracker. If you get memory error, you need to reduce it. 22 | 23 | - `--staf_dir (str)`: Path to folder where STAF pose tracker installed. This path should point to the main directory of staf. 24 | 25 | - `--vibe_batch_size (int), default=450`: Batch size of VIBE model. 26 | 27 | - `--display`: Enable this flag if you want to visualize the output of tracking and pose & shape estimation interactively. 28 | 29 | - `--run_smplify`: Enable this flag if you want to refine the results of VIBE using Temporal SMPLify algorithm. 30 | For this option, you have to set `--tracking_method` option to `pose`. 31 | 32 | - `--no_render`: This flag disables the final rendering of VIBE results. Useful if you only want to get VIBE predictions. 33 | 34 | - `--wireframe`: Enable this if you would like to render wireframe meshes in the final rendering. 35 | 36 | - `--sideview`: Render the output meshes from an alternate viewpoint. Default alternate viewpoint is -90 degrees in y axis. 37 | Note that this option doubles the rendering time. 38 | 39 | - `--save_obj`: Save output meshes as .obj files. 40 | 41 | ## Examples 42 | - Run VIBE on a video file using bbox tracker and visualize the results with wireframe meshes: 43 | ```bash 44 | python demo_video.py --vid_file sample_video.mp4 --output_folder output/ --tracking_method bbox --detector maskrcnn --display --wireframe 45 | ``` 46 | 47 | - Run VIBE on a YouTube video using pose tracker and run Temporal SMPLify to further refine the predictions: 48 | ```bash 49 | python demo_video.py --vid_file sample_video.mp4 --output_folder output/ --tracking_method pose --display --run_smplify 50 | ``` 51 | 52 | - Change the default batch sizes to avoid possible memory errors: 53 | ```bash 54 | python demo_video.py --vid_file sample_video.mp4 --output_folder output/ --tracker_batch_size 2 --vibe_batch_size 64 55 | ``` 56 | 57 | ## Output Format 58 | 59 | If demo finishes succesfully, it needs to create a file named `vibe_output.pkl` in the `--output_folder`. 60 | We can inspect what this file contains by: 61 | 62 | ```python 63 | >>> import joblib # you may use native pickle here as well 64 | 65 | >>> output = joblib.load('output/group_dance/vibe_output.pkl') 66 | 67 | >>> print(output.keys()) 68 | 69 | dict_keys([1, 2, 3, 4]) # these are the track ids for each subject appearing in the video 70 | 71 | >>> for k,v in output[1].items(): print(k,v.shape) 72 | 73 | pred_cam (n_frames, 3) # weak perspective camera parameters in cropped image space (s,tx,ty) 74 | orig_cam (n_frames, 4) # weak perspective camera parameters in original image space (sx,sy,tx,ty) 75 | verts (n_frames, 6890, 3) # SMPL mesh vertices 76 | pose (n_frames, 72) # SMPL pose parameters 77 | betas (n_frames, 10) # SMPL body shape parameters 78 | joints3d (n_frames, 49, 3) # SMPL 3D joints 79 | joints2d (n_frames, 21, 3) # 2D keypoint detections by STAF if pose tracking enabled otherwise None 80 | bboxes (n_frames, 4) # bbox detections (cx,cy,w,h) 81 | frame_ids (n_frames,) # frame ids in which subject with tracking id #1 appears 82 | 83 | ``` 84 | You can find the names & order of 3d joints [here](https://github.com/mkocabas/VIBE/blob/master/lib/data_utils/kp_utils.py#L212) and 2D joints [here](https://github.com/mkocabas/VIBE/blob/master/lib/data_utils/kp_utils.py#L187). 85 | 86 | ## Runtime Performance 87 | Here is the breakdown of runtime speeds per step namely tracking and VIBE. This results are obtained by running VIBE 88 | on a [video](https://www.youtube.com/watch?v=Opry3F6aB1I) containing 5 people. 89 | 90 | ```bash 91 | python demo.py --vid_file https://www.youtube.com/watch?v=Opry3F6aB1I --output_folder output/ --vibe_batch_size 32 --no_render 92 | ``` 93 | 94 | | Tracker | GPU | Tracking Time (ms/img) | Tracking FPS | VIBE Time (ms/image) | VIBE FPS | Total FPS | 95 | |-----------------|:---------:|:----------------------:|:------------:|:--------------------:|:--------:|:---------:| 96 | | STAF-pose | RTX2080Ti | 23.2 | 43 | 16.1 | 61 | 21 | 97 | | MaskRCNN-bbox | RTX2080Ti | 68.0 | 15 | 16.1 | 61 | 11 | 98 | | YOLOv3-416-bbox | RTX2080Ti | 12.7 | 79 | 16.1 | 61 | 29 | 99 | | YOLOv3-608-bbox | RTX2080Ti | 22.2 | 45 | 16.1 | 61 | 23 | 100 | 101 | **Note**: Above table does not include the time spent during rendering of the final output. 102 | We use pyrender with GPU accelaration and it takes 2-3 FPS per image. Please let us know if you know any faster alternative. 103 | 104 | ## References 105 | [1] Pose tracker is from [STAF implementation](https://github.com/soulslicer/openpose/tree/staf) 106 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | os.environ['PYOPENGL_PLATFORM'] = 'egl' 19 | 20 | import torch 21 | import pprint 22 | import random 23 | import numpy as np 24 | import torch.backends.cudnn as cudnn 25 | from torch.utils.tensorboard import SummaryWriter 26 | 27 | from lib.core.loss import VIBELoss 28 | from lib.core.trainer import Trainer 29 | from lib.core.config import parse_args 30 | from lib.utils.utils import prepare_output_dir 31 | from lib.models import VIBE, MotionDiscriminator 32 | from lib.dataset.loaders import get_data_loaders 33 | from lib.utils.utils import create_logger, get_optimizer 34 | 35 | 36 | def main(cfg): 37 | if cfg.SEED_VALUE >= 0: 38 | print(f'Seed value for the experiment {cfg.SEED_VALUE}') 39 | os.environ['PYTHONHASHSEED'] = str(cfg.SEED_VALUE) 40 | random.seed(cfg.SEED_VALUE) 41 | torch.manual_seed(cfg.SEED_VALUE) 42 | np.random.seed(cfg.SEED_VALUE) 43 | 44 | logger = create_logger(cfg.LOGDIR, phase='train') 45 | 46 | logger.info(f'GPU name -> {torch.cuda.get_device_name()}') 47 | logger.info(f'GPU feat -> {torch.cuda.get_device_properties("cuda")}') 48 | 49 | logger.info(pprint.pformat(cfg)) 50 | 51 | # cudnn related setting 52 | cudnn.benchmark = cfg.CUDNN.BENCHMARK 53 | torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC 54 | torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED 55 | 56 | writer = SummaryWriter(log_dir=cfg.LOGDIR) 57 | writer.add_text('config', pprint.pformat(cfg), 0) 58 | 59 | # ========= Dataloaders ========= # 60 | data_loaders = get_data_loaders(cfg) 61 | 62 | # ========= Compile Loss ========= # 63 | loss = VIBELoss( 64 | e_loss_weight=cfg.LOSS.KP_2D_W, 65 | e_3d_loss_weight=cfg.LOSS.KP_3D_W, 66 | e_pose_loss_weight=cfg.LOSS.POSE_W, 67 | e_shape_loss_weight=cfg.LOSS.SHAPE_W, 68 | d_motion_loss_weight=cfg.LOSS.D_MOTION_LOSS_W, 69 | ) 70 | 71 | # ========= Initialize networks, optimizers and lr_schedulers ========= # 72 | generator = VIBE( 73 | n_layers=cfg.MODEL.TGRU.NUM_LAYERS, 74 | batch_size=cfg.TRAIN.BATCH_SIZE, 75 | seqlen=cfg.DATASET.SEQLEN, 76 | hidden_size=cfg.MODEL.TGRU.HIDDEN_SIZE, 77 | pretrained=cfg.TRAIN.PRETRAINED_REGRESSOR, 78 | add_linear=cfg.MODEL.TGRU.ADD_LINEAR, 79 | bidirectional=cfg.MODEL.TGRU.BIDIRECTIONAL, 80 | use_residual=cfg.MODEL.TGRU.RESIDUAL, 81 | ).to(cfg.DEVICE) 82 | 83 | if cfg.TRAIN.PRETRAINED != '' and os.path.isfile(cfg.TRAIN.PRETRAINED): 84 | checkpoint = torch.load(cfg.TRAIN.PRETRAINED) 85 | best_performance = checkpoint['performance'] 86 | generator.load_state_dict(checkpoint['gen_state_dict']) 87 | print(f'==> Loaded pretrained model from {cfg.TRAIN.PRETRAINED}...') 88 | print(f'Performance on 3DPW test set {best_performance}') 89 | else: 90 | print(f'{cfg.TRAIN.PRETRAINED} is not a pretrained model!!!!') 91 | 92 | gen_optimizer = get_optimizer( 93 | model=generator, 94 | optim_type=cfg.TRAIN.GEN_OPTIM, 95 | lr=cfg.TRAIN.GEN_LR, 96 | weight_decay=cfg.TRAIN.GEN_WD, 97 | momentum=cfg.TRAIN.GEN_MOMENTUM, 98 | ) 99 | 100 | motion_discriminator = MotionDiscriminator( 101 | rnn_size=cfg.TRAIN.MOT_DISCR.HIDDEN_SIZE, 102 | input_size=69, 103 | num_layers=cfg.TRAIN.MOT_DISCR.NUM_LAYERS, 104 | output_size=1, 105 | feature_pool=cfg.TRAIN.MOT_DISCR.FEATURE_POOL, 106 | attention_size=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.SIZE, 107 | attention_layers=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.LAYERS, 108 | attention_dropout=None if cfg.TRAIN.MOT_DISCR.FEATURE_POOL !='attention' else cfg.TRAIN.MOT_DISCR.ATT.DROPOUT 109 | ).to(cfg.DEVICE) 110 | 111 | dis_motion_optimizer = get_optimizer( 112 | model=motion_discriminator, 113 | optim_type=cfg.TRAIN.MOT_DISCR.OPTIM, 114 | lr=cfg.TRAIN.MOT_DISCR.LR, 115 | weight_decay=cfg.TRAIN.MOT_DISCR.WD, 116 | momentum=cfg.TRAIN.MOT_DISCR.MOMENTUM 117 | ) 118 | 119 | motion_lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 120 | dis_motion_optimizer, 121 | mode='min', 122 | factor=0.1, 123 | patience=cfg.TRAIN.LR_PATIENCE, 124 | verbose=True, 125 | ) 126 | 127 | lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 128 | gen_optimizer, 129 | mode='min', 130 | factor=0.1, 131 | patience=cfg.TRAIN.LR_PATIENCE, 132 | verbose=True, 133 | ) 134 | 135 | # ========= Start Training ========= # 136 | Trainer( 137 | data_loaders=data_loaders, 138 | generator=generator, 139 | motion_discriminator=motion_discriminator, 140 | criterion=loss, 141 | dis_motion_optimizer=dis_motion_optimizer, 142 | dis_motion_update_steps=cfg.TRAIN.MOT_DISCR.UPDATE_STEPS, 143 | gen_optimizer=gen_optimizer, 144 | start_epoch=cfg.TRAIN.START_EPOCH, 145 | end_epoch=cfg.TRAIN.END_EPOCH, 146 | device=cfg.DEVICE, 147 | writer=writer, 148 | debug=cfg.DEBUG, 149 | logdir=cfg.LOGDIR, 150 | lr_scheduler=lr_scheduler, 151 | motion_lr_scheduler=motion_lr_scheduler, 152 | resume=cfg.TRAIN.RESUME, 153 | num_iters_per_epoch=cfg.TRAIN.NUM_ITERS_PER_EPOCH, 154 | debug_freq=cfg.DEBUG_FREQ, 155 | ).fit() 156 | 157 | 158 | if __name__ == '__main__': 159 | cfg, cfg_file = parse_args() 160 | cfg = prepare_output_dir(cfg, cfg_file) 161 | 162 | main(cfg) 163 | -------------------------------------------------------------------------------- /lib/models/vibe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import torch 19 | import os.path as osp 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | from lib.core.config import VIBE_DATA_DIR 24 | from lib.models.spin import Regressor, hmr 25 | 26 | 27 | class TemporalEncoder(nn.Module): 28 | def __init__( 29 | self, 30 | n_layers=1, 31 | hidden_size=2048, 32 | add_linear=False, 33 | bidirectional=False, 34 | use_residual=True 35 | ): 36 | super(TemporalEncoder, self).__init__() 37 | 38 | self.gru = nn.GRU( 39 | input_size=2048, 40 | hidden_size=hidden_size, 41 | bidirectional=bidirectional, 42 | num_layers=n_layers 43 | ) 44 | 45 | self.linear = None 46 | if bidirectional: 47 | self.linear = nn.Linear(hidden_size*2, 2048) 48 | elif add_linear: 49 | self.linear = nn.Linear(hidden_size, 2048) 50 | self.use_residual = use_residual 51 | 52 | def forward(self, x): 53 | n,t,f = x.shape 54 | x = x.permute(1,0,2) # NTF -> TNF 55 | y, _ = self.gru(x) 56 | if self.linear: 57 | y = F.relu(y) 58 | y = self.linear(y.view(-1, y.size(-1))) 59 | y = y.view(t,n,f) 60 | if self.use_residual and y.shape[-1] == 2048: 61 | y = y + x 62 | y = y.permute(1,0,2) # TNF -> NTF 63 | return y 64 | 65 | 66 | class VIBE(nn.Module): 67 | def __init__( 68 | self, 69 | seqlen, 70 | batch_size=64, 71 | n_layers=1, 72 | hidden_size=2048, 73 | add_linear=False, 74 | bidirectional=False, 75 | use_residual=True, 76 | pretrained=osp.join(VIBE_DATA_DIR, 'spin_model_checkpoint.pth.tar'), 77 | ): 78 | 79 | super(VIBE, self).__init__() 80 | 81 | self.seqlen = seqlen 82 | self.batch_size = batch_size 83 | 84 | self.encoder = TemporalEncoder( 85 | n_layers=n_layers, 86 | hidden_size=hidden_size, 87 | bidirectional=bidirectional, 88 | add_linear=add_linear, 89 | use_residual=use_residual, 90 | ) 91 | 92 | # regressor can predict cam, pose and shape params in an iterative way 93 | self.regressor = Regressor() 94 | 95 | if pretrained and os.path.isfile(pretrained): 96 | pretrained_dict = torch.load(pretrained)['model'] 97 | 98 | self.regressor.load_state_dict(pretrained_dict, strict=False) 99 | print(f'=> loaded pretrained model from \'{pretrained}\'') 100 | 101 | 102 | def forward(self, input, J_regressor=None): 103 | # input size NTF 104 | batch_size, seqlen = input.shape[:2] 105 | 106 | feature = self.encoder(input) 107 | feature = feature.reshape(-1, feature.size(-1)) 108 | 109 | smpl_output = self.regressor(feature, J_regressor=J_regressor) 110 | for s in smpl_output: 111 | s['theta'] = s['theta'].reshape(batch_size, seqlen, -1) 112 | s['verts'] = s['verts'].reshape(batch_size, seqlen, -1, 3) 113 | s['kp_2d'] = s['kp_2d'].reshape(batch_size, seqlen, -1, 2) 114 | s['kp_3d'] = s['kp_3d'].reshape(batch_size, seqlen, -1, 3) 115 | s['rotmat'] = s['rotmat'].reshape(batch_size, seqlen, -1, 3, 3) 116 | 117 | return smpl_output 118 | 119 | 120 | class VIBE_Demo(nn.Module): 121 | def __init__( 122 | self, 123 | seqlen, 124 | batch_size=64, 125 | n_layers=1, 126 | hidden_size=2048, 127 | add_linear=False, 128 | bidirectional=False, 129 | use_residual=True, 130 | pretrained=osp.join(VIBE_DATA_DIR, 'spin_model_checkpoint.pth.tar'), 131 | ): 132 | 133 | super(VIBE_Demo, self).__init__() 134 | 135 | self.seqlen = seqlen 136 | self.batch_size = batch_size 137 | 138 | self.encoder = TemporalEncoder( 139 | n_layers=n_layers, 140 | hidden_size=hidden_size, 141 | bidirectional=bidirectional, 142 | add_linear=add_linear, 143 | use_residual=use_residual, 144 | ) 145 | 146 | self.hmr = hmr() 147 | checkpoint = torch.load(pretrained) 148 | self.hmr.load_state_dict(checkpoint['model'], strict=False) 149 | 150 | # regressor can predict cam, pose and shape params in an iterative way 151 | self.regressor = Regressor() 152 | 153 | if pretrained and os.path.isfile(pretrained): 154 | pretrained_dict = torch.load(pretrained)['model'] 155 | 156 | self.regressor.load_state_dict(pretrained_dict, strict=False) 157 | print(f'=> loaded pretrained model from \'{pretrained}\'') 158 | 159 | 160 | def forward(self, input, J_regressor=None): 161 | # input size NTF 162 | batch_size, seqlen, nc, h, w = input.shape 163 | 164 | feature = self.hmr.feature_extractor(input.reshape(-1, nc, h, w)) 165 | 166 | feature = feature.reshape(batch_size, seqlen, -1) 167 | feature = self.encoder(feature) 168 | feature = feature.reshape(-1, feature.size(-1)) 169 | 170 | smpl_output = self.regressor(feature, J_regressor=J_regressor) 171 | 172 | for s in smpl_output: 173 | s['theta'] = s['theta'].reshape(batch_size, seqlen, -1) 174 | s['verts'] = s['verts'].reshape(batch_size, seqlen, -1, 3) 175 | s['kp_2d'] = s['kp_2d'].reshape(batch_size, seqlen, -1, 2) 176 | s['kp_3d'] = s['kp_3d'].reshape(batch_size, seqlen, -1, 3) 177 | s['rotmat'] = s['rotmat'].reshape(batch_size, seqlen, -1, 3, 3) 178 | 179 | return smpl_output 180 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | License 2 | 3 | Software Copyright License for non-commercial scientific research purposes 4 | Please read carefully the following terms and conditions and any accompanying documentation before you download 5 | and/or use the VIBE model, data and software, (the "Model & Software"), including 3D meshes, software, and scripts. 6 | By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use 7 | of this github repository), you acknowledge that you have read these terms and conditions, understand them, and 8 | agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use 9 | the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights 10 | under this License 11 | 12 | Ownership / Licensees 13 | The Software and the associated materials has been developed at the 14 | 15 | Max Planck Institute for Intelligent Systems (hereinafter "MPI"). 16 | 17 | Any copyright or patent right is owned by and proprietary material of the 18 | 19 | Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (hereinafter “MPG”; MPI and MPG hereinafter 20 | collectively “Max-Planck”) 21 | 22 | hereinafter the “Licensor”. 23 | 24 | This software includes the SMPL Body Model. By downloading this software, you are agreeing to be bound by the terms of the SMPL Model License 25 | 26 | https://smpl.is.tue.mpg.de/modellicense 27 | 28 | which is necessary to create SMPL body models. 29 | 30 | SMPL bodies that are generated with VIBE can be distributed freely under the SMPL Body License 31 | 32 | https://smpl.is.tue.mpg.de/bodylicense 33 | 34 | License Grant 35 | Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right: 36 | 37 | To install the Model & Software on computers owned, leased or otherwise controlled by you and/or your organization; 38 | To use the Model & Software for the sole purpose of performing non-commercial scientific research, non-commercial 39 | education, or non-commercial artistic projects; 40 | Any other use, in particular any use for commercial purposes, is prohibited. This includes, without limitation, 41 | incorporation in a commercial product, use in a commercial service, or production of other artifacts for 42 | commercial purposes. The Model & Software may not be reproduced, modified and/or made available in any form to 43 | any third party without Max-Planck’s prior written permission. 44 | 45 | The Model & Software may not be used for pornographic purposes or to generate pornographic material whether 46 | commercial or not. This license also prohibits the use of the Model & Software to train methods/algorithms/neural 47 | networks/etc. for commercial use of any kind. By downloading the Model & Software, 48 | you agree not to reverse engineer it. 49 | 50 | No Distribution 51 | The Model & Software and the license herein granted shall not be copied, shared, distributed, re-sold, offered 52 | for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive 53 | purposes only. 54 | 55 | Disclaimer of Representations and Warranties 56 | You expressly acknowledge and agree that the Model & Software results from basic research, is provided “AS IS”, 57 | may contain errors, and that any use of the Model & Software is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS 58 | OR WARRANTIES OF ANY KIND CONCERNING THE MODEL & SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY 59 | LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor 60 | makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of 61 | the Model & Software, (ii) that the use of the Model & Software will not infringe any patents, copyrights or other 62 | intellectual property rights of a third party, and (iii) that the use of the Model & Software will not cause any 63 | damage of any kind to you or a third party. 64 | 65 | Limitation of Liability 66 | Because this Model & Software License Agreement qualifies as a donation, according to Section 521 of the German 67 | Civil Code (Bürgerliches Gesetzbuch – BGB) Licensor as a donor is liable for intent and gross negligence only. 68 | If the Licensor fraudulently conceals a legal or material defect, they are obliged to compensate the Licensee 69 | for the resulting damage. 70 | 71 | Licensor shall be liable for loss of data only up to the amount of typical recovery costs which would have 72 | arisen had proper and regular data backup measures been taken. For the avoidance of doubt Licensor shall be 73 | liable in accordance with the German Product Liability Act in the event of product liability. The foregoing 74 | applies also to Licensor’s legal representatives or assistants in performance. Any further liability shall be excluded. 75 | Patent claims generated through the usage of the Model & Software cannot be directed towards the copyright holders. 76 | The Model & Software is provided in the state of development the licensor defines. If modified or extended by 77 | Licensee, the Licensor makes no claims about the fitness of the Model & Software and is not responsible 78 | for any problems such modifications cause. 79 | 80 | No Maintenance Services 81 | You understand and agree that Licensor is under no obligation to provide either maintenance services, 82 | update services, notices of latent defects, or corrections of defects with regard to the Model & Software. 83 | Licensor nevertheless reserves the right to update, modify, or discontinue the Model & Software at any time. 84 | 85 | Defects of the Model & Software must be notified in writing to the Licensor with a comprehensible description 86 | of the error symptoms. The notification of the defect should enable the reproduction of the error. 87 | The Licensee is encouraged to communicate any use, results, modification or publication. 88 | 89 | Publications using the Model & Software 90 | You acknowledge that the Model & Software is a valuable scientific resource and agree to appropriately reference 91 | the following paper in any publication making use of the Model & Software. 92 | 93 | Citation: 94 | 95 | @inproceedings{VIBE:CVPR:2020, 96 | title = {{VIBE}: Video Inference for Human Body Pose and Shape Estimation}, 97 | author = {Kocabas, Muhammed and Athanasiou, Nikos and Black, Michael J.}, 98 | booktitle = {Computer Vision and Pattern Recognition (CVPR)}, 99 | month = jun, 100 | year = {2020}, 101 | month_numeric = {6} 102 | } 103 | 104 | Commercial licensing opportunities 105 | For commercial uses of the Software, please send email to ps-license@tue.mpg.de 106 | 107 | This Agreement shall be governed by the laws of the Federal Republic of Germany except for the UN Sales Convention. 108 | -------------------------------------------------------------------------------- /lib/data_utils/posetrack_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import sys 18 | sys.path.append('.') 19 | 20 | import glob 21 | import joblib 22 | import argparse 23 | import numpy as np 24 | import json 25 | import os.path as osp 26 | 27 | from lib.models import spin 28 | from lib.core.config import VIBE_DB_DIR 29 | from lib.utils.utils import tqdm_enumerate 30 | from lib.data_utils.feature_extractor import extract_features 31 | from lib.data_utils.kp_utils import get_posetrack_original_kp_names, convert_kps 32 | 33 | def read_data(folder, set): 34 | dataset = { 35 | 'img_name' : [] , 36 | 'joints2D': [], 37 | 'bbox': [], 38 | 'vid_name': [], 39 | 'features': [], 40 | } 41 | 42 | model = spin.get_pretrained_hmr() 43 | 44 | file_names = glob.glob(osp.join(folder, 'posetrack_data/annotations/', f'{set}/*.json')) 45 | file_names = sorted(file_names) 46 | nn_corrupted = 0 47 | tot_frames = 0 48 | min_frame_number = 8 49 | 50 | for fid,fname in tqdm_enumerate(file_names): 51 | if fname == osp.join(folder, 'annotations/train/021133_mpii_train.json'): 52 | continue 53 | 54 | with open(fname, 'r') as entry: 55 | anns = json.load(entry) 56 | # num_frames = anns['images'][0]['nframes'] 57 | anns['images'] = [item for item in anns['images'] if item['is_labeled'] ] 58 | num_frames = len(anns['images']) 59 | frame2imgname = dict() 60 | for el in anns['images']: 61 | frame2imgname[el['frame_id']] = el['file_name'] 62 | 63 | num_people = -1 64 | for x in anns['annotations']: 65 | if num_people < x['track_id']: 66 | num_people = x['track_id'] 67 | num_people += 1 68 | posetrack_joints = get_posetrack_original_kp_names() 69 | idxs = [anns['categories'][0]['keypoints'].index(h) for h in posetrack_joints if h in anns['categories'][0]['keypoints']] 70 | for x in anns['annotations']: 71 | kps = np.array(x['keypoints']).reshape((17,3)) 72 | kps = kps[idxs,:] 73 | x['keypoints'] = list(kps.flatten()) 74 | 75 | tot_frames += num_people * num_frames 76 | for p_id in range(num_people): 77 | 78 | annot_pid = [(item['keypoints'], item['bbox'], item['image_id']) 79 | for item in anns['annotations'] 80 | if item['track_id'] == p_id and not(np.count_nonzero(item['keypoints']) == 0) ] 81 | 82 | if len(annot_pid) < min_frame_number: 83 | nn_corrupted += len(annot_pid) 84 | continue 85 | 86 | bbox = np.zeros((len(annot_pid),4)) 87 | # perm_idxs = get_perm_idxs('posetrack', 'common') 88 | kp_2d = np.zeros((len(annot_pid), len(annot_pid[0][0])//3 ,3)) 89 | img_paths = np.zeros((len(annot_pid))) 90 | 91 | for i, (key2djnts, bbox_p, image_id) in enumerate(annot_pid): 92 | 93 | if (bbox_p[2]==0 or bbox_p[3]==0) : 94 | nn_corrupted +=1 95 | continue 96 | 97 | img_paths[i] = image_id 98 | key2djnts[2::3] = len(key2djnts[2::3])*[1] 99 | 100 | kp_2d[i,:] = np.array(key2djnts).reshape(int(len(key2djnts)/3),3) # [perm_idxs, :] 101 | for kp_loc in kp_2d[i,:]: 102 | if kp_loc[0] == 0 and kp_loc[1] == 0: 103 | kp_loc[2] = 0 104 | 105 | 106 | x_tl = bbox_p[0] 107 | y_tl = bbox_p[1] 108 | w = bbox_p[2] 109 | h = bbox_p[3] 110 | bbox_p[0] = x_tl + w / 2 111 | bbox_p[1] = y_tl + h / 2 112 | # 113 | 114 | w = h = np.where(w / h > 1, w, h) 115 | w = h = h * 0.8 116 | bbox_p[2] = w 117 | bbox_p[3] = h 118 | bbox[i, :] = bbox_p 119 | 120 | img_paths = list(img_paths) 121 | img_paths = [osp.join(folder, frame2imgname[item]) if item != 0 else 0 for item in img_paths ] 122 | 123 | bbx_idxs = [] 124 | for bbx_id, bbx in enumerate(bbox): 125 | if np.count_nonzero(bbx) == 0: 126 | bbx_idxs += [bbx_id] 127 | 128 | kp_2d = np.delete(kp_2d, bbx_idxs, 0) 129 | img_paths = np.delete(np.array(img_paths), bbx_idxs, 0) 130 | bbox = np.delete(bbox, np.where(~bbox.any(axis=1))[0], axis=0) 131 | 132 | # Convert to common 2d keypoint format 133 | if bbox.size == 0 or bbox.shape[0] < min_frame_number: 134 | nn_corrupted += 1 135 | continue 136 | 137 | kp_2d = convert_kps(kp_2d, src='posetrack', dst='spin') 138 | 139 | dataset['vid_name'].append(np.array([f'{fname}_{p_id}']*img_paths.shape[0])) 140 | dataset['img_name'].append(np.array(img_paths)) 141 | dataset['joints2D'].append(kp_2d) 142 | dataset['bbox'].append(np.array(bbox)) 143 | 144 | # compute_features 145 | features = extract_features( 146 | model, 147 | np.array(img_paths), 148 | bbox, 149 | kp_2d=kp_2d, 150 | dataset='spin', 151 | debug=False, 152 | ) 153 | 154 | assert kp_2d.shape[0] == img_paths.shape[0] == bbox.shape[0] 155 | 156 | dataset['features'].append(features) 157 | 158 | 159 | print(nn_corrupted, tot_frames) 160 | for k in dataset.keys(): 161 | dataset[k] = np.array(dataset[k]) 162 | 163 | for k in dataset.keys(): 164 | dataset[k] = np.concatenate(dataset[k]) 165 | 166 | for k,v in dataset.items(): 167 | print(k, v.shape) 168 | 169 | return dataset 170 | 171 | 172 | if __name__ == '__main__': 173 | parser = argparse.ArgumentParser() 174 | parser.add_argument('--dir', type=str, help='dataset directory', default='data/posetrack') 175 | args = parser.parse_args() 176 | 177 | dataset_train = read_data(args.dir, 'train') 178 | joblib.dump(dataset_train, osp.join(VIBE_DB_DIR, 'posetrack_train_db.pt')) 179 | -------------------------------------------------------------------------------- /lib/data_utils/threedpw_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import sys 18 | sys.path.append('.') 19 | 20 | import os 21 | import cv2 22 | import torch 23 | import joblib 24 | import argparse 25 | import numpy as np 26 | import pickle as pkl 27 | import os.path as osp 28 | from tqdm import tqdm 29 | 30 | from lib.models import spin 31 | from lib.data_utils.kp_utils import * 32 | from lib.core.config import VIBE_DB_DIR, VIBE_DATA_DIR 33 | from lib.utils.smooth_bbox import get_smooth_bbox_params 34 | from lib.models.smpl import SMPL, SMPL_MODEL_DIR, H36M_TO_J14 35 | from lib.data_utils.feature_extractor import extract_features 36 | from lib.utils.geometry import batch_rodrigues, rotation_matrix_to_angle_axis 37 | 38 | NUM_JOINTS = 24 39 | VIS_THRESH = 0.3 40 | MIN_KP = 6 41 | 42 | def read_data(folder, set, debug=False): 43 | 44 | dataset = { 45 | 'vid_name': [], 46 | 'frame_id': [], 47 | 'joints3D': [], 48 | 'joints2D': [], 49 | 'shape': [], 50 | 'pose': [], 51 | 'bbox': [], 52 | 'img_name': [], 53 | 'features': [], 54 | 'valid': [], 55 | } 56 | 57 | model = spin.get_pretrained_hmr() 58 | 59 | sequences = [x.split('.')[0] for x in os.listdir(osp.join(folder, 'sequenceFiles', set))] 60 | 61 | J_regressor = None 62 | 63 | smpl = SMPL(SMPL_MODEL_DIR, batch_size=1, create_transl=False) 64 | if set == 'test' or set == 'validation': 65 | J_regressor = torch.from_numpy(np.load(osp.join(VIBE_DATA_DIR, 'J_regressor_h36m.npy'))).float() 66 | 67 | for i, seq in tqdm(enumerate(sequences)): 68 | 69 | data_file = osp.join(folder, 'sequenceFiles', set, seq + '.pkl') 70 | 71 | data = pkl.load(open(data_file, 'rb'), encoding='latin1') 72 | 73 | img_dir = osp.join(folder, 'imageFiles', seq) 74 | 75 | num_people = len(data['poses']) 76 | num_frames = len(data['img_frame_ids']) 77 | assert (data['poses2d'][0].shape[0] == num_frames) 78 | 79 | for p_id in range(num_people): 80 | pose = torch.from_numpy(data['poses'][p_id]).float() 81 | shape = torch.from_numpy(data['betas'][p_id][:10]).float().repeat(pose.size(0), 1) 82 | trans = torch.from_numpy(data['trans'][p_id]).float() 83 | j2d = data['poses2d'][p_id].transpose(0,2,1) 84 | cam_pose = data['cam_poses'] 85 | campose_valid = data['campose_valid'][p_id] 86 | 87 | # ======== Align the mesh params ======== # 88 | rot = pose[:, :3] 89 | rot_mat = batch_rodrigues(rot) 90 | 91 | Rc = torch.from_numpy(cam_pose[:, :3, :3]).float() 92 | Rs = torch.bmm(Rc, rot_mat.reshape(-1, 3, 3)) 93 | rot = rotation_matrix_to_angle_axis(Rs) 94 | pose[:, :3] = rot 95 | # ======== Align the mesh params ======== # 96 | 97 | output = smpl(betas=shape, body_pose=pose[:,3:], global_orient=pose[:,:3], transl=trans) 98 | # verts = output.vertices 99 | j3d = output.joints 100 | 101 | if J_regressor is not None: 102 | vertices = output.vertices 103 | J_regressor_batch = J_regressor[None, :].expand(vertices.shape[0], -1, -1).to(vertices.device) 104 | j3d = torch.matmul(J_regressor_batch, vertices) 105 | j3d = j3d[:, H36M_TO_J14, :] 106 | 107 | img_paths = [] 108 | for i_frame in range(num_frames): 109 | img_path = os.path.join(img_dir + '/image_{:05d}.jpg'.format(i_frame)) 110 | img_paths.append(img_path) 111 | 112 | bbox_params, time_pt1, time_pt2 = get_smooth_bbox_params(j2d, vis_thresh=VIS_THRESH, sigma=8) 113 | 114 | # process bbox_params 115 | c_x = bbox_params[:,0] 116 | c_y = bbox_params[:,1] 117 | scale = bbox_params[:,2] 118 | w = h = 150. / scale 119 | w = h = h * 1.1 120 | bbox = np.vstack([c_x,c_y,w,h]).T 121 | 122 | # process keypoints 123 | j2d[:, :, 2] = j2d[:, :, 2] > 0.3 # set the visibility flags 124 | # Convert to common 2d keypoint format 125 | perm_idxs = get_perm_idxs('3dpw', 'common') 126 | perm_idxs += [0, 0] # no neck, top head 127 | j2d = j2d[:, perm_idxs] 128 | j2d[:, 12:, 2] = 0.0 129 | 130 | # print('j2d', j2d[time_pt1:time_pt2].shape) 131 | # print('campose', campose_valid[time_pt1:time_pt2].shape) 132 | 133 | img_paths_array = np.array(img_paths)[time_pt1:time_pt2] 134 | dataset['vid_name'].append(np.array([f'{seq}_{p_id}']*num_frames)[time_pt1:time_pt2]) 135 | dataset['frame_id'].append(np.arange(0, num_frames)[time_pt1:time_pt2]) 136 | dataset['img_name'].append(img_paths_array) 137 | dataset['joints3D'].append(j3d.numpy()[time_pt1:time_pt2]) 138 | dataset['joints2D'].append(j2d[time_pt1:time_pt2]) 139 | dataset['shape'].append(shape.numpy()[time_pt1:time_pt2]) 140 | dataset['pose'].append(pose.numpy()[time_pt1:time_pt2]) 141 | dataset['bbox'].append(bbox) 142 | dataset['valid'].append(campose_valid[time_pt1:time_pt2]) 143 | 144 | features = extract_features(model, img_paths_array, bbox, 145 | kp_2d=j2d[time_pt1:time_pt2], debug=debug, dataset='3dpw', scale=1.2) 146 | dataset['features'].append(features) 147 | 148 | for k in dataset.keys(): 149 | dataset[k] = np.concatenate(dataset[k]) 150 | print(k, dataset[k].shape) 151 | 152 | # Filter out keypoints 153 | indices_to_use = np.where((dataset['joints2D'][:, :, 2] > VIS_THRESH).sum(-1) > MIN_KP)[0] 154 | for k in dataset.keys(): 155 | dataset[k] = dataset[k][indices_to_use] 156 | 157 | return dataset 158 | 159 | 160 | if __name__ == '__main__': 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument('--dir', type=str, help='dataset directory', default='data/3dpw') 163 | args = parser.parse_args() 164 | 165 | debug = False 166 | 167 | dataset = read_data(args.dir, 'validation', debug=debug) 168 | joblib.dump(dataset, osp.join(VIBE_DB_DIR, '3dpw_val_db.pt')) 169 | 170 | dataset = read_data(args.dir, 'test', debug=debug) 171 | joblib.dump(dataset, osp.join(VIBE_DB_DIR, '3dpw_test_db.pt')) 172 | -------------------------------------------------------------------------------- /lib/dataset/dataset_3d.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import torch 19 | import random 20 | import logging 21 | import numpy as np 22 | import os.path as osp 23 | import joblib 24 | 25 | from torch.utils.data import Dataset 26 | from lib.core.config import VIBE_DB_DIR 27 | from lib.data_utils.kp_utils import convert_kps 28 | from lib.data_utils.img_utils import normalize_2d_kp, transfrom_keypoints, split_into_chunks 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | class Dataset3D(Dataset): 33 | def __init__(self, set, seqlen, overlap=0., folder=None, dataset_name=None, debug=False): 34 | 35 | self.folder = folder 36 | self.set = set 37 | self.dataset_name = dataset_name 38 | self.seqlen = seqlen 39 | self.stride = int(seqlen * (1-overlap)) 40 | self.debug = debug 41 | self.db = self.load_db() 42 | self.vid_indices = split_into_chunks(self.db['vid_name'], self.seqlen, self.stride) 43 | 44 | def __len__(self): 45 | return len(self.vid_indices) 46 | 47 | def __getitem__(self, index): 48 | return self.get_single_item(index) 49 | 50 | def load_db(self): 51 | db_file = osp.join(VIBE_DB_DIR, f'{self.dataset_name}_{self.set}_db.pt') 52 | 53 | if osp.isfile(db_file): 54 | db = joblib.load(db_file) 55 | else: 56 | raise ValueError(f'{db_file} do not exists') 57 | 58 | print(f'Loaded {self.dataset_name} dataset from {db_file}') 59 | return db 60 | 61 | def get_single_item(self, index): 62 | start_index, end_index = self.vid_indices[index] 63 | 64 | is_train = self.set == 'train' 65 | 66 | if self.dataset_name == '3dpw': 67 | kp_2d = convert_kps(self.db['joints2D'][start_index:end_index + 1], src='common', dst='spin') 68 | kp_3d = self.db['joints3D'][start_index:end_index + 1] 69 | elif self.dataset_name == 'mpii3d': 70 | kp_2d = self.db['joints2D'][start_index:end_index + 1] 71 | if is_train: 72 | kp_3d = self.db['joints3D'][start_index:end_index + 1] 73 | else: 74 | kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') 75 | elif self.dataset_name == 'h36m': 76 | kp_2d = self.db['joints2D'][start_index:end_index + 1] 77 | if is_train: 78 | kp_3d = self.db['joints3D'][start_index:end_index + 1] 79 | else: 80 | kp_3d = convert_kps(self.db['joints3D'][start_index:end_index + 1], src='spin', dst='common') 81 | 82 | kp_2d_tensor = np.ones((self.seqlen, 49, 3), dtype=np.float16) 83 | nj = 14 if not is_train else 49 84 | kp_3d_tensor = np.zeros((self.seqlen, nj, 3), dtype=np.float16) 85 | 86 | 87 | if self.dataset_name == '3dpw': 88 | pose = self.db['pose'][start_index:end_index+1] 89 | shape = self.db['shape'][start_index:end_index+1] 90 | w_smpl = torch.ones(self.seqlen).float() 91 | w_3d = torch.ones(self.seqlen).float() 92 | elif self.dataset_name == 'h36m': 93 | if not is_train: 94 | pose = np.zeros((kp_2d.shape[0], 72)) 95 | shape = np.zeros((kp_2d.shape[0], 10)) 96 | w_smpl = torch.zeros(self.seqlen).float() 97 | w_3d = torch.ones(self.seqlen).float() 98 | else: 99 | pose = self.db['pose'][start_index:end_index + 1] 100 | shape = self.db['shape'][start_index:end_index + 1] 101 | w_smpl = torch.ones(self.seqlen).float() 102 | w_3d = torch.ones(self.seqlen).float() 103 | elif self.dataset_name == 'mpii3d': 104 | pose = np.zeros((kp_2d.shape[0], 72)) 105 | shape = np.zeros((kp_2d.shape[0], 10)) 106 | w_smpl = torch.zeros(self.seqlen).float() 107 | w_3d = torch.ones(self.seqlen).float() 108 | 109 | bbox = self.db['bbox'][start_index:end_index + 1] 110 | input = torch.from_numpy(self.db['features'][start_index:end_index+1]).float() 111 | 112 | theta_tensor = np.zeros((self.seqlen, 85), dtype=np.float16) 113 | 114 | for idx in range(self.seqlen): 115 | # crop image and transform 2d keypoints 116 | kp_2d[idx,:,:2], trans = transfrom_keypoints( 117 | kp_2d=kp_2d[idx,:,:2], 118 | center_x=bbox[idx,0], 119 | center_y=bbox[idx,1], 120 | width=bbox[idx,2], 121 | height=bbox[idx,3], 122 | patch_width=224, 123 | patch_height=224, 124 | do_augment=False, 125 | ) 126 | 127 | kp_2d[idx,:,:2] = normalize_2d_kp(kp_2d[idx,:,:2], 224) 128 | 129 | # theta shape (85,) 130 | theta = np.concatenate((np.array([1., 0., 0.]), pose[idx], shape[idx]), axis=0) 131 | 132 | kp_2d_tensor[idx] = kp_2d[idx] 133 | theta_tensor[idx] = theta 134 | kp_3d_tensor[idx] = kp_3d[idx] 135 | 136 | target = { 137 | 'features': input, 138 | 'theta': torch.from_numpy(theta_tensor).float(), # camera, pose and shape 139 | 'kp_2d': torch.from_numpy(kp_2d_tensor).float(), # 2D keypoints transformed according to bbox cropping 140 | 'kp_3d': torch.from_numpy(kp_3d_tensor).float(), # 3D keypoints 141 | 'w_smpl': w_smpl, 142 | 'w_3d': w_3d, 143 | } 144 | 145 | if self.dataset_name == 'mpii3d' and not is_train: 146 | target['valid'] = self.db['valid_i'][start_index:end_index+1] 147 | 148 | if self.dataset_name == '3dpw' and not is_train: 149 | vn = self.db['vid_name'][start_index:end_index + 1] 150 | fi = self.db['frame_id'][start_index:end_index + 1] 151 | target['instance_id'] = [f'{v}/{f}'for v,f in zip(vn,fi)] 152 | 153 | 154 | 155 | # if self.dataset_name == '3dpw' and not self.is_train: 156 | # target['imgname'] = self.db['img_name'][start_index:end_index+1].tolist() 157 | # target['imgname'] = np.array(target['imgname']) 158 | # print(target['imgname'].dtype) 159 | # target['center'] = self.db['bbox'][start_index:end_index+1, :2] 160 | # target['valid'] = torch.from_numpy(self.db['valid'][start_index:end_index+1]) 161 | 162 | if self.debug: 163 | from lib.data_utils.img_utils import get_single_image_crop 164 | 165 | if self.dataset_name == 'mpii3d': 166 | video = self.db['img_name'][start_index:end_index+1] 167 | # print(video) 168 | elif self.dataset_name == 'h36m': 169 | video = self.db['img_name'][start_index:end_index + 1] 170 | else: 171 | vid_name = self.db['vid_name'][start_index] 172 | vid_name = '_'.join(vid_name.split('_')[:-1]) 173 | f = osp.join(self.folder, 'imageFiles', vid_name) 174 | video_file_list = [osp.join(f, x) for x in sorted(os.listdir(f)) if x.endswith('.jpg')] 175 | frame_idxs = self.db['frame_id'][start_index:end_index + 1] 176 | # print(f, frame_idxs) 177 | video = [video_file_list[i] for i in frame_idxs] 178 | 179 | video = torch.cat( 180 | [get_single_image_crop(image, bbox).unsqueeze(0) for image, bbox in zip(video, bbox)], dim=0 181 | ) 182 | 183 | target['video'] = video 184 | 185 | return target 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VIBE: Video Inference for Human Body Pose and Shape Estimation [CVPR-2020] 2 | [![report](https://img.shields.io/badge/arxiv-report-red)](https://arxiv.org/abs/1912.05656) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1dFfwxZ52MN86FA6uFNypMEdFShd2euQA) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/vibe-video-inference-for-human-body-pose-and/3d-human-pose-estimation-on-3dpw)](https://paperswithcode.com/sota/3d-human-pose-estimation-on-3dpw?p=vibe-video-inference-for-human-body-pose-and) 3 | 4 |

5 | 6 | 7 |

8 | 9 | Check our YouTube videos below for more details. 10 | 11 | | Paper Video | Qualitative Results | 12 | |------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| 13 | | [![PaperVideo](https://img.youtube.com/vi/rIr-nX63dUA/0.jpg)](https://www.youtube.com/watch?v=rIr-nX63dUA) | [![QualitativeResults](https://img.youtube.com/vi/fW0sIZfQcIs/0.jpg)](https://www.youtube.com/watch?v=fW0sIZfQcIs) | 14 | 15 | 17 | 18 | > [**VIBE: Video Inference for Human Body Pose and Shape Estimation**](https://arxiv.org/abs/1912.05656), 19 | > [Muhammed Kocabas](https://ps.is.tuebingen.mpg.de/person/mkocabas), [Nikos Athanasiou](https://ps.is.tuebingen.mpg.de/person/nathanasiou), 20 | [Michael J. Black](https://ps.is.tuebingen.mpg.de/person/black), 21 | > *IEEE Computer Vision and Pattern Recognition, 2020* 22 | 23 | ## Features 24 | 25 | _**V**ideo **I**nference for **B**ody Pose and Shape **E**stimation_ (VIBE) is a video pose and shape estimation method. 26 | It predicts the parameters of SMPL body model for each frame of an input video. Pleaser refer to our [arXiv report](https://arxiv.org/abs/1912.05656) for further details. 27 | 28 | This implementation: 29 | 30 | - has the demo and training code for VIBE implemented purely in PyTorch, 31 | - can work on arbitrary videos with multiple people, 32 | - supports both CPU and GPU inference (though GPU is way faster), 33 | - is fast, up-to 30 FPS on a RTX2080Ti (see [this table](doc/demo.md#runtime-performance)), 34 | - achieves SOTA results on 3DPW and MPI-INF-3DHP datasets, 35 | - includes Temporal SMPLify implementation. 36 | - includes the training code and detailed instruction on how to train it from scratch. 37 | - can create an FBX/glTF output to be used with major graphics softwares. 38 | 39 |

40 | 41 | 42 |

43 | 44 | ## Updates 45 | 46 | - 06/10/2020: Support OneEuroFilter smoothing. 47 | - 14/09/2020: FBX/glTF conversion script is released. 48 | 49 | ## Getting Started 50 | VIBE has been implemented and tested on Ubuntu 18.04 with python >= 3.7. It supports both GPU and CPU inference. 51 | If you don't have a suitable device, try running our Colab demo. 52 | 53 | Clone the repo: 54 | ```bash 55 | git clone https://github.com/mkocabas/VIBE.git 56 | ``` 57 | 58 | Install the requirements using `virtualenv` or `conda`: 59 | ```bash 60 | # pip 61 | source scripts/install_pip.sh 62 | 63 | # conda 64 | source scripts/install_conda.sh 65 | ``` 66 | 67 | ## Running the Demo 68 | 69 | We have prepared a nice demo code to run VIBE on arbitrary videos. 70 | First, you need download the required data(i.e our trained model and SMPL model parameters). To do this you can just run: 71 | 72 | ```bash 73 | source scripts/prepare_data.sh 74 | ``` 75 | 76 | Then, running the demo is as simple as: 77 | 78 | ```bash 79 | # Run on a local video 80 | python demo.py --vid_file sample_video.mp4 --output_folder output/ --display 81 | 82 | # Run on a YouTube video 83 | python demo.py --vid_file https://www.youtube.com/watch?v=wPZP8Bwxplo --output_folder output/ --display 84 | ``` 85 | 86 | Refer to [`doc/demo.md`](doc/demo.md) for more details about the demo code. 87 | 88 | Sample demo output with the `--sideview` flag: 89 | 90 |

91 | 92 |

93 | 94 | ### FBX and glTF output (New Feature!) 95 | We provide a script to convert VIBE output to standalone FBX/glTF files to be used in 3D graphics tools like 96 | Blender, Unity etc. You need to follow steps below to be able to run the conversion script. 97 | 98 | - You need to download FBX files for SMPL body model 99 | - Go to [SMPL website](https://smpl.is.tue.mpg.de/) and create an account. 100 | - Download the Unity-compatible FBX file through the [link](https://psfiles.is.tuebingen.mpg.de/downloads/smpl/SMPL_unity_v-1-0-0-zip) 101 | - Unzip the contents and locate them `data/SMPL_unity_v.1.0.0`. 102 | - Install Blender python API 103 | - Note that we tested our script with Blender v2.8.0 and v2.8.3. 104 | - Run the command below to convert VIBE output to FBX: 105 | ``` 106 | python lib/utils/fbx_output.py \ 107 | --input output/sample_video/vibe_output.pkl \ 108 | --output output/sample_video/fbx_output.fbx \ # specify the file extension as *.glb for glTF 109 | --fps_source 30 \ 110 | --fps_target 30 \ 111 | --gender \ 112 | --person_id 113 | 114 | ``` 115 | 116 | ## Google Colab 117 | If you do not have a suitable environment to run this project then you could give Google Colab a try. 118 | It allows you to run the project in the cloud, free of charge. You may try our Colab demo using the notebook we have prepared: 119 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1dFfwxZ52MN86FA6uFNypMEdFShd2euQA) 120 | 121 | 122 | ## Training 123 | Run the commands below to start training: 124 | 125 | ```shell script 126 | source scripts/prepare_training_data.sh 127 | python train.py --cfg configs/config.yaml 128 | ``` 129 | 130 | Note that the training datasets should be downloaded and prepared before running data processing script. 131 | Please see [`doc/train.md`](doc/train.md) for details on how to prepare them. 132 | 133 | ## Evaluation 134 | 135 | Here we compare VIBE with recent state-of-the-art methods on 3D pose estimation datasets. Evaluation metric is 136 | Procrustes Aligned Mean Per Joint Position Error (PA-MPJPE) in mm. 137 | 138 | | Models | 3DPW ↓ | MPI-INF-3DHP ↓ | H36M ↓ | 139 | |----------------|:----:|:------------:|:----:| 140 | | SPIN | 59.2 | 67.5 | **41.1** | 141 | | Temporal HMR | 76.7 | 89.8 | 56.8 | 142 | | VIBE | 56.5 | **63.4** | 41.5 | 143 | 144 | See [`doc/eval.md`](doc/eval.md) to reproduce the results in this table or 145 | evaluate a pretrained model. 146 | 147 | **Correction**: Due to a mistake in dataset preprocessing, VIBE trained with 3DPW results in Table 1 of the original paper are not correct. 148 | Besides, even though training with 3DPW guarantees better quantitative performance, it does not give good 149 | qualitative results. ArXiv version will be updated with the corrected results. 150 | 151 | ## Citation 152 | 153 | ```bibtex 154 | @inproceedings{kocabas2019vibe, 155 | title={VIBE: Video Inference for Human Body Pose and Shape Estimation}, 156 | author={Kocabas, Muhammed and Athanasiou, Nikos and Black, Michael J.}, 157 | booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 158 | month = {June}, 159 | year = {2020} 160 | } 161 | ``` 162 | 163 | ## License 164 | This code is available for **non-commercial scientific research purposes** as defined in the [LICENSE file](LICENSE). By downloading and using this code you agree to the terms in the [LICENSE](LICENSE). Third-party datasets and software are subject to their respective licenses. 165 | 166 | 167 | ## References 168 | We indicate if a function or script is borrowed externally inside each file. Here are some great resources we 169 | benefit: 170 | 171 | - Pretrained HMR and some functions are borrowed from [SPIN](https://github.com/nkolot/SPIN). 172 | - SMPL models and layer is from [SMPL-X model](https://github.com/vchoutas/smplx). 173 | - Some functions are borrowed from [Temporal HMR](https://github.com/akanazawa/human_dynamics). 174 | - Some functions are borrowed from [HMR-pytorch](https://github.com/MandyMo/pytorch_HMR). 175 | - Some functions are borrowed from [Kornia](https://github.com/kornia/kornia). 176 | - Pose tracker is from [STAF](https://github.com/soulslicer/openpose/tree/staf). 177 | 178 | -------------------------------------------------------------------------------- /lib/utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | # Some functions are borrowed from https://github.com/akanazawa/human_dynamics/blob/master/src/evaluation/eval_util.py 2 | # Adhere to their licence to use these functions 3 | 4 | import torch 5 | import numpy as np 6 | 7 | 8 | def compute_accel(joints): 9 | """ 10 | Computes acceleration of 3D joints. 11 | Args: 12 | joints (Nx25x3). 13 | Returns: 14 | Accelerations (N-2). 15 | """ 16 | velocities = joints[1:] - joints[:-1] 17 | acceleration = velocities[1:] - velocities[:-1] 18 | acceleration_normed = np.linalg.norm(acceleration, axis=2) 19 | return np.mean(acceleration_normed, axis=1) 20 | 21 | 22 | def compute_error_accel(joints_gt, joints_pred, vis=None): 23 | """ 24 | Computes acceleration error: 25 | 1/(n-2) \sum_{i=1}^{n-1} X_{i-1} - 2X_i + X_{i+1} 26 | Note that for each frame that is not visible, three entries in the 27 | acceleration error should be zero'd out. 28 | Args: 29 | joints_gt (Nx14x3). 30 | joints_pred (Nx14x3). 31 | vis (N). 32 | Returns: 33 | error_accel (N-2). 34 | """ 35 | # (N-2)x14x3 36 | accel_gt = joints_gt[:-2] - 2 * joints_gt[1:-1] + joints_gt[2:] 37 | accel_pred = joints_pred[:-2] - 2 * joints_pred[1:-1] + joints_pred[2:] 38 | 39 | normed = np.linalg.norm(accel_pred - accel_gt, axis=2) 40 | 41 | if vis is None: 42 | new_vis = np.ones(len(normed), dtype=bool) 43 | else: 44 | invis = np.logical_not(vis) 45 | invis1 = np.roll(invis, -1) 46 | invis2 = np.roll(invis, -2) 47 | new_invis = np.logical_or(invis, np.logical_or(invis1, invis2))[:-2] 48 | new_vis = np.logical_not(new_invis) 49 | 50 | return np.mean(normed[new_vis], axis=1) 51 | 52 | 53 | def compute_error_verts(pred_verts, target_verts=None, target_theta=None): 54 | """ 55 | Computes MPJPE over 6890 surface vertices. 56 | Args: 57 | verts_gt (Nx6890x3). 58 | verts_pred (Nx6890x3). 59 | Returns: 60 | error_verts (N). 61 | """ 62 | 63 | if target_verts is None: 64 | from lib.models.smpl import SMPL_MODEL_DIR 65 | from lib.models.smpl import SMPL 66 | device = 'cpu' 67 | smpl = SMPL( 68 | SMPL_MODEL_DIR, 69 | batch_size=1, # target_theta.shape[0], 70 | ).to(device) 71 | 72 | betas = torch.from_numpy(target_theta[:,75:]).to(device) 73 | pose = torch.from_numpy(target_theta[:,3:75]).to(device) 74 | 75 | target_verts = [] 76 | b_ = torch.split(betas, 5000) 77 | p_ = torch.split(pose, 5000) 78 | 79 | for b,p in zip(b_,p_): 80 | output = smpl(betas=b, body_pose=p[:, 3:], global_orient=p[:, :3], pose2rot=True) 81 | target_verts.append(output.vertices.detach().cpu().numpy()) 82 | 83 | target_verts = np.concatenate(target_verts, axis=0) 84 | 85 | assert len(pred_verts) == len(target_verts) 86 | error_per_vert = np.sqrt(np.sum((target_verts - pred_verts) ** 2, axis=2)) 87 | return np.mean(error_per_vert, axis=1) 88 | 89 | 90 | def compute_similarity_transform(S1, S2): 91 | ''' 92 | Computes a similarity transform (sR, t) that takes 93 | a set of 3D points S1 (3 x N) closest to a set of 3D points S2, 94 | where R is an 3x3 rotation matrix, t 3x1 translation, s scale. 95 | i.e. solves the orthogonal Procrutes problem. 96 | ''' 97 | transposed = False 98 | if S1.shape[0] != 3 and S1.shape[0] != 2: 99 | S1 = S1.T 100 | S2 = S2.T 101 | transposed = True 102 | assert(S2.shape[1] == S1.shape[1]) 103 | 104 | # 1. Remove mean. 105 | mu1 = S1.mean(axis=1, keepdims=True) 106 | mu2 = S2.mean(axis=1, keepdims=True) 107 | X1 = S1 - mu1 108 | X2 = S2 - mu2 109 | 110 | # 2. Compute variance of X1 used for scale. 111 | var1 = np.sum(X1**2) 112 | 113 | # 3. The outer product of X1 and X2. 114 | K = X1.dot(X2.T) 115 | 116 | # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are 117 | # singular vectors of K. 118 | U, s, Vh = np.linalg.svd(K) 119 | V = Vh.T 120 | # Construct Z that fixes the orientation of R to get det(R)=1. 121 | Z = np.eye(U.shape[0]) 122 | Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T))) 123 | # Construct R. 124 | R = V.dot(Z.dot(U.T)) 125 | 126 | # 5. Recover scale. 127 | scale = np.trace(R.dot(K)) / var1 128 | 129 | # 6. Recover translation. 130 | t = mu2 - scale*(R.dot(mu1)) 131 | 132 | # 7. Error: 133 | S1_hat = scale*R.dot(S1) + t 134 | 135 | if transposed: 136 | S1_hat = S1_hat.T 137 | 138 | return S1_hat 139 | 140 | 141 | def compute_similarity_transform_torch(S1, S2): 142 | ''' 143 | Computes a similarity transform (sR, t) that takes 144 | a set of 3D points S1 (3 x N) closest to a set of 3D points S2, 145 | where R is an 3x3 rotation matrix, t 3x1 translation, s scale. 146 | i.e. solves the orthogonal Procrutes problem. 147 | ''' 148 | transposed = False 149 | if S1.shape[0] != 3 and S1.shape[0] != 2: 150 | S1 = S1.T 151 | S2 = S2.T 152 | transposed = True 153 | assert (S2.shape[1] == S1.shape[1]) 154 | 155 | # 1. Remove mean. 156 | mu1 = S1.mean(axis=1, keepdims=True) 157 | mu2 = S2.mean(axis=1, keepdims=True) 158 | X1 = S1 - mu1 159 | X2 = S2 - mu2 160 | 161 | # print('X1', X1.shape) 162 | 163 | # 2. Compute variance of X1 used for scale. 164 | var1 = torch.sum(X1 ** 2) 165 | 166 | # print('var', var1.shape) 167 | 168 | # 3. The outer product of X1 and X2. 169 | K = X1.mm(X2.T) 170 | 171 | # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are 172 | # singular vectors of K. 173 | U, s, V = torch.svd(K) 174 | # V = Vh.T 175 | # Construct Z that fixes the orientation of R to get det(R)=1. 176 | Z = torch.eye(U.shape[0], device=S1.device) 177 | Z[-1, -1] *= torch.sign(torch.det(U @ V.T)) 178 | # Construct R. 179 | R = V.mm(Z.mm(U.T)) 180 | 181 | # print('R', X1.shape) 182 | 183 | # 5. Recover scale. 184 | scale = torch.trace(R.mm(K)) / var1 185 | # print(R.shape, mu1.shape) 186 | # 6. Recover translation. 187 | t = mu2 - scale * (R.mm(mu1)) 188 | # print(t.shape) 189 | 190 | # 7. Error: 191 | S1_hat = scale * R.mm(S1) + t 192 | 193 | if transposed: 194 | S1_hat = S1_hat.T 195 | 196 | return S1_hat 197 | 198 | 199 | def batch_compute_similarity_transform_torch(S1, S2): 200 | ''' 201 | Computes a similarity transform (sR, t) that takes 202 | a set of 3D points S1 (3 x N) closest to a set of 3D points S2, 203 | where R is an 3x3 rotation matrix, t 3x1 translation, s scale. 204 | i.e. solves the orthogonal Procrutes problem. 205 | ''' 206 | transposed = False 207 | if S1.shape[0] != 3 and S1.shape[0] != 2: 208 | S1 = S1.permute(0,2,1) 209 | S2 = S2.permute(0,2,1) 210 | transposed = True 211 | assert(S2.shape[1] == S1.shape[1]) 212 | 213 | # 1. Remove mean. 214 | mu1 = S1.mean(axis=-1, keepdims=True) 215 | mu2 = S2.mean(axis=-1, keepdims=True) 216 | 217 | X1 = S1 - mu1 218 | X2 = S2 - mu2 219 | 220 | # 2. Compute variance of X1 used for scale. 221 | var1 = torch.sum(X1**2, dim=1).sum(dim=1) 222 | 223 | # 3. The outer product of X1 and X2. 224 | K = X1.bmm(X2.permute(0,2,1)) 225 | 226 | # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are 227 | # singular vectors of K. 228 | U, s, V = torch.svd(K) 229 | 230 | # Construct Z that fixes the orientation of R to get det(R)=1. 231 | Z = torch.eye(U.shape[1], device=S1.device).unsqueeze(0) 232 | Z = Z.repeat(U.shape[0],1,1) 233 | Z[:,-1, -1] *= torch.sign(torch.det(U.bmm(V.permute(0,2,1)))) 234 | 235 | # Construct R. 236 | R = V.bmm(Z.bmm(U.permute(0,2,1))) 237 | 238 | # 5. Recover scale. 239 | scale = torch.cat([torch.trace(x).unsqueeze(0) for x in R.bmm(K)]) / var1 240 | 241 | # 6. Recover translation. 242 | t = mu2 - (scale.unsqueeze(-1).unsqueeze(-1) * (R.bmm(mu1))) 243 | 244 | # 7. Error: 245 | S1_hat = scale.unsqueeze(-1).unsqueeze(-1) * R.bmm(S1) + t 246 | 247 | if transposed: 248 | S1_hat = S1_hat.permute(0,2,1) 249 | 250 | return S1_hat 251 | 252 | 253 | def align_by_pelvis(joints): 254 | """ 255 | Assumes joints is 14 x 3 in LSP order. 256 | Then hips are: [3, 2] 257 | Takes mid point of these points, then subtracts it. 258 | """ 259 | 260 | left_id = 2 261 | right_id = 3 262 | 263 | pelvis = (joints[left_id, :] + joints[right_id, :]) / 2.0 264 | return joints - np.expand_dims(pelvis, axis=0) 265 | 266 | 267 | def compute_errors(gt3ds, preds): 268 | """ 269 | Gets MPJPE after pelvis alignment + MPJPE after Procrustes. 270 | Evaluates on the 14 common joints. 271 | Inputs: 272 | - gt3ds: N x 14 x 3 273 | - preds: N x 14 x 3 274 | """ 275 | errors, errors_pa = [], [] 276 | for i, (gt3d, pred) in enumerate(zip(gt3ds, preds)): 277 | gt3d = gt3d.reshape(-1, 3) 278 | # Root align. 279 | gt3d = align_by_pelvis(gt3d) 280 | pred3d = align_by_pelvis(pred) 281 | 282 | joint_error = np.sqrt(np.sum((gt3d - pred3d)**2, axis=1)) 283 | errors.append(np.mean(joint_error)) 284 | 285 | # Get PA error. 286 | pred3d_sym = compute_similarity_transform(pred3d, gt3d) 287 | pa_error = np.sqrt(np.sum((gt3d - pred3d_sym)**2, axis=1)) 288 | errors_pa.append(np.mean(pa_error)) 289 | 290 | return errors, errors_pa 291 | -------------------------------------------------------------------------------- /lib/smplify/prior.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | from __future__ import absolute_import 18 | from __future__ import print_function 19 | from __future__ import division 20 | 21 | import sys 22 | import os 23 | 24 | import time 25 | import pickle 26 | 27 | import numpy as np 28 | 29 | import torch 30 | import torch.nn as nn 31 | 32 | DEFAULT_DTYPE = torch.float32 33 | 34 | 35 | def create_prior(prior_type, **kwargs): 36 | if prior_type == 'gmm': 37 | prior = MaxMixturePrior(**kwargs) 38 | elif prior_type == 'l2': 39 | return L2Prior(**kwargs) 40 | elif prior_type == 'angle': 41 | return SMPLifyAnglePrior(**kwargs) 42 | elif prior_type == 'none' or prior_type is None: 43 | # Don't use any pose prior 44 | def no_prior(*args, **kwargs): 45 | return 0.0 46 | prior = no_prior 47 | else: 48 | raise ValueError('Prior {}'.format(prior_type) + ' is not implemented') 49 | return prior 50 | 51 | 52 | class SMPLifyAnglePrior(nn.Module): 53 | def __init__(self, dtype=torch.float32, **kwargs): 54 | super(SMPLifyAnglePrior, self).__init__() 55 | 56 | # Indices for the roration angle of 57 | # 55: left elbow, 90deg bend at -np.pi/2 58 | # 58: right elbow, 90deg bend at np.pi/2 59 | # 12: left knee, 90deg bend at np.pi/2 60 | # 15: right knee, 90deg bend at np.pi/2 61 | angle_prior_idxs = np.array([55, 58, 12, 15], dtype=np.int64) 62 | angle_prior_idxs = torch.tensor(angle_prior_idxs, dtype=torch.long) 63 | self.register_buffer('angle_prior_idxs', angle_prior_idxs) 64 | 65 | angle_prior_signs = np.array([1, -1, -1, -1], 66 | dtype=np.float32 if dtype == torch.float32 67 | else np.float64) 68 | angle_prior_signs = torch.tensor(angle_prior_signs, 69 | dtype=dtype) 70 | self.register_buffer('angle_prior_signs', angle_prior_signs) 71 | 72 | def forward(self, pose, with_global_pose=False): 73 | ''' Returns the angle prior loss for the given pose 74 | 75 | Args: 76 | pose: (Bx[23 + 1] * 3) torch tensor with the axis-angle 77 | representation of the rotations of the joints of the SMPL model. 78 | Kwargs: 79 | with_global_pose: Whether the pose vector also contains the global 80 | orientation of the SMPL model. If not then the indices must be 81 | corrected. 82 | Returns: 83 | A sze (B) tensor containing the angle prior loss for each element 84 | in the batch. 85 | ''' 86 | angle_prior_idxs = self.angle_prior_idxs - (not with_global_pose) * 3 87 | return torch.exp(pose[:, angle_prior_idxs] * 88 | self.angle_prior_signs).pow(2) 89 | 90 | 91 | class L2Prior(nn.Module): 92 | def __init__(self, dtype=DEFAULT_DTYPE, reduction='sum', **kwargs): 93 | super(L2Prior, self).__init__() 94 | 95 | def forward(self, module_input, *args): 96 | return torch.sum(module_input.pow(2)) 97 | 98 | 99 | class MaxMixturePrior(nn.Module): 100 | 101 | def __init__(self, prior_folder='prior', 102 | num_gaussians=6, dtype=DEFAULT_DTYPE, epsilon=1e-16, 103 | use_merged=True, 104 | **kwargs): 105 | super(MaxMixturePrior, self).__init__() 106 | 107 | if dtype == DEFAULT_DTYPE: 108 | np_dtype = np.float32 109 | elif dtype == torch.float64: 110 | np_dtype = np.float64 111 | else: 112 | print('Unknown float type {}, exiting!'.format(dtype)) 113 | sys.exit(-1) 114 | 115 | self.num_gaussians = num_gaussians 116 | self.epsilon = epsilon 117 | self.use_merged = use_merged 118 | gmm_fn = 'gmm_{:02d}.pkl'.format(num_gaussians) 119 | 120 | full_gmm_fn = os.path.join(prior_folder, gmm_fn) 121 | if not os.path.exists(full_gmm_fn): 122 | print('The path to the mixture prior "{}"'.format(full_gmm_fn) + 123 | ' does not exist, exiting!') 124 | sys.exit(-1) 125 | 126 | with open(full_gmm_fn, 'rb') as f: 127 | gmm = pickle.load(f, encoding='latin1') 128 | 129 | if type(gmm) == dict: 130 | means = gmm['means'].astype(np_dtype) 131 | covs = gmm['covars'].astype(np_dtype) 132 | weights = gmm['weights'].astype(np_dtype) 133 | elif 'sklearn.mixture.gmm.GMM' in str(type(gmm)): 134 | means = gmm.means_.astype(np_dtype) 135 | covs = gmm.covars_.astype(np_dtype) 136 | weights = gmm.weights_.astype(np_dtype) 137 | else: 138 | print('Unknown type for the prior: {}, exiting!'.format(type(gmm))) 139 | sys.exit(-1) 140 | 141 | self.register_buffer('means', torch.tensor(means, dtype=dtype)) 142 | 143 | self.register_buffer('covs', torch.tensor(covs, dtype=dtype)) 144 | 145 | precisions = [np.linalg.inv(cov) for cov in covs] 146 | precisions = np.stack(precisions).astype(np_dtype) 147 | 148 | self.register_buffer('precisions', 149 | torch.tensor(precisions, dtype=dtype)) 150 | 151 | # The constant term: 152 | sqrdets = np.array([(np.sqrt(np.linalg.det(c))) 153 | for c in gmm['covars']]) 154 | const = (2 * np.pi)**(69 / 2.) 155 | 156 | nll_weights = np.asarray(gmm['weights'] / (const * 157 | (sqrdets / sqrdets.min()))) 158 | nll_weights = torch.tensor(nll_weights, dtype=dtype).unsqueeze(dim=0) 159 | self.register_buffer('nll_weights', nll_weights) 160 | 161 | weights = torch.tensor(gmm['weights'], dtype=dtype).unsqueeze(dim=0) 162 | self.register_buffer('weights', weights) 163 | 164 | self.register_buffer('pi_term', 165 | torch.log(torch.tensor(2 * np.pi, dtype=dtype))) 166 | 167 | cov_dets = [np.log(np.linalg.det(cov.astype(np_dtype)) + epsilon) 168 | for cov in covs] 169 | self.register_buffer('cov_dets', 170 | torch.tensor(cov_dets, dtype=dtype)) 171 | 172 | # The dimensionality of the random variable 173 | self.random_var_dim = self.means.shape[1] 174 | 175 | def get_mean(self): 176 | ''' Returns the mean of the mixture ''' 177 | mean_pose = torch.matmul(self.weights, self.means) 178 | return mean_pose 179 | 180 | def merged_log_likelihood(self, pose, betas): 181 | diff_from_mean = pose.unsqueeze(dim=1) - self.means 182 | 183 | prec_diff_prod = torch.einsum('mij,bmj->bmi', 184 | [self.precisions, diff_from_mean]) 185 | diff_prec_quadratic = (prec_diff_prod * diff_from_mean).sum(dim=-1) 186 | 187 | curr_loglikelihood = 0.5 * diff_prec_quadratic - \ 188 | torch.log(self.nll_weights) 189 | # curr_loglikelihood = 0.5 * (self.cov_dets.unsqueeze(dim=0) + 190 | # self.random_var_dim * self.pi_term + 191 | # diff_prec_quadratic 192 | # ) - torch.log(self.weights) 193 | 194 | min_likelihood, _ = torch.min(curr_loglikelihood, dim=1) 195 | return min_likelihood 196 | 197 | def log_likelihood(self, pose, betas, *args, **kwargs): 198 | ''' Create graph operation for negative log-likelihood calculation 199 | ''' 200 | likelihoods = [] 201 | 202 | for idx in range(self.num_gaussians): 203 | mean = self.means[idx] 204 | prec = self.precisions[idx] 205 | cov = self.covs[idx] 206 | diff_from_mean = pose - mean 207 | 208 | curr_loglikelihood = torch.einsum('bj,ji->bi', 209 | [diff_from_mean, prec]) 210 | curr_loglikelihood = torch.einsum('bi,bi->b', 211 | [curr_loglikelihood, 212 | diff_from_mean]) 213 | cov_term = torch.log(torch.det(cov) + self.epsilon) 214 | curr_loglikelihood += 0.5 * (cov_term + 215 | self.random_var_dim * 216 | self.pi_term) 217 | likelihoods.append(curr_loglikelihood) 218 | 219 | log_likelihoods = torch.stack(likelihoods, dim=1) 220 | min_idx = torch.argmin(log_likelihoods, dim=1) 221 | weight_component = self.nll_weights[:, min_idx] 222 | weight_component = -torch.log(weight_component) 223 | 224 | return weight_component + log_likelihoods[:, min_idx] 225 | 226 | def forward(self, pose, betas): 227 | if self.use_merged: 228 | return self.merged_log_likelihood(pose, betas) 229 | else: 230 | return self.log_likelihood(pose, betas) 231 | -------------------------------------------------------------------------------- /lib/smplify/losses.py: -------------------------------------------------------------------------------- 1 | # This script is the extended version of https://github.com/nkolot/SPIN/blob/master/smplify/losses.py to deal with 2 | # sequences inputs. 3 | 4 | import torch 5 | from lib.models.spin import perspective_projection 6 | from lib.models.smpl import JOINT_IDS 7 | 8 | 9 | def gmof(x, sigma): 10 | """ 11 | Geman-McClure error function 12 | """ 13 | x_squared = x ** 2 14 | sigma_squared = sigma ** 2 15 | return (sigma_squared * x_squared) / (sigma_squared + x_squared) 16 | 17 | 18 | def angle_prior(pose): 19 | """ 20 | Angle prior that penalizes unnatural bending of the knees and elbows 21 | """ 22 | # We subtract 3 because pose does not include the global rotation of the model 23 | return torch.exp( 24 | pose[:, [55 - 3, 58 - 3, 12 - 3, 15 - 3]] * torch.tensor([1., -1., -1, -1.], device=pose.device)) ** 2 25 | 26 | 27 | def body_fitting_loss(body_pose, betas, model_joints, camera_t, camera_center, 28 | joints_2d, joints_conf, pose_prior, 29 | focal_length=5000, sigma=100, pose_prior_weight=4.78, 30 | shape_prior_weight=5, angle_prior_weight=15.2, 31 | output='sum'): 32 | """ 33 | Loss function for body fitting 34 | """ 35 | # pose_prior_weight = 1. 36 | # shape_prior_weight = 1. 37 | # angle_prior_weight = 1. 38 | # sigma = 10. 39 | 40 | batch_size = body_pose.shape[0] 41 | rotation = torch.eye(3, device=body_pose.device).unsqueeze(0).expand(batch_size, -1, -1) 42 | projected_joints = perspective_projection(model_joints, rotation, camera_t, 43 | focal_length, camera_center) 44 | 45 | # Weighted robust reprojection error 46 | reprojection_error = gmof(projected_joints - joints_2d, sigma) 47 | reprojection_loss = (joints_conf ** 2) * reprojection_error.sum(dim=-1) 48 | 49 | # Pose prior loss 50 | pose_prior_loss = (pose_prior_weight ** 2) * pose_prior(body_pose, betas) 51 | 52 | # Angle prior for knees and elbows 53 | angle_prior_loss = (angle_prior_weight ** 2) * angle_prior(body_pose).sum(dim=-1) 54 | 55 | # Regularizer to prevent betas from taking large values 56 | shape_prior_loss = (shape_prior_weight ** 2) * (betas ** 2).sum(dim=-1) 57 | 58 | total_loss = reprojection_loss.sum(dim=-1) + pose_prior_loss + angle_prior_loss + shape_prior_loss 59 | print(f'joints: {reprojection_loss[0].sum().item():.2f}, ' 60 | f'pose_prior: {pose_prior_loss[0].item():.2f}, ' 61 | f'angle_prior: {angle_prior_loss[0].item():.2f}, ' 62 | f'shape_prior: {shape_prior_loss[0].item():.2f}') 63 | 64 | if output == 'sum': 65 | return total_loss.sum() 66 | elif output == 'reprojection': 67 | return reprojection_loss 68 | 69 | 70 | def camera_fitting_loss(model_joints, camera_t, camera_t_est, camera_center, joints_2d, joints_conf, 71 | focal_length=5000, depth_loss_weight=100): 72 | """ 73 | Loss function for camera optimization. 74 | """ 75 | 76 | # Project model joints 77 | batch_size = model_joints.shape[0] 78 | rotation = torch.eye(3, device=model_joints.device).unsqueeze(0).expand(batch_size, -1, -1) 79 | projected_joints = perspective_projection(model_joints, rotation, camera_t, 80 | focal_length, camera_center) 81 | 82 | op_joints = ['OP RHip', 'OP LHip', 'OP RShoulder', 'OP LShoulder'] 83 | op_joints_ind = [JOINT_IDS[joint] for joint in op_joints] 84 | gt_joints = ['Right Hip', 'Left Hip', 'Right Shoulder', 'Left Shoulder'] 85 | gt_joints_ind = [JOINT_IDS[joint] for joint in gt_joints] 86 | reprojection_error_op = (joints_2d[:, op_joints_ind] - 87 | projected_joints[:, op_joints_ind]) ** 2 88 | reprojection_error_gt = (joints_2d[:, gt_joints_ind] - 89 | projected_joints[:, gt_joints_ind]) ** 2 90 | 91 | # Check if for each example in the batch all 4 OpenPose detections are valid, otherwise use the GT detections 92 | # OpenPose joints are more reliable for this task, so we prefer to use them if possible 93 | is_valid = (joints_conf[:, op_joints_ind].min(dim=-1)[0][:, None, None] > 0).float() 94 | reprojection_loss = (is_valid * reprojection_error_op + (1 - is_valid) * reprojection_error_gt).sum(dim=(1, 2)) 95 | 96 | # Loss that penalizes deviation from depth estimate 97 | depth_loss = (depth_loss_weight ** 2) * (camera_t[:, 2] - camera_t_est[:, 2]) ** 2 98 | 99 | total_loss = reprojection_loss + depth_loss 100 | return total_loss.sum() 101 | 102 | 103 | def temporal_body_fitting_loss(body_pose, betas, model_joints, camera_t, camera_center, 104 | joints_2d, joints_conf, pose_prior, 105 | focal_length=5000, sigma=100, pose_prior_weight=4.78, 106 | shape_prior_weight=5, angle_prior_weight=15.2, 107 | smooth_2d_weight=0.01, smooth_3d_weight=1.0, 108 | output='sum'): 109 | """ 110 | Loss function for body fitting 111 | """ 112 | # pose_prior_weight = 1. 113 | # shape_prior_weight = 1. 114 | # angle_prior_weight = 1. 115 | # sigma = 10. 116 | 117 | batch_size = body_pose.shape[0] 118 | rotation = torch.eye(3, device=body_pose.device).unsqueeze(0).expand(batch_size, -1, -1) 119 | projected_joints = perspective_projection(model_joints, rotation, camera_t, 120 | focal_length, camera_center) 121 | 122 | # Weighted robust reprojection error 123 | reprojection_error = gmof(projected_joints - joints_2d, sigma) 124 | reprojection_loss = (joints_conf ** 2) * reprojection_error.sum(dim=-1) 125 | 126 | # Pose prior loss 127 | pose_prior_loss = (pose_prior_weight ** 2) * pose_prior(body_pose, betas) 128 | 129 | # Angle prior for knees and elbows 130 | angle_prior_loss = (angle_prior_weight ** 2) * angle_prior(body_pose).sum(dim=-1) 131 | 132 | # Regularizer to prevent betas from taking large values 133 | shape_prior_loss = (shape_prior_weight ** 2) * (betas ** 2).sum(dim=-1) 134 | 135 | total_loss = reprojection_loss.sum(dim=-1) + pose_prior_loss + angle_prior_loss + shape_prior_loss 136 | 137 | # Smooth 2d joint loss 138 | joint_conf_diff = joints_conf[1:] 139 | joints_2d_diff = projected_joints[1:] - projected_joints[:-1] 140 | smooth_j2d_loss = (joint_conf_diff ** 2) * joints_2d_diff.abs().sum(dim=-1) 141 | smooth_j2d_loss = torch.cat( 142 | [torch.zeros(1, smooth_j2d_loss.shape[1], device=body_pose.device), smooth_j2d_loss] 143 | ).sum(dim=-1) 144 | smooth_j2d_loss = (smooth_2d_weight ** 2) * smooth_j2d_loss 145 | 146 | # Smooth 3d joint loss 147 | joints_3d_diff = model_joints[1:] - model_joints[:-1] 148 | # joints_3d_diff = joints_3d_diff * 100. 149 | smooth_j3d_loss = (joint_conf_diff ** 2) * joints_3d_diff.abs().sum(dim=-1) 150 | smooth_j3d_loss = torch.cat( 151 | [torch.zeros(1, smooth_j3d_loss.shape[1], device=body_pose.device), smooth_j3d_loss] 152 | ).sum(dim=-1) 153 | smooth_j3d_loss = (smooth_3d_weight ** 2) * smooth_j3d_loss 154 | 155 | total_loss += smooth_j2d_loss + smooth_j3d_loss 156 | 157 | # print(f'joints: {reprojection_loss[0].sum().item():.2f}, ' 158 | # f'pose_prior: {pose_prior_loss[0].item():.2f}, ' 159 | # f'angle_prior: {angle_prior_loss[0].item():.2f}, ' 160 | # f'shape_prior: {shape_prior_loss[0].item():.2f}, ' 161 | # f'smooth_j2d: {smooth_j2d_loss.sum().item()}, ' 162 | # f'smooth_j3d: {smooth_j3d_loss.sum().item()}') 163 | 164 | if output == 'sum': 165 | return total_loss.sum() 166 | elif output == 'reprojection': 167 | return reprojection_loss 168 | 169 | 170 | def temporal_camera_fitting_loss(model_joints, camera_t, camera_t_est, camera_center, joints_2d, joints_conf, 171 | focal_length=5000, depth_loss_weight=100): 172 | """ 173 | Loss function for camera optimization. 174 | """ 175 | 176 | # Project model joints 177 | batch_size = model_joints.shape[0] 178 | rotation = torch.eye(3, device=model_joints.device).unsqueeze(0).expand(batch_size, -1, -1) 179 | projected_joints = perspective_projection(model_joints, rotation, camera_t, 180 | focal_length, camera_center) 181 | 182 | op_joints = ['OP RHip', 'OP LHip', 'OP RShoulder', 'OP LShoulder'] 183 | op_joints_ind = [JOINT_IDS[joint] for joint in op_joints] 184 | # gt_joints = ['Right Hip', 'Left Hip', 'Right Shoulder', 'Left Shoulder'] 185 | # gt_joints_ind = [constants.JOINT_IDS[joint] for joint in gt_joints] 186 | reprojection_error_op = (joints_2d[:, op_joints_ind] - 187 | projected_joints[:, op_joints_ind]) ** 2 188 | # reprojection_error_gt = (joints_2d[:, gt_joints_ind] - 189 | # projected_joints[:, gt_joints_ind]) ** 2 190 | 191 | # Check if for each example in the batch all 4 OpenPose detections are valid, otherwise use the GT detections 192 | # OpenPose joints are more reliable for this task, so we prefer to use them if possible 193 | is_valid = (joints_conf[:, op_joints_ind].min(dim=-1)[0][:, None, None] > 0).float() 194 | reprojection_loss = (is_valid * reprojection_error_op).sum(dim=(1, 2)) 195 | 196 | # Loss that penalizes deviation from depth estimate 197 | depth_loss = (depth_loss_weight ** 2) * (camera_t[:, 2] - camera_t_est[:, 2]) ** 2 198 | 199 | total_loss = reprojection_loss + depth_loss 200 | return total_loss.sum() 201 | -------------------------------------------------------------------------------- /lib/core/loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import torch 18 | import torch.nn as nn 19 | 20 | from lib.utils.geometry import batch_rodrigues 21 | 22 | class VIBELoss(nn.Module): 23 | def __init__( 24 | self, 25 | e_loss_weight=60., 26 | e_3d_loss_weight=30., 27 | e_pose_loss_weight=1., 28 | e_shape_loss_weight=0.001, 29 | d_motion_loss_weight=1., 30 | device='cuda', 31 | ): 32 | super(VIBELoss, self).__init__() 33 | self.e_loss_weight = e_loss_weight 34 | self.e_3d_loss_weight = e_3d_loss_weight 35 | self.e_pose_loss_weight = e_pose_loss_weight 36 | self.e_shape_loss_weight = e_shape_loss_weight 37 | self.d_motion_loss_weight = d_motion_loss_weight 38 | 39 | self.device = device 40 | self.criterion_shape = nn.L1Loss().to(self.device) 41 | self.criterion_keypoints = nn.MSELoss(reduction='none').to(self.device) 42 | self.criterion_regr = nn.MSELoss().to(self.device) 43 | 44 | self.enc_loss = batch_encoder_disc_l2_loss 45 | self.dec_loss = batch_adv_disc_l2_loss 46 | 47 | def forward( 48 | self, 49 | generator_outputs, 50 | data_2d, 51 | data_3d, 52 | data_body_mosh=None, 53 | data_motion_mosh=None, 54 | body_discriminator=None, 55 | motion_discriminator=None, 56 | ): 57 | # to reduce time dimension 58 | reduce = lambda x: x.reshape((x.shape[0] * x.shape[1],) + x.shape[2:]) 59 | # flatten for weight vectors 60 | flatten = lambda x: x.reshape(-1) 61 | # accumulate all predicted thetas from IEF 62 | accumulate_thetas = lambda x: torch.cat([output['theta'] for output in x],0) 63 | 64 | if data_2d: 65 | sample_2d_count = data_2d['kp_2d'].shape[0] 66 | real_2d = torch.cat((data_2d['kp_2d'], data_3d['kp_2d']), 0) 67 | else: 68 | sample_2d_count = 0 69 | real_2d = data_3d['kp_2d'] 70 | 71 | real_2d = reduce(real_2d) 72 | 73 | real_3d = reduce(data_3d['kp_3d']) 74 | data_3d_theta = reduce(data_3d['theta']) 75 | 76 | w_3d = data_3d['w_3d'].type(torch.bool) 77 | w_smpl = data_3d['w_smpl'].type(torch.bool) 78 | 79 | total_predict_thetas = accumulate_thetas(generator_outputs) 80 | 81 | preds = generator_outputs[-1] 82 | 83 | pred_j3d = preds['kp_3d'][sample_2d_count:] 84 | pred_theta = preds['theta'][sample_2d_count:] 85 | 86 | theta_size = pred_theta.shape[:2] 87 | 88 | pred_theta = reduce(pred_theta) 89 | pred_j2d = reduce(preds['kp_2d']) 90 | pred_j3d = reduce(pred_j3d) 91 | 92 | w_3d = flatten(w_3d) 93 | w_smpl = flatten(w_smpl) 94 | 95 | pred_theta = pred_theta[w_smpl] 96 | pred_j3d = pred_j3d[w_3d] 97 | data_3d_theta = data_3d_theta[w_smpl] 98 | real_3d = real_3d[w_3d] 99 | 100 | # <======== Generator Loss 101 | loss_kp_2d = self.keypoint_loss(pred_j2d, real_2d, openpose_weight=1., gt_weight=1.) * self.e_loss_weight 102 | 103 | loss_kp_3d = self.keypoint_3d_loss(pred_j3d, real_3d) 104 | loss_kp_3d = loss_kp_3d * self.e_3d_loss_weight 105 | 106 | real_shape, pred_shape = data_3d_theta[:, 75:], pred_theta[:, 75:] 107 | real_pose, pred_pose = data_3d_theta[:, 3:75], pred_theta[:, 3:75] 108 | 109 | loss_dict = { 110 | 'loss_kp_2d': loss_kp_2d, 111 | 'loss_kp_3d': loss_kp_3d, 112 | } 113 | if pred_theta.shape[0] > 0: 114 | loss_pose, loss_shape = self.smpl_losses(pred_pose, pred_shape, real_pose, real_shape) 115 | loss_shape = loss_shape * self.e_shape_loss_weight 116 | loss_pose = loss_pose * self.e_pose_loss_weight 117 | loss_dict['loss_shape'] = loss_shape 118 | loss_dict['loss_pose'] = loss_pose 119 | 120 | gen_loss = torch.stack(list(loss_dict.values())).sum() 121 | 122 | # <======== Motion Discriminator Loss 123 | end_idx = 75 124 | start_idx = 6 125 | pred_motion = total_predict_thetas 126 | e_motion_disc_loss = self.enc_loss(motion_discriminator(pred_motion[:, :, start_idx:end_idx])) 127 | e_motion_disc_loss = e_motion_disc_loss * self.d_motion_loss_weight 128 | 129 | fake_motion = pred_motion.detach() 130 | real_motion = data_motion_mosh['theta'] 131 | fake_disc_value = motion_discriminator(fake_motion[:, :, start_idx:end_idx]) 132 | real_disc_value = motion_discriminator(real_motion[:, :, start_idx:end_idx]) 133 | d_motion_disc_real, d_motion_disc_fake, d_motion_disc_loss = self.dec_loss(real_disc_value, fake_disc_value) 134 | 135 | d_motion_disc_real = d_motion_disc_real * self.d_motion_loss_weight 136 | d_motion_disc_fake = d_motion_disc_fake * self.d_motion_loss_weight 137 | d_motion_disc_loss = d_motion_disc_loss * self.d_motion_loss_weight 138 | 139 | loss_dict['e_m_disc_loss'] = e_motion_disc_loss 140 | loss_dict['d_m_disc_real'] = d_motion_disc_real 141 | loss_dict['d_m_disc_fake'] = d_motion_disc_fake 142 | loss_dict['d_m_disc_loss'] = d_motion_disc_loss 143 | 144 | gen_loss = gen_loss + e_motion_disc_loss 145 | motion_dis_loss = d_motion_disc_loss 146 | 147 | return gen_loss, motion_dis_loss, loss_dict 148 | 149 | def keypoint_loss(self, pred_keypoints_2d, gt_keypoints_2d, openpose_weight, gt_weight): 150 | """ 151 | Compute 2D reprojection loss on the keypoints. 152 | The loss is weighted by the confidence. 153 | The available keypoints are different for each dataset. 154 | """ 155 | conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone() 156 | conf[:, :25] *= openpose_weight 157 | conf[:, 25:] *= gt_weight 158 | loss = (conf * self.criterion_keypoints(pred_keypoints_2d, gt_keypoints_2d[:, :, :-1])).mean() 159 | return loss 160 | 161 | def keypoint_3d_loss(self, pred_keypoints_3d, gt_keypoints_3d): 162 | """ 163 | Compute 3D keypoint loss for the examples that 3D keypoint annotations are available. 164 | The loss is weighted by the confidence. 165 | """ 166 | pred_keypoints_3d = pred_keypoints_3d[:, 25:39, :] 167 | gt_keypoints_3d = gt_keypoints_3d[:, 25:39, :] 168 | 169 | # conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone() 170 | # gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone() 171 | # gt_keypoints_3d = gt_keypoints_3d 172 | # conf = conf 173 | pred_keypoints_3d = pred_keypoints_3d 174 | if len(gt_keypoints_3d) > 0: 175 | gt_pelvis = (gt_keypoints_3d[:, 2,:] + gt_keypoints_3d[:, 3,:]) / 2 176 | gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :] 177 | pred_pelvis = (pred_keypoints_3d[:, 2,:] + pred_keypoints_3d[:, 3,:]) / 2 178 | pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :] 179 | # print(conf.shape, pred_keypoints_3d.shape, gt_keypoints_3d.shape) 180 | # return (conf * self.criterion_keypoints(pred_keypoints_3d, gt_keypoints_3d)).mean() 181 | return self.criterion_keypoints(pred_keypoints_3d, gt_keypoints_3d).mean() 182 | else: 183 | return torch.FloatTensor(1).fill_(0.).to(self.device) 184 | 185 | def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas): 186 | pred_rotmat_valid = batch_rodrigues(pred_rotmat.reshape(-1,3)).reshape(-1, 24, 3, 3) 187 | gt_rotmat_valid = batch_rodrigues(gt_pose.reshape(-1,3)).reshape(-1, 24, 3, 3) 188 | pred_betas_valid = pred_betas 189 | gt_betas_valid = gt_betas 190 | if len(pred_rotmat_valid) > 0: 191 | loss_regr_pose = self.criterion_regr(pred_rotmat_valid, gt_rotmat_valid) 192 | loss_regr_betas = self.criterion_regr(pred_betas_valid, gt_betas_valid) 193 | else: 194 | loss_regr_pose = torch.FloatTensor(1).fill_(0.).to(self.device) 195 | loss_regr_betas = torch.FloatTensor(1).fill_(0.).to(self.device) 196 | return loss_regr_pose, loss_regr_betas 197 | 198 | 199 | def batch_encoder_disc_l2_loss(disc_value): 200 | ''' 201 | Inputs: 202 | disc_value: N x 25 203 | ''' 204 | k = disc_value.shape[0] 205 | return torch.sum((disc_value - 1.0) ** 2) * 1.0 / k 206 | 207 | 208 | def batch_adv_disc_l2_loss(real_disc_value, fake_disc_value): 209 | ''' 210 | Inputs: 211 | disc_value: N x 25 212 | ''' 213 | ka = real_disc_value.shape[0] 214 | kb = fake_disc_value.shape[0] 215 | lb, la = torch.sum(fake_disc_value ** 2) / kb, torch.sum((real_disc_value - 1) ** 2) / ka 216 | return la, lb, la + lb 217 | 218 | 219 | def batch_encoder_disc_wasserstein_loss(disc_value): 220 | ''' 221 | Inputs: 222 | disc_value: N x 25 223 | ''' 224 | k = disc_value.shape[0] 225 | return -1 * disc_value.sum() / k 226 | 227 | 228 | def batch_adv_disc_wasserstein_loss(real_disc_value, fake_disc_value): 229 | ''' 230 | Inputs: 231 | disc_value: N x 25 232 | ''' 233 | 234 | ka = real_disc_value.shape[0] 235 | kb = fake_disc_value.shape[0] 236 | 237 | la = -1 * real_disc_value.sum() / ka 238 | lb = fake_disc_value.sum() / kb 239 | return la, lb, la + lb 240 | 241 | 242 | def batch_smooth_pose_loss(pred_theta): 243 | pose = pred_theta[:,:,3:75] 244 | pose_diff = pose[:,1:,:] - pose[:,:-1,:] 245 | return torch.mean(pose_diff).abs() 246 | 247 | 248 | def batch_smooth_shape_loss(pred_theta): 249 | shape = pred_theta[:, :, 75:] 250 | shape_diff = shape[:, 1:, :] - shape[:, :-1, :] 251 | return torch.mean(shape_diff).abs() 252 | -------------------------------------------------------------------------------- /lib/utils/demo_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is 4 | # holder of all proprietary rights on this computer program. 5 | # You can only use this computer program if you have closed 6 | # a license agreement with MPG or you get the right to use the computer 7 | # program from someone who is authorized to grant you that right. 8 | # Any use of the computer program without a valid license is prohibited and 9 | # liable to prosecution. 10 | # 11 | # Copyright©2019 Max-Planck-Gesellschaft zur Förderung 12 | # der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute 13 | # for Intelligent Systems. All rights reserved. 14 | # 15 | # Contact: ps-license@tuebingen.mpg.de 16 | 17 | import os 18 | import cv2 19 | import time 20 | import json 21 | import torch 22 | import subprocess 23 | import numpy as np 24 | import os.path as osp 25 | from pytube import YouTube 26 | from collections import OrderedDict 27 | 28 | from lib.utils.smooth_bbox import get_smooth_bbox_params, get_all_bbox_params 29 | from lib.data_utils.img_utils import get_single_image_crop_demo 30 | from lib.utils.geometry import rotation_matrix_to_angle_axis 31 | from lib.smplify.temporal_smplify import TemporalSMPLify 32 | 33 | 34 | def preprocess_video(video, joints2d, bboxes, frames, scale=1.0, crop_size=224): 35 | """ 36 | Read video, do normalize and crop it according to the bounding box. 37 | If there are bounding box annotations, use them to crop the image. 38 | If no bounding box is specified but openpose detections are available, use them to get the bounding box. 39 | 40 | :param video (ndarray): input video 41 | :param joints2d (ndarray, NxJx3): openpose detections 42 | :param bboxes (ndarray, Nx5): bbox detections 43 | :param scale (float): bbox crop scaling factor 44 | :param crop_size (int): crop width and height 45 | :return: cropped video, cropped and normalized video, modified bboxes, modified joints2d 46 | """ 47 | 48 | if joints2d is not None: 49 | bboxes, time_pt1, time_pt2 = get_all_bbox_params(joints2d, vis_thresh=0.3) 50 | bboxes[:,2:] = 150. / bboxes[:,2:] 51 | bboxes = np.stack([bboxes[:,0], bboxes[:,1], bboxes[:,2], bboxes[:,2]]).T 52 | 53 | video = video[time_pt1:time_pt2] 54 | joints2d = joints2d[time_pt1:time_pt2] 55 | frames = frames[time_pt1:time_pt2] 56 | 57 | shape = video.shape 58 | 59 | temp_video = np.zeros((shape[0], crop_size, crop_size, shape[-1])) 60 | norm_video = torch.zeros(shape[0], shape[-1], crop_size, crop_size) 61 | 62 | for idx in range(video.shape[0]): 63 | 64 | img = video[idx] 65 | bbox = bboxes[idx] 66 | 67 | j2d = joints2d[idx] if joints2d is not None else None 68 | 69 | norm_img, raw_img, kp_2d = get_single_image_crop_demo( 70 | img, 71 | bbox, 72 | kp_2d=j2d, 73 | scale=scale, 74 | crop_size=crop_size) 75 | 76 | if joints2d is not None: 77 | joints2d[idx] = kp_2d 78 | 79 | temp_video[idx] = raw_img 80 | norm_video[idx] = norm_img 81 | 82 | temp_video = temp_video.astype(np.uint8) 83 | 84 | return temp_video, norm_video, bboxes, joints2d, frames 85 | 86 | 87 | def download_youtube_clip(url, download_folder): 88 | return YouTube(url).streams.first().download(output_path=download_folder) 89 | 90 | 91 | def smplify_runner( 92 | pred_rotmat, 93 | pred_betas, 94 | pred_cam, 95 | j2d, 96 | device, 97 | batch_size, 98 | lr=1.0, 99 | opt_steps=1, 100 | use_lbfgs=True, 101 | pose2aa=True 102 | ): 103 | smplify = TemporalSMPLify( 104 | step_size=lr, 105 | batch_size=batch_size, 106 | num_iters=opt_steps, 107 | focal_length=5000., 108 | use_lbfgs=use_lbfgs, 109 | device=device, 110 | # max_iter=10, 111 | ) 112 | # Convert predicted rotation matrices to axis-angle 113 | if pose2aa: 114 | pred_pose = rotation_matrix_to_angle_axis(pred_rotmat.detach()).reshape(batch_size, -1) 115 | else: 116 | pred_pose = pred_rotmat 117 | 118 | # Calculate camera parameters for smplify 119 | pred_cam_t = torch.stack([ 120 | pred_cam[:, 1], pred_cam[:, 2], 121 | 2 * 5000 / (224 * pred_cam[:, 0] + 1e-9) 122 | ], dim=-1) 123 | 124 | gt_keypoints_2d_orig = j2d 125 | # Before running compute reprojection error of the network 126 | opt_joint_loss = smplify.get_fitting_loss( 127 | pred_pose.detach(), pred_betas.detach(), 128 | pred_cam_t.detach(), 129 | 0.5 * 224 * torch.ones(batch_size, 2, device=device), 130 | gt_keypoints_2d_orig).mean(dim=-1) 131 | 132 | best_prediction_id = torch.argmin(opt_joint_loss).item() 133 | pred_betas = pred_betas[best_prediction_id].unsqueeze(0) 134 | # pred_betas = pred_betas[best_prediction_id:best_prediction_id+2] # .unsqueeze(0) 135 | # top5_best_idxs = torch.topk(opt_joint_loss, 5, largest=False)[1] 136 | # breakpoint() 137 | 138 | start = time.time() 139 | # Run SMPLify optimization initialized from the network prediction 140 | # new_opt_vertices, new_opt_joints, \ 141 | # new_opt_pose, new_opt_betas, \ 142 | # new_opt_cam_t, \ 143 | output, new_opt_joint_loss = smplify( 144 | pred_pose.detach(), pred_betas.detach(), 145 | pred_cam_t.detach(), 146 | 0.5 * 224 * torch.ones(batch_size, 2, device=device), 147 | gt_keypoints_2d_orig, 148 | ) 149 | new_opt_joint_loss = new_opt_joint_loss.mean(dim=-1) 150 | # smplify_time = time.time() - start 151 | # print(f'Smplify time: {smplify_time}') 152 | # Will update the dictionary for the examples where the new loss is less than the current one 153 | update = (new_opt_joint_loss < opt_joint_loss) 154 | 155 | new_opt_vertices = output['verts'] 156 | new_opt_cam_t = output['theta'][:,:3] 157 | new_opt_pose = output['theta'][:,3:75] 158 | new_opt_betas = output['theta'][:,75:] 159 | new_opt_joints3d = output['kp_3d'] 160 | 161 | return_val = [ 162 | update, new_opt_vertices.cpu(), new_opt_cam_t.cpu(), 163 | new_opt_pose.cpu(), new_opt_betas.cpu(), new_opt_joints3d.cpu(), 164 | new_opt_joint_loss, opt_joint_loss, 165 | ] 166 | 167 | return return_val 168 | 169 | 170 | def trim_videos(filename, start_time, end_time, output_filename): 171 | command = ['ffmpeg', 172 | '-i', '"%s"' % filename, 173 | '-ss', str(start_time), 174 | '-t', str(end_time - start_time), 175 | '-c:v', 'libx264', '-c:a', 'copy', 176 | '-threads', '1', 177 | '-loglevel', 'panic', 178 | '"%s"' % output_filename] 179 | # command = ' '.join(command) 180 | subprocess.call(command) 181 | 182 | 183 | def video_to_images(vid_file, img_folder=None, return_info=False): 184 | if img_folder is None: 185 | img_folder = osp.join('/tmp', osp.basename(vid_file).replace('.', '_')) 186 | 187 | os.makedirs(img_folder, exist_ok=True) 188 | 189 | command = ['ffmpeg', 190 | '-i', vid_file, 191 | '-f', 'image2', 192 | '-v', 'error', 193 | f'{img_folder}/%06d.png'] 194 | print(f'Running \"{" ".join(command)}\"') 195 | subprocess.call(command) 196 | 197 | print(f'Images saved to \"{img_folder}\"') 198 | 199 | img_shape = cv2.imread(osp.join(img_folder, '000001.png')).shape 200 | 201 | if return_info: 202 | return img_folder, len(os.listdir(img_folder)), img_shape 203 | else: 204 | return img_folder 205 | 206 | 207 | def download_url(url, outdir): 208 | print(f'Downloading files from {url}') 209 | cmd = ['wget', '-c', url, '-P', outdir] 210 | subprocess.call(cmd) 211 | 212 | 213 | def download_ckpt(outdir='data/vibe_data', use_3dpw=False): 214 | os.makedirs(outdir, exist_ok=True) 215 | 216 | if use_3dpw: 217 | ckpt_file = 'data/vibe_data/vibe_model_w_3dpw.pth.tar' 218 | url = 'https://www.dropbox.com/s/41ozgqorcp095ja/vibe_model_w_3dpw.pth.tar' 219 | if not os.path.isfile(ckpt_file): 220 | download_url(url=url, outdir=outdir) 221 | else: 222 | ckpt_file = 'data/vibe_data/vibe_model_wo_3dpw.pth.tar' 223 | url = 'https://www.dropbox.com/s/amj2p8bmf6g56k6/vibe_model_wo_3dpw.pth.tar' 224 | if not os.path.isfile(ckpt_file): 225 | download_url(url=url, outdir=outdir) 226 | 227 | return ckpt_file 228 | 229 | 230 | def images_to_video(img_folder, output_vid_file): 231 | os.makedirs(img_folder, exist_ok=True) 232 | 233 | command = [ 234 | 'ffmpeg', '-y', '-threads', '16', '-i', f'{img_folder}/%06d.png', '-profile:v', 'baseline', 235 | '-level', '3.0', '-c:v', 'libx264', '-pix_fmt', 'yuv420p', '-an', '-v', 'error', output_vid_file, 236 | ] 237 | 238 | print(f'Running \"{" ".join(command)}\"') 239 | subprocess.call(command) 240 | 241 | 242 | def convert_crop_cam_to_orig_img(cam, bbox, img_width, img_height): 243 | ''' 244 | Convert predicted camera from cropped image coordinates 245 | to original image coordinates 246 | :param cam (ndarray, shape=(3,)): weak perspective camera in cropped img coordinates 247 | :param bbox (ndarray, shape=(4,)): bbox coordinates (c_x, c_y, h) 248 | :param img_width (int): original image width 249 | :param img_height (int): original image height 250 | :return: 251 | ''' 252 | cx, cy, h = bbox[:,0], bbox[:,1], bbox[:,2] 253 | hw, hh = img_width / 2., img_height / 2. 254 | sx = cam[:,0] * (1. / (img_width / h)) 255 | sy = cam[:,0] * (1. / (img_height / h)) 256 | tx = ((cx - hw) / hw / sx) + cam[:,1] 257 | ty = ((cy - hh) / hh / sy) + cam[:,2] 258 | orig_cam = np.stack([sx, sy, tx, ty]).T 259 | return orig_cam 260 | 261 | 262 | def prepare_rendering_results(vibe_results, nframes): 263 | frame_results = [{} for _ in range(nframes)] 264 | for person_id, person_data in vibe_results.items(): 265 | for idx, frame_id in enumerate(person_data['frame_ids']): 266 | frame_results[frame_id][person_id] = { 267 | 'verts': person_data['verts'][idx], 268 | 'cam': person_data['orig_cam'][idx], 269 | } 270 | 271 | # naive depth ordering based on the scale of the weak perspective camera 272 | for frame_id, frame_data in enumerate(frame_results): 273 | # sort based on y-scale of the cam in original image coords 274 | sort_idx = np.argsort([v['cam'][1] for k,v in frame_data.items()]) 275 | frame_results[frame_id] = OrderedDict( 276 | {list(frame_data.keys())[i]:frame_data[list(frame_data.keys())[i]] for i in sort_idx} 277 | ) 278 | 279 | return frame_results 280 | -------------------------------------------------------------------------------- /lib/smplify/temporal_smplify.py: -------------------------------------------------------------------------------- 1 | # This script is the extended version of https://github.com/nkolot/SPIN/blob/master/smplify/smplify.py to deal with 2 | # sequences inputs. 3 | 4 | import os 5 | import torch 6 | 7 | from lib.core.config import VIBE_DATA_DIR 8 | from lib.models.smpl import SMPL, JOINT_IDS, SMPL_MODEL_DIR 9 | from lib.smplify.losses import temporal_camera_fitting_loss, temporal_body_fitting_loss 10 | 11 | # For the GMM prior, we use the GMM implementation of SMPLify-X 12 | # https://github.com/vchoutas/smplify-x/blob/master/smplifyx/prior.py 13 | from .prior import MaxMixturePrior 14 | 15 | def arrange_betas(pose, betas): 16 | batch_size = pose.shape[0] 17 | num_video = betas.shape[0] 18 | 19 | video_size = batch_size // num_video 20 | betas_ext = torch.zeros(batch_size, betas.shape[-1], device=betas.device) 21 | for i in range(num_video): 22 | betas_ext[i*video_size:(i+1)*video_size] = betas[i] 23 | 24 | return betas_ext 25 | 26 | class TemporalSMPLify(): 27 | """Implementation of single-stage SMPLify.""" 28 | 29 | def __init__(self, 30 | step_size=1e-2, 31 | batch_size=66, 32 | num_iters=100, 33 | focal_length=5000, 34 | use_lbfgs=True, 35 | device=torch.device('cuda'), 36 | max_iter=20): 37 | 38 | # Store options 39 | self.device = device 40 | self.focal_length = focal_length 41 | self.step_size = step_size 42 | self.max_iter = max_iter 43 | # Ignore the the following joints for the fitting process 44 | ign_joints = ['OP Neck', 'OP RHip', 'OP LHip', 'Right Hip', 'Left Hip'] 45 | self.ign_joints = [JOINT_IDS[i] for i in ign_joints] 46 | self.num_iters = num_iters 47 | 48 | # GMM pose prior 49 | self.pose_prior = MaxMixturePrior(prior_folder=VIBE_DATA_DIR, 50 | num_gaussians=8, 51 | dtype=torch.float32).to(device) 52 | self.use_lbfgs = use_lbfgs 53 | # Load SMPL model 54 | self.smpl = SMPL(SMPL_MODEL_DIR, 55 | batch_size=batch_size, 56 | create_transl=False).to(self.device) 57 | 58 | def __call__(self, init_pose, init_betas, init_cam_t, camera_center, keypoints_2d): 59 | """Perform body fitting. 60 | Input: 61 | init_pose: SMPL pose estimate 62 | init_betas: SMPL betas estimate 63 | init_cam_t: Camera translation estimate 64 | camera_center: Camera center location 65 | keypoints_2d: Keypoints used for the optimization 66 | Returns: 67 | vertices: Vertices of optimized shape 68 | joints: 3D joints of optimized shape 69 | pose: SMPL pose parameters of optimized shape 70 | betas: SMPL beta parameters of optimized shape 71 | camera_translation: Camera translation 72 | reprojection_loss: Final joint reprojection loss 73 | """ 74 | 75 | # Make camera translation a learnable parameter 76 | camera_translation = init_cam_t.clone() 77 | 78 | # Get joint confidence 79 | joints_2d = keypoints_2d[:, :, :2] 80 | joints_conf = keypoints_2d[:, :, -1] 81 | 82 | # Split SMPL pose to body pose and global orientation 83 | body_pose = init_pose[:, 3:].detach().clone() 84 | global_orient = init_pose[:, :3].detach().clone() 85 | betas = init_betas.detach().clone() 86 | 87 | # Step 1: Optimize camera translation and body orientation 88 | # Optimize only camera translation and body orientation 89 | body_pose.requires_grad = False 90 | betas.requires_grad = False 91 | global_orient.requires_grad = True 92 | camera_translation.requires_grad = True 93 | 94 | camera_opt_params = [global_orient, camera_translation] 95 | 96 | if self.use_lbfgs: 97 | camera_optimizer = torch.optim.LBFGS(camera_opt_params, max_iter=self.max_iter, 98 | lr=self.step_size, line_search_fn='strong_wolfe') 99 | for i in range(self.num_iters): 100 | def closure(): 101 | camera_optimizer.zero_grad() 102 | betas_ext = arrange_betas(body_pose, betas) 103 | smpl_output = self.smpl(global_orient=global_orient, 104 | body_pose=body_pose, 105 | betas=betas_ext) 106 | model_joints = smpl_output.joints 107 | 108 | 109 | loss = temporal_camera_fitting_loss(model_joints, camera_translation, 110 | init_cam_t, camera_center, 111 | joints_2d, joints_conf, focal_length=self.focal_length) 112 | loss.backward() 113 | return loss 114 | 115 | camera_optimizer.step(closure) 116 | else: 117 | camera_optimizer = torch.optim.Adam(camera_opt_params, lr=self.step_size, betas=(0.9, 0.999)) 118 | 119 | for i in range(self.num_iters): 120 | betas_ext = arrange_betas(body_pose, betas) 121 | smpl_output = self.smpl(global_orient=global_orient, 122 | body_pose=body_pose, 123 | betas=betas_ext) 124 | model_joints = smpl_output.joints 125 | loss = temporal_camera_fitting_loss(model_joints, camera_translation, 126 | init_cam_t, camera_center, 127 | joints_2d, joints_conf, focal_length=self.focal_length) 128 | camera_optimizer.zero_grad() 129 | loss.backward() 130 | camera_optimizer.step() 131 | 132 | # Fix camera translation after optimizing camera 133 | camera_translation.requires_grad = False 134 | 135 | # Step 2: Optimize body joints 136 | # Optimize only the body pose and global orientation of the body 137 | body_pose.requires_grad = True 138 | betas.requires_grad = True 139 | global_orient.requires_grad = True 140 | camera_translation.requires_grad = False 141 | body_opt_params = [body_pose, betas, global_orient] 142 | 143 | # For joints ignored during fitting, set the confidence to 0 144 | joints_conf[:, self.ign_joints] = 0. 145 | 146 | if self.use_lbfgs: 147 | body_optimizer = torch.optim.LBFGS(body_opt_params, max_iter=self.max_iter, 148 | lr=self.step_size, line_search_fn='strong_wolfe') 149 | for i in range(self.num_iters): 150 | def closure(): 151 | body_optimizer.zero_grad() 152 | betas_ext = arrange_betas(body_pose, betas) 153 | smpl_output = self.smpl(global_orient=global_orient, 154 | body_pose=body_pose, 155 | betas=betas_ext) 156 | model_joints = smpl_output.joints 157 | 158 | loss = temporal_body_fitting_loss(body_pose, betas, model_joints, camera_translation, camera_center, 159 | joints_2d, joints_conf, self.pose_prior, 160 | focal_length=self.focal_length) 161 | loss.backward() 162 | return loss 163 | 164 | body_optimizer.step(closure) 165 | else: 166 | body_optimizer = torch.optim.Adam(body_opt_params, lr=self.step_size, betas=(0.9, 0.999)) 167 | 168 | for i in range(self.num_iters): 169 | betas_ext = arrange_betas(body_pose, betas) 170 | smpl_output = self.smpl(global_orient=global_orient, 171 | body_pose=body_pose, 172 | betas=betas_ext) 173 | model_joints = smpl_output.joints 174 | loss = temporal_body_fitting_loss(body_pose, betas, model_joints, camera_translation, camera_center, 175 | joints_2d, joints_conf, self.pose_prior, 176 | focal_length=self.focal_length) 177 | body_optimizer.zero_grad() 178 | loss.backward() 179 | body_optimizer.step() 180 | # scheduler.step(epoch=i) 181 | 182 | # Get final loss value 183 | 184 | with torch.no_grad(): 185 | betas_ext = arrange_betas(body_pose, betas) 186 | smpl_output = self.smpl(global_orient=global_orient, 187 | body_pose=body_pose, 188 | betas=betas_ext) 189 | model_joints = smpl_output.joints 190 | reprojection_loss = temporal_body_fitting_loss(body_pose, betas, model_joints, camera_translation, 191 | camera_center, 192 | joints_2d, joints_conf, self.pose_prior, 193 | focal_length=self.focal_length, 194 | output='reprojection') 195 | 196 | vertices = smpl_output.vertices.detach() 197 | joints = smpl_output.joints.detach() 198 | pose = torch.cat([global_orient, body_pose], dim=-1).detach() 199 | betas = betas.detach() 200 | 201 | # Back to weak perspective camera 202 | camera_translation = torch.stack([ 203 | 2 * 5000. / (224 * camera_translation[:,2] + 1e-9), 204 | camera_translation[:,0], camera_translation[:,1] 205 | ], dim=-1) 206 | 207 | betas = betas.repeat(pose.shape[0],1) 208 | output = { 209 | 'theta': torch.cat([camera_translation, pose, betas], dim=1), 210 | 'verts': vertices, 211 | 'kp_3d': joints, 212 | } 213 | 214 | return output, reprojection_loss 215 | # return vertices, joints, pose, betas, camera_translation, reprojection_loss 216 | 217 | def get_fitting_loss(self, pose, betas, cam_t, camera_center, keypoints_2d): 218 | """Given body and camera parameters, compute reprojection loss value. 219 | Input: 220 | pose: SMPL pose parameters 221 | betas: SMPL beta parameters 222 | cam_t: Camera translation 223 | camera_center: Camera center location 224 | keypoints_2d: Keypoints used for the optimization 225 | Returns: 226 | reprojection_loss: Final joint reprojection loss 227 | """ 228 | 229 | batch_size = pose.shape[0] 230 | 231 | # Get joint confidence 232 | joints_2d = keypoints_2d[:, :, :2] 233 | joints_conf = keypoints_2d[:, :, -1] 234 | # For joints ignored during fitting, set the confidence to 0 235 | joints_conf[:, self.ign_joints] = 0. 236 | 237 | # Split SMPL pose to body pose and global orientation 238 | body_pose = pose[:, 3:] 239 | global_orient = pose[:, :3] 240 | 241 | with torch.no_grad(): 242 | smpl_output = self.smpl(global_orient=global_orient, 243 | body_pose=body_pose, 244 | betas=betas, return_full_pose=True) 245 | model_joints = smpl_output.joints 246 | reprojection_loss = temporal_body_fitting_loss(body_pose, betas, model_joints, cam_t, camera_center, 247 | joints_2d, joints_conf, self.pose_prior, 248 | focal_length=self.focal_length, 249 | output='reprojection') 250 | 251 | return reprojection_loss 252 | --------------------------------------------------------------------------------