├── LICENSE ├── README.md ├── figures ├── gallery │ ├── gen_00.gif │ ├── gen_01.gif │ ├── gen_02.gif │ ├── gen_03.gif │ ├── gen_04.gif │ ├── gen_05.gif │ ├── gen_06.gif │ ├── gen_07.gif │ └── gen_08.gif └── gallery_t2m │ ├── gen_00.gif │ ├── gen_01.gif │ ├── gen_02.gif │ ├── gen_03.gif │ ├── gen_04.gif │ ├── gen_05.gif │ ├── gen_06.gif │ └── gen_07.gif └── text2motion ├── README.md ├── datasets ├── __init__.py ├── dataloader.py ├── dataset.py ├── evaluator.py └── evaluator_models.py ├── install.md ├── models ├── __init__.py ├── gaussian_diffusion.py └── transformer.py ├── options ├── base_options.py ├── evaluate_options.py └── train_options.py ├── requirements.txt ├── tools ├── evaluation.py ├── train.py └── visualization.py ├── trainers ├── __init__.py └── ddpm_trainer.py └── utils ├── __init__.py ├── get_opt.py ├── metrics.py ├── motion_process.py ├── paramUtil.py ├── plot_script.py ├── quaternion.py ├── skeleton.py ├── utils.py └── word_vectorizer.py /LICENSE: -------------------------------------------------------------------------------- 1 | S-Lab License 1.0 2 | 3 | Copyright 2022 S-Lab 4 | 5 | Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | 15 | 4. In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work. 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model

4 | 5 |
6 | Mingyuan Zhang1*  7 | Zhongang Cai1,2*  8 | Liang Pan1  9 | Fangzhou Hong1  10 | Xinying Guo1  11 | Lei Yang2  12 | Ziwei Liu1+ 13 |
14 |
15 | 1S-Lab, Nanyang Technological University  16 | 2SenseTime Research  17 |
18 |
19 | *equal contribution  20 | +corresponding author 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |
play the guitarwalk sadlywalk happilycheck time
37 | 38 | This repository contains the official implementation of _MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model_. 39 | 40 | --- 41 | 42 |

43 | [Project Page] • 44 | [arXiv] • 45 | [Video] • 46 | [Colab Demo] • 47 | [Hugging Face Demo] 48 |

49 | 50 |
51 | 52 | 53 | ## Updates 54 | 55 | [10/2022] Add a [🤗Hugging Face Demo](https://huggingface.co/spaces/mingyuan/MotionDiffuse) for text-driven motion generation! 56 | 57 | [10/2022] Add a [Colab Demo](https://colab.research.google.com/drive/1Dp6VsZp2ozKuu9ccMmsDjyij_vXfCYb3?usp=sharing) for text-driven motion generation! [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Dp6VsZp2ozKuu9ccMmsDjyij_vXfCYb3?usp=sharing) 58 | 59 | [10/2022] Code release for text-driven motion generation! 60 | 61 | [8/2022] [Paper](https://arxiv.org/abs/2208.15001) uploaded to arXiv. [![arXiv](https://img.shields.io/badge/arXiv-2208.15001-b31b1b.svg)](https://arxiv.org/abs/2208.15001) 62 | 63 | ## Text-driven Motion Generation 64 | 65 | You may refer to [this file](text2motion/README.md) for detailed introduction. 66 | 67 | ## Citation 68 | 69 | If you find our work useful for your research, please consider citing the paper: 70 | 71 | ``` 72 | @article{zhang2022motiondiffuse, 73 | title={MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model}, 74 | author={Zhang, Mingyuan and Cai, Zhongang and Pan, Liang and Hong, Fangzhou and Guo, Xinying and Yang, Lei and Liu, Ziwei}, 75 | journal={arXiv preprint arXiv:2208.15001}, 76 | year={2022} 77 | } 78 | ``` 79 | 80 | ## Acknowledgements 81 | 82 | This study is supported by NTU NAP, MOE AcRF Tier 2 (T2EP20221-0033), and under the RIE2020 Industry Alignment Fund – Industry Collaboration Projects (IAF-ICP) Funding Initiative, as well as cash and in-kind contribution from the industry partner(s). 83 | -------------------------------------------------------------------------------- /figures/gallery/gen_00.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_00.gif -------------------------------------------------------------------------------- /figures/gallery/gen_01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_01.gif -------------------------------------------------------------------------------- /figures/gallery/gen_02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_02.gif -------------------------------------------------------------------------------- /figures/gallery/gen_03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_03.gif -------------------------------------------------------------------------------- /figures/gallery/gen_04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_04.gif -------------------------------------------------------------------------------- /figures/gallery/gen_05.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_05.gif -------------------------------------------------------------------------------- /figures/gallery/gen_06.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_06.gif -------------------------------------------------------------------------------- /figures/gallery/gen_07.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_07.gif -------------------------------------------------------------------------------- /figures/gallery/gen_08.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery/gen_08.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_00.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_00.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_01.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_01.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_02.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_02.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_03.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_03.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_04.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_04.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_05.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_05.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_06.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_06.gif -------------------------------------------------------------------------------- /figures/gallery_t2m/gen_07.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/figures/gallery_t2m/gen_07.gif -------------------------------------------------------------------------------- /text2motion/README.md: -------------------------------------------------------------------------------- 1 | # Text-driven Motion Generation 2 | 3 | 4 | 5 | - [Installation](#installation) 6 | - [Training](#prepare-environment) 7 | - [Acknowledgement](#acknowledgement) 8 | 9 | 10 | 11 | ## Installation 12 | 13 | Please refer to [install.md](install.md) for detailed installation. 14 | 15 | ## Training 16 | 17 | Due to the requirement of a large batchsize, we highly recommend you to use DDP training. A slurm-based script is as below: 18 | 19 | ```shell 20 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 21 | srun -p ${PARTITION} -n8 --gres=gpu:8 -u \ 22 | python -u tools/train.py \ 23 | --name kit_baseline_ddp_8gpu_8layers_1000 \ 24 | --batch_size 128 \ 25 | --times 200 \ 26 | --num_epochs 50 \ 27 | --dataset_name kit \ 28 | --distributed 29 | ``` 30 | 31 | Besides, you can train the model on multi-GPUs with DataParallel: 32 | 33 | ```shell 34 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 35 | python -u tools/train.py \ 36 | --name kit_baseline_dp_2gpu_8layers_1000 \ 37 | --batch_size 128 \ 38 | --times 50 \ 39 | --num_epochs 50 \ 40 | --dataset_name kit \ 41 | --num_layers 8 \ 42 | --diffusion_steps 1000 \ 43 | --data_parallel \ 44 | --gpu_id 0 1 45 | ``` 46 | 47 | Otherwise, you can run the training code on a single GPU like: 48 | 49 | ```shell 50 | PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ 51 | python -u tools/train.py \ 52 | --name kit_baseline_1gpu_8layers_1000 \ 53 | --batch_size 128 \ 54 | --times 25 \ 55 | --num_epochs 50 \ 56 | --dataset_name kit 57 | ``` 58 | 59 | Here, `times` means the duplication times of the original dataset. To retain the number of iterations, you can set `times` to 25 for 1 GPU, 50 for 2 GPUs, 100 for 4 GPUs, and 200 for 8 GPUs. 60 | 61 | ## Evaluation 62 | 63 | ```shell 64 | # GPU_ID indicates which gpu you want to use 65 | python -u tools/evaluation.py checkpoints/kit/kit_motiondiffuse/opt.txt GPU_ID 66 | # Or you can omit this option and use cpu for evaluation 67 | python -u tools/evaluation.py checkpoints/kit/kit_motiondiffuse/opt.txt 68 | ``` 69 | 70 | ## Visualization 71 | 72 | You can visualize human motion with the given language description and the expected motion length. We also provide a [Colab Demo](https://colab.research.google.com/drive/1Dp6VsZp2ozKuu9ccMmsDjyij_vXfCYb3?usp=sharing) and a [Hugging Face Demo](https://huggingface.co/spaces/mingyuan/MotionDiffuse) for your convenience. 73 | 74 | ```shell 75 | # Currently we only support visualization of models trained on the HumanML3D dataset. 76 | # Motion length can not be larger than 196, which is the maximum length during training 77 | # You can omit `gpu_id` to run visualization on your CPU 78 | # Optionally, you can store the xyz coordinates of each joint to `npy_path`. The shape of motion data is (T, 22, 3), where T denotes the motion length, 22 is the number of joints. 79 | 80 | python -u tools/visualization.py \ 81 | --opt_path checkpoints/t2m/t2m_motiondiffuse/opt.txt \ 82 | --text "a person is jumping" \ 83 | --motion_length 60 \ 84 | --result_path "test_sample.gif" \ 85 | --npy_path "test_sample.npy" \ 86 | --gpu_id 0 87 | ``` 88 | 89 | Here are some visualization examples. The motion lengths are shown in the title of animations. 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 |
105 | 106 | **Note:** You may install `matplotlib==3.3.1` to support visualization here. 107 | 108 | ## Acknowledgement 109 | 110 | This code is developed on top of [Generating Diverse and Natural 3D Human Motions from Text](https://github.com/EricGuo5513/text-to-motion) -------------------------------------------------------------------------------- /text2motion/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import Text2MotionDataset 2 | from .evaluator import ( 3 | EvaluationDataset, 4 | get_dataset_motion_loader, 5 | get_motion_loader, 6 | EvaluatorModelWrapper) 7 | from .dataloader import build_dataloader 8 | 9 | __all__ = [ 10 | 'Text2MotionDataset', 'EvaluationDataset', 'build_dataloader', 11 | 'get_dataset_motion_loader', 'get_motion_loader'] -------------------------------------------------------------------------------- /text2motion/datasets/dataloader.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import random 3 | from functools import partial 4 | from typing import Optional, Union 5 | 6 | import numpy as np 7 | from mmcv.runner import get_dist_info 8 | from mmcv.utils import Registry, build_from_cfg 9 | from torch.utils.data import DataLoader 10 | from torch.utils.data.dataset import Dataset 11 | 12 | import torch 13 | from torch.utils.data import DistributedSampler as _DistributedSampler 14 | 15 | 16 | class DistributedSampler(_DistributedSampler): 17 | 18 | def __init__(self, 19 | dataset, 20 | num_replicas=None, 21 | rank=None, 22 | shuffle=True, 23 | round_up=True): 24 | super().__init__(dataset, num_replicas=num_replicas, rank=rank) 25 | self.shuffle = shuffle 26 | self.round_up = round_up 27 | if self.round_up: 28 | self.total_size = self.num_samples * self.num_replicas 29 | else: 30 | self.total_size = len(self.dataset) 31 | 32 | def __iter__(self): 33 | # deterministically shuffle based on epoch 34 | if self.shuffle: 35 | g = torch.Generator() 36 | g.manual_seed(self.epoch) 37 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 38 | else: 39 | indices = torch.arange(len(self.dataset)).tolist() 40 | 41 | # add extra samples to make it evenly divisible 42 | if self.round_up: 43 | indices = ( 44 | indices * 45 | int(self.total_size / len(indices) + 1))[:self.total_size] 46 | assert len(indices) == self.total_size 47 | 48 | # subsample 49 | indices = indices[self.rank:self.total_size:self.num_replicas] 50 | if self.round_up: 51 | assert len(indices) == self.num_samples 52 | 53 | return iter(indices) 54 | 55 | 56 | def build_dataloader(dataset: Dataset, 57 | samples_per_gpu: int, 58 | workers_per_gpu: int, 59 | num_gpus: Optional[int] = 1, 60 | dist: Optional[bool] = True, 61 | shuffle: Optional[bool] = True, 62 | round_up: Optional[bool] = True, 63 | seed: Optional[Union[int, None]] = None, 64 | persistent_workers: Optional[bool] = True, 65 | **kwargs): 66 | """Build PyTorch DataLoader. 67 | 68 | In distributed training, each GPU/process has a dataloader. 69 | In non-distributed training, there is only one dataloader for all GPUs. 70 | 71 | Args: 72 | dataset (:obj:`Dataset`): A PyTorch dataset. 73 | samples_per_gpu (int): Number of training samples on each GPU, i.e., 74 | batch size of each GPU. 75 | workers_per_gpu (int): How many subprocesses to use for data loading 76 | for each GPU. 77 | num_gpus (int, optional): Number of GPUs. Only used in non-distributed 78 | training. 79 | dist (bool, optional): Distributed training/test or not. Default: True. 80 | shuffle (bool, optional): Whether to shuffle the data at every epoch. 81 | Default: True. 82 | round_up (bool, optional): Whether to round up the length of dataset by 83 | adding extra samples to make it evenly divisible. Default: True. 84 | persistent_workers (bool): If True, the data loader will not shutdown 85 | the worker processes after a dataset has been consumed once. 86 | This allows to maintain the workers Dataset instances alive. 87 | The argument also has effect in PyTorch>=1.7.0. 88 | Default: True 89 | kwargs: any keyword argument to be used to initialize DataLoader 90 | 91 | Returns: 92 | DataLoader: A PyTorch dataloader. 93 | """ 94 | rank, world_size = get_dist_info() 95 | if dist: 96 | sampler = DistributedSampler( 97 | dataset, world_size, rank, shuffle=shuffle, round_up=round_up) 98 | shuffle = False 99 | batch_size = samples_per_gpu 100 | num_workers = workers_per_gpu 101 | else: 102 | sampler = None 103 | batch_size = num_gpus * samples_per_gpu 104 | num_workers = num_gpus * workers_per_gpu 105 | 106 | init_fn = partial( 107 | worker_init_fn, num_workers=num_workers, rank=rank, 108 | seed=seed) if seed is not None else None 109 | 110 | data_loader = DataLoader( 111 | dataset, 112 | batch_size=batch_size, 113 | sampler=sampler, 114 | num_workers=num_workers, 115 | pin_memory=False, 116 | shuffle=shuffle, 117 | worker_init_fn=init_fn, 118 | persistent_workers=persistent_workers, 119 | **kwargs) 120 | 121 | return data_loader 122 | 123 | 124 | def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int): 125 | """Init random seed for each worker.""" 126 | # The seed of each worker equals to 127 | # num_worker * rank + worker_id + user_seed 128 | worker_seed = num_workers * rank + worker_id + seed 129 | np.random.seed(worker_seed) 130 | random.seed(worker_seed) 131 | -------------------------------------------------------------------------------- /text2motion/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils import data 3 | import numpy as np 4 | import os 5 | from os.path import join as pjoin 6 | import random 7 | import codecs as cs 8 | from tqdm import tqdm 9 | 10 | 11 | class Text2MotionDataset(data.Dataset): 12 | """Dataset for Text2Motion generation task. 13 | 14 | """ 15 | def __init__(self, opt, mean, std, split_file, times=1, w_vectorizer=None, eval_mode=False): 16 | self.opt = opt 17 | self.max_length = 20 18 | self.times = times 19 | self.w_vectorizer = w_vectorizer 20 | self.eval_mode = eval_mode 21 | min_motion_len = 40 if self.opt.dataset_name =='t2m' else 24 22 | 23 | joints_num = opt.joints_num 24 | 25 | data_dict = {} 26 | id_list = [] 27 | with cs.open(split_file, 'r') as f: 28 | for line in f.readlines(): 29 | id_list.append(line.strip()) 30 | 31 | new_name_list = [] 32 | length_list = [] 33 | for name in tqdm(id_list): 34 | try: 35 | motion = np.load(pjoin(opt.motion_dir, name + '.npy')) 36 | if (len(motion)) < min_motion_len or (len(motion) >= 200): 37 | continue 38 | text_data = [] 39 | flag = False 40 | with cs.open(pjoin(opt.text_dir, name + '.txt')) as f: 41 | for line in f.readlines(): 42 | text_dict = {} 43 | line_split = line.strip().split('#') 44 | caption = line_split[0] 45 | tokens = line_split[1].split(' ') 46 | f_tag = float(line_split[2]) 47 | to_tag = float(line_split[3]) 48 | f_tag = 0.0 if np.isnan(f_tag) else f_tag 49 | to_tag = 0.0 if np.isnan(to_tag) else to_tag 50 | 51 | text_dict['caption'] = caption 52 | text_dict['tokens'] = tokens 53 | if f_tag == 0.0 and to_tag == 0.0: 54 | flag = True 55 | text_data.append(text_dict) 56 | else: 57 | n_motion = motion[int(f_tag*20) : int(to_tag*20)] 58 | if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200): 59 | continue 60 | new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name 61 | while new_name in data_dict: 62 | new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name 63 | data_dict[new_name] = {'motion': n_motion, 64 | 'length': len(n_motion), 65 | 'text':[text_dict]} 66 | new_name_list.append(new_name) 67 | length_list.append(len(n_motion)) 68 | 69 | if flag: 70 | data_dict[name] = {'motion': motion, 71 | 'length': len(motion), 72 | 'text':text_data} 73 | new_name_list.append(name) 74 | length_list.append(len(motion)) 75 | except: 76 | # Some motion may not exist in KIT dataset 77 | pass 78 | 79 | 80 | name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1])) 81 | 82 | if opt.is_train: 83 | # root_rot_velocity (B, seq_len, 1) 84 | std[0:1] = std[0:1] / opt.feat_bias 85 | # root_linear_velocity (B, seq_len, 2) 86 | std[1:3] = std[1:3] / opt.feat_bias 87 | # root_y (B, seq_len, 1) 88 | std[3:4] = std[3:4] / opt.feat_bias 89 | # ric_data (B, seq_len, (joint_num - 1)*3) 90 | std[4: 4 + (joints_num - 1) * 3] = std[4: 4 + (joints_num - 1) * 3] / 1.0 91 | # rot_data (B, seq_len, (joint_num - 1)*6) 92 | std[4 + (joints_num - 1) * 3: 4 + (joints_num - 1) * 9] = std[4 + (joints_num - 1) * 3: 4 + ( 93 | joints_num - 1) * 9] / 1.0 94 | # local_velocity (B, seq_len, joint_num*3) 95 | std[4 + (joints_num - 1) * 9: 4 + (joints_num - 1) * 9 + joints_num * 3] = std[ 96 | 4 + (joints_num - 1) * 9: 4 + ( 97 | joints_num - 1) * 9 + joints_num * 3] / 1.0 98 | # foot contact (B, seq_len, 4) 99 | std[4 + (joints_num - 1) * 9 + joints_num * 3:] = std[ 100 | 4 + (joints_num - 1) * 9 + joints_num * 3:] / opt.feat_bias 101 | 102 | assert 4 + (joints_num - 1) * 9 + joints_num * 3 + 4 == mean.shape[-1] 103 | np.save(pjoin(opt.meta_dir, 'mean.npy'), mean) 104 | np.save(pjoin(opt.meta_dir, 'std.npy'), std) 105 | 106 | self.mean = mean 107 | self.std = std 108 | self.length_arr = np.array(length_list) 109 | self.data_dict = data_dict 110 | self.name_list = name_list 111 | 112 | def inv_transform(self, data): 113 | return data * self.std + self.mean 114 | 115 | def real_len(self): 116 | return len(self.data_dict) 117 | 118 | def __len__(self): 119 | return self.real_len() * self.times 120 | 121 | def __getitem__(self, item): 122 | idx = item % self.real_len() 123 | data = self.data_dict[self.name_list[idx]] 124 | motion, m_length, text_list = data['motion'], data['length'], data['text'] 125 | # Randomly select a caption 126 | text_data = random.choice(text_list) 127 | caption = text_data['caption'] 128 | 129 | max_motion_length = self.opt.max_motion_length 130 | if m_length >= self.opt.max_motion_length: 131 | idx = random.randint(0, len(motion) - max_motion_length) 132 | motion = motion[idx: idx + max_motion_length] 133 | else: 134 | padding_len = max_motion_length - m_length 135 | D = motion.shape[1] 136 | padding_zeros = np.zeros((padding_len, D)) 137 | motion = np.concatenate((motion, padding_zeros), axis=0) 138 | 139 | assert len(motion) == max_motion_length 140 | "Z Normalization" 141 | motion = (motion - self.mean) / self.std 142 | 143 | if self.eval_mode: 144 | tokens = text_data['tokens'] 145 | if len(tokens) < self.opt.max_text_len: 146 | # pad with "unk" 147 | tokens = ['sos/OTHER'] + tokens + ['eos/OTHER'] 148 | sent_len = len(tokens) 149 | tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len) 150 | else: 151 | # crop 152 | tokens = tokens[:self.opt.max_text_len] 153 | tokens = ['sos/OTHER'] + tokens + ['eos/OTHER'] 154 | sent_len = len(tokens) 155 | pos_one_hots = [] 156 | word_embeddings = [] 157 | for token in tokens: 158 | word_emb, pos_oh = self.w_vectorizer[token] 159 | pos_one_hots.append(pos_oh[None, :]) 160 | word_embeddings.append(word_emb[None, :]) 161 | pos_one_hots = np.concatenate(pos_one_hots, axis=0) 162 | word_embeddings = np.concatenate(word_embeddings, axis=0) 163 | return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length 164 | return caption, motion, m_length 165 | -------------------------------------------------------------------------------- /text2motion/datasets/evaluator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils.word_vectorizer import WordVectorizer, POS_enumerator 3 | from utils.get_opt import get_opt 4 | from models import MotionTransformer 5 | from torch.utils.data import Dataset, DataLoader 6 | from os.path import join as pjoin 7 | from tqdm import tqdm 8 | import numpy as np 9 | from .evaluator_models import * 10 | import os 11 | import codecs as cs 12 | import random 13 | from torch.utils.data._utils.collate import default_collate 14 | 15 | 16 | class EvaluationDataset(Dataset): 17 | 18 | def __init__(self, opt, trainer, dataset, w_vectorizer, mm_num_samples, mm_num_repeats): 19 | assert mm_num_samples < len(dataset) 20 | print(opt.model_dir) 21 | 22 | dataloader = DataLoader(dataset, batch_size=1, num_workers=1, shuffle=True) 23 | epoch, it = trainer.load(pjoin(opt.model_dir, opt.which_epoch + '.tar')) 24 | 25 | generated_motion = [] 26 | min_mov_length = 10 if opt.dataset_name == 't2m' else 6 27 | 28 | trainer.eval_mode() 29 | trainer.to(opt.device) 30 | 31 | # Pre-process all target captions 32 | mm_generated_motions = [] 33 | mm_idxs = np.random.choice(len(dataset), mm_num_samples, replace=False) 34 | mm_idxs = np.sort(mm_idxs) 35 | all_caption = [] 36 | all_m_lens = [] 37 | all_data = [] 38 | with torch.no_grad(): 39 | for i, data in tqdm(enumerate(dataloader)): 40 | word_emb, pos_ohot, caption, cap_lens, motions, m_lens, tokens = data 41 | all_data.append(data) 42 | tokens = tokens[0].split('_') 43 | mm_num_now = len(mm_generated_motions) 44 | is_mm = True if ((mm_num_now < mm_num_samples) and (i == mm_idxs[mm_num_now])) else False 45 | repeat_times = mm_num_repeats if is_mm else 1 46 | m_lens = max(m_lens // opt.unit_length * opt.unit_length, min_mov_length * opt.unit_length) 47 | m_lens = min(m_lens, opt.max_motion_length) 48 | if isinstance(m_lens, int): 49 | m_lens = torch.LongTensor([m_lens]).to(opt.device) 50 | else: 51 | m_lens = m_lens.to(opt.device) 52 | for t in range(repeat_times): 53 | all_m_lens.append(m_lens) 54 | all_caption.extend(caption) 55 | if is_mm: 56 | mm_generated_motions.append(0) 57 | all_m_lens = torch.stack(all_m_lens) 58 | 59 | # Generate all sequences 60 | with torch.no_grad(): 61 | all_pred_motions = trainer.generate(all_caption, all_m_lens, opt.dim_pose) 62 | 63 | cur_idx = 0 64 | mm_generated_motions = [] 65 | with torch.no_grad(): 66 | for i, data_dummy in tqdm(enumerate(dataloader)): 67 | data = all_data[i] 68 | word_emb, pos_ohot, caption, cap_lens, motions, m_lens, tokens = data 69 | tokens = tokens[0].split('_') 70 | mm_num_now = len(mm_generated_motions) 71 | is_mm = True if ((mm_num_now < mm_num_samples) and (i == mm_idxs[mm_num_now])) else False 72 | repeat_times = mm_num_repeats if is_mm else 1 73 | mm_motions = [] 74 | m_lens = max(m_lens // opt.unit_length * opt.unit_length, min_mov_length * opt.unit_length) 75 | m_lens = min(m_lens, opt.max_motion_length) 76 | if isinstance(m_lens, int): 77 | m_lens = torch.LongTensor([m_lens]).to(opt.device) 78 | else: 79 | m_lens = m_lens.to(opt.device) 80 | for t in range(repeat_times): 81 | m_len = m_lens[0].item() 82 | pred_motions = all_pred_motions[cur_idx][:m_lens[0].item()] 83 | assert pred_motions.shape[0] == m_lens[0].item() 84 | cur_idx += 1 85 | if t == 0: 86 | sub_dict = {'motion': pred_motions.cpu().numpy(), 87 | 'length': pred_motions.shape[0], 88 | 'caption': caption[0], 89 | 'cap_len': cap_lens[0].item(), 90 | 'tokens': tokens} 91 | generated_motion.append(sub_dict) 92 | 93 | if is_mm: 94 | mm_motions.append({ 95 | 'motion': pred_motions.cpu().numpy(), 96 | 'length': m_lens[0].item() 97 | }) 98 | if is_mm: 99 | mm_generated_motions.append({'caption': caption[0], 100 | 'tokens': tokens, 101 | 'cap_len': cap_lens[0].item(), 102 | 'mm_motions': mm_motions}) 103 | self.generated_motion = generated_motion 104 | self.mm_generated_motion = mm_generated_motions 105 | self.opt = opt 106 | self.w_vectorizer = w_vectorizer 107 | 108 | 109 | def __len__(self): 110 | return len(self.generated_motion) 111 | 112 | 113 | def __getitem__(self, item): 114 | data = self.generated_motion[item] 115 | motion, m_length, caption, tokens = data['motion'], data['length'], data['caption'], data['tokens'] 116 | sent_len = data['cap_len'] 117 | pos_one_hots = [] 118 | word_embeddings = [] 119 | for token in tokens: 120 | word_emb, pos_oh = self.w_vectorizer[token] 121 | pos_one_hots.append(pos_oh[None, :]) 122 | word_embeddings.append(word_emb[None, :]) 123 | pos_one_hots = np.concatenate(pos_one_hots, axis=0) 124 | word_embeddings = np.concatenate(word_embeddings, axis=0) 125 | 126 | if m_length < self.opt.max_motion_length: 127 | motion = np.concatenate([motion, 128 | np.zeros((self.opt.max_motion_length - m_length, motion.shape[1])) 129 | ], axis=0) 130 | return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, '_'.join(tokens) 131 | 132 | 133 | def collate_fn(batch): 134 | batch.sort(key=lambda x: x[3], reverse=True) 135 | return default_collate(batch) 136 | 137 | 138 | '''For use of training text motion matching model, and evaluations''' 139 | class Text2MotionDatasetV2(Dataset): 140 | def __init__(self, opt, mean, std, split_file, w_vectorizer): 141 | self.opt = opt 142 | self.w_vectorizer = w_vectorizer 143 | self.max_length = 20 144 | self.pointer = 0 145 | self.max_motion_length = opt.max_motion_length 146 | min_motion_len = 40 if self.opt.dataset_name =='t2m' else 24 147 | 148 | data_dict = {} 149 | id_list = [] 150 | with cs.open(split_file, 'r') as f: 151 | for line in f.readlines(): 152 | id_list.append(line.strip()) 153 | 154 | new_name_list = [] 155 | length_list = [] 156 | for name in tqdm(id_list): 157 | try: 158 | motion = np.load(pjoin(opt.motion_dir, name + '.npy')) 159 | if (len(motion)) < min_motion_len or (len(motion) >= 200): 160 | continue 161 | text_data = [] 162 | flag = False 163 | with cs.open(pjoin(opt.text_dir, name + '.txt')) as f: 164 | for line in f.readlines(): 165 | text_dict = {} 166 | line_split = line.strip().split('#') 167 | caption = line_split[0] 168 | tokens = line_split[1].split(' ') 169 | f_tag = float(line_split[2]) 170 | to_tag = float(line_split[3]) 171 | f_tag = 0.0 if np.isnan(f_tag) else f_tag 172 | to_tag = 0.0 if np.isnan(to_tag) else to_tag 173 | 174 | text_dict['caption'] = caption 175 | text_dict['tokens'] = tokens 176 | if f_tag == 0.0 and to_tag == 0.0: 177 | flag = True 178 | text_data.append(text_dict) 179 | else: 180 | try: 181 | n_motion = motion[int(f_tag*20) : int(to_tag*20)] 182 | if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200): 183 | continue 184 | new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name 185 | while new_name in data_dict: 186 | new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name 187 | data_dict[new_name] = {'motion': n_motion, 188 | 'length': len(n_motion), 189 | 'text':[text_dict]} 190 | new_name_list.append(new_name) 191 | length_list.append(len(n_motion)) 192 | except: 193 | print(line_split) 194 | print(line_split[2], line_split[3], f_tag, to_tag, name) 195 | # break 196 | 197 | if flag: 198 | data_dict[name] = {'motion': motion, 199 | 'length': len(motion), 200 | 'text': text_data} 201 | new_name_list.append(name) 202 | length_list.append(len(motion)) 203 | except: 204 | pass 205 | 206 | name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1])) 207 | 208 | self.mean = mean 209 | self.std = std 210 | self.length_arr = np.array(length_list) 211 | self.data_dict = data_dict 212 | self.name_list = name_list 213 | self.reset_max_len(self.max_length) 214 | 215 | def reset_max_len(self, length): 216 | assert length <= self.max_motion_length 217 | self.pointer = np.searchsorted(self.length_arr, length) 218 | print("Pointer Pointing at %d"%self.pointer) 219 | self.max_length = length 220 | 221 | def inv_transform(self, data): 222 | return data * self.std + self.mean 223 | 224 | def __len__(self): 225 | return len(self.data_dict) - self.pointer 226 | 227 | def __getitem__(self, item): 228 | idx = self.pointer + item 229 | data = self.data_dict[self.name_list[idx]] 230 | motion, m_length, text_list = data['motion'], data['length'], data['text'] 231 | # Randomly select a caption 232 | text_data = random.choice(text_list) 233 | caption, tokens = text_data['caption'], text_data['tokens'] 234 | 235 | if len(tokens) < self.opt.max_text_len: 236 | # pad with "unk" 237 | tokens = ['sos/OTHER'] + tokens + ['eos/OTHER'] 238 | sent_len = len(tokens) 239 | tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len) 240 | else: 241 | # crop 242 | tokens = tokens[:self.opt.max_text_len] 243 | tokens = ['sos/OTHER'] + tokens + ['eos/OTHER'] 244 | sent_len = len(tokens) 245 | pos_one_hots = [] 246 | word_embeddings = [] 247 | for token in tokens: 248 | word_emb, pos_oh = self.w_vectorizer[token] 249 | pos_one_hots.append(pos_oh[None, :]) 250 | word_embeddings.append(word_emb[None, :]) 251 | pos_one_hots = np.concatenate(pos_one_hots, axis=0) 252 | word_embeddings = np.concatenate(word_embeddings, axis=0) 253 | 254 | # Crop the motions in to times of 4, and introduce small variations 255 | if self.opt.unit_length < 10: 256 | coin2 = np.random.choice(['single', 'single', 'double']) 257 | else: 258 | coin2 = 'single' 259 | 260 | if coin2 == 'double': 261 | m_length = (m_length // self.opt.unit_length - 1) * self.opt.unit_length 262 | elif coin2 == 'single': 263 | m_length = (m_length // self.opt.unit_length) * self.opt.unit_length 264 | idx = random.randint(0, len(motion) - m_length) 265 | motion = motion[idx:idx+m_length] 266 | 267 | "Z Normalization" 268 | motion = (motion - self.mean) / self.std 269 | 270 | if m_length < self.max_motion_length: 271 | motion = np.concatenate([motion, 272 | np.zeros((self.max_motion_length - m_length, motion.shape[1])) 273 | ], axis=0) 274 | return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length, '_'.join(tokens) 275 | 276 | 277 | def get_dataset_motion_loader(opt_path, batch_size, device): 278 | opt = get_opt(opt_path, device) 279 | 280 | # Configurations of T2M dataset and KIT dataset is almost the same 281 | if opt.dataset_name == 't2m' or opt.dataset_name == 'kit': 282 | print('Loading dataset %s ...' % opt.dataset_name) 283 | 284 | mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) 285 | std = np.load(pjoin(opt.meta_dir, 'std.npy')) 286 | 287 | w_vectorizer = WordVectorizer('./data/glove', 'our_vab') 288 | split_file = pjoin(opt.data_root, 'test.txt') 289 | dataset = Text2MotionDatasetV2(opt, mean, std, split_file, w_vectorizer) 290 | dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4, drop_last=True, 291 | collate_fn=collate_fn, shuffle=True) 292 | else: 293 | raise KeyError('Dataset not Recognized !!') 294 | 295 | print('Ground Truth Dataset Loading Completed!!!') 296 | return dataloader, dataset 297 | 298 | 299 | class MMGeneratedDataset(Dataset): 300 | def __init__(self, opt, motion_dataset, w_vectorizer): 301 | self.opt = opt 302 | self.dataset = motion_dataset.mm_generated_motion 303 | self.w_vectorizer = w_vectorizer 304 | 305 | def __len__(self): 306 | return len(self.dataset) 307 | 308 | def __getitem__(self, item): 309 | data = self.dataset[item] 310 | mm_motions = data['mm_motions'] 311 | m_lens = [] 312 | motions = [] 313 | for mm_motion in mm_motions: 314 | m_lens.append(mm_motion['length']) 315 | motion = mm_motion['motion'] 316 | if len(motion) < self.opt.max_motion_length: 317 | motion = np.concatenate([motion, 318 | np.zeros((self.opt.max_motion_length - len(motion), motion.shape[1])) 319 | ], axis=0) 320 | motion = motion[None, :] 321 | motions.append(motion) 322 | m_lens = np.array(m_lens, dtype=np.int) 323 | motions = np.concatenate(motions, axis=0) 324 | sort_indx = np.argsort(m_lens)[::-1].copy() 325 | # print(m_lens) 326 | # print(sort_indx) 327 | # print(m_lens[sort_indx]) 328 | m_lens = m_lens[sort_indx] 329 | motions = motions[sort_indx] 330 | return motions, m_lens 331 | 332 | 333 | 334 | def get_motion_loader(opt, batch_size, trainer, ground_truth_dataset, mm_num_samples, mm_num_repeats): 335 | 336 | # Currently the configurations of two datasets are almost the same 337 | if opt.dataset_name == 't2m' or opt.dataset_name == 'kit': 338 | w_vectorizer = WordVectorizer('./data/glove', 'our_vab') 339 | else: 340 | raise KeyError('Dataset not recognized!!') 341 | print('Generating %s ...' % opt.name) 342 | 343 | dataset = EvaluationDataset(opt, trainer, ground_truth_dataset, w_vectorizer, mm_num_samples, mm_num_repeats) 344 | mm_dataset = MMGeneratedDataset(opt, dataset, w_vectorizer) 345 | 346 | motion_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, drop_last=True, num_workers=4) 347 | mm_motion_loader = DataLoader(mm_dataset, batch_size=1, num_workers=1) 348 | 349 | print('Generated Dataset Loading Completed!!!') 350 | 351 | return motion_loader, mm_motion_loader 352 | 353 | 354 | def build_models(opt): 355 | movement_enc = MovementConvEncoder(opt.dim_pose-4, opt.dim_movement_enc_hidden, opt.dim_movement_latent) 356 | text_enc = TextEncoderBiGRUCo(word_size=opt.dim_word, 357 | pos_size=opt.dim_pos_ohot, 358 | hidden_size=opt.dim_text_hidden, 359 | output_size=opt.dim_coemb_hidden, 360 | device=opt.device) 361 | 362 | motion_enc = MotionEncoderBiGRUCo(input_size=opt.dim_movement_latent, 363 | hidden_size=opt.dim_motion_hidden, 364 | output_size=opt.dim_coemb_hidden, 365 | device=opt.device) 366 | 367 | checkpoint = torch.load(pjoin('data/pretrained_models', opt.dataset_name, 'text_mot_match', 'model', 'finest.tar'), 368 | map_location=opt.device) 369 | movement_enc.load_state_dict(checkpoint['movement_encoder']) 370 | text_enc.load_state_dict(checkpoint['text_encoder']) 371 | motion_enc.load_state_dict(checkpoint['motion_encoder']) 372 | print('Loading Evaluation Model Wrapper (Epoch %d) Completed!!' % (checkpoint['epoch'])) 373 | return text_enc, motion_enc, movement_enc 374 | 375 | 376 | class EvaluatorModelWrapper(object): 377 | 378 | def __init__(self, opt): 379 | 380 | if opt.dataset_name == 't2m': 381 | opt.dim_pose = 263 382 | elif opt.dataset_name == 'kit': 383 | opt.dim_pose = 251 384 | else: 385 | raise KeyError('Dataset not Recognized!!!') 386 | 387 | opt.dim_word = 300 388 | opt.max_motion_length = 196 389 | opt.dim_pos_ohot = len(POS_enumerator) 390 | opt.dim_motion_hidden = 1024 391 | opt.max_text_len = 20 392 | opt.dim_text_hidden = 512 393 | opt.dim_coemb_hidden = 512 394 | 395 | self.text_encoder, self.motion_encoder, self.movement_encoder = build_models(opt) 396 | self.opt = opt 397 | self.device = opt.device 398 | 399 | self.text_encoder.to(opt.device) 400 | self.motion_encoder.to(opt.device) 401 | self.movement_encoder.to(opt.device) 402 | 403 | self.text_encoder.eval() 404 | self.motion_encoder.eval() 405 | self.movement_encoder.eval() 406 | 407 | # Please note that the results does not following the order of inputs 408 | def get_co_embeddings(self, word_embs, pos_ohot, cap_lens, motions, m_lens): 409 | with torch.no_grad(): 410 | word_embs = word_embs.detach().to(self.device).float() 411 | pos_ohot = pos_ohot.detach().to(self.device).float() 412 | motions = motions.detach().to(self.device).float() 413 | 414 | align_idx = np.argsort(m_lens.data.tolist())[::-1].copy() 415 | motions = motions[align_idx] 416 | m_lens = m_lens[align_idx] 417 | 418 | '''Movement Encoding''' 419 | movements = self.movement_encoder(motions[..., :-4]).detach() 420 | m_lens = m_lens // self.opt.unit_length 421 | motion_embedding = self.motion_encoder(movements, m_lens) 422 | 423 | '''Text Encoding''' 424 | text_embedding = self.text_encoder(word_embs, pos_ohot, cap_lens) 425 | text_embedding = text_embedding[align_idx] 426 | return text_embedding, motion_embedding 427 | 428 | # Please note that the results does not following the order of inputs 429 | def get_motion_embeddings(self, motions, m_lens): 430 | with torch.no_grad(): 431 | motions = motions.detach().to(self.device).float() 432 | 433 | align_idx = np.argsort(m_lens.data.tolist())[::-1].copy() 434 | motions = motions[align_idx] 435 | m_lens = m_lens[align_idx] 436 | 437 | '''Movement Encoding''' 438 | movements = self.movement_encoder(motions[..., :-4]).detach() 439 | m_lens = m_lens // self.opt.unit_length 440 | motion_embedding = self.motion_encoder(movements, m_lens) 441 | return motion_embedding 442 | -------------------------------------------------------------------------------- /text2motion/datasets/evaluator_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import time 5 | import math 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 7 | # from networks.layers import * 8 | import torch.nn.functional as F 9 | 10 | 11 | class ContrastiveLoss(torch.nn.Module): 12 | """ 13 | Contrastive loss function. 14 | Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf 15 | """ 16 | def __init__(self, margin=3.0): 17 | super(ContrastiveLoss, self).__init__() 18 | self.margin = margin 19 | 20 | def forward(self, output1, output2, label): 21 | euclidean_distance = F.pairwise_distance(output1, output2, keepdim=True) 22 | loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) + 23 | (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2)) 24 | return loss_contrastive 25 | 26 | 27 | def init_weight(m): 28 | if isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear) or isinstance(m, nn.ConvTranspose1d): 29 | nn.init.xavier_normal_(m.weight) 30 | # m.bias.data.fill_(0.01) 31 | if m.bias is not None: 32 | nn.init.constant_(m.bias, 0) 33 | 34 | 35 | def reparameterize(mu, logvar): 36 | s_var = logvar.mul(0.5).exp_() 37 | eps = s_var.data.new(s_var.size()).normal_() 38 | return eps.mul(s_var).add_(mu) 39 | 40 | 41 | # batch_size, dimension and position 42 | # output: (batch_size, dim) 43 | def positional_encoding(batch_size, dim, pos): 44 | assert batch_size == pos.shape[0] 45 | positions_enc = np.array([ 46 | [pos[j] / np.power(10000, (i-i%2)/dim) for i in range(dim)] 47 | for j in range(batch_size) 48 | ], dtype=np.float32) 49 | positions_enc[:, 0::2] = np.sin(positions_enc[:, 0::2]) 50 | positions_enc[:, 1::2] = np.cos(positions_enc[:, 1::2]) 51 | return torch.from_numpy(positions_enc).float() 52 | 53 | 54 | def get_padding_mask(batch_size, seq_len, cap_lens): 55 | cap_lens = cap_lens.data.tolist() 56 | mask_2d = torch.ones((batch_size, seq_len, seq_len), dtype=torch.float32) 57 | for i, cap_len in enumerate(cap_lens): 58 | mask_2d[i, :, :cap_len] = 0 59 | return mask_2d.bool(), 1 - mask_2d[:, :, 0].clone() 60 | 61 | 62 | class PositionalEncoding(nn.Module): 63 | 64 | def __init__(self, d_model, max_len=300): 65 | super(PositionalEncoding, self).__init__() 66 | 67 | pe = torch.zeros(max_len, d_model) 68 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 69 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 70 | pe[:, 0::2] = torch.sin(position * div_term) 71 | pe[:, 1::2] = torch.cos(position * div_term) 72 | # pe = pe.unsqueeze(0).transpose(0, 1) 73 | self.register_buffer('pe', pe) 74 | 75 | def forward(self, pos): 76 | return self.pe[pos] 77 | 78 | 79 | class MovementConvEncoder(nn.Module): 80 | def __init__(self, input_size, hidden_size, output_size): 81 | super(MovementConvEncoder, self).__init__() 82 | self.main = nn.Sequential( 83 | nn.Conv1d(input_size, hidden_size, 4, 2, 1), 84 | nn.Dropout(0.2, inplace=True), 85 | nn.LeakyReLU(0.2, inplace=True), 86 | nn.Conv1d(hidden_size, output_size, 4, 2, 1), 87 | nn.Dropout(0.2, inplace=True), 88 | nn.LeakyReLU(0.2, inplace=True), 89 | ) 90 | self.out_net = nn.Linear(output_size, output_size) 91 | self.main.apply(init_weight) 92 | self.out_net.apply(init_weight) 93 | 94 | def forward(self, inputs): 95 | inputs = inputs.permute(0, 2, 1) 96 | outputs = self.main(inputs).permute(0, 2, 1) 97 | # print(outputs.shape) 98 | return self.out_net(outputs) 99 | 100 | 101 | class MovementConvDecoder(nn.Module): 102 | def __init__(self, input_size, hidden_size, output_size): 103 | super(MovementConvDecoder, self).__init__() 104 | self.main = nn.Sequential( 105 | nn.ConvTranspose1d(input_size, hidden_size, 4, 2, 1), 106 | # nn.Dropout(0.2, inplace=True), 107 | nn.LeakyReLU(0.2, inplace=True), 108 | nn.ConvTranspose1d(hidden_size, output_size, 4, 2, 1), 109 | # nn.Dropout(0.2, inplace=True), 110 | nn.LeakyReLU(0.2, inplace=True), 111 | ) 112 | self.out_net = nn.Linear(output_size, output_size) 113 | 114 | self.main.apply(init_weight) 115 | self.out_net.apply(init_weight) 116 | 117 | def forward(self, inputs): 118 | inputs = inputs.permute(0, 2, 1) 119 | outputs = self.main(inputs).permute(0, 2, 1) 120 | return self.out_net(outputs) 121 | 122 | 123 | class TextVAEDecoder(nn.Module): 124 | def __init__(self, text_size, input_size, output_size, hidden_size, n_layers): 125 | super(TextVAEDecoder, self).__init__() 126 | self.input_size = input_size 127 | self.output_size = output_size 128 | self.hidden_size = hidden_size 129 | self.n_layers = n_layers 130 | self.emb = nn.Sequential( 131 | nn.Linear(input_size, hidden_size), 132 | nn.LayerNorm(hidden_size), 133 | nn.LeakyReLU(0.2, inplace=True)) 134 | 135 | self.z2init = nn.Linear(text_size, hidden_size * n_layers) 136 | self.gru = nn.ModuleList([nn.GRUCell(hidden_size, hidden_size) for i in range(self.n_layers)]) 137 | self.positional_encoder = PositionalEncoding(hidden_size) 138 | 139 | 140 | self.output = nn.Sequential( 141 | nn.Linear(hidden_size, hidden_size), 142 | nn.LayerNorm(hidden_size), 143 | nn.LeakyReLU(0.2, inplace=True), 144 | nn.Linear(hidden_size, output_size) 145 | ) 146 | 147 | # 148 | # self.output = nn.Sequential( 149 | # nn.Linear(hidden_size, hidden_size), 150 | # nn.LayerNorm(hidden_size), 151 | # nn.LeakyReLU(0.2, inplace=True), 152 | # nn.Linear(hidden_size, output_size-4) 153 | # ) 154 | 155 | # self.contact_net = nn.Sequential( 156 | # nn.Linear(output_size-4, 64), 157 | # nn.LayerNorm(64), 158 | # nn.LeakyReLU(0.2, inplace=True), 159 | # nn.Linear(64, 4) 160 | # ) 161 | 162 | self.output.apply(init_weight) 163 | self.emb.apply(init_weight) 164 | self.z2init.apply(init_weight) 165 | # self.contact_net.apply(init_weight) 166 | 167 | def get_init_hidden(self, latent): 168 | hidden = self.z2init(latent) 169 | hidden = torch.split(hidden, self.hidden_size, dim=-1) 170 | return list(hidden) 171 | 172 | def forward(self, inputs, last_pred, hidden, p): 173 | h_in = self.emb(inputs) 174 | pos_enc = self.positional_encoder(p).to(inputs.device).detach() 175 | h_in = h_in + pos_enc 176 | for i in range(self.n_layers): 177 | # print(h_in.shape) 178 | hidden[i] = self.gru[i](h_in, hidden[i]) 179 | h_in = hidden[i] 180 | pose_pred = self.output(h_in) 181 | # pose_pred = self.output(h_in) + last_pred.detach() 182 | # contact = self.contact_net(pose_pred) 183 | # return torch.cat([pose_pred, contact], dim=-1), hidden 184 | return pose_pred, hidden 185 | 186 | 187 | class TextDecoder(nn.Module): 188 | def __init__(self, text_size, input_size, output_size, hidden_size, n_layers): 189 | super(TextDecoder, self).__init__() 190 | self.input_size = input_size 191 | self.output_size = output_size 192 | self.hidden_size = hidden_size 193 | self.n_layers = n_layers 194 | self.emb = nn.Sequential( 195 | nn.Linear(input_size, hidden_size), 196 | nn.LayerNorm(hidden_size), 197 | nn.LeakyReLU(0.2, inplace=True)) 198 | 199 | self.gru = nn.ModuleList([nn.GRUCell(hidden_size, hidden_size) for i in range(self.n_layers)]) 200 | self.z2init = nn.Linear(text_size, hidden_size * n_layers) 201 | self.positional_encoder = PositionalEncoding(hidden_size) 202 | 203 | self.mu_net = nn.Linear(hidden_size, output_size) 204 | self.logvar_net = nn.Linear(hidden_size, output_size) 205 | 206 | self.emb.apply(init_weight) 207 | self.z2init.apply(init_weight) 208 | self.mu_net.apply(init_weight) 209 | self.logvar_net.apply(init_weight) 210 | 211 | def get_init_hidden(self, latent): 212 | 213 | hidden = self.z2init(latent) 214 | hidden = torch.split(hidden, self.hidden_size, dim=-1) 215 | 216 | return list(hidden) 217 | 218 | def forward(self, inputs, hidden, p): 219 | # print(inputs.shape) 220 | x_in = self.emb(inputs) 221 | pos_enc = self.positional_encoder(p).to(inputs.device).detach() 222 | x_in = x_in + pos_enc 223 | 224 | for i in range(self.n_layers): 225 | hidden[i] = self.gru[i](x_in, hidden[i]) 226 | h_in = hidden[i] 227 | mu = self.mu_net(h_in) 228 | logvar = self.logvar_net(h_in) 229 | z = reparameterize(mu, logvar) 230 | return z, mu, logvar, hidden 231 | 232 | class AttLayer(nn.Module): 233 | def __init__(self, query_dim, key_dim, value_dim): 234 | super(AttLayer, self).__init__() 235 | self.W_q = nn.Linear(query_dim, value_dim) 236 | self.W_k = nn.Linear(key_dim, value_dim, bias=False) 237 | self.W_v = nn.Linear(key_dim, value_dim) 238 | 239 | self.softmax = nn.Softmax(dim=1) 240 | self.dim = value_dim 241 | 242 | self.W_q.apply(init_weight) 243 | self.W_k.apply(init_weight) 244 | self.W_v.apply(init_weight) 245 | 246 | def forward(self, query, key_mat): 247 | ''' 248 | query (batch, query_dim) 249 | key (batch, seq_len, key_dim) 250 | ''' 251 | # print(query.shape) 252 | query_vec = self.W_q(query).unsqueeze(-1) # (batch, value_dim, 1) 253 | val_set = self.W_v(key_mat) # (batch, seq_len, value_dim) 254 | key_set = self.W_k(key_mat) # (batch, seq_len, value_dim) 255 | 256 | weights = torch.matmul(key_set, query_vec) / np.sqrt(self.dim) 257 | 258 | co_weights = self.softmax(weights) # (batch, seq_len, 1) 259 | values = val_set * co_weights # (batch, seq_len, value_dim) 260 | pred = values.sum(dim=1) # (batch, value_dim) 261 | return pred, co_weights 262 | 263 | def short_cut(self, querys, keys): 264 | return self.W_q(querys), self.W_k(keys) 265 | 266 | 267 | class TextEncoderBiGRU(nn.Module): 268 | def __init__(self, word_size, pos_size, hidden_size, device): 269 | super(TextEncoderBiGRU, self).__init__() 270 | self.device = device 271 | 272 | self.pos_emb = nn.Linear(pos_size, word_size) 273 | self.input_emb = nn.Linear(word_size, hidden_size) 274 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True) 275 | # self.linear2 = nn.Linear(hidden_size, output_size) 276 | 277 | self.input_emb.apply(init_weight) 278 | self.pos_emb.apply(init_weight) 279 | # self.linear2.apply(init_weight) 280 | # self.batch_size = batch_size 281 | self.hidden_size = hidden_size 282 | self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True)) 283 | 284 | # input(batch_size, seq_len, dim) 285 | def forward(self, word_embs, pos_onehot, cap_lens): 286 | num_samples = word_embs.shape[0] 287 | 288 | pos_embs = self.pos_emb(pos_onehot) 289 | inputs = word_embs + pos_embs 290 | input_embs = self.input_emb(inputs) 291 | hidden = self.hidden.repeat(1, num_samples, 1) 292 | 293 | cap_lens = cap_lens.data.tolist() 294 | emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True) 295 | 296 | gru_seq, gru_last = self.gru(emb, hidden) 297 | 298 | gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1) 299 | gru_seq = pad_packed_sequence(gru_seq, batch_first=True)[0] 300 | forward_seq = gru_seq[..., :self.hidden_size] 301 | backward_seq = gru_seq[..., self.hidden_size:].clone() 302 | 303 | # Concate the forward and backward word embeddings 304 | for i, length in enumerate(cap_lens): 305 | backward_seq[i:i+1, :length] = torch.flip(backward_seq[i:i+1, :length].clone(), dims=[1]) 306 | gru_seq = torch.cat([forward_seq, backward_seq], dim=-1) 307 | 308 | return gru_seq, gru_last 309 | 310 | 311 | class TextEncoderBiGRUCo(nn.Module): 312 | def __init__(self, word_size, pos_size, hidden_size, output_size, device): 313 | super(TextEncoderBiGRUCo, self).__init__() 314 | self.device = device 315 | 316 | self.pos_emb = nn.Linear(pos_size, word_size) 317 | self.input_emb = nn.Linear(word_size, hidden_size) 318 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True) 319 | self.output_net = nn.Sequential( 320 | nn.Linear(hidden_size * 2, hidden_size), 321 | nn.LayerNorm(hidden_size), 322 | nn.LeakyReLU(0.2, inplace=True), 323 | nn.Linear(hidden_size, output_size) 324 | ) 325 | 326 | self.input_emb.apply(init_weight) 327 | self.pos_emb.apply(init_weight) 328 | self.output_net.apply(init_weight) 329 | # self.linear2.apply(init_weight) 330 | # self.batch_size = batch_size 331 | self.hidden_size = hidden_size 332 | self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True)) 333 | 334 | # input(batch_size, seq_len, dim) 335 | def forward(self, word_embs, pos_onehot, cap_lens): 336 | num_samples = word_embs.shape[0] 337 | 338 | pos_embs = self.pos_emb(pos_onehot) 339 | inputs = word_embs + pos_embs 340 | input_embs = self.input_emb(inputs) 341 | hidden = self.hidden.repeat(1, num_samples, 1) 342 | 343 | cap_lens = cap_lens.data.tolist() 344 | emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True) 345 | 346 | gru_seq, gru_last = self.gru(emb, hidden) 347 | 348 | gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1) 349 | 350 | return self.output_net(gru_last) 351 | 352 | 353 | class MotionEncoderBiGRUCo(nn.Module): 354 | def __init__(self, input_size, hidden_size, output_size, device): 355 | super(MotionEncoderBiGRUCo, self).__init__() 356 | self.device = device 357 | 358 | self.input_emb = nn.Linear(input_size, hidden_size) 359 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True) 360 | self.output_net = nn.Sequential( 361 | nn.Linear(hidden_size*2, hidden_size), 362 | nn.LayerNorm(hidden_size), 363 | nn.LeakyReLU(0.2, inplace=True), 364 | nn.Linear(hidden_size, output_size) 365 | ) 366 | 367 | self.input_emb.apply(init_weight) 368 | self.output_net.apply(init_weight) 369 | self.hidden_size = hidden_size 370 | self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True)) 371 | 372 | # input(batch_size, seq_len, dim) 373 | def forward(self, inputs, m_lens): 374 | num_samples = inputs.shape[0] 375 | 376 | input_embs = self.input_emb(inputs) 377 | hidden = self.hidden.repeat(1, num_samples, 1) 378 | 379 | cap_lens = m_lens.data.tolist() 380 | emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True) 381 | 382 | gru_seq, gru_last = self.gru(emb, hidden) 383 | 384 | gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1) 385 | 386 | return self.output_net(gru_last) 387 | 388 | 389 | class MotionLenEstimatorBiGRU(nn.Module): 390 | def __init__(self, word_size, pos_size, hidden_size, output_size): 391 | super(MotionLenEstimatorBiGRU, self).__init__() 392 | 393 | self.pos_emb = nn.Linear(pos_size, word_size) 394 | self.input_emb = nn.Linear(word_size, hidden_size) 395 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True) 396 | nd = 512 397 | self.output = nn.Sequential( 398 | nn.Linear(hidden_size*2, nd), 399 | nn.LayerNorm(nd), 400 | nn.LeakyReLU(0.2, inplace=True), 401 | 402 | nn.Linear(nd, nd // 2), 403 | nn.LayerNorm(nd // 2), 404 | nn.LeakyReLU(0.2, inplace=True), 405 | 406 | nn.Linear(nd // 2, nd // 4), 407 | nn.LayerNorm(nd // 4), 408 | nn.LeakyReLU(0.2, inplace=True), 409 | 410 | nn.Linear(nd // 4, output_size) 411 | ) 412 | # self.linear2 = nn.Linear(hidden_size, output_size) 413 | 414 | self.input_emb.apply(init_weight) 415 | self.pos_emb.apply(init_weight) 416 | self.output.apply(init_weight) 417 | # self.linear2.apply(init_weight) 418 | # self.batch_size = batch_size 419 | self.hidden_size = hidden_size 420 | self.hidden = nn.Parameter(torch.randn((2, 1, self.hidden_size), requires_grad=True)) 421 | 422 | # input(batch_size, seq_len, dim) 423 | def forward(self, word_embs, pos_onehot, cap_lens): 424 | num_samples = word_embs.shape[0] 425 | 426 | pos_embs = self.pos_emb(pos_onehot) 427 | inputs = word_embs + pos_embs 428 | input_embs = self.input_emb(inputs) 429 | hidden = self.hidden.repeat(1, num_samples, 1) 430 | 431 | cap_lens = cap_lens.data.tolist() 432 | emb = pack_padded_sequence(input_embs, cap_lens, batch_first=True) 433 | 434 | gru_seq, gru_last = self.gru(emb, hidden) 435 | 436 | gru_last = torch.cat([gru_last[0], gru_last[1]], dim=-1) 437 | 438 | return self.output(gru_last) 439 | -------------------------------------------------------------------------------- /text2motion/install.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | 4 | 5 | - [Requirements](#requirements) 6 | - [Prepare environment](#prepare-environment) 7 | - [Data Preparation](#data-preparation) 8 | 9 | 10 | 11 | ## Requirements 12 | 13 | - Linux 14 | - Python 3.7+ 15 | - PyTorch 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0 or 1.9.1. 16 | - CUDA 9.2+ 17 | - GCC 5+ 18 | - [MMCV](https://github.com/open-mmlab/mmcv) (Please install mmcv-full>=1.3.17,<1.6.0 for GPU) 19 | 20 | ## Prepare environment 21 | 22 | a. Create a conda virtual environment and activate it. 23 | 24 | ```shell 25 | conda create -n motiondiffuse python=3.7 -y 26 | conda activate motiondiffuse 27 | ``` 28 | 29 | b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/). 30 | ```shell 31 | conda install pytorch={torch_version} torchvision cudatoolkit={cu_version} -c pytorch 32 | ``` 33 | 34 | E.g., install PyTorch 1.7.1 & CUDA 10.1. 35 | ```shell 36 | conda install pytorch=1.7.1 torchvision cudatoolkit=10.1 -c pytorch 37 | ``` 38 | 39 | **Important:** Make sure that your compilation CUDA version and runtime CUDA version match. 40 | 41 | c. Build mmcv-full 42 | 43 | - mmcv-full 44 | 45 | We recommend you to install the pre-build package as below. 46 | 47 | For CPU: 48 | ```shell 49 | pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/{torch_version}/index.html 50 | ``` 51 | Please replace `{torch_version}` in the url to your desired one. 52 | 53 | For GPU: 54 | ```shell 55 | pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html 56 | ``` 57 | Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. 58 | 59 | For example, to install mmcv-full with CUDA 10.1 and PyTorch 1.7.1, use the following command: 60 | ```shell 61 | pip install "mmcv-full>=1.3.17,<=1.5.3" -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.1/index.html 62 | ``` 63 | 64 | See [here](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) for different versions of MMCV compatible to different PyTorch and CUDA versions. 65 | For more version download link, refer to [openmmlab-download](https://download.openmmlab.com/mmcv/dist/index.html). 66 | 67 | 68 | d. Install other requirements 69 | 70 | ```shell 71 | pip install -r requirements.txt 72 | ``` 73 | 74 | ## Data Preparation 75 | 76 | a. Download datasets 77 | 78 | For both the HumanML3D dataset and the KIT-ML dataset, you could find the details as well as download link [[here]](https://github.com/EricGuo5513/HumanML3D). 79 | 80 | b. Download pretrained weights for evaluation 81 | 82 | We use the same evaluation protocol as [this repo](https://github.com/EricGuo5513/text-to-motion). You should download pretrained weights of the contrastive models in [t2m](https://drive.google.com/file/d/1DSaKqWX2HlwBtVH5l7DdW96jeYUIXsOP/view) and [kit](https://drive.google.com/file/d/1tX79xk0fflp07EZ660Xz1RAFE33iEyJR/view) for calculating FID and precisions. To dynamically estimate the length of the target motion, `length_est_bigru` and [Glove data](https://drive.google.com/drive/folders/1qxHtwffhfI4qMwptNW6KJEDuT6bduqO7?usp=sharing) are required. 83 | 84 | c. Download pretrained weights for **MotionDiffuse** 85 | 86 | The pretrained weights for our proposed MotionDiffuse can be downloaded from [here](https://drive.google.com/drive/folders/1qxHtwffhfI4qMwptNW6KJEDuT6bduqO7?usp=sharing) 87 | 88 | 89 | Download the above resources and arrange them in the following file structure: 90 | 91 | ```text 92 | MotionDiffuse 93 | └── text2motion 94 | ├── checkpoints 95 | │ ├── kit 96 | │ │ └── kit_motiondiffuse 97 | │ │ ├── meta 98 | │ │ │ ├── mean.npy 99 | │ │ │ └── std.npy 100 | │ │ ├── model 101 | │ │ │ └── latest.tar 102 | │ │ └── opt.txt 103 | │ └── t2m 104 | │ └── t2m_motiondiffuse 105 | │ ├── meta 106 | │ │ ├── mean.npy 107 | │ │ └── std.npy 108 | │ ├── model 109 | │ │ └── latest.tar 110 | │ └── opt.txt 111 | └── data 112 | ├── glove 113 | │ ├── our_vab_data.npy 114 | │ ├── our_vab_idx.pkl 115 | │ └── out_vab_words.pkl 116 | ├── pretrained_models 117 | │ ├── kit 118 | │ │ └── text_mot_match 119 | │ │ └── model 120 | │ │ └── finest.tar 121 | │ └── t2m 122 | │ │ ├── text_mot_match 123 | │ │ │ └── model 124 | │ │ │ └── finest.tar 125 | │ │ └── length_est_bigru 126 | │ │ └── model 127 | │ │ └── finest.tar 128 | ├── HumanML3D 129 | │ ├── new_joint_vecs 130 | │ │ └── ... 131 | │ ├── new_joints 132 | │ │ └── ... 133 | │ ├── texts 134 | │ │ └── ... 135 | │ ├── Mean.npy 136 | │ ├── Std.npy 137 | │ ├── test.txt 138 | │ ├── train_val.txt 139 | │ ├── train.txt 140 | │ └── val.txt 141 | └── KIT-ML 142 | ├── new_joint_vecs 143 | │ └── ... 144 | ├── new_joints 145 | │ └── ... 146 | ├── texts 147 | │ └── ... 148 | ├── Mean.npy 149 | ├── Std.npy 150 | ├── test.txt 151 | ├── train_val.txt 152 | ├── train.txt 153 | └── val.txt 154 | ``` -------------------------------------------------------------------------------- /text2motion/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import MotionTransformer 2 | from .gaussian_diffusion import GaussianDiffusion 3 | 4 | __all__ = ['MotionTransformer', 'GaussianDiffusion'] -------------------------------------------------------------------------------- /text2motion/models/transformer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 S-Lab 3 | """ 4 | 5 | from cv2 import norm 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import layer_norm, nn 9 | import numpy as np 10 | import clip 11 | 12 | import math 13 | 14 | 15 | def timestep_embedding(timesteps, dim, max_period=10000): 16 | """ 17 | Create sinusoidal timestep embeddings. 18 | :param timesteps: a 1-D Tensor of N indices, one per batch element. 19 | These may be fractional. 20 | :param dim: the dimension of the output. 21 | :param max_period: controls the minimum frequency of the embeddings. 22 | :return: an [N x dim] Tensor of positional embeddings. 23 | """ 24 | half = dim // 2 25 | freqs = torch.exp( 26 | -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half 27 | ).to(device=timesteps.device) 28 | args = timesteps[:, None].float() * freqs[None] 29 | embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) 30 | if dim % 2: 31 | embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) 32 | return embedding 33 | 34 | 35 | def set_requires_grad(nets, requires_grad=False): 36 | """Set requies_grad for all the networks. 37 | 38 | Args: 39 | nets (nn.Module | list[nn.Module]): A list of networks or a single 40 | network. 41 | requires_grad (bool): Whether the networks require gradients or not 42 | """ 43 | if not isinstance(nets, list): 44 | nets = [nets] 45 | for net in nets: 46 | if net is not None: 47 | for param in net.parameters(): 48 | param.requires_grad = requires_grad 49 | 50 | 51 | def zero_module(module): 52 | """ 53 | Zero out the parameters of a module and return it. 54 | """ 55 | for p in module.parameters(): 56 | p.detach().zero_() 57 | return module 58 | 59 | 60 | class StylizationBlock(nn.Module): 61 | 62 | def __init__(self, latent_dim, time_embed_dim, dropout): 63 | super().__init__() 64 | self.emb_layers = nn.Sequential( 65 | nn.SiLU(), 66 | nn.Linear(time_embed_dim, 2 * latent_dim), 67 | ) 68 | self.norm = nn.LayerNorm(latent_dim) 69 | self.out_layers = nn.Sequential( 70 | nn.SiLU(), 71 | nn.Dropout(p=dropout), 72 | zero_module(nn.Linear(latent_dim, latent_dim)), 73 | ) 74 | 75 | def forward(self, h, emb): 76 | """ 77 | h: B, T, D 78 | emb: B, D 79 | """ 80 | # B, 1, 2D 81 | emb_out = self.emb_layers(emb).unsqueeze(1) 82 | # scale: B, 1, D / shift: B, 1, D 83 | scale, shift = torch.chunk(emb_out, 2, dim=2) 84 | h = self.norm(h) * (1 + scale) + shift 85 | h = self.out_layers(h) 86 | return h 87 | 88 | 89 | class LinearTemporalSelfAttention(nn.Module): 90 | 91 | def __init__(self, seq_len, latent_dim, num_head, dropout, time_embed_dim): 92 | super().__init__() 93 | self.num_head = num_head 94 | self.norm = nn.LayerNorm(latent_dim) 95 | self.query = nn.Linear(latent_dim, latent_dim) 96 | self.key = nn.Linear(latent_dim, latent_dim) 97 | self.value = nn.Linear(latent_dim, latent_dim) 98 | self.dropout = nn.Dropout(dropout) 99 | self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout) 100 | 101 | def forward(self, x, emb, src_mask): 102 | """ 103 | x: B, T, D 104 | """ 105 | B, T, D = x.shape 106 | H = self.num_head 107 | # B, T, D 108 | query = self.query(self.norm(x)) 109 | # B, T, D 110 | key = (self.key(self.norm(x)) + (1 - src_mask) * -1000000) 111 | query = F.softmax(query.view(B, T, H, -1), dim=-1) 112 | key = F.softmax(key.view(B, T, H, -1), dim=1) 113 | # B, T, H, HD 114 | value = (self.value(self.norm(x)) * src_mask).view(B, T, H, -1) 115 | # B, H, HD, HD 116 | attention = torch.einsum('bnhd,bnhl->bhdl', key, value) 117 | y = torch.einsum('bnhd,bhdl->bnhl', query, attention).reshape(B, T, D) 118 | y = x + self.proj_out(y, emb) 119 | return y 120 | 121 | 122 | class LinearTemporalCrossAttention(nn.Module): 123 | 124 | def __init__(self, seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim): 125 | super().__init__() 126 | self.num_head = num_head 127 | self.norm = nn.LayerNorm(latent_dim) 128 | self.text_norm = nn.LayerNorm(text_latent_dim) 129 | self.query = nn.Linear(latent_dim, latent_dim) 130 | self.key = nn.Linear(text_latent_dim, latent_dim) 131 | self.value = nn.Linear(text_latent_dim, latent_dim) 132 | self.dropout = nn.Dropout(dropout) 133 | self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout) 134 | 135 | def forward(self, x, xf, emb): 136 | """ 137 | x: B, T, D 138 | xf: B, N, L 139 | """ 140 | B, T, D = x.shape 141 | N = xf.shape[1] 142 | H = self.num_head 143 | # B, T, D 144 | query = self.query(self.norm(x)) 145 | # B, N, D 146 | key = self.key(self.text_norm(xf)) 147 | query = F.softmax(query.view(B, T, H, -1), dim=-1) 148 | key = F.softmax(key.view(B, N, H, -1), dim=1) 149 | # B, N, H, HD 150 | value = self.value(self.text_norm(xf)).view(B, N, H, -1) 151 | # B, H, HD, HD 152 | attention = torch.einsum('bnhd,bnhl->bhdl', key, value) 153 | y = torch.einsum('bnhd,bhdl->bnhl', query, attention).reshape(B, T, D) 154 | y = x + self.proj_out(y, emb) 155 | return y 156 | 157 | class FFN(nn.Module): 158 | 159 | def __init__(self, latent_dim, ffn_dim, dropout, time_embed_dim): 160 | super().__init__() 161 | self.linear1 = nn.Linear(latent_dim, ffn_dim) 162 | self.linear2 = zero_module(nn.Linear(ffn_dim, latent_dim)) 163 | self.activation = nn.GELU() 164 | self.dropout = nn.Dropout(dropout) 165 | self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout) 166 | 167 | def forward(self, x, emb): 168 | y = self.linear2(self.dropout(self.activation(self.linear1(x)))) 169 | y = x + self.proj_out(y, emb) 170 | return y 171 | 172 | 173 | class LinearTemporalDiffusionTransformerDecoderLayer(nn.Module): 174 | 175 | def __init__(self, 176 | seq_len=60, 177 | latent_dim=32, 178 | text_latent_dim=512, 179 | time_embed_dim=128, 180 | ffn_dim=256, 181 | num_head=4, 182 | dropout=0.1): 183 | super().__init__() 184 | self.sa_block = LinearTemporalSelfAttention( 185 | seq_len, latent_dim, num_head, dropout, time_embed_dim) 186 | self.ca_block = LinearTemporalCrossAttention( 187 | seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim) 188 | self.ffn = FFN(latent_dim, ffn_dim, dropout, time_embed_dim) 189 | 190 | def forward(self, x, xf, emb, src_mask): 191 | x = self.sa_block(x, emb, src_mask) 192 | x = self.ca_block(x, xf, emb) 193 | x = self.ffn(x, emb) 194 | return x 195 | 196 | class TemporalSelfAttention(nn.Module): 197 | 198 | def __init__(self, seq_len, latent_dim, num_head, dropout, time_embed_dim): 199 | super().__init__() 200 | self.num_head = num_head 201 | self.norm = nn.LayerNorm(latent_dim) 202 | self.query = nn.Linear(latent_dim, latent_dim) 203 | self.key = nn.Linear(latent_dim, latent_dim) 204 | self.value = nn.Linear(latent_dim, latent_dim) 205 | self.dropout = nn.Dropout(dropout) 206 | self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout) 207 | 208 | def forward(self, x, emb, src_mask): 209 | """ 210 | x: B, T, D 211 | """ 212 | B, T, D = x.shape 213 | H = self.num_head 214 | # B, T, 1, D 215 | query = self.query(self.norm(x)).unsqueeze(2) 216 | # B, 1, T, D 217 | key = self.key(self.norm(x)).unsqueeze(1) 218 | query = query.view(B, T, H, -1) 219 | key = key.view(B, T, H, -1) 220 | # B, T, T, H 221 | attention = torch.einsum('bnhd,bmhd->bnmh', query, key) / math.sqrt(D // H) 222 | attention = attention + (1 - src_mask.unsqueeze(-1)) * -100000 223 | weight = self.dropout(F.softmax(attention, dim=2)) 224 | value = self.value(self.norm(x)).view(B, T, H, -1) 225 | y = torch.einsum('bnmh,bmhd->bnhd', weight, value).reshape(B, T, D) 226 | y = x + self.proj_out(y, emb) 227 | return y 228 | 229 | class TemporalCrossAttention(nn.Module): 230 | 231 | def __init__(self, seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim): 232 | super().__init__() 233 | self.num_head = num_head 234 | self.norm = nn.LayerNorm(latent_dim) 235 | self.text_norm = nn.LayerNorm(text_latent_dim) 236 | self.query = nn.Linear(latent_dim, latent_dim) 237 | self.key = nn.Linear(text_latent_dim, latent_dim) 238 | self.value = nn.Linear(text_latent_dim, latent_dim) 239 | self.dropout = nn.Dropout(dropout) 240 | self.proj_out = StylizationBlock(latent_dim, time_embed_dim, dropout) 241 | 242 | def forward(self, x, xf, emb): 243 | """ 244 | x: B, T, D 245 | xf: B, N, L 246 | """ 247 | B, T, D = x.shape 248 | N = xf.shape[1] 249 | H = self.num_head 250 | # B, T, 1, D 251 | query = self.query(self.norm(x)).unsqueeze(2) 252 | # B, 1, N, D 253 | key = self.key(self.text_norm(xf)).unsqueeze(1) 254 | query = query.view(B, T, H, -1) 255 | key = key.view(B, N, H, -1) 256 | # B, T, N, H 257 | attention = torch.einsum('bnhd,bmhd->bnmh', query, key) / math.sqrt(D // H) 258 | weight = self.dropout(F.softmax(attention, dim=2)) 259 | value = self.value(self.text_norm(xf)).view(B, N, H, -1) 260 | y = torch.einsum('bnmh,bmhd->bnhd', weight, value).reshape(B, T, D) 261 | y = x + self.proj_out(y, emb) 262 | return y 263 | 264 | class TemporalDiffusionTransformerDecoderLayer(nn.Module): 265 | 266 | def __init__(self, 267 | seq_len=60, 268 | latent_dim=32, 269 | text_latent_dim=512, 270 | time_embed_dim=128, 271 | ffn_dim=256, 272 | num_head=4, 273 | dropout=0.1): 274 | super().__init__() 275 | self.sa_block = TemporalSelfAttention( 276 | seq_len, latent_dim, num_head, dropout, time_embed_dim) 277 | self.ca_block = TemporalCrossAttention( 278 | seq_len, latent_dim, text_latent_dim, num_head, dropout, time_embed_dim) 279 | self.ffn = FFN(latent_dim, ffn_dim, dropout, time_embed_dim) 280 | 281 | def forward(self, x, xf, emb, src_mask): 282 | x = self.sa_block(x, emb, src_mask) 283 | x = self.ca_block(x, xf, emb) 284 | x = self.ffn(x, emb) 285 | return x 286 | 287 | 288 | class MotionTransformer(nn.Module): 289 | def __init__(self, 290 | input_feats, 291 | num_frames=240, 292 | latent_dim=512, 293 | ff_size=1024, 294 | num_layers=8, 295 | num_heads=8, 296 | dropout=0, 297 | activation="gelu", 298 | num_text_layers=4, 299 | text_latent_dim=256, 300 | text_ff_size=2048, 301 | text_num_heads=4, 302 | no_clip=False, 303 | no_eff=False, 304 | **kargs): 305 | super().__init__() 306 | 307 | self.num_frames = num_frames 308 | self.latent_dim = latent_dim 309 | self.ff_size = ff_size 310 | self.num_layers = num_layers 311 | self.num_heads = num_heads 312 | self.dropout = dropout 313 | self.activation = activation 314 | self.input_feats = input_feats 315 | self.time_embed_dim = latent_dim * 4 316 | self.sequence_embedding = nn.Parameter(torch.randn(num_frames, latent_dim)) 317 | 318 | # Text Transformer 319 | self.clip, _ = clip.load('ViT-B/32', "cpu") 320 | if no_clip: 321 | self.clip.initialize_parameters() 322 | else: 323 | set_requires_grad(self.clip, False) 324 | if text_latent_dim != 512: 325 | self.text_pre_proj = nn.Linear(512, text_latent_dim) 326 | else: 327 | self.text_pre_proj = nn.Identity() 328 | textTransEncoderLayer = nn.TransformerEncoderLayer( 329 | d_model=text_latent_dim, 330 | nhead=text_num_heads, 331 | dim_feedforward=text_ff_size, 332 | dropout=dropout, 333 | activation=activation) 334 | self.textTransEncoder = nn.TransformerEncoder( 335 | textTransEncoderLayer, 336 | num_layers=num_text_layers) 337 | self.text_ln = nn.LayerNorm(text_latent_dim) 338 | self.text_proj = nn.Sequential( 339 | nn.Linear(text_latent_dim, self.time_embed_dim) 340 | ) 341 | 342 | # Input Embedding 343 | self.joint_embed = nn.Linear(self.input_feats, self.latent_dim) 344 | 345 | self.time_embed = nn.Sequential( 346 | nn.Linear(self.latent_dim, self.time_embed_dim), 347 | nn.SiLU(), 348 | nn.Linear(self.time_embed_dim, self.time_embed_dim), 349 | ) 350 | self.temporal_decoder_blocks = nn.ModuleList() 351 | for i in range(num_layers): 352 | if no_eff: 353 | self.temporal_decoder_blocks.append( 354 | TemporalDiffusionTransformerDecoderLayer( 355 | seq_len=num_frames, 356 | latent_dim=latent_dim, 357 | text_latent_dim=text_latent_dim, 358 | time_embed_dim=self.time_embed_dim, 359 | ffn_dim=ff_size, 360 | num_head=num_heads, 361 | dropout=dropout 362 | ) 363 | ) 364 | else: 365 | self.temporal_decoder_blocks.append( 366 | LinearTemporalDiffusionTransformerDecoderLayer( 367 | seq_len=num_frames, 368 | latent_dim=latent_dim, 369 | text_latent_dim=text_latent_dim, 370 | time_embed_dim=self.time_embed_dim, 371 | ffn_dim=ff_size, 372 | num_head=num_heads, 373 | dropout=dropout 374 | ) 375 | ) 376 | 377 | # Output Module 378 | self.out = zero_module(nn.Linear(self.latent_dim, self.input_feats)) 379 | 380 | def encode_text(self, text, device): 381 | with torch.no_grad(): 382 | text = clip.tokenize(text, truncate=True).to(device) 383 | x = self.clip.token_embedding(text).type(self.clip.dtype) # [batch_size, n_ctx, d_model] 384 | 385 | x = x + self.clip.positional_embedding.type(self.clip.dtype) 386 | x = x.permute(1, 0, 2) # NLD -> LND 387 | x = self.clip.transformer(x) 388 | x = self.clip.ln_final(x).type(self.clip.dtype) 389 | 390 | # T, B, D 391 | x = self.text_pre_proj(x) 392 | xf_out = self.textTransEncoder(x) 393 | xf_out = self.text_ln(xf_out) 394 | xf_proj = self.text_proj(xf_out[text.argmax(dim=-1), torch.arange(xf_out.shape[1])]) 395 | # B, T, D 396 | xf_out = xf_out.permute(1, 0, 2) 397 | return xf_proj, xf_out 398 | 399 | def generate_src_mask(self, T, length): 400 | B = len(length) 401 | src_mask = torch.ones(B, T) 402 | for i in range(B): 403 | for j in range(length[i], T): 404 | src_mask[i, j] = 0 405 | return src_mask 406 | 407 | def forward(self, x, timesteps, length=None, text=None, xf_proj=None, xf_out=None): 408 | """ 409 | x: B, T, D 410 | """ 411 | B, T = x.shape[0], x.shape[1] 412 | if text is not None and len(text) != B: 413 | index = x.device.index 414 | text = text[index * B: index * B + B] 415 | if xf_proj is None or xf_out is None: 416 | xf_proj, xf_out = self.encode_text(text, x.device) 417 | 418 | emb = self.time_embed(timestep_embedding(timesteps, self.latent_dim)) + xf_proj 419 | 420 | # B, T, latent_dim 421 | h = self.joint_embed(x) 422 | h = h + self.sequence_embedding.unsqueeze(0)[:, :T, :] 423 | 424 | src_mask = self.generate_src_mask(T, length).to(x.device).unsqueeze(-1) 425 | for module in self.temporal_decoder_blocks: 426 | h = module(h, xf_out, emb, src_mask) 427 | 428 | output = self.out(h).view(B, T, -1).contiguous() 429 | return output 430 | -------------------------------------------------------------------------------- /text2motion/options/base_options.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | from mmcv.runner import init_dist, get_dist_info 5 | import torch.distributed as dist 6 | 7 | 8 | class BaseOptions(): 9 | def __init__(self): 10 | self.parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 11 | self.initialized = False 12 | 13 | def initialize(self): 14 | self.parser.add_argument('--name', type=str, default="test", help='Name of this trial') 15 | self.parser.add_argument('--decomp_name', type=str, default="Decomp_SP001_SM001_H512", help='Name of autoencoder model') 16 | 17 | self.parser.add_argument("--gpu_id", type=int, nargs='+', default=(-1), help='GPU id') 18 | self.parser.add_argument("--distributed", action="store_true", help='Whether to use DDP training') 19 | self.parser.add_argument("--data_parallel", action="store_true", help="Whether to use DP training") 20 | 21 | self.parser.add_argument('--dataset_name', type=str, default='t2m', help='Dataset Name') 22 | self.parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints', help='models are saved here') 23 | 24 | self.parser.add_argument("--unit_length", type=int, default=4, help="Motions are cropped to the maximum times of unit_length") 25 | self.parser.add_argument("--max_text_len", type=int, default=20, help="Maximum length of text description") 26 | 27 | self.parser.add_argument('--text_enc_mod', type=str, default='bigru') 28 | self.parser.add_argument('--estimator_mod', type=str, default='bigru') 29 | 30 | self.parser.add_argument('--dim_text_hidden', type=int, default=512, help='Dimension of hidden unit in text encoder') 31 | self.parser.add_argument('--dim_att_vec', type=int, default=512, help='Dimension of attention vector') 32 | self.parser.add_argument('--dim_z', type=int, default=128, help='Dimension of latent Gaussian vector') 33 | 34 | self.parser.add_argument('--n_layers_pri', type=int, default=1, help='Number of layers in prior network') 35 | self.parser.add_argument('--n_layers_pos', type=int, default=1, help='Number of layers in posterior network') 36 | self.parser.add_argument('--n_layers_dec', type=int, default=1, help='Number of layers in generator') 37 | 38 | self.parser.add_argument('--dim_pri_hidden', type=int, default=1024, help='Dimension of hidden unit in prior network') 39 | self.parser.add_argument('--dim_pos_hidden', type=int, default=1024, help='Dimension of hidden unit in posterior network') 40 | self.parser.add_argument('--dim_dec_hidden', type=int, default=1024, help='Dimension of hidden unit in generator') 41 | 42 | self.parser.add_argument('--dim_movement_enc_hidden', type=int, default=512, 43 | help='Dimension of hidden in AutoEncoder(encoder)') 44 | self.parser.add_argument('--dim_movement_dec_hidden', type=int, default=512, 45 | help='Dimension of hidden in AutoEncoder(decoder)') 46 | self.parser.add_argument('--dim_movement_latent', type=int, default=512, help='Dimension of motion snippet') 47 | 48 | self.initialized = True 49 | 50 | 51 | 52 | def parse(self): 53 | if not self.initialized: 54 | self.initialize() 55 | 56 | self.opt = self.parser.parse_args() 57 | 58 | self.opt.is_train = self.is_train 59 | 60 | args = vars(self.opt) 61 | if args["distributed"]: 62 | init_dist('slurm') 63 | rank, world_size = get_dist_info() 64 | if args["distributed"]: 65 | self.opt.gpu_id = range(world_size) 66 | elif self.opt.gpu_id != (-1): 67 | if len(self.opt.gpu_id) == 1: 68 | torch.cuda.set_device(self.opt.gpu_id[0]) 69 | else: 70 | assert args["data_parallel"] == False 71 | 72 | if rank == 0: 73 | print('------------ Options -------------') 74 | for k, v in sorted(args.items()): 75 | print('%s: %s' % (str(k), str(v))) 76 | print('-------------- End ----------------') 77 | if self.is_train: 78 | # save to the disk 79 | expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.dataset_name, self.opt.name) 80 | if not os.path.exists(expr_dir): 81 | os.makedirs(expr_dir) 82 | file_name = os.path.join(expr_dir, 'opt.txt') 83 | with open(file_name, 'wt') as opt_file: 84 | opt_file.write('------------ Options -------------\n') 85 | for k, v in sorted(args.items()): 86 | opt_file.write('%s: %s\n' % (str(k), str(v))) 87 | opt_file.write('-------------- End ----------------\n') 88 | if world_size > 1: 89 | dist.barrier() 90 | return self.opt 91 | -------------------------------------------------------------------------------- /text2motion/options/evaluate_options.py: -------------------------------------------------------------------------------- 1 | from options.base_options import BaseOptions 2 | 3 | 4 | class TestOptions(BaseOptions): 5 | def initialize(self): 6 | BaseOptions.initialize(self) 7 | self.parser.add_argument('--batch_size', type=int, default=1, help='Batch size') 8 | self.parser.add_argument('--start_mov_len', type=int, default=10) 9 | self.parser.add_argument('--est_length', action="store_true", help="Whether to use sampled motion length") 10 | self.parser.add_argument('--num_layers', type=int, default=8, help='num_layers of transformer') 11 | self.parser.add_argument('--latent_dim', type=int, default=512, help='latent_dim of transformer') 12 | self.parser.add_argument('--diffusion_steps', type=int, default=1000, help='diffusion_steps of transformer') 13 | self.parser.add_argument('--no_clip', action='store_true', help='whether use clip pretrain') 14 | self.parser.add_argument('--no_eff', action='store_true', help='whether use efficient attention') 15 | 16 | 17 | self.parser.add_argument('--repeat_times', type=int, default=3, help="Number of generation rounds for each text description") 18 | self.parser.add_argument('--split_file', type=str, default='test.txt') 19 | self.parser.add_argument('--text', type=str, default="", help='Text description for motion generation') 20 | self.parser.add_argument('--motion_length', type=int, default=0, help='Number of framese for motion generation') 21 | self.parser.add_argument('--text_file', type=str, default="", help='Path of text description for motion generation') 22 | self.parser.add_argument('--which_epoch', type=str, default="latest", help='Checkpoint that will be used') 23 | self.parser.add_argument('--result_path', type=str, default="./eval_results/", help='Path to save generation results') 24 | self.parser.add_argument('--num_results', type=int, default=40, help='Number of descriptions that will be used') 25 | self.parser.add_argument('--ext', type=str, default='default', help='Save file path extension') 26 | 27 | self.is_train = False 28 | -------------------------------------------------------------------------------- /text2motion/options/train_options.py: -------------------------------------------------------------------------------- 1 | from options.base_options import BaseOptions 2 | import argparse 3 | 4 | class TrainCompOptions(BaseOptions): 5 | def initialize(self): 6 | BaseOptions.initialize(self) 7 | self.parser.add_argument('--num_layers', type=int, default=8, help='num_layers of transformer') 8 | self.parser.add_argument('--latent_dim', type=int, default=512, help='latent_dim of transformer') 9 | self.parser.add_argument('--diffusion_steps', type=int, default=1000, help='diffusion_steps of transformer') 10 | self.parser.add_argument('--no_clip', action='store_true', help='whether use clip pretrain') 11 | self.parser.add_argument('--no_eff', action='store_true', help='whether use efficient attention') 12 | 13 | self.parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs') 14 | self.parser.add_argument('--lr', type=float, default=2e-4, help='Learning rate') 15 | self.parser.add_argument('--batch_size', type=int, default=32, help='Batch size per GPU') 16 | self.parser.add_argument('--times', type=int, default=1, help='times of dataset') 17 | 18 | self.parser.add_argument('--feat_bias', type=float, default=25, help='Scales for global motion features and foot contact') 19 | 20 | self.parser.add_argument('--is_continue', action="store_true", help='Is this trail continued from previous trail?') 21 | 22 | self.parser.add_argument('--log_every', type=int, default=50, help='Frequency of printing training progress (by iteration)') 23 | self.parser.add_argument('--save_every_e', type=int, default=5, help='Frequency of saving models (by epoch)') 24 | self.parser.add_argument('--eval_every_e', type=int, default=5, help='Frequency of animation results (by epoch)') 25 | self.parser.add_argument('--save_latest', type=int, default=500, help='Frequency of saving models (by iteration)') 26 | self.is_train = True 27 | -------------------------------------------------------------------------------- /text2motion/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | opencv-python 3 | scipy 4 | matplotlib==3.3.1 5 | spacy 6 | git+https://github.com/openai/CLIP.git -------------------------------------------------------------------------------- /text2motion/tools/evaluation.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import numpy as np 3 | import torch 4 | from datasets import get_dataset_motion_loader, get_motion_loader 5 | from models import MotionTransformer 6 | from utils.get_opt import get_opt 7 | from utils.metrics import * 8 | from datasets import EvaluatorModelWrapper 9 | from collections import OrderedDict 10 | from utils.plot_script import * 11 | from utils import paramUtil 12 | from utils.utils import * 13 | from trainers import DDPMTrainer 14 | 15 | from os.path import join as pjoin 16 | import sys 17 | 18 | 19 | def build_models(opt, dim_pose): 20 | encoder = MotionTransformer( 21 | input_feats=dim_pose, 22 | num_frames=opt.max_motion_length, 23 | num_layers=opt.num_layers, 24 | latent_dim=opt.latent_dim, 25 | no_clip=opt.no_clip, 26 | no_eff=opt.no_eff) 27 | return encoder 28 | 29 | 30 | torch.multiprocessing.set_sharing_strategy('file_system') 31 | 32 | 33 | def evaluate_matching_score(motion_loaders, file): 34 | match_score_dict = OrderedDict({}) 35 | R_precision_dict = OrderedDict({}) 36 | activation_dict = OrderedDict({}) 37 | # print(motion_loaders.keys()) 38 | print('========== Evaluating Matching Score ==========') 39 | for motion_loader_name, motion_loader in motion_loaders.items(): 40 | all_motion_embeddings = [] 41 | score_list = [] 42 | all_size = 0 43 | matching_score_sum = 0 44 | top_k_count = 0 45 | # print(motion_loader_name) 46 | with torch.no_grad(): 47 | for idx, batch in enumerate(motion_loader): 48 | word_embeddings, pos_one_hots, _, sent_lens, motions, m_lens, _ = batch 49 | text_embeddings, motion_embeddings = eval_wrapper.get_co_embeddings( 50 | word_embs=word_embeddings, 51 | pos_ohot=pos_one_hots, 52 | cap_lens=sent_lens, 53 | motions=motions, 54 | m_lens=m_lens 55 | ) 56 | dist_mat = euclidean_distance_matrix(text_embeddings.cpu().numpy(), 57 | motion_embeddings.cpu().numpy()) 58 | matching_score_sum += dist_mat.trace() 59 | 60 | argsmax = np.argsort(dist_mat, axis=1) 61 | top_k_mat = calculate_top_k(argsmax, top_k=3) 62 | top_k_count += top_k_mat.sum(axis=0) 63 | 64 | all_size += text_embeddings.shape[0] 65 | 66 | all_motion_embeddings.append(motion_embeddings.cpu().numpy()) 67 | 68 | all_motion_embeddings = np.concatenate(all_motion_embeddings, axis=0) 69 | matching_score = matching_score_sum / all_size 70 | R_precision = top_k_count / all_size 71 | match_score_dict[motion_loader_name] = matching_score 72 | R_precision_dict[motion_loader_name] = R_precision 73 | activation_dict[motion_loader_name] = all_motion_embeddings 74 | 75 | print(f'---> [{motion_loader_name}] Matching Score: {matching_score:.4f}') 76 | print(f'---> [{motion_loader_name}] Matching Score: {matching_score:.4f}', file=file, flush=True) 77 | 78 | line = f'---> [{motion_loader_name}] R_precision: ' 79 | for i in range(len(R_precision)): 80 | line += '(top %d): %.4f ' % (i+1, R_precision[i]) 81 | print(line) 82 | print(line, file=file, flush=True) 83 | 84 | return match_score_dict, R_precision_dict, activation_dict 85 | 86 | 87 | def evaluate_fid(groundtruth_loader, activation_dict, file): 88 | eval_dict = OrderedDict({}) 89 | gt_motion_embeddings = [] 90 | print('========== Evaluating FID ==========') 91 | with torch.no_grad(): 92 | for idx, batch in enumerate(groundtruth_loader): 93 | _, _, _, sent_lens, motions, m_lens, _ = batch 94 | motion_embeddings = eval_wrapper.get_motion_embeddings( 95 | motions=motions, 96 | m_lens=m_lens 97 | ) 98 | gt_motion_embeddings.append(motion_embeddings.cpu().numpy()) 99 | gt_motion_embeddings = np.concatenate(gt_motion_embeddings, axis=0) 100 | gt_mu, gt_cov = calculate_activation_statistics(gt_motion_embeddings) 101 | 102 | # print(gt_mu) 103 | for model_name, motion_embeddings in activation_dict.items(): 104 | mu, cov = calculate_activation_statistics(motion_embeddings) 105 | # print(mu) 106 | fid = calculate_frechet_distance(gt_mu, gt_cov, mu, cov) 107 | print(f'---> [{model_name}] FID: {fid:.4f}') 108 | print(f'---> [{model_name}] FID: {fid:.4f}', file=file, flush=True) 109 | eval_dict[model_name] = fid 110 | return eval_dict 111 | 112 | 113 | def evaluate_diversity(activation_dict, file): 114 | eval_dict = OrderedDict({}) 115 | print('========== Evaluating Diversity ==========') 116 | for model_name, motion_embeddings in activation_dict.items(): 117 | diversity = calculate_diversity(motion_embeddings, diversity_times) 118 | eval_dict[model_name] = diversity 119 | print(f'---> [{model_name}] Diversity: {diversity:.4f}') 120 | print(f'---> [{model_name}] Diversity: {diversity:.4f}', file=file, flush=True) 121 | return eval_dict 122 | 123 | 124 | def evaluate_multimodality(mm_motion_loaders, file): 125 | eval_dict = OrderedDict({}) 126 | print('========== Evaluating MultiModality ==========') 127 | for model_name, mm_motion_loader in mm_motion_loaders.items(): 128 | mm_motion_embeddings = [] 129 | with torch.no_grad(): 130 | for idx, batch in enumerate(mm_motion_loader): 131 | # (1, mm_replications, dim_pos) 132 | motions, m_lens = batch 133 | motion_embedings = eval_wrapper.get_motion_embeddings(motions[0], m_lens[0]) 134 | mm_motion_embeddings.append(motion_embedings.unsqueeze(0)) 135 | if len(mm_motion_embeddings) == 0: 136 | multimodality = 0 137 | else: 138 | mm_motion_embeddings = torch.cat(mm_motion_embeddings, dim=0).cpu().numpy() 139 | multimodality = calculate_multimodality(mm_motion_embeddings, mm_num_times) 140 | print(f'---> [{model_name}] Multimodality: {multimodality:.4f}') 141 | print(f'---> [{model_name}] Multimodality: {multimodality:.4f}', file=file, flush=True) 142 | eval_dict[model_name] = multimodality 143 | return eval_dict 144 | 145 | 146 | def get_metric_statistics(values): 147 | mean = np.mean(values, axis=0) 148 | std = np.std(values, axis=0) 149 | conf_interval = 1.96 * std / np.sqrt(replication_times) 150 | return mean, conf_interval 151 | 152 | 153 | def evaluation(log_file): 154 | with open(log_file, 'w') as f: 155 | all_metrics = OrderedDict({'Matching Score': OrderedDict({}), 156 | 'R_precision': OrderedDict({}), 157 | 'FID': OrderedDict({}), 158 | 'Diversity': OrderedDict({}), 159 | 'MultiModality': OrderedDict({})}) 160 | for replication in range(replication_times): 161 | motion_loaders = {} 162 | mm_motion_loaders = {} 163 | motion_loaders['ground truth'] = gt_loader 164 | for motion_loader_name, motion_loader_getter in eval_motion_loaders.items(): 165 | motion_loader, mm_motion_loader = motion_loader_getter() 166 | motion_loaders[motion_loader_name] = motion_loader 167 | mm_motion_loaders[motion_loader_name] = mm_motion_loader 168 | 169 | print(f'==================== Replication {replication} ====================') 170 | print(f'==================== Replication {replication} ====================', file=f, flush=True) 171 | print(f'Time: {datetime.now()}') 172 | print(f'Time: {datetime.now()}', file=f, flush=True) 173 | mat_score_dict, R_precision_dict, acti_dict = evaluate_matching_score(motion_loaders, f) 174 | 175 | print(f'Time: {datetime.now()}') 176 | print(f'Time: {datetime.now()}', file=f, flush=True) 177 | fid_score_dict = evaluate_fid(gt_loader, acti_dict, f) 178 | 179 | print(f'Time: {datetime.now()}') 180 | print(f'Time: {datetime.now()}', file=f, flush=True) 181 | div_score_dict = evaluate_diversity(acti_dict, f) 182 | 183 | print(f'Time: {datetime.now()}') 184 | print(f'Time: {datetime.now()}', file=f, flush=True) 185 | mm_score_dict = evaluate_multimodality(mm_motion_loaders, f) 186 | 187 | print(f'!!! DONE !!!') 188 | print(f'!!! DONE !!!', file=f, flush=True) 189 | 190 | for key, item in mat_score_dict.items(): 191 | if key not in all_metrics['Matching Score']: 192 | all_metrics['Matching Score'][key] = [item] 193 | else: 194 | all_metrics['Matching Score'][key] += [item] 195 | 196 | for key, item in R_precision_dict.items(): 197 | if key not in all_metrics['R_precision']: 198 | all_metrics['R_precision'][key] = [item] 199 | else: 200 | all_metrics['R_precision'][key] += [item] 201 | 202 | for key, item in fid_score_dict.items(): 203 | if key not in all_metrics['FID']: 204 | all_metrics['FID'][key] = [item] 205 | else: 206 | all_metrics['FID'][key] += [item] 207 | 208 | for key, item in div_score_dict.items(): 209 | if key not in all_metrics['Diversity']: 210 | all_metrics['Diversity'][key] = [item] 211 | else: 212 | all_metrics['Diversity'][key] += [item] 213 | 214 | for key, item in mm_score_dict.items(): 215 | if key not in all_metrics['MultiModality']: 216 | all_metrics['MultiModality'][key] = [item] 217 | else: 218 | all_metrics['MultiModality'][key] += [item] 219 | 220 | 221 | # print(all_metrics['Diversity']) 222 | for metric_name, metric_dict in all_metrics.items(): 223 | print('========== %s Summary ==========' % metric_name) 224 | print('========== %s Summary ==========' % metric_name, file=f, flush=True) 225 | 226 | for model_name, values in metric_dict.items(): 227 | # print(metric_name, model_name) 228 | mean, conf_interval = get_metric_statistics(np.array(values)) 229 | # print(mean, mean.dtype) 230 | if isinstance(mean, np.float64) or isinstance(mean, np.float32): 231 | print(f'---> [{model_name}] Mean: {mean:.4f} CInterval: {conf_interval:.4f}') 232 | print(f'---> [{model_name}] Mean: {mean:.4f} CInterval: {conf_interval:.4f}', file=f, flush=True) 233 | elif isinstance(mean, np.ndarray): 234 | line = f'---> [{model_name}]' 235 | for i in range(len(mean)): 236 | line += '(top %d) Mean: %.4f CInt: %.4f;' % (i+1, mean[i], conf_interval[i]) 237 | print(line) 238 | print(line, file=f, flush=True) 239 | 240 | 241 | if __name__ == '__main__': 242 | mm_num_samples = 100 243 | mm_num_repeats = 30 244 | mm_num_times = 10 245 | 246 | diversity_times = 300 247 | replication_times = 1 248 | batch_size = 32 249 | opt_path = sys.argv[1] 250 | dataset_opt_path = opt_path 251 | 252 | try: 253 | device_id = int(sys.argv[2]) 254 | except: 255 | device_id = 0 256 | device = torch.device('cuda:%d' % device_id if torch.cuda.is_available() else 'cpu') 257 | torch.cuda.set_device(device_id) 258 | 259 | gt_loader, gt_dataset = get_dataset_motion_loader(dataset_opt_path, batch_size, device) 260 | wrapper_opt = get_opt(dataset_opt_path, device) 261 | eval_wrapper = EvaluatorModelWrapper(wrapper_opt) 262 | 263 | opt = get_opt(opt_path, device) 264 | encoder = build_models(opt, opt.dim_pose) 265 | trainer = DDPMTrainer(opt, encoder) 266 | eval_motion_loaders = { 267 | 'text2motion': lambda: get_motion_loader( 268 | opt, 269 | batch_size, 270 | trainer, 271 | gt_dataset, 272 | mm_num_samples, 273 | mm_num_repeats 274 | ) 275 | } 276 | 277 | log_file = './t2m_evaluation.log' 278 | evaluation(log_file) 279 | -------------------------------------------------------------------------------- /text2motion/tools/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join as pjoin 3 | 4 | import utils.paramUtil as paramUtil 5 | from options.train_options import TrainCompOptions 6 | from utils.plot_script import * 7 | 8 | from models import MotionTransformer 9 | from trainers import DDPMTrainer 10 | from datasets import Text2MotionDataset 11 | 12 | from mmcv.runner import get_dist_info, init_dist 13 | from mmcv.parallel import MMDistributedDataParallel, MMDataParallel 14 | import torch 15 | import torch.distributed as dist 16 | 17 | 18 | def build_models(opt, dim_pose): 19 | encoder = MotionTransformer( 20 | input_feats=dim_pose, 21 | num_frames=opt.max_motion_length, 22 | num_layers=opt.num_layers, 23 | latent_dim=opt.latent_dim, 24 | no_clip=opt.no_clip, 25 | no_eff=opt.no_eff) 26 | return encoder 27 | 28 | 29 | if __name__ == '__main__': 30 | parser = TrainCompOptions() 31 | opt = parser.parse() 32 | rank, world_size = get_dist_info() 33 | 34 | opt.device = torch.device("cuda") 35 | torch.autograd.set_detect_anomaly(True) 36 | 37 | opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name) 38 | opt.model_dir = pjoin(opt.save_root, 'model') 39 | opt.meta_dir = pjoin(opt.save_root, 'meta') 40 | 41 | if rank == 0: 42 | os.makedirs(opt.model_dir, exist_ok=True) 43 | os.makedirs(opt.meta_dir, exist_ok=True) 44 | if world_size > 1: 45 | dist.barrier() 46 | 47 | if opt.dataset_name == 't2m': 48 | opt.data_root = './data/HumanML3D' 49 | opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs') 50 | opt.text_dir = pjoin(opt.data_root, 'texts') 51 | opt.joints_num = 22 52 | radius = 4 53 | fps = 20 54 | opt.max_motion_length = 196 55 | dim_pose = 263 56 | kinematic_chain = paramUtil.t2m_kinematic_chain 57 | elif opt.dataset_name == 'kit': 58 | opt.data_root = './data/KIT-ML' 59 | opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs') 60 | opt.text_dir = pjoin(opt.data_root, 'texts') 61 | opt.joints_num = 21 62 | radius = 240 * 8 63 | fps = 12.5 64 | dim_pose = 251 65 | opt.max_motion_length = 196 66 | kinematic_chain = paramUtil.kit_kinematic_chain 67 | 68 | else: 69 | raise KeyError('Dataset Does Not Exist') 70 | 71 | dim_word = 300 72 | mean = np.load(pjoin(opt.data_root, 'Mean.npy')) 73 | std = np.load(pjoin(opt.data_root, 'Std.npy')) 74 | 75 | train_split_file = pjoin(opt.data_root, 'train.txt') 76 | 77 | encoder = build_models(opt, dim_pose) 78 | if world_size > 1: 79 | encoder = MMDistributedDataParallel( 80 | encoder.cuda(), 81 | device_ids=[torch.cuda.current_device()], 82 | broadcast_buffers=False, 83 | find_unused_parameters=True) 84 | elif opt.data_parallel: 85 | encoder = MMDataParallel( 86 | encoder.cuda(opt.gpu_id[0]), device_ids=opt.gpu_id) 87 | else: 88 | encoder = encoder.cuda() 89 | 90 | trainer = DDPMTrainer(opt, encoder) 91 | train_dataset = Text2MotionDataset(opt, mean, std, train_split_file, opt.times) 92 | trainer.train(train_dataset) 93 | -------------------------------------------------------------------------------- /text2motion/tools/visualization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import argparse 5 | from os.path import join as pjoin 6 | 7 | import utils.paramUtil as paramUtil 8 | from torch.utils.data import DataLoader 9 | from utils.plot_script import * 10 | from utils.get_opt import get_opt 11 | from datasets.evaluator_models import MotionLenEstimatorBiGRU 12 | 13 | from trainers import DDPMTrainer 14 | from models import MotionTransformer 15 | from utils.word_vectorizer import WordVectorizer, POS_enumerator 16 | from utils.utils import * 17 | from utils.motion_process import recover_from_ric 18 | 19 | 20 | def plot_t2m(data, result_path, npy_path, caption): 21 | joint = recover_from_ric(torch.from_numpy(data).float(), opt.joints_num).numpy() 22 | joint = motion_temporal_filter(joint, sigma=1) 23 | plot_3d_motion(result_path, paramUtil.t2m_kinematic_chain, joint, title=caption, fps=20) 24 | if npy_path != "": 25 | np.save(npy_path, joint) 26 | 27 | 28 | def build_models(opt): 29 | encoder = MotionTransformer( 30 | input_feats=opt.dim_pose, 31 | num_frames=opt.max_motion_length, 32 | num_layers=opt.num_layers, 33 | latent_dim=opt.latent_dim, 34 | no_clip=opt.no_clip, 35 | no_eff=opt.no_eff) 36 | return encoder 37 | 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--opt_path', type=str, help='Opt path') 42 | parser.add_argument('--text', type=str, default="", help='Text description for motion generation') 43 | parser.add_argument('--motion_length', type=int, default=60, help='Number of frames for motion generation') 44 | parser.add_argument('--result_path', type=str, default="test_sample.gif", help='Path to save generation result') 45 | parser.add_argument('--npy_path', type=str, default="", help='Path to save 3D keypoints sequence') 46 | parser.add_argument('--gpu_id', type=int, default=-1, help="which gpu to use") 47 | args = parser.parse_args() 48 | 49 | device = torch.device('cuda:%d' % args.gpu_id if args.gpu_id != -1 else 'cpu') 50 | opt = get_opt(args.opt_path, device) 51 | opt.do_denoise = True 52 | 53 | assert opt.dataset_name == "t2m" 54 | assert args.motion_length <= 196 55 | opt.data_root = './dataset/HumanML3D' 56 | opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs') 57 | opt.text_dir = pjoin(opt.data_root, 'texts') 58 | opt.joints_num = 22 59 | opt.dim_pose = 263 60 | dim_word = 300 61 | dim_pos_ohot = len(POS_enumerator) 62 | num_classes = 200 // opt.unit_length 63 | 64 | mean = np.load(pjoin(opt.meta_dir, 'mean.npy')) 65 | std = np.load(pjoin(opt.meta_dir, 'std.npy')) 66 | 67 | encoder = build_models(opt).to(device) 68 | trainer = DDPMTrainer(opt, encoder) 69 | trainer.load(pjoin(opt.model_dir, opt.which_epoch + '.tar')) 70 | 71 | trainer.eval_mode() 72 | trainer.to(opt.device) 73 | 74 | result_dict = {} 75 | with torch.no_grad(): 76 | if args.motion_length != -1: 77 | caption = [args.text] 78 | m_lens = torch.LongTensor([args.motion_length]).to(device) 79 | pred_motions = trainer.generate(caption, m_lens, opt.dim_pose) 80 | motion = pred_motions[0].cpu().numpy() 81 | motion = motion * std + mean 82 | title = args.text + " #%d" % motion.shape[0] 83 | plot_t2m(motion, args.result_path, args.npy_path, title) 84 | -------------------------------------------------------------------------------- /text2motion/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from .ddpm_trainer import DDPMTrainer 2 | 3 | 4 | __all__ = ['DDPMTrainer'] -------------------------------------------------------------------------------- /text2motion/trainers/ddpm_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import random 4 | import time 5 | from models.transformer import MotionTransformer 6 | from torch.utils.data import DataLoader 7 | import torch.optim as optim 8 | from torch.nn.utils import clip_grad_norm_ 9 | from collections import OrderedDict 10 | from utils.utils import print_current_loss 11 | from os.path import join as pjoin 12 | import codecs as cs 13 | import torch.distributed as dist 14 | 15 | 16 | from mmcv.runner import get_dist_info 17 | from models.gaussian_diffusion import ( 18 | GaussianDiffusion, 19 | get_named_beta_schedule, 20 | create_named_schedule_sampler, 21 | ModelMeanType, 22 | ModelVarType, 23 | LossType 24 | ) 25 | 26 | from datasets import build_dataloader 27 | 28 | 29 | class DDPMTrainer(object): 30 | 31 | def __init__(self, args, encoder): 32 | self.opt = args 33 | self.device = args.device 34 | self.encoder = encoder 35 | self.diffusion_steps = args.diffusion_steps 36 | sampler = 'uniform' 37 | beta_scheduler = 'linear' 38 | betas = get_named_beta_schedule(beta_scheduler, self.diffusion_steps) 39 | self.diffusion = GaussianDiffusion( 40 | betas=betas, 41 | model_mean_type=ModelMeanType.EPSILON, 42 | model_var_type=ModelVarType.FIXED_SMALL, 43 | loss_type=LossType.MSE 44 | ) 45 | self.sampler = create_named_schedule_sampler(sampler, self.diffusion) 46 | self.sampler_name = sampler 47 | 48 | if args.is_train: 49 | self.mse_criterion = torch.nn.MSELoss(reduction='none') 50 | self.to(self.device) 51 | 52 | @staticmethod 53 | def zero_grad(opt_list): 54 | for opt in opt_list: 55 | opt.zero_grad() 56 | 57 | @staticmethod 58 | def clip_norm(network_list): 59 | for network in network_list: 60 | clip_grad_norm_(network.parameters(), 0.5) 61 | 62 | @staticmethod 63 | def step(opt_list): 64 | for opt in opt_list: 65 | opt.step() 66 | 67 | def forward(self, batch_data, eval_mode=False): 68 | caption, motions, m_lens = batch_data 69 | motions = motions.detach().to(self.device).float() 70 | 71 | self.caption = caption 72 | self.motions = motions 73 | x_start = motions 74 | B, T = x_start.shape[:2] 75 | cur_len = torch.LongTensor([min(T, m_len) for m_len in m_lens]).to(self.device) 76 | t, _ = self.sampler.sample(B, x_start.device) 77 | output = self.diffusion.training_losses( 78 | model=self.encoder, 79 | x_start=x_start, 80 | t=t, 81 | model_kwargs={"text": caption, "length": cur_len} 82 | ) 83 | 84 | self.real_noise = output['target'] 85 | self.fake_noise = output['pred'] 86 | try: 87 | self.src_mask = self.encoder.module.generate_src_mask(T, cur_len).to(x_start.device) 88 | except: 89 | self.src_mask = self.encoder.generate_src_mask(T, cur_len).to(x_start.device) 90 | 91 | def generate_batch(self, caption, m_lens, dim_pose): 92 | xf_proj, xf_out = self.encoder.encode_text(caption, self.device) 93 | 94 | B = len(caption) 95 | T = min(m_lens.max(), self.encoder.num_frames) 96 | output = self.diffusion.p_sample_loop( 97 | self.encoder, 98 | (B, T, dim_pose), 99 | clip_denoised=False, 100 | progress=True, 101 | model_kwargs={ 102 | 'xf_proj': xf_proj, 103 | 'xf_out': xf_out, 104 | 'length': m_lens 105 | }) 106 | return output 107 | 108 | def generate(self, caption, m_lens, dim_pose, batch_size=1024): 109 | N = len(caption) 110 | cur_idx = 0 111 | self.encoder.eval() 112 | all_output = [] 113 | while cur_idx < N: 114 | if cur_idx + batch_size >= N: 115 | batch_caption = caption[cur_idx:] 116 | batch_m_lens = m_lens[cur_idx:] 117 | else: 118 | batch_caption = caption[cur_idx: cur_idx + batch_size] 119 | batch_m_lens = m_lens[cur_idx: cur_idx + batch_size] 120 | output = self.generate_batch(batch_caption, batch_m_lens, dim_pose) 121 | B = output.shape[0] 122 | 123 | for i in range(B): 124 | all_output.append(output[i]) 125 | cur_idx += batch_size 126 | return all_output 127 | 128 | def backward_G(self): 129 | loss_mot_rec = self.mse_criterion(self.fake_noise, self.real_noise).mean(dim=-1) 130 | loss_mot_rec = (loss_mot_rec * self.src_mask).sum() / self.src_mask.sum() 131 | self.loss_mot_rec = loss_mot_rec 132 | loss_logs = OrderedDict({}) 133 | loss_logs['loss_mot_rec'] = self.loss_mot_rec.item() 134 | return loss_logs 135 | 136 | def update(self): 137 | self.zero_grad([self.opt_encoder]) 138 | loss_logs = self.backward_G() 139 | self.loss_mot_rec.backward() 140 | self.clip_norm([self.encoder]) 141 | self.step([self.opt_encoder]) 142 | 143 | return loss_logs 144 | 145 | def to(self, device): 146 | if self.opt.is_train: 147 | self.mse_criterion.to(device) 148 | self.encoder = self.encoder.to(device) 149 | 150 | def train_mode(self): 151 | self.encoder.train() 152 | 153 | def eval_mode(self): 154 | self.encoder.eval() 155 | 156 | def save(self, file_name, ep, total_it): 157 | state = { 158 | 'opt_encoder': self.opt_encoder.state_dict(), 159 | 'ep': ep, 160 | 'total_it': total_it 161 | } 162 | try: 163 | state['encoder'] = self.encoder.module.state_dict() 164 | except: 165 | state['encoder'] = self.encoder.state_dict() 166 | torch.save(state, file_name) 167 | return 168 | 169 | def load(self, model_dir): 170 | checkpoint = torch.load(model_dir, map_location=self.device) 171 | if self.opt.is_train: 172 | self.opt_encoder.load_state_dict(checkpoint['opt_encoder']) 173 | self.encoder.load_state_dict(checkpoint['encoder'], strict=True) 174 | return checkpoint['ep'], checkpoint.get('total_it', 0) 175 | 176 | def train(self, train_dataset): 177 | rank, world_size = get_dist_info() 178 | self.to(self.device) 179 | self.opt_encoder = optim.Adam(self.encoder.parameters(), lr=self.opt.lr) 180 | it = 0 181 | cur_epoch = 0 182 | if self.opt.is_continue: 183 | model_dir = pjoin(self.opt.model_dir, 'latest.tar') 184 | cur_epoch, it = self.load(model_dir) 185 | 186 | start_time = time.time() 187 | 188 | train_loader = build_dataloader( 189 | train_dataset, 190 | samples_per_gpu=self.opt.batch_size, 191 | drop_last=True, 192 | workers_per_gpu=4, 193 | shuffle=True, 194 | dist=self.opt.distributed, 195 | num_gpus=len(self.opt.gpu_id)) 196 | 197 | logs = OrderedDict() 198 | for epoch in range(cur_epoch, self.opt.num_epochs): 199 | self.train_mode() 200 | for i, batch_data in enumerate(train_loader): 201 | self.forward(batch_data) 202 | log_dict = self.update() 203 | for k, v in log_dict.items(): 204 | if k not in logs: 205 | logs[k] = v 206 | else: 207 | logs[k] += v 208 | it += 1 209 | if it % self.opt.log_every == 0 and rank == 0: 210 | mean_loss = OrderedDict({}) 211 | for tag, value in logs.items(): 212 | mean_loss[tag] = value / self.opt.log_every 213 | logs = OrderedDict() 214 | print_current_loss(start_time, it, mean_loss, epoch, inner_iter=i) 215 | 216 | if it % self.opt.save_latest == 0 and rank == 0: 217 | self.save(pjoin(self.opt.model_dir, 'latest.tar'), epoch, it) 218 | 219 | if rank == 0: 220 | self.save(pjoin(self.opt.model_dir, 'latest.tar'), epoch, it) 221 | 222 | if epoch % self.opt.save_every_e == 0 and rank == 0: 223 | self.save(pjoin(self.opt.model_dir, 'ckpt_e%03d.tar'%(epoch)), 224 | epoch, total_it=it) 225 | -------------------------------------------------------------------------------- /text2motion/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mingyuan-zhang/MotionDiffuse/aba848edd22133919ca96b67f4908399c67685b1/text2motion/utils/__init__.py -------------------------------------------------------------------------------- /text2motion/utils/get_opt.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import Namespace 3 | import re 4 | from os.path import join as pjoin 5 | from utils.word_vectorizer import POS_enumerator 6 | 7 | 8 | def is_float(numStr): 9 | flag = False 10 | numStr = str(numStr).strip().lstrip('-').lstrip('+') 11 | try: 12 | reg = re.compile(r'^[-+]?[0-9]+\.[0-9]+$') 13 | res = reg.match(str(numStr)) 14 | if res: 15 | flag = True 16 | except Exception as ex: 17 | print("is_float() - error: " + str(ex)) 18 | return flag 19 | 20 | 21 | def is_number(numStr): 22 | flag = False 23 | numStr = str(numStr).strip().lstrip('-').lstrip('+') 24 | if str(numStr).isdigit(): 25 | flag = True 26 | return flag 27 | 28 | 29 | def get_opt(opt_path, device): 30 | opt = Namespace() 31 | opt_dict = vars(opt) 32 | 33 | skip = ('-------------- End ----------------', 34 | '------------ Options -------------', 35 | '\n') 36 | print('Reading', opt_path) 37 | with open(opt_path) as f: 38 | for line in f: 39 | if line.strip() not in skip: 40 | # print(line.strip()) 41 | key, value = line.strip().split(': ') 42 | if value in ('True', 'False'): 43 | opt_dict[key] = True if value == 'True' else False 44 | elif is_float(value): 45 | opt_dict[key] = float(value) 46 | elif is_number(value): 47 | opt_dict[key] = int(value) 48 | else: 49 | opt_dict[key] = str(value) 50 | 51 | opt_dict['which_epoch'] = 'latest' 52 | if 'num_layers' not in opt_dict: 53 | opt_dict['num_layers'] = 8 54 | if 'latent_dim' not in opt_dict: 55 | opt_dict['latent_dim'] = 512 56 | if 'diffusion_steps' not in opt_dict: 57 | opt_dict['diffusion_steps'] = 1000 58 | if 'no_clip' not in opt_dict: 59 | opt_dict['no_clip'] = False 60 | if 'no_eff' not in opt_dict: 61 | opt_dict['no_eff'] = False 62 | 63 | opt.save_root = pjoin(opt.checkpoints_dir, opt.dataset_name, opt.name) 64 | opt.model_dir = pjoin(opt.save_root, 'model') 65 | opt.meta_dir = pjoin(opt.save_root, 'meta') 66 | 67 | if opt.dataset_name == 't2m': 68 | opt.data_root = './data/HumanML3D' 69 | opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs') 70 | opt.text_dir = pjoin(opt.data_root, 'texts') 71 | opt.joints_num = 22 72 | opt.dim_pose = 263 73 | opt.max_motion_length = 196 74 | elif opt.dataset_name == 'kit': 75 | opt.data_root = './data/KIT-ML' 76 | opt.motion_dir = pjoin(opt.data_root, 'new_joint_vecs') 77 | opt.text_dir = pjoin(opt.data_root, 'texts') 78 | opt.joints_num = 21 79 | opt.dim_pose = 251 80 | opt.max_motion_length = 196 81 | else: 82 | raise KeyError('Dataset not recognized') 83 | 84 | opt.dim_word = 300 85 | opt.num_classes = 200 // opt.unit_length 86 | opt.dim_pos_ohot = len(POS_enumerator) 87 | opt.is_train = False 88 | opt.is_continue = False 89 | opt.device = device 90 | 91 | return opt -------------------------------------------------------------------------------- /text2motion/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import linalg 3 | 4 | 5 | # (X - X_train)*(X - X_train) = -2X*X_train + X*X + X_train*X_train 6 | def euclidean_distance_matrix(matrix1, matrix2): 7 | """ 8 | Params: 9 | -- matrix1: N1 x D 10 | -- matrix2: N2 x D 11 | Returns: 12 | -- dist: N1 x N2 13 | dist[i, j] == distance(matrix1[i], matrix2[j]) 14 | """ 15 | assert matrix1.shape[1] == matrix2.shape[1] 16 | d1 = -2 * np.dot(matrix1, matrix2.T) # shape (num_test, num_train) 17 | d2 = np.sum(np.square(matrix1), axis=1, keepdims=True) # shape (num_test, 1) 18 | d3 = np.sum(np.square(matrix2), axis=1) # shape (num_train, ) 19 | dists = np.sqrt(d1 + d2 + d3) # broadcasting 20 | return dists 21 | 22 | def calculate_top_k(mat, top_k): 23 | size = mat.shape[0] 24 | gt_mat = np.expand_dims(np.arange(size), 1).repeat(size, 1) 25 | bool_mat = (mat == gt_mat) 26 | correct_vec = False 27 | top_k_list = [] 28 | for i in range(top_k): 29 | # print(correct_vec, bool_mat[:, i]) 30 | correct_vec = (correct_vec | bool_mat[:, i]) 31 | # print(correct_vec) 32 | top_k_list.append(correct_vec[:, None]) 33 | top_k_mat = np.concatenate(top_k_list, axis=1) 34 | return top_k_mat 35 | 36 | 37 | def calculate_R_precision(embedding1, embedding2, top_k, sum_all=False): 38 | dist_mat = euclidean_distance_matrix(embedding1, embedding2) 39 | argmax = np.argsort(dist_mat, axis=1) 40 | top_k_mat = calculate_top_k(argmax, top_k) 41 | if sum_all: 42 | return top_k_mat.sum(axis=0) 43 | else: 44 | return top_k_mat 45 | 46 | 47 | def calculate_matching_score(embedding1, embedding2, sum_all=False): 48 | assert len(embedding1.shape) == 2 49 | assert embedding1.shape[0] == embedding2.shape[0] 50 | assert embedding1.shape[1] == embedding2.shape[1] 51 | 52 | dist = linalg.norm(embedding1 - embedding2, axis=1) 53 | if sum_all: 54 | return dist.sum(axis=0) 55 | else: 56 | return dist 57 | 58 | 59 | 60 | def calculate_activation_statistics(activations): 61 | """ 62 | Params: 63 | -- activation: num_samples x dim_feat 64 | Returns: 65 | -- mu: dim_feat 66 | -- sigma: dim_feat x dim_feat 67 | """ 68 | mu = np.mean(activations, axis=0) 69 | cov = np.cov(activations, rowvar=False) 70 | return mu, cov 71 | 72 | 73 | def calculate_diversity(activation, diversity_times): 74 | assert len(activation.shape) == 2 75 | assert activation.shape[0] > diversity_times 76 | num_samples = activation.shape[0] 77 | 78 | first_indices = np.random.choice(num_samples, diversity_times, replace=False) 79 | second_indices = np.random.choice(num_samples, diversity_times, replace=False) 80 | dist = linalg.norm(activation[first_indices] - activation[second_indices], axis=1) 81 | return dist.mean() 82 | 83 | 84 | def calculate_multimodality(activation, multimodality_times): 85 | assert len(activation.shape) == 3 86 | assert activation.shape[1] > multimodality_times 87 | num_per_sent = activation.shape[1] 88 | 89 | first_dices = np.random.choice(num_per_sent, multimodality_times, replace=False) 90 | second_dices = np.random.choice(num_per_sent, multimodality_times, replace=False) 91 | dist = linalg.norm(activation[:, first_dices] - activation[:, second_dices], axis=2) 92 | return dist.mean() 93 | 94 | 95 | def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): 96 | """Numpy implementation of the Frechet Distance. 97 | The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) 98 | and X_2 ~ N(mu_2, C_2) is 99 | d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). 100 | Stable version by Dougal J. Sutherland. 101 | Params: 102 | -- mu1 : Numpy array containing the activations of a layer of the 103 | inception net (like returned by the function 'get_predictions') 104 | for generated samples. 105 | -- mu2 : The sample mean over activations, precalculated on an 106 | representative data set. 107 | -- sigma1: The covariance matrix over activations for generated samples. 108 | -- sigma2: The covariance matrix over activations, precalculated on an 109 | representative data set. 110 | Returns: 111 | -- : The Frechet Distance. 112 | """ 113 | 114 | mu1 = np.atleast_1d(mu1) 115 | mu2 = np.atleast_1d(mu2) 116 | 117 | sigma1 = np.atleast_2d(sigma1) 118 | sigma2 = np.atleast_2d(sigma2) 119 | 120 | assert mu1.shape == mu2.shape, \ 121 | 'Training and test mean vectors have different lengths' 122 | assert sigma1.shape == sigma2.shape, \ 123 | 'Training and test covariances have different dimensions' 124 | 125 | diff = mu1 - mu2 126 | 127 | # Product might be almost singular 128 | covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) 129 | if not np.isfinite(covmean).all(): 130 | msg = ('fid calculation produces singular product; ' 131 | 'adding %s to diagonal of cov estimates') % eps 132 | print(msg) 133 | offset = np.eye(sigma1.shape[0]) * eps 134 | covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) 135 | 136 | # Numerical error might give slight imaginary component 137 | if np.iscomplexobj(covmean): 138 | if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): 139 | m = np.max(np.abs(covmean.imag)) 140 | raise ValueError('Imaginary component {}'.format(m)) 141 | covmean = covmean.real 142 | 143 | tr_covmean = np.trace(covmean) 144 | 145 | return (diff.dot(diff) + np.trace(sigma1) + 146 | np.trace(sigma2) - 2 * tr_covmean) -------------------------------------------------------------------------------- /text2motion/utils/motion_process.py: -------------------------------------------------------------------------------- 1 | from os.path import join as pjoin 2 | 3 | import numpy as np 4 | import os 5 | from utils.quaternion import * 6 | from utils.skeleton import Skeleton 7 | from utils.paramUtil import * 8 | 9 | import torch 10 | from tqdm import tqdm 11 | 12 | # positions (batch, joint_num, 3) 13 | def uniform_skeleton(positions, target_offset): 14 | src_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu') 15 | src_offset = src_skel.get_offsets_joints(torch.from_numpy(positions[0])) 16 | src_offset = src_offset.numpy() 17 | tgt_offset = target_offset.numpy() 18 | # print(src_offset) 19 | # print(tgt_offset) 20 | '''Calculate Scale Ratio as the ratio of legs''' 21 | src_leg_len = np.abs(src_offset[l_idx1]).max() + np.abs(src_offset[l_idx2]).max() 22 | tgt_leg_len = np.abs(tgt_offset[l_idx1]).max() + np.abs(tgt_offset[l_idx2]).max() 23 | 24 | scale_rt = tgt_leg_len / src_leg_len 25 | # print(scale_rt) 26 | src_root_pos = positions[:, 0] 27 | tgt_root_pos = src_root_pos * scale_rt 28 | 29 | '''Inverse Kinematics''' 30 | quat_params = src_skel.inverse_kinematics_np(positions, face_joint_indx) 31 | # print(quat_params.shape) 32 | 33 | '''Forward Kinematics''' 34 | src_skel.set_offset(target_offset) 35 | new_joints = src_skel.forward_kinematics_np(quat_params, tgt_root_pos) 36 | return new_joints 37 | 38 | 39 | def extract_features(positions, feet_thre, n_raw_offsets, kinematic_chain, face_joint_indx, fid_r, fid_l): 40 | global_positions = positions.copy() 41 | """ Get Foot Contacts """ 42 | 43 | def foot_detect(positions, thres): 44 | velfactor, heightfactor = np.array([thres, thres]), np.array([3.0, 2.0]) 45 | 46 | feet_l_x = (positions[1:, fid_l, 0] - positions[:-1, fid_l, 0]) ** 2 47 | feet_l_y = (positions[1:, fid_l, 1] - positions[:-1, fid_l, 1]) ** 2 48 | feet_l_z = (positions[1:, fid_l, 2] - positions[:-1, fid_l, 2]) ** 2 49 | # feet_l_h = positions[:-1,fid_l,1] 50 | # feet_l = (((feet_l_x + feet_l_y + feet_l_z) < velfactor) & (feet_l_h < heightfactor)).astype(np.float) 51 | feet_l = ((feet_l_x + feet_l_y + feet_l_z) < velfactor).astype(np.float) 52 | 53 | feet_r_x = (positions[1:, fid_r, 0] - positions[:-1, fid_r, 0]) ** 2 54 | feet_r_y = (positions[1:, fid_r, 1] - positions[:-1, fid_r, 1]) ** 2 55 | feet_r_z = (positions[1:, fid_r, 2] - positions[:-1, fid_r, 2]) ** 2 56 | # feet_r_h = positions[:-1,fid_r,1] 57 | # feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor) & (feet_r_h < heightfactor)).astype(np.float) 58 | feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor)).astype(np.float) 59 | return feet_l, feet_r 60 | 61 | # 62 | feet_l, feet_r = foot_detect(positions, feet_thre) 63 | # feet_l, feet_r = foot_detect(positions, 0.002) 64 | 65 | '''Quaternion and Cartesian representation''' 66 | r_rot = None 67 | 68 | def get_rifke(positions): 69 | '''Local pose''' 70 | positions[..., 0] -= positions[:, 0:1, 0] 71 | positions[..., 2] -= positions[:, 0:1, 2] 72 | '''All pose face Z+''' 73 | positions = qrot_np(np.repeat(r_rot[:, None], positions.shape[1], axis=1), positions) 74 | return positions 75 | 76 | def get_quaternion(positions): 77 | skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu") 78 | # (seq_len, joints_num, 4) 79 | quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=False) 80 | 81 | '''Fix Quaternion Discontinuity''' 82 | quat_params = qfix(quat_params) 83 | # (seq_len, 4) 84 | r_rot = quat_params[:, 0].copy() 85 | # print(r_rot[0]) 86 | '''Root Linear Velocity''' 87 | # (seq_len - 1, 3) 88 | velocity = (positions[1:, 0] - positions[:-1, 0]).copy() 89 | # print(r_rot.shape, velocity.shape) 90 | velocity = qrot_np(r_rot[1:], velocity) 91 | '''Root Angular Velocity''' 92 | # (seq_len - 1, 4) 93 | r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1])) 94 | quat_params[1:, 0] = r_velocity 95 | # (seq_len, joints_num, 4) 96 | return quat_params, r_velocity, velocity, r_rot 97 | 98 | def get_cont6d_params(positions): 99 | skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu") 100 | # (seq_len, joints_num, 4) 101 | quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=True) 102 | 103 | '''Quaternion to continuous 6D''' 104 | cont_6d_params = quaternion_to_cont6d_np(quat_params) 105 | # (seq_len, 4) 106 | r_rot = quat_params[:, 0].copy() 107 | # print(r_rot[0]) 108 | '''Root Linear Velocity''' 109 | # (seq_len - 1, 3) 110 | velocity = (positions[1:, 0] - positions[:-1, 0]).copy() 111 | # print(r_rot.shape, velocity.shape) 112 | velocity = qrot_np(r_rot[1:], velocity) 113 | '''Root Angular Velocity''' 114 | # (seq_len - 1, 4) 115 | r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1])) 116 | # (seq_len, joints_num, 4) 117 | return cont_6d_params, r_velocity, velocity, r_rot 118 | 119 | cont_6d_params, r_velocity, velocity, r_rot = get_cont6d_params(positions) 120 | positions = get_rifke(positions) 121 | 122 | # trejec = np.cumsum(np.concatenate([np.array([[0, 0, 0]]), velocity], axis=0), axis=0) 123 | # r_rotations, r_pos = recover_ric_glo_np(r_velocity, velocity[:, [0, 2]]) 124 | 125 | # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*') 126 | # plt.plot(ground_positions[:, 0, 0], ground_positions[:, 0, 2], marker='o', color='r') 127 | # plt.plot(trejec[:, 0], trejec[:, 2], marker='^', color='g') 128 | # plt.plot(r_pos[:, 0], r_pos[:, 2], marker='s', color='y') 129 | # plt.xlabel('x') 130 | # plt.ylabel('z') 131 | # plt.axis('equal') 132 | # plt.show() 133 | 134 | '''Root height''' 135 | root_y = positions[:, 0, 1:2] 136 | 137 | '''Root rotation and linear velocity''' 138 | # (seq_len-1, 1) rotation velocity along y-axis 139 | # (seq_len-1, 2) linear velovity on xz plane 140 | r_velocity = np.arcsin(r_velocity[:, 2:3]) 141 | l_velocity = velocity[:, [0, 2]] 142 | # print(r_velocity.shape, l_velocity.shape, root_y.shape) 143 | root_data = np.concatenate([r_velocity, l_velocity, root_y[:-1]], axis=-1) 144 | 145 | '''Get Joint Rotation Representation''' 146 | # (seq_len, (joints_num-1) *6) quaternion for skeleton joints 147 | rot_data = cont_6d_params[:, 1:].reshape(len(cont_6d_params), -1) 148 | 149 | '''Get Joint Rotation Invariant Position Represention''' 150 | # (seq_len, (joints_num-1)*3) local joint position 151 | ric_data = positions[:, 1:].reshape(len(positions), -1) 152 | 153 | '''Get Joint Velocity Representation''' 154 | # (seq_len-1, joints_num*3) 155 | local_vel = qrot_np(np.repeat(r_rot[:-1, None], global_positions.shape[1], axis=1), 156 | global_positions[1:] - global_positions[:-1]) 157 | local_vel = local_vel.reshape(len(local_vel), -1) 158 | 159 | data = root_data 160 | data = np.concatenate([data, ric_data[:-1]], axis=-1) 161 | data = np.concatenate([data, rot_data[:-1]], axis=-1) 162 | # print(data.shape, local_vel.shape) 163 | data = np.concatenate([data, local_vel], axis=-1) 164 | data = np.concatenate([data, feet_l, feet_r], axis=-1) 165 | 166 | return data 167 | 168 | 169 | def process_file(positions, feet_thre): 170 | # (seq_len, joints_num, 3) 171 | # '''Down Sample''' 172 | # positions = positions[::ds_num] 173 | 174 | '''Uniform Skeleton''' 175 | positions = uniform_skeleton(positions, tgt_offsets) 176 | 177 | '''Put on Floor''' 178 | floor_height = positions.min(axis=0).min(axis=0)[1] 179 | positions[:, :, 1] -= floor_height 180 | # print(floor_height) 181 | 182 | # plot_3d_motion("./positions_1.mp4", kinematic_chain, positions, 'title', fps=20) 183 | 184 | '''XZ at origin''' 185 | root_pos_init = positions[0] 186 | root_pose_init_xz = root_pos_init[0] * np.array([1, 0, 1]) 187 | positions = positions - root_pose_init_xz 188 | 189 | # '''Move the first pose to origin ''' 190 | # root_pos_init = positions[0] 191 | # positions = positions - root_pos_init[0] 192 | 193 | '''All initially face Z+''' 194 | r_hip, l_hip, sdr_r, sdr_l = face_joint_indx 195 | across1 = root_pos_init[r_hip] - root_pos_init[l_hip] 196 | across2 = root_pos_init[sdr_r] - root_pos_init[sdr_l] 197 | across = across1 + across2 198 | across = across / np.sqrt((across ** 2).sum(axis=-1))[..., np.newaxis] 199 | 200 | # forward (3,), rotate around y-axis 201 | forward_init = np.cross(np.array([[0, 1, 0]]), across, axis=-1) 202 | # forward (3,) 203 | forward_init = forward_init / np.sqrt((forward_init ** 2).sum(axis=-1))[..., np.newaxis] 204 | 205 | # print(forward_init) 206 | 207 | target = np.array([[0, 0, 1]]) 208 | root_quat_init = qbetween_np(forward_init, target) 209 | root_quat_init = np.ones(positions.shape[:-1] + (4,)) * root_quat_init 210 | 211 | positions_b = positions.copy() 212 | 213 | positions = qrot_np(root_quat_init, positions) 214 | 215 | # plot_3d_motion("./positions_2.mp4", kinematic_chain, positions, 'title', fps=20) 216 | 217 | '''New ground truth positions''' 218 | global_positions = positions.copy() 219 | 220 | # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*') 221 | # plt.plot(positions[:, 0, 0], positions[:, 0, 2], marker='o', color='r') 222 | # plt.xlabel('x') 223 | # plt.ylabel('z') 224 | # plt.axis('equal') 225 | # plt.show() 226 | 227 | """ Get Foot Contacts """ 228 | 229 | def foot_detect(positions, thres): 230 | velfactor, heightfactor = np.array([thres, thres]), np.array([3.0, 2.0]) 231 | 232 | feet_l_x = (positions[1:, fid_l, 0] - positions[:-1, fid_l, 0]) ** 2 233 | feet_l_y = (positions[1:, fid_l, 1] - positions[:-1, fid_l, 1]) ** 2 234 | feet_l_z = (positions[1:, fid_l, 2] - positions[:-1, fid_l, 2]) ** 2 235 | # feet_l_h = positions[:-1,fid_l,1] 236 | # feet_l = (((feet_l_x + feet_l_y + feet_l_z) < velfactor) & (feet_l_h < heightfactor)).astype(np.float) 237 | feet_l = ((feet_l_x + feet_l_y + feet_l_z) < velfactor).astype(np.float) 238 | 239 | feet_r_x = (positions[1:, fid_r, 0] - positions[:-1, fid_r, 0]) ** 2 240 | feet_r_y = (positions[1:, fid_r, 1] - positions[:-1, fid_r, 1]) ** 2 241 | feet_r_z = (positions[1:, fid_r, 2] - positions[:-1, fid_r, 2]) ** 2 242 | # feet_r_h = positions[:-1,fid_r,1] 243 | # feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor) & (feet_r_h < heightfactor)).astype(np.float) 244 | feet_r = (((feet_r_x + feet_r_y + feet_r_z) < velfactor)).astype(np.float) 245 | return feet_l, feet_r 246 | # 247 | feet_l, feet_r = foot_detect(positions, feet_thre) 248 | # feet_l, feet_r = foot_detect(positions, 0.002) 249 | 250 | '''Quaternion and Cartesian representation''' 251 | r_rot = None 252 | 253 | def get_rifke(positions): 254 | '''Local pose''' 255 | positions[..., 0] -= positions[:, 0:1, 0] 256 | positions[..., 2] -= positions[:, 0:1, 2] 257 | '''All pose face Z+''' 258 | positions = qrot_np(np.repeat(r_rot[:, None], positions.shape[1], axis=1), positions) 259 | return positions 260 | 261 | def get_quaternion(positions): 262 | skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu") 263 | # (seq_len, joints_num, 4) 264 | quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=False) 265 | 266 | '''Fix Quaternion Discontinuity''' 267 | quat_params = qfix(quat_params) 268 | # (seq_len, 4) 269 | r_rot = quat_params[:, 0].copy() 270 | # print(r_rot[0]) 271 | '''Root Linear Velocity''' 272 | # (seq_len - 1, 3) 273 | velocity = (positions[1:, 0] - positions[:-1, 0]).copy() 274 | # print(r_rot.shape, velocity.shape) 275 | velocity = qrot_np(r_rot[1:], velocity) 276 | '''Root Angular Velocity''' 277 | # (seq_len - 1, 4) 278 | r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1])) 279 | quat_params[1:, 0] = r_velocity 280 | # (seq_len, joints_num, 4) 281 | return quat_params, r_velocity, velocity, r_rot 282 | 283 | def get_cont6d_params(positions): 284 | skel = Skeleton(n_raw_offsets, kinematic_chain, "cpu") 285 | # (seq_len, joints_num, 4) 286 | quat_params = skel.inverse_kinematics_np(positions, face_joint_indx, smooth_forward=True) 287 | 288 | '''Quaternion to continuous 6D''' 289 | cont_6d_params = quaternion_to_cont6d_np(quat_params) 290 | # (seq_len, 4) 291 | r_rot = quat_params[:, 0].copy() 292 | # print(r_rot[0]) 293 | '''Root Linear Velocity''' 294 | # (seq_len - 1, 3) 295 | velocity = (positions[1:, 0] - positions[:-1, 0]).copy() 296 | # print(r_rot.shape, velocity.shape) 297 | velocity = qrot_np(r_rot[1:], velocity) 298 | '''Root Angular Velocity''' 299 | # (seq_len - 1, 4) 300 | r_velocity = qmul_np(r_rot[1:], qinv_np(r_rot[:-1])) 301 | # (seq_len, joints_num, 4) 302 | return cont_6d_params, r_velocity, velocity, r_rot 303 | 304 | cont_6d_params, r_velocity, velocity, r_rot = get_cont6d_params(positions) 305 | positions = get_rifke(positions) 306 | 307 | # trejec = np.cumsum(np.concatenate([np.array([[0, 0, 0]]), velocity], axis=0), axis=0) 308 | # r_rotations, r_pos = recover_ric_glo_np(r_velocity, velocity[:, [0, 2]]) 309 | 310 | # plt.plot(positions_b[:, 0, 0], positions_b[:, 0, 2], marker='*') 311 | # plt.plot(ground_positions[:, 0, 0], ground_positions[:, 0, 2], marker='o', color='r') 312 | # plt.plot(trejec[:, 0], trejec[:, 2], marker='^', color='g') 313 | # plt.plot(r_pos[:, 0], r_pos[:, 2], marker='s', color='y') 314 | # plt.xlabel('x') 315 | # plt.ylabel('z') 316 | # plt.axis('equal') 317 | # plt.show() 318 | 319 | '''Root height''' 320 | root_y = positions[:, 0, 1:2] 321 | 322 | '''Root rotation and linear velocity''' 323 | # (seq_len-1, 1) rotation velocity along y-axis 324 | # (seq_len-1, 2) linear velovity on xz plane 325 | r_velocity = np.arcsin(r_velocity[:, 2:3]) 326 | l_velocity = velocity[:, [0, 2]] 327 | # print(r_velocity.shape, l_velocity.shape, root_y.shape) 328 | root_data = np.concatenate([r_velocity, l_velocity, root_y[:-1]], axis=-1) 329 | 330 | '''Get Joint Rotation Representation''' 331 | # (seq_len, (joints_num-1) *6) quaternion for skeleton joints 332 | rot_data = cont_6d_params[:, 1:].reshape(len(cont_6d_params), -1) 333 | 334 | '''Get Joint Rotation Invariant Position Represention''' 335 | # (seq_len, (joints_num-1)*3) local joint position 336 | ric_data = positions[:, 1:].reshape(len(positions), -1) 337 | 338 | '''Get Joint Velocity Representation''' 339 | # (seq_len-1, joints_num*3) 340 | local_vel = qrot_np(np.repeat(r_rot[:-1, None], global_positions.shape[1], axis=1), 341 | global_positions[1:] - global_positions[:-1]) 342 | local_vel = local_vel.reshape(len(local_vel), -1) 343 | 344 | data = root_data 345 | data = np.concatenate([data, ric_data[:-1]], axis=-1) 346 | data = np.concatenate([data, rot_data[:-1]], axis=-1) 347 | # print(data.shape, local_vel.shape) 348 | data = np.concatenate([data, local_vel], axis=-1) 349 | data = np.concatenate([data, feet_l, feet_r], axis=-1) 350 | 351 | return data, global_positions, positions, l_velocity 352 | 353 | 354 | # Recover global angle and positions for rotation data 355 | # root_rot_velocity (B, seq_len, 1) 356 | # root_linear_velocity (B, seq_len, 2) 357 | # root_y (B, seq_len, 1) 358 | # ric_data (B, seq_len, (joint_num - 1)*3) 359 | # rot_data (B, seq_len, (joint_num - 1)*6) 360 | # local_velocity (B, seq_len, joint_num*3) 361 | # foot contact (B, seq_len, 4) 362 | def recover_root_rot_pos(data): 363 | rot_vel = data[..., 0] 364 | r_rot_ang = torch.zeros_like(rot_vel).to(data.device) 365 | '''Get Y-axis rotation from rotation velocity''' 366 | r_rot_ang[..., 1:] = rot_vel[..., :-1] 367 | r_rot_ang = torch.cumsum(r_rot_ang, dim=-1) 368 | 369 | r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device) 370 | r_rot_quat[..., 0] = torch.cos(r_rot_ang) 371 | r_rot_quat[..., 2] = torch.sin(r_rot_ang) 372 | 373 | r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device) 374 | r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3] 375 | '''Add Y-axis rotation to root position''' 376 | r_pos = qrot(qinv(r_rot_quat), r_pos) 377 | 378 | r_pos = torch.cumsum(r_pos, dim=-2) 379 | 380 | r_pos[..., 1] = data[..., 3] 381 | return r_rot_quat, r_pos 382 | 383 | 384 | def recover_from_rot(data, joints_num, skeleton): 385 | r_rot_quat, r_pos = recover_root_rot_pos(data) 386 | 387 | r_rot_cont6d = quaternion_to_cont6d(r_rot_quat) 388 | 389 | start_indx = 1 + 2 + 1 + (joints_num - 1) * 3 390 | end_indx = start_indx + (joints_num - 1) * 6 391 | cont6d_params = data[..., start_indx:end_indx] 392 | # print(r_rot_cont6d.shape, cont6d_params.shape, r_pos.shape) 393 | cont6d_params = torch.cat([r_rot_cont6d, cont6d_params], dim=-1) 394 | cont6d_params = cont6d_params.view(-1, joints_num, 6) 395 | 396 | positions = skeleton.forward_kinematics_cont6d(cont6d_params, r_pos) 397 | 398 | return positions 399 | 400 | 401 | def recover_from_ric(data, joints_num): 402 | r_rot_quat, r_pos = recover_root_rot_pos(data) 403 | positions = data[..., 4:(joints_num - 1) * 3 + 4] 404 | positions = positions.view(positions.shape[:-1] + (-1, 3)) 405 | 406 | '''Add Y-axis rotation to local joints''' 407 | positions = qrot(qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions) 408 | 409 | '''Add root XZ to joints''' 410 | positions[..., 0] += r_pos[..., 0:1] 411 | positions[..., 2] += r_pos[..., 2:3] 412 | 413 | '''Concate root and joints''' 414 | positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2) 415 | 416 | return positions 417 | ''' 418 | For Text2Motion Dataset 419 | ''' 420 | ''' 421 | if __name__ == "__main__": 422 | example_id = "000021" 423 | # Lower legs 424 | l_idx1, l_idx2 = 5, 8 425 | # Right/Left foot 426 | fid_r, fid_l = [8, 11], [7, 10] 427 | # Face direction, r_hip, l_hip, sdr_r, sdr_l 428 | face_joint_indx = [2, 1, 17, 16] 429 | # l_hip, r_hip 430 | r_hip, l_hip = 2, 1 431 | joints_num = 22 432 | # ds_num = 8 433 | data_dir = '../dataset/pose_data_raw/joints/' 434 | save_dir1 = '../dataset/pose_data_raw/new_joints/' 435 | save_dir2 = '../dataset/pose_data_raw/new_joint_vecs/' 436 | 437 | n_raw_offsets = torch.from_numpy(t2m_raw_offsets) 438 | kinematic_chain = t2m_kinematic_chain 439 | 440 | # Get offsets of target skeleton 441 | example_data = np.load(os.path.join(data_dir, example_id + '.npy')) 442 | example_data = example_data.reshape(len(example_data), -1, 3) 443 | example_data = torch.from_numpy(example_data) 444 | tgt_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu') 445 | # (joints_num, 3) 446 | tgt_offsets = tgt_skel.get_offsets_joints(example_data[0]) 447 | # print(tgt_offsets) 448 | 449 | source_list = os.listdir(data_dir) 450 | frame_num = 0 451 | for source_file in tqdm(source_list): 452 | source_data = np.load(os.path.join(data_dir, source_file))[:, :joints_num] 453 | try: 454 | data, ground_positions, positions, l_velocity = process_file(source_data, 0.002) 455 | rec_ric_data = recover_from_ric(torch.from_numpy(data).unsqueeze(0).float(), joints_num) 456 | np.save(pjoin(save_dir1, source_file), rec_ric_data.squeeze().numpy()) 457 | np.save(pjoin(save_dir2, source_file), data) 458 | frame_num += data.shape[0] 459 | except Exception as e: 460 | print(source_file) 461 | print(e) 462 | 463 | print('Total clips: %d, Frames: %d, Duration: %fm' % 464 | (len(source_list), frame_num, frame_num / 20 / 60)) 465 | ''' 466 | 467 | if __name__ == "__main__": 468 | example_id = "03950_gt" 469 | # Lower legs 470 | l_idx1, l_idx2 = 17, 18 471 | # Right/Left foot 472 | fid_r, fid_l = [14, 15], [19, 20] 473 | # Face direction, r_hip, l_hip, sdr_r, sdr_l 474 | face_joint_indx = [11, 16, 5, 8] 475 | # l_hip, r_hip 476 | r_hip, l_hip = 11, 16 477 | joints_num = 21 478 | # ds_num = 8 479 | data_dir = '../dataset/kit_mocap_dataset/joints/' 480 | save_dir1 = '../dataset/kit_mocap_dataset/new_joints/' 481 | save_dir2 = '../dataset/kit_mocap_dataset/new_joint_vecs/' 482 | 483 | n_raw_offsets = torch.from_numpy(kit_raw_offsets) 484 | kinematic_chain = kit_kinematic_chain 485 | 486 | '''Get offsets of target skeleton''' 487 | example_data = np.load(os.path.join(data_dir, example_id + '.npy')) 488 | example_data = example_data.reshape(len(example_data), -1, 3) 489 | example_data = torch.from_numpy(example_data) 490 | tgt_skel = Skeleton(n_raw_offsets, kinematic_chain, 'cpu') 491 | # (joints_num, 3) 492 | tgt_offsets = tgt_skel.get_offsets_joints(example_data[0]) 493 | # print(tgt_offsets) 494 | 495 | source_list = os.listdir(data_dir) 496 | frame_num = 0 497 | '''Read source data''' 498 | for source_file in tqdm(source_list): 499 | source_data = np.load(os.path.join(data_dir, source_file))[:, :joints_num] 500 | try: 501 | name = ''.join(source_file[:-7].split('_')) + '.npy' 502 | data, ground_positions, positions, l_velocity = process_file(source_data, 0.05) 503 | rec_ric_data = recover_from_ric(torch.from_numpy(data).unsqueeze(0).float(), joints_num) 504 | if np.isnan(rec_ric_data.numpy()).any(): 505 | print(source_file) 506 | continue 507 | np.save(pjoin(save_dir1, name), rec_ric_data.squeeze().numpy()) 508 | np.save(pjoin(save_dir2, name), data) 509 | frame_num += data.shape[0] 510 | except Exception as e: 511 | print(source_file) 512 | print(e) 513 | 514 | print('Total clips: %d, Frames: %d, Duration: %fm' % 515 | (len(source_list), frame_num, frame_num / 12.5 / 60)) -------------------------------------------------------------------------------- /text2motion/utils/paramUtil.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Define a kinematic tree for the skeletal struture 4 | kit_kinematic_chain = [[0, 11, 12, 13, 14, 15], [0, 16, 17, 18, 19, 20], [0, 1, 2, 3, 4], [3, 5, 6, 7], [3, 8, 9, 10]] 5 | 6 | kit_raw_offsets = np.array( 7 | [ 8 | [0, 0, 0], 9 | [0, 1, 0], 10 | [0, 1, 0], 11 | [0, 1, 0], 12 | [0, 1, 0], 13 | [1, 0, 0], 14 | [0, -1, 0], 15 | [0, -1, 0], 16 | [-1, 0, 0], 17 | [0, -1, 0], 18 | [0, -1, 0], 19 | [1, 0, 0], 20 | [0, -1, 0], 21 | [0, -1, 0], 22 | [0, 0, 1], 23 | [0, 0, 1], 24 | [-1, 0, 0], 25 | [0, -1, 0], 26 | [0, -1, 0], 27 | [0, 0, 1], 28 | [0, 0, 1] 29 | ] 30 | ) 31 | 32 | t2m_raw_offsets = np.array([[0,0,0], 33 | [1,0,0], 34 | [-1,0,0], 35 | [0,1,0], 36 | [0,-1,0], 37 | [0,-1,0], 38 | [0,1,0], 39 | [0,-1,0], 40 | [0,-1,0], 41 | [0,1,0], 42 | [0,0,1], 43 | [0,0,1], 44 | [0,1,0], 45 | [1,0,0], 46 | [-1,0,0], 47 | [0,0,1], 48 | [0,-1,0], 49 | [0,-1,0], 50 | [0,-1,0], 51 | [0,-1,0], 52 | [0,-1,0], 53 | [0,-1,0]]) 54 | 55 | t2m_kinematic_chain = [[0, 2, 5, 8, 11], [0, 1, 4, 7, 10], [0, 3, 6, 9, 12, 15], [9, 14, 17, 19, 21], [9, 13, 16, 18, 20]] 56 | t2m_left_hand_chain = [[20, 22, 23, 24], [20, 34, 35, 36], [20, 25, 26, 27], [20, 31, 32, 33], [20, 28, 29, 30]] 57 | t2m_right_hand_chain = [[21, 43, 44, 45], [21, 46, 47, 48], [21, 40, 41, 42], [21, 37, 38, 39], [21, 49, 50, 51]] 58 | 59 | 60 | kit_tgt_skel_id = '03950' 61 | 62 | t2m_tgt_skel_id = '000021' 63 | 64 | -------------------------------------------------------------------------------- /text2motion/utils/plot_script.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | from mpl_toolkits.mplot3d import Axes3D 6 | from matplotlib.animation import FuncAnimation, FFMpegFileWriter 7 | from mpl_toolkits.mplot3d.art3d import Poly3DCollection 8 | import mpl_toolkits.mplot3d.axes3d as p3 9 | # import cv2 10 | 11 | 12 | def list_cut_average(ll, intervals): 13 | if intervals == 1: 14 | return ll 15 | 16 | bins = math.ceil(len(ll) * 1.0 / intervals) 17 | ll_new = [] 18 | for i in range(bins): 19 | l_low = intervals * i 20 | l_high = l_low + intervals 21 | l_high = l_high if l_high < len(ll) else len(ll) 22 | ll_new.append(np.mean(ll[l_low:l_high])) 23 | return ll_new 24 | 25 | 26 | def plot_3d_motion(save_path, kinematic_tree, joints, title, figsize=(10, 10), fps=120, radius=4): 27 | matplotlib.use('Agg') 28 | 29 | title_sp = title.split(' ') 30 | if len(title_sp) > 20: 31 | title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:20]), ' '.join(title_sp[20:])]) 32 | elif len(title_sp) > 10: 33 | title = '\n'.join([' '.join(title_sp[:10]), ' '.join(title_sp[10:])]) 34 | 35 | def init(): 36 | ax.set_xlim3d([-radius / 4, radius / 4]) 37 | ax.set_ylim3d([0, radius / 2]) 38 | ax.set_zlim3d([0, radius / 2]) 39 | # print(title) 40 | fig.suptitle(title, fontsize=20) 41 | ax.grid(b=False) 42 | 43 | def plot_xzPlane(minx, maxx, miny, minz, maxz): 44 | ## Plot a plane XZ 45 | verts = [ 46 | [minx, miny, minz], 47 | [minx, miny, maxz], 48 | [maxx, miny, maxz], 49 | [maxx, miny, minz] 50 | ] 51 | xz_plane = Poly3DCollection([verts]) 52 | xz_plane.set_facecolor((0.5, 0.5, 0.5, 0.5)) 53 | ax.add_collection3d(xz_plane) 54 | 55 | # return ax 56 | 57 | # (seq_len, joints_num, 3) 58 | data = joints.copy().reshape(len(joints), -1, 3) 59 | fig = plt.figure(figsize=figsize) 60 | ax = p3.Axes3D(fig) 61 | init() 62 | MINS = data.min(axis=0).min(axis=0) 63 | MAXS = data.max(axis=0).max(axis=0) 64 | colors = ['red', 'blue', 'black', 'red', 'blue', 65 | 'darkblue', 'darkblue', 'darkblue', 'darkblue', 'darkblue', 66 | 'darkred', 'darkred', 'darkred', 'darkred', 'darkred'] 67 | frame_number = data.shape[0] 68 | # print(data.shape) 69 | 70 | height_offset = MINS[1] 71 | data[:, :, 1] -= height_offset 72 | trajec = data[:, 0, [0, 2]] 73 | 74 | data[..., 0] -= data[:, 0:1, 0] 75 | data[..., 2] -= data[:, 0:1, 2] 76 | 77 | # print(trajec.shape) 78 | 79 | def update(index): 80 | # print(index) 81 | ax.lines = [] 82 | ax.collections = [] 83 | ax.view_init(elev=120, azim=-90) 84 | ax.dist = 7.5 85 | # ax = 86 | plot_xzPlane(MINS[0] - trajec[index, 0], MAXS[0] - trajec[index, 0], 0, MINS[2] - trajec[index, 1], 87 | MAXS[2] - trajec[index, 1]) 88 | # ax.scatter(data[index, :22, 0], data[index, :22, 1], data[index, :22, 2], color='black', s=3) 89 | 90 | if index > 1: 91 | ax.plot3D(trajec[:index, 0] - trajec[index, 0], np.zeros_like(trajec[:index, 0]), 92 | trajec[:index, 1] - trajec[index, 1], linewidth=1.0, 93 | color='blue') 94 | # ax = plot_xzPlane(ax, MINS[0], MAXS[0], 0, MINS[2], MAXS[2]) 95 | 96 | for i, (chain, color) in enumerate(zip(kinematic_tree, colors)): 97 | # print(color) 98 | if i < 5: 99 | linewidth = 4.0 100 | else: 101 | linewidth = 2.0 102 | ax.plot3D(data[index, chain, 0], data[index, chain, 1], data[index, chain, 2], linewidth=linewidth, 103 | color=color) 104 | # print(trajec[:index, 0].shape) 105 | 106 | plt.axis('off') 107 | ax.set_xticklabels([]) 108 | ax.set_yticklabels([]) 109 | ax.set_zticklabels([]) 110 | 111 | ani = FuncAnimation(fig, update, frames=frame_number, interval=1000 / fps, repeat=False) 112 | 113 | # writer = FFMpegFileWriter(fps=fps) 114 | ani.save(save_path, fps=fps) 115 | plt.close() 116 | -------------------------------------------------------------------------------- /text2motion/utils/quaternion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import torch 9 | import numpy as np 10 | 11 | _EPS4 = np.finfo(float).eps * 4.0 12 | 13 | _FLOAT_EPS = np.finfo(np.float).eps 14 | 15 | # PyTorch-backed implementations 16 | def qinv(q): 17 | assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)' 18 | mask = torch.ones_like(q) 19 | mask[..., 1:] = -mask[..., 1:] 20 | return q * mask 21 | 22 | 23 | def qinv_np(q): 24 | assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)' 25 | return qinv(torch.from_numpy(q).float()).numpy() 26 | 27 | 28 | def qnormalize(q): 29 | assert q.shape[-1] == 4, 'q must be a tensor of shape (*, 4)' 30 | return q / torch.norm(q, dim=-1, keepdim=True) 31 | 32 | 33 | def qmul(q, r): 34 | """ 35 | Multiply quaternion(s) q with quaternion(s) r. 36 | Expects two equally-sized tensors of shape (*, 4), where * denotes any number of dimensions. 37 | Returns q*r as a tensor of shape (*, 4). 38 | """ 39 | assert q.shape[-1] == 4 40 | assert r.shape[-1] == 4 41 | 42 | original_shape = q.shape 43 | 44 | # Compute outer product 45 | terms = torch.bmm(r.view(-1, 4, 1), q.view(-1, 1, 4)) 46 | 47 | w = terms[:, 0, 0] - terms[:, 1, 1] - terms[:, 2, 2] - terms[:, 3, 3] 48 | x = terms[:, 0, 1] + terms[:, 1, 0] - terms[:, 2, 3] + terms[:, 3, 2] 49 | y = terms[:, 0, 2] + terms[:, 1, 3] + terms[:, 2, 0] - terms[:, 3, 1] 50 | z = terms[:, 0, 3] - terms[:, 1, 2] + terms[:, 2, 1] + terms[:, 3, 0] 51 | return torch.stack((w, x, y, z), dim=1).view(original_shape) 52 | 53 | 54 | def qrot(q, v): 55 | """ 56 | Rotate vector(s) v about the rotation described by quaternion(s) q. 57 | Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v, 58 | where * denotes any number of dimensions. 59 | Returns a tensor of shape (*, 3). 60 | """ 61 | assert q.shape[-1] == 4 62 | assert v.shape[-1] == 3 63 | assert q.shape[:-1] == v.shape[:-1] 64 | 65 | original_shape = list(v.shape) 66 | # print(q.shape) 67 | q = q.contiguous().view(-1, 4) 68 | v = v.contiguous().view(-1, 3) 69 | 70 | qvec = q[:, 1:] 71 | uv = torch.cross(qvec, v, dim=1) 72 | uuv = torch.cross(qvec, uv, dim=1) 73 | return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape) 74 | 75 | 76 | def qeuler(q, order, epsilon=0, deg=True): 77 | """ 78 | Convert quaternion(s) q to Euler angles. 79 | Expects a tensor of shape (*, 4), where * denotes any number of dimensions. 80 | Returns a tensor of shape (*, 3). 81 | """ 82 | assert q.shape[-1] == 4 83 | 84 | original_shape = list(q.shape) 85 | original_shape[-1] = 3 86 | q = q.view(-1, 4) 87 | 88 | q0 = q[:, 0] 89 | q1 = q[:, 1] 90 | q2 = q[:, 2] 91 | q3 = q[:, 3] 92 | 93 | if order == 'xyz': 94 | x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2)) 95 | y = torch.asin(torch.clamp(2 * (q1 * q3 + q0 * q2), -1 + epsilon, 1 - epsilon)) 96 | z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3)) 97 | elif order == 'yzx': 98 | x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3)) 99 | y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3)) 100 | z = torch.asin(torch.clamp(2 * (q1 * q2 + q0 * q3), -1 + epsilon, 1 - epsilon)) 101 | elif order == 'zxy': 102 | x = torch.asin(torch.clamp(2 * (q0 * q1 + q2 * q3), -1 + epsilon, 1 - epsilon)) 103 | y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q1 * q1 + q2 * q2)) 104 | z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q1 * q1 + q3 * q3)) 105 | elif order == 'xzy': 106 | x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3)) 107 | y = torch.atan2(2 * (q0 * q2 + q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3)) 108 | z = torch.asin(torch.clamp(2 * (q0 * q3 - q1 * q2), -1 + epsilon, 1 - epsilon)) 109 | elif order == 'yxz': 110 | x = torch.asin(torch.clamp(2 * (q0 * q1 - q2 * q3), -1 + epsilon, 1 - epsilon)) 111 | y = torch.atan2(2 * (q1 * q3 + q0 * q2), 1 - 2 * (q1 * q1 + q2 * q2)) 112 | z = torch.atan2(2 * (q1 * q2 + q0 * q3), 1 - 2 * (q1 * q1 + q3 * q3)) 113 | elif order == 'zyx': 114 | x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2)) 115 | y = torch.asin(torch.clamp(2 * (q0 * q2 - q1 * q3), -1 + epsilon, 1 - epsilon)) 116 | z = torch.atan2(2 * (q0 * q3 + q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3)) 117 | else: 118 | raise 119 | 120 | if deg: 121 | return torch.stack((x, y, z), dim=1).view(original_shape) * 180 / np.pi 122 | else: 123 | return torch.stack((x, y, z), dim=1).view(original_shape) 124 | 125 | 126 | # Numpy-backed implementations 127 | 128 | def qmul_np(q, r): 129 | q = torch.from_numpy(q).contiguous().float() 130 | r = torch.from_numpy(r).contiguous().float() 131 | return qmul(q, r).numpy() 132 | 133 | 134 | def qrot_np(q, v): 135 | q = torch.from_numpy(q).contiguous().float() 136 | v = torch.from_numpy(v).contiguous().float() 137 | return qrot(q, v).numpy() 138 | 139 | 140 | def qeuler_np(q, order, epsilon=0, use_gpu=False): 141 | if use_gpu: 142 | q = torch.from_numpy(q).cuda().float() 143 | return qeuler(q, order, epsilon).cpu().numpy() 144 | else: 145 | q = torch.from_numpy(q).contiguous().float() 146 | return qeuler(q, order, epsilon).numpy() 147 | 148 | 149 | def qfix(q): 150 | """ 151 | Enforce quaternion continuity across the time dimension by selecting 152 | the representation (q or -q) with minimal distance (or, equivalently, maximal dot product) 153 | between two consecutive frames. 154 | 155 | Expects a tensor of shape (L, J, 4), where L is the sequence length and J is the number of joints. 156 | Returns a tensor of the same shape. 157 | """ 158 | assert len(q.shape) == 3 159 | assert q.shape[-1] == 4 160 | 161 | result = q.copy() 162 | dot_products = np.sum(q[1:] * q[:-1], axis=2) 163 | mask = dot_products < 0 164 | mask = (np.cumsum(mask, axis=0) % 2).astype(bool) 165 | result[1:][mask] *= -1 166 | return result 167 | 168 | 169 | def euler2quat(e, order, deg=True): 170 | """ 171 | Convert Euler angles to quaternions. 172 | """ 173 | assert e.shape[-1] == 3 174 | 175 | original_shape = list(e.shape) 176 | original_shape[-1] = 4 177 | 178 | e = e.view(-1, 3) 179 | 180 | ## if euler angles in degrees 181 | if deg: 182 | e = e * np.pi / 180. 183 | 184 | x = e[:, 0] 185 | y = e[:, 1] 186 | z = e[:, 2] 187 | 188 | rx = torch.stack((torch.cos(x / 2), torch.sin(x / 2), torch.zeros_like(x), torch.zeros_like(x)), dim=1) 189 | ry = torch.stack((torch.cos(y / 2), torch.zeros_like(y), torch.sin(y / 2), torch.zeros_like(y)), dim=1) 190 | rz = torch.stack((torch.cos(z / 2), torch.zeros_like(z), torch.zeros_like(z), torch.sin(z / 2)), dim=1) 191 | 192 | result = None 193 | for coord in order: 194 | if coord == 'x': 195 | r = rx 196 | elif coord == 'y': 197 | r = ry 198 | elif coord == 'z': 199 | r = rz 200 | else: 201 | raise 202 | if result is None: 203 | result = r 204 | else: 205 | result = qmul(result, r) 206 | 207 | # Reverse antipodal representation to have a non-negative "w" 208 | if order in ['xyz', 'yzx', 'zxy']: 209 | result *= -1 210 | 211 | return result.view(original_shape) 212 | 213 | 214 | def expmap_to_quaternion(e): 215 | """ 216 | Convert axis-angle rotations (aka exponential maps) to quaternions. 217 | Stable formula from "Practical Parameterization of Rotations Using the Exponential Map". 218 | Expects a tensor of shape (*, 3), where * denotes any number of dimensions. 219 | Returns a tensor of shape (*, 4). 220 | """ 221 | assert e.shape[-1] == 3 222 | 223 | original_shape = list(e.shape) 224 | original_shape[-1] = 4 225 | e = e.reshape(-1, 3) 226 | 227 | theta = np.linalg.norm(e, axis=1).reshape(-1, 1) 228 | w = np.cos(0.5 * theta).reshape(-1, 1) 229 | xyz = 0.5 * np.sinc(0.5 * theta / np.pi) * e 230 | return np.concatenate((w, xyz), axis=1).reshape(original_shape) 231 | 232 | 233 | def euler_to_quaternion(e, order): 234 | """ 235 | Convert Euler angles to quaternions. 236 | """ 237 | assert e.shape[-1] == 3 238 | 239 | original_shape = list(e.shape) 240 | original_shape[-1] = 4 241 | 242 | e = e.reshape(-1, 3) 243 | 244 | x = e[:, 0] 245 | y = e[:, 1] 246 | z = e[:, 2] 247 | 248 | rx = np.stack((np.cos(x / 2), np.sin(x / 2), np.zeros_like(x), np.zeros_like(x)), axis=1) 249 | ry = np.stack((np.cos(y / 2), np.zeros_like(y), np.sin(y / 2), np.zeros_like(y)), axis=1) 250 | rz = np.stack((np.cos(z / 2), np.zeros_like(z), np.zeros_like(z), np.sin(z / 2)), axis=1) 251 | 252 | result = None 253 | for coord in order: 254 | if coord == 'x': 255 | r = rx 256 | elif coord == 'y': 257 | r = ry 258 | elif coord == 'z': 259 | r = rz 260 | else: 261 | raise 262 | if result is None: 263 | result = r 264 | else: 265 | result = qmul_np(result, r) 266 | 267 | # Reverse antipodal representation to have a non-negative "w" 268 | if order in ['xyz', 'yzx', 'zxy']: 269 | result *= -1 270 | 271 | return result.reshape(original_shape) 272 | 273 | 274 | def quaternion_to_matrix(quaternions): 275 | """ 276 | Convert rotations given as quaternions to rotation matrices. 277 | Args: 278 | quaternions: quaternions with real part first, 279 | as tensor of shape (..., 4). 280 | Returns: 281 | Rotation matrices as tensor of shape (..., 3, 3). 282 | """ 283 | r, i, j, k = torch.unbind(quaternions, -1) 284 | two_s = 2.0 / (quaternions * quaternions).sum(-1) 285 | 286 | o = torch.stack( 287 | ( 288 | 1 - two_s * (j * j + k * k), 289 | two_s * (i * j - k * r), 290 | two_s * (i * k + j * r), 291 | two_s * (i * j + k * r), 292 | 1 - two_s * (i * i + k * k), 293 | two_s * (j * k - i * r), 294 | two_s * (i * k - j * r), 295 | two_s * (j * k + i * r), 296 | 1 - two_s * (i * i + j * j), 297 | ), 298 | -1, 299 | ) 300 | return o.reshape(quaternions.shape[:-1] + (3, 3)) 301 | 302 | 303 | def quaternion_to_matrix_np(quaternions): 304 | q = torch.from_numpy(quaternions).contiguous().float() 305 | return quaternion_to_matrix(q).numpy() 306 | 307 | 308 | def quaternion_to_cont6d_np(quaternions): 309 | rotation_mat = quaternion_to_matrix_np(quaternions) 310 | cont_6d = np.concatenate([rotation_mat[..., 0], rotation_mat[..., 1]], axis=-1) 311 | return cont_6d 312 | 313 | 314 | def quaternion_to_cont6d(quaternions): 315 | rotation_mat = quaternion_to_matrix(quaternions) 316 | cont_6d = torch.cat([rotation_mat[..., 0], rotation_mat[..., 1]], dim=-1) 317 | return cont_6d 318 | 319 | 320 | def cont6d_to_matrix(cont6d): 321 | assert cont6d.shape[-1] == 6, "The last dimension must be 6" 322 | x_raw = cont6d[..., 0:3] 323 | y_raw = cont6d[..., 3:6] 324 | 325 | x = x_raw / torch.norm(x_raw, dim=-1, keepdim=True) 326 | z = torch.cross(x, y_raw, dim=-1) 327 | z = z / torch.norm(z, dim=-1, keepdim=True) 328 | 329 | y = torch.cross(z, x, dim=-1) 330 | 331 | x = x[..., None] 332 | y = y[..., None] 333 | z = z[..., None] 334 | 335 | mat = torch.cat([x, y, z], dim=-1) 336 | return mat 337 | 338 | 339 | def cont6d_to_matrix_np(cont6d): 340 | q = torch.from_numpy(cont6d).contiguous().float() 341 | return cont6d_to_matrix(q).numpy() 342 | 343 | 344 | def qpow(q0, t, dtype=torch.float): 345 | ''' q0 : tensor of quaternions 346 | t: tensor of powers 347 | ''' 348 | q0 = qnormalize(q0) 349 | theta0 = torch.acos(q0[..., 0]) 350 | 351 | ## if theta0 is close to zero, add epsilon to avoid NaNs 352 | mask = (theta0 <= 10e-10) * (theta0 >= -10e-10) 353 | theta0 = (1 - mask) * theta0 + mask * 10e-10 354 | v0 = q0[..., 1:] / torch.sin(theta0).view(-1, 1) 355 | 356 | if isinstance(t, torch.Tensor): 357 | q = torch.zeros(t.shape + q0.shape) 358 | theta = t.view(-1, 1) * theta0.view(1, -1) 359 | else: ## if t is a number 360 | q = torch.zeros(q0.shape) 361 | theta = t * theta0 362 | 363 | q[..., 0] = torch.cos(theta) 364 | q[..., 1:] = v0 * torch.sin(theta).unsqueeze(-1) 365 | 366 | return q.to(dtype) 367 | 368 | 369 | def qslerp(q0, q1, t): 370 | ''' 371 | q0: starting quaternion 372 | q1: ending quaternion 373 | t: array of points along the way 374 | 375 | Returns: 376 | Tensor of Slerps: t.shape + q0.shape 377 | ''' 378 | 379 | q0 = qnormalize(q0) 380 | q1 = qnormalize(q1) 381 | q_ = qpow(qmul(q1, qinv(q0)), t) 382 | 383 | return qmul(q_, 384 | q0.contiguous().view(torch.Size([1] * len(t.shape)) + q0.shape).expand(t.shape + q0.shape).contiguous()) 385 | 386 | 387 | def qbetween(v0, v1): 388 | ''' 389 | find the quaternion used to rotate v0 to v1 390 | ''' 391 | assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)' 392 | assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)' 393 | 394 | v = torch.cross(v0, v1) 395 | w = torch.sqrt((v0 ** 2).sum(dim=-1, keepdim=True) * (v1 ** 2).sum(dim=-1, keepdim=True)) + (v0 * v1).sum(dim=-1, 396 | keepdim=True) 397 | return qnormalize(torch.cat([w, v], dim=-1)) 398 | 399 | 400 | def qbetween_np(v0, v1): 401 | ''' 402 | find the quaternion used to rotate v0 to v1 403 | ''' 404 | assert v0.shape[-1] == 3, 'v0 must be of the shape (*, 3)' 405 | assert v1.shape[-1] == 3, 'v1 must be of the shape (*, 3)' 406 | 407 | v0 = torch.from_numpy(v0).float() 408 | v1 = torch.from_numpy(v1).float() 409 | return qbetween(v0, v1).numpy() 410 | 411 | 412 | def lerp(p0, p1, t): 413 | if not isinstance(t, torch.Tensor): 414 | t = torch.Tensor([t]) 415 | 416 | new_shape = t.shape + p0.shape 417 | new_view_t = t.shape + torch.Size([1] * len(p0.shape)) 418 | new_view_p = torch.Size([1] * len(t.shape)) + p0.shape 419 | p0 = p0.view(new_view_p).expand(new_shape) 420 | p1 = p1.view(new_view_p).expand(new_shape) 421 | t = t.view(new_view_t).expand(new_shape) 422 | 423 | return p0 + t * (p1 - p0) 424 | -------------------------------------------------------------------------------- /text2motion/utils/skeleton.py: -------------------------------------------------------------------------------- 1 | from utils.quaternion import * 2 | import scipy.ndimage.filters as filters 3 | 4 | class Skeleton(object): 5 | def __init__(self, offset, kinematic_tree, device): 6 | self.device = device 7 | self._raw_offset_np = offset.numpy() 8 | self._raw_offset = offset.clone().detach().to(device).float() 9 | self._kinematic_tree = kinematic_tree 10 | self._offset = None 11 | self._parents = [0] * len(self._raw_offset) 12 | self._parents[0] = -1 13 | for chain in self._kinematic_tree: 14 | for j in range(1, len(chain)): 15 | self._parents[chain[j]] = chain[j-1] 16 | 17 | def njoints(self): 18 | return len(self._raw_offset) 19 | 20 | def offset(self): 21 | return self._offset 22 | 23 | def set_offset(self, offsets): 24 | self._offset = offsets.clone().detach().to(self.device).float() 25 | 26 | def kinematic_tree(self): 27 | return self._kinematic_tree 28 | 29 | def parents(self): 30 | return self._parents 31 | 32 | # joints (batch_size, joints_num, 3) 33 | def get_offsets_joints_batch(self, joints): 34 | assert len(joints.shape) == 3 35 | _offsets = self._raw_offset.expand(joints.shape[0], -1, -1).clone() 36 | for i in range(1, self._raw_offset.shape[0]): 37 | _offsets[:, i] = torch.norm(joints[:, i] - joints[:, self._parents[i]], p=2, dim=1)[:, None] * _offsets[:, i] 38 | 39 | self._offset = _offsets.detach() 40 | return _offsets 41 | 42 | # joints (joints_num, 3) 43 | def get_offsets_joints(self, joints): 44 | assert len(joints.shape) == 2 45 | _offsets = self._raw_offset.clone() 46 | for i in range(1, self._raw_offset.shape[0]): 47 | # print(joints.shape) 48 | _offsets[i] = torch.norm(joints[i] - joints[self._parents[i]], p=2, dim=0) * _offsets[i] 49 | 50 | self._offset = _offsets.detach() 51 | return _offsets 52 | 53 | # face_joint_idx should follow the order of right hip, left hip, right shoulder, left shoulder 54 | # joints (batch_size, joints_num, 3) 55 | def inverse_kinematics_np(self, joints, face_joint_idx, smooth_forward=False): 56 | assert len(face_joint_idx) == 4 57 | '''Get Forward Direction''' 58 | l_hip, r_hip, sdr_r, sdr_l = face_joint_idx 59 | across1 = joints[:, r_hip] - joints[:, l_hip] 60 | across2 = joints[:, sdr_r] - joints[:, sdr_l] 61 | across = across1 + across2 62 | across = across / np.sqrt((across**2).sum(axis=-1))[:, np.newaxis] 63 | # print(across1.shape, across2.shape) 64 | 65 | # forward (batch_size, 3) 66 | forward = np.cross(np.array([[0, 1, 0]]), across, axis=-1) 67 | if smooth_forward: 68 | forward = filters.gaussian_filter1d(forward, 20, axis=0, mode='nearest') 69 | # forward (batch_size, 3) 70 | forward = forward / np.sqrt((forward**2).sum(axis=-1))[..., np.newaxis] 71 | 72 | '''Get Root Rotation''' 73 | target = np.array([[0,0,1]]).repeat(len(forward), axis=0) 74 | root_quat = qbetween_np(forward, target) 75 | 76 | '''Inverse Kinematics''' 77 | # quat_params (batch_size, joints_num, 4) 78 | # print(joints.shape[:-1]) 79 | quat_params = np.zeros(joints.shape[:-1] + (4,)) 80 | # print(quat_params.shape) 81 | root_quat[0] = np.array([[1.0, 0.0, 0.0, 0.0]]) 82 | quat_params[:, 0] = root_quat 83 | # quat_params[0, 0] = np.array([[1.0, 0.0, 0.0, 0.0]]) 84 | for chain in self._kinematic_tree: 85 | R = root_quat 86 | for j in range(len(chain) - 1): 87 | # (batch, 3) 88 | u = self._raw_offset_np[chain[j+1]][np.newaxis,...].repeat(len(joints), axis=0) 89 | # print(u.shape) 90 | # (batch, 3) 91 | v = joints[:, chain[j+1]] - joints[:, chain[j]] 92 | v = v / np.sqrt((v**2).sum(axis=-1))[:, np.newaxis] 93 | # print(u.shape, v.shape) 94 | rot_u_v = qbetween_np(u, v) 95 | 96 | R_loc = qmul_np(qinv_np(R), rot_u_v) 97 | 98 | quat_params[:,chain[j + 1], :] = R_loc 99 | R = qmul_np(R, R_loc) 100 | 101 | return quat_params 102 | 103 | # Be sure root joint is at the beginning of kinematic chains 104 | def forward_kinematics(self, quat_params, root_pos, skel_joints=None, do_root_R=True): 105 | # quat_params (batch_size, joints_num, 4) 106 | # joints (batch_size, joints_num, 3) 107 | # root_pos (batch_size, 3) 108 | if skel_joints is not None: 109 | offsets = self.get_offsets_joints_batch(skel_joints) 110 | if len(self._offset.shape) == 2: 111 | offsets = self._offset.expand(quat_params.shape[0], -1, -1) 112 | joints = torch.zeros(quat_params.shape[:-1] + (3,)).to(self.device) 113 | joints[:, 0] = root_pos 114 | for chain in self._kinematic_tree: 115 | if do_root_R: 116 | R = quat_params[:, 0] 117 | else: 118 | R = torch.tensor([[1.0, 0.0, 0.0, 0.0]]).expand(len(quat_params), -1).detach().to(self.device) 119 | for i in range(1, len(chain)): 120 | R = qmul(R, quat_params[:, chain[i]]) 121 | offset_vec = offsets[:, chain[i]] 122 | joints[:, chain[i]] = qrot(R, offset_vec) + joints[:, chain[i-1]] 123 | return joints 124 | 125 | # Be sure root joint is at the beginning of kinematic chains 126 | def forward_kinematics_np(self, quat_params, root_pos, skel_joints=None, do_root_R=True): 127 | # quat_params (batch_size, joints_num, 4) 128 | # joints (batch_size, joints_num, 3) 129 | # root_pos (batch_size, 3) 130 | if skel_joints is not None: 131 | skel_joints = torch.from_numpy(skel_joints) 132 | offsets = self.get_offsets_joints_batch(skel_joints) 133 | if len(self._offset.shape) == 2: 134 | offsets = self._offset.expand(quat_params.shape[0], -1, -1) 135 | offsets = offsets.numpy() 136 | joints = np.zeros(quat_params.shape[:-1] + (3,)) 137 | joints[:, 0] = root_pos 138 | for chain in self._kinematic_tree: 139 | if do_root_R: 140 | R = quat_params[:, 0] 141 | else: 142 | R = np.array([[1.0, 0.0, 0.0, 0.0]]).repeat(len(quat_params), axis=0) 143 | for i in range(1, len(chain)): 144 | R = qmul_np(R, quat_params[:, chain[i]]) 145 | offset_vec = offsets[:, chain[i]] 146 | joints[:, chain[i]] = qrot_np(R, offset_vec) + joints[:, chain[i - 1]] 147 | return joints 148 | 149 | def forward_kinematics_cont6d_np(self, cont6d_params, root_pos, skel_joints=None, do_root_R=True): 150 | # cont6d_params (batch_size, joints_num, 6) 151 | # joints (batch_size, joints_num, 3) 152 | # root_pos (batch_size, 3) 153 | if skel_joints is not None: 154 | skel_joints = torch.from_numpy(skel_joints) 155 | offsets = self.get_offsets_joints_batch(skel_joints) 156 | if len(self._offset.shape) == 2: 157 | offsets = self._offset.expand(cont6d_params.shape[0], -1, -1) 158 | offsets = offsets.numpy() 159 | joints = np.zeros(cont6d_params.shape[:-1] + (3,)) 160 | joints[:, 0] = root_pos 161 | for chain in self._kinematic_tree: 162 | if do_root_R: 163 | matR = cont6d_to_matrix_np(cont6d_params[:, 0]) 164 | else: 165 | matR = np.eye(3)[np.newaxis, :].repeat(len(cont6d_params), axis=0) 166 | for i in range(1, len(chain)): 167 | matR = np.matmul(matR, cont6d_to_matrix_np(cont6d_params[:, chain[i]])) 168 | offset_vec = offsets[:, chain[i]][..., np.newaxis] 169 | # print(matR.shape, offset_vec.shape) 170 | joints[:, chain[i]] = np.matmul(matR, offset_vec).squeeze(-1) + joints[:, chain[i-1]] 171 | return joints 172 | 173 | def forward_kinematics_cont6d(self, cont6d_params, root_pos, skel_joints=None, do_root_R=True): 174 | # cont6d_params (batch_size, joints_num, 6) 175 | # joints (batch_size, joints_num, 3) 176 | # root_pos (batch_size, 3) 177 | if skel_joints is not None: 178 | # skel_joints = torch.from_numpy(skel_joints) 179 | offsets = self.get_offsets_joints_batch(skel_joints) 180 | if len(self._offset.shape) == 2: 181 | offsets = self._offset.expand(cont6d_params.shape[0], -1, -1) 182 | joints = torch.zeros(cont6d_params.shape[:-1] + (3,)).to(cont6d_params.device) 183 | joints[..., 0, :] = root_pos 184 | for chain in self._kinematic_tree: 185 | if do_root_R: 186 | matR = cont6d_to_matrix(cont6d_params[:, 0]) 187 | else: 188 | matR = torch.eye(3).expand((len(cont6d_params), -1, -1)).detach().to(cont6d_params.device) 189 | for i in range(1, len(chain)): 190 | matR = torch.matmul(matR, cont6d_to_matrix(cont6d_params[:, chain[i]])) 191 | offset_vec = offsets[:, chain[i]].unsqueeze(-1) 192 | # print(matR.shape, offset_vec.shape) 193 | joints[:, chain[i]] = torch.matmul(matR, offset_vec).squeeze(-1) + joints[:, chain[i-1]] 194 | return joints 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /text2motion/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | # import cv2 4 | from PIL import Image 5 | from utils import paramUtil 6 | import math 7 | import time 8 | import matplotlib.pyplot as plt 9 | from scipy.ndimage import gaussian_filter 10 | 11 | 12 | def mkdir(path): 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | 16 | COLORS = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], 17 | [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], 18 | [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] 19 | 20 | MISSING_VALUE = -1 21 | 22 | def save_image(image_numpy, image_path): 23 | img_pil = Image.fromarray(image_numpy) 24 | img_pil.save(image_path) 25 | 26 | 27 | def save_logfile(log_loss, save_path): 28 | with open(save_path, 'wt') as f: 29 | for k, v in log_loss.items(): 30 | w_line = k 31 | for digit in v: 32 | w_line += ' %.3f' % digit 33 | f.write(w_line + '\n') 34 | 35 | 36 | def print_current_loss(start_time, niter_state, losses, epoch=None, inner_iter=None): 37 | 38 | def as_minutes(s): 39 | m = math.floor(s / 60) 40 | s -= m * 60 41 | return '%dm %ds' % (m, s) 42 | 43 | def time_since(since, percent): 44 | now = time.time() 45 | s = now - since 46 | es = s / percent 47 | rs = es - s 48 | return '%s (- %s)' % (as_minutes(s), as_minutes(rs)) 49 | 50 | if epoch is not None: 51 | print('epoch: %3d niter: %6d inner_iter: %4d' % (epoch, niter_state, inner_iter), end=" ") 52 | 53 | now = time.time() 54 | message = '%s'%(as_minutes(now - start_time)) 55 | 56 | for k, v in losses.items(): 57 | message += ' %s: %.4f ' % (k, v) 58 | print(message) 59 | 60 | 61 | def compose_gif_img_list(img_list, fp_out, duration): 62 | img, *imgs = [Image.fromarray(np.array(image)) for image in img_list] 63 | img.save(fp=fp_out, format='GIF', append_images=imgs, optimize=False, 64 | save_all=True, loop=0, duration=duration) 65 | 66 | 67 | def save_images(visuals, image_path): 68 | if not os.path.exists(image_path): 69 | os.makedirs(image_path) 70 | 71 | for i, (label, img_numpy) in enumerate(visuals.items()): 72 | img_name = '%d_%s.jpg' % (i, label) 73 | save_path = os.path.join(image_path, img_name) 74 | save_image(img_numpy, save_path) 75 | 76 | 77 | def save_images_test(visuals, image_path, from_name, to_name): 78 | if not os.path.exists(image_path): 79 | os.makedirs(image_path) 80 | 81 | for i, (label, img_numpy) in enumerate(visuals.items()): 82 | img_name = "%s_%s_%s" % (from_name, to_name, label) 83 | save_path = os.path.join(image_path, img_name) 84 | save_image(img_numpy, save_path) 85 | 86 | 87 | def compose_and_save_img(img_list, save_dir, img_name, col=4, row=1, img_size=(256, 200)): 88 | # print(col, row) 89 | compose_img = compose_image(img_list, col, row, img_size) 90 | if not os.path.exists(save_dir): 91 | os.makedirs(save_dir) 92 | img_path = os.path.join(save_dir, img_name) 93 | # print(img_path) 94 | compose_img.save(img_path) 95 | 96 | 97 | def compose_image(img_list, col, row, img_size): 98 | to_image = Image.new('RGB', (col * img_size[0], row * img_size[1])) 99 | for y in range(0, row): 100 | for x in range(0, col): 101 | from_img = Image.fromarray(img_list[y * col + x]) 102 | # print((x * img_size[0], y*img_size[1], 103 | # (x + 1) * img_size[0], (y + 1) * img_size[1])) 104 | paste_area = (x * img_size[0], y*img_size[1], 105 | (x + 1) * img_size[0], (y + 1) * img_size[1]) 106 | to_image.paste(from_img, paste_area) 107 | # to_image[y*img_size[1]:(y + 1) * img_size[1], x * img_size[0] :(x + 1) * img_size[0]] = from_img 108 | return to_image 109 | 110 | 111 | def list_cut_average(ll, intervals): 112 | if intervals == 1: 113 | return ll 114 | 115 | bins = math.ceil(len(ll) * 1.0 / intervals) 116 | ll_new = [] 117 | for i in range(bins): 118 | l_low = intervals * i 119 | l_high = l_low + intervals 120 | l_high = l_high if l_high < len(ll) else len(ll) 121 | ll_new.append(np.mean(ll[l_low:l_high])) 122 | return ll_new 123 | 124 | 125 | def motion_temporal_filter(motion, sigma=1): 126 | motion = motion.reshape(motion.shape[0], -1) 127 | # print(motion.shape) 128 | for i in range(motion.shape[1]): 129 | motion[:, i] = gaussian_filter(motion[:, i], sigma=sigma, mode="nearest") 130 | return motion.reshape(motion.shape[0], -1, 3) 131 | 132 | -------------------------------------------------------------------------------- /text2motion/utils/word_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | from os.path import join as pjoin 4 | 5 | POS_enumerator = { 6 | 'VERB': 0, 7 | 'NOUN': 1, 8 | 'DET': 2, 9 | 'ADP': 3, 10 | 'NUM': 4, 11 | 'AUX': 5, 12 | 'PRON': 6, 13 | 'ADJ': 7, 14 | 'ADV': 8, 15 | 'Loc_VIP': 9, 16 | 'Body_VIP': 10, 17 | 'Obj_VIP': 11, 18 | 'Act_VIP': 12, 19 | 'Desc_VIP': 13, 20 | 'OTHER': 14, 21 | } 22 | 23 | Loc_list = ('left', 'right', 'clockwise', 'counterclockwise', 'anticlockwise', 'forward', 'back', 'backward', 24 | 'up', 'down', 'straight', 'curve') 25 | 26 | Body_list = ('arm', 'chin', 'foot', 'feet', 'face', 'hand', 'mouth', 'leg', 'waist', 'eye', 'knee', 'shoulder', 'thigh') 27 | 28 | Obj_List = ('stair', 'dumbbell', 'chair', 'window', 'floor', 'car', 'ball', 'handrail', 'baseball', 'basketball') 29 | 30 | Act_list = ('walk', 'run', 'swing', 'pick', 'bring', 'kick', 'put', 'squat', 'throw', 'hop', 'dance', 'jump', 'turn', 31 | 'stumble', 'dance', 'stop', 'sit', 'lift', 'lower', 'raise', 'wash', 'stand', 'kneel', 'stroll', 32 | 'rub', 'bend', 'balance', 'flap', 'jog', 'shuffle', 'lean', 'rotate', 'spin', 'spread', 'climb') 33 | 34 | Desc_list = ('slowly', 'carefully', 'fast', 'careful', 'slow', 'quickly', 'happy', 'angry', 'sad', 'happily', 35 | 'angrily', 'sadly') 36 | 37 | VIP_dict = { 38 | 'Loc_VIP': Loc_list, 39 | 'Body_VIP': Body_list, 40 | 'Obj_VIP': Obj_List, 41 | 'Act_VIP': Act_list, 42 | 'Desc_VIP': Desc_list, 43 | } 44 | 45 | 46 | class WordVectorizer(object): 47 | def __init__(self, meta_root, prefix): 48 | vectors = np.load(pjoin(meta_root, '%s_data.npy'%prefix)) 49 | words = pickle.load(open(pjoin(meta_root, '%s_words.pkl'%prefix), 'rb')) 50 | word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl'%prefix), 'rb')) 51 | self.word2vec = {w: vectors[word2idx[w]] for w in words} 52 | 53 | def _get_pos_ohot(self, pos): 54 | pos_vec = np.zeros(len(POS_enumerator)) 55 | if pos in POS_enumerator: 56 | pos_vec[POS_enumerator[pos]] = 1 57 | else: 58 | pos_vec[POS_enumerator['OTHER']] = 1 59 | return pos_vec 60 | 61 | def __len__(self): 62 | return len(self.word2vec) 63 | 64 | def __getitem__(self, item): 65 | word, pos = item.split('/') 66 | if word in self.word2vec: 67 | word_vec = self.word2vec[word] 68 | vip_pos = None 69 | for key, values in VIP_dict.items(): 70 | if word in values: 71 | vip_pos = key 72 | break 73 | if vip_pos is not None: 74 | pos_vec = self._get_pos_ohot(vip_pos) 75 | else: 76 | pos_vec = self._get_pos_ohot(pos) 77 | else: 78 | word_vec = self.word2vec['unk'] 79 | pos_vec = self._get_pos_ohot('OTHER') 80 | return word_vec, pos_vec --------------------------------------------------------------------------------