├── LICENSE ├── README.md ├── data_processing ├── add_noisy_data.sh ├── alt_prosody.py ├── create_vector.py ├── prepare_data.py ├── silence.wav ├── tools.py ├── tools.pyc └── white_noise.wav ├── evaluation ├── README.md ├── calc_distance.py ├── calc_errors.py ├── calc_jerk.py ├── hellinger.py ├── hellinger_one2one.py ├── joints.txt └── plot_results.py ├── example_scripts ├── README.md ├── baseline_test.sh ├── baseline_train_n_test.sh ├── config.txt ├── proposed_test.sh └── proposed_train_n_test.sh ├── helpers ├── README.md ├── apply_filters.py ├── convert_original.py ├── filters │ ├── __pycache__ │ │ ├── ma_filter.cpython-35.pyc │ │ └── one_euro_filter.cpython-35.pyc │ ├── ma_filter.py │ └── one_euro_filter.py └── remove_velocity.py ├── hierarchy.txt ├── motion_repr_learning ├── README.md └── ae │ ├── DAE.py │ ├── decode.py │ ├── encode_dataset.py │ ├── learn_dataset_encoding.py │ ├── train.py │ └── utils │ ├── __init__.py │ ├── data.py │ ├── flags.py │ └── utils.py ├── predict.py ├── requirements.txt ├── train.py └── visuals └── SpeechReprMotion.png /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aud2Repr2Pose: Analyzing input and output representations for speech-driven gesture generation 2 | [Taras Kucherenko](https://svito-zar.github.io/), [Dai Hasegawa](https://hasegawadai.info/), [Gustav Eje Henter](https://people.kth.se/~ghe/), Naoshi Kaneko, [Hedvig Kjellström](http://www.csc.kth.se/~hedvig/) 3 | 4 | ![ImageOfIdea](visuals/SpeechReprMotion.png?raw=true "Idea") 5 | 6 | This repository contains Keras and Tensorflow based implementation of the speech-driven gesture generation by a neural network which was published at International Conference on Intelligent Virtual Agents (IVA'19) and the extention was published in International Journal of Human-Computer Interaction in 2021. 7 | 8 | The [project website](https://svito-zar.github.io/audio2gestures/) contains all the information about this project, including [video](https://youtu.be/Iv7UBe92zrw) explanation of the method and the [paper](https://www.researchgate.net/publication/331645229_Analyzing_Input_and_Output_Representations_for_Speech-Driven_Gesture_Generation). 9 | 10 | ## Demo on another dataset 11 | 12 | This model has been applied to English dataset. 13 | 14 | The [demo video](https://youtu.be/tQLVyTVtsSU) as well as the [code](https://github.com/Svito-zar/speech-driven-hand-gesture-generation-demo) to run the pre-trained model are online. 15 | 16 | ## Requirements 17 | 18 | - Python 3 19 | 20 | 21 | ## Initial setup 22 | 23 | ### install packages 24 | ```sh 25 | 26 | # if you have GPU 27 | pip install tensorflow-gpu==1.15.2 28 | 29 | # if you don't have GPU 30 | pip install tensorflow==1.15.2 31 | 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | ### install ffmpeg 36 | ```sh 37 | # macos 38 | brew install ffmpeg 39 | ``` 40 | 41 | ``` 42 | # ubuntu 43 | sudo add-apt-repository ppa:jonathonf/ffmpeg-4 44 | sudo apt-get update 45 | sudo apt-get install ffmpeg 46 | ``` 47 | 48 | 49 |   50 | ____________________________________________________________________________________________________________ 51 |   52 | 53 | # How to use this repository? 54 | 55 | # 0. Notation 56 | 57 | We write all the parameters which needs to be specified by a user in the capslock. 58 | 59 | ## 1. Download raw data 60 | 61 | - Clone this repository 62 | - Download a dataset from `https://www.dropbox.com/sh/j419kp4m8hkt9nd/AAC_pIcS1b_WFBqUp5ofBG1Ia?dl=0` 63 | - Create a directory named `dataset` and put two directories `motion/` and `speech/` under `dataset/` 64 | 65 | ## 2. Split dataset 66 | 67 | - Put the folder with the dataset in the `data_processing` directory of this repo: next to the script `prepare_data.py` 68 | - Run the following command 69 | 70 | ```sh 71 | python data_processing/prepare_data.py DATA_DIR 72 | # DATA_DIR = directory to save data such as 'data/' 73 | ``` 74 | 75 | Note: DATA_DIR is not a directory where the raw data is stored (the folder with data, "dataset" , has to be stored in the root folder of this repo). DATA_DIR is the directory where the postprocessed data should be saved. After this step you don't need to have "dataset" in the root folder any more. 76 | You should use the same DATA_DIR in all the following scripts. 77 | 78 | After this command: 79 | - `train/` `test/` `dev/` are created under `DATA_DIR/` 80 | - in `inputs/` inside each directory, audio(id).wav files are stored 81 | - in `labels/` inside each directory, gesture(id).bvh files are stored 82 | - under `DATA_DIR/`, three csv files `gg-train.csv` `gg-test.csv` `gg-dev.csv` are created and these files have paths to actual data 83 | 84 | 85 | ## 3. Convert the dataset into vectors 86 | 87 | ```sh 88 | python data_processing/create_vector.py DATA_DIR N_CONTEXT 89 | # N_CONTEXT = number of context, in our experiments was set to '60' 90 | # (this means 30 steps backwards and forwards) 91 | ``` 92 | 93 | Note: if you change the N_CONTEXT value - you need to update it in the `train.py` script. 94 | 95 | (You are likely to get a warning like this "WARNING:root:frame length (5513) is greater than FFT size (512), frame will be truncated. Increase NFFT to avoid." ) 96 | 97 | As a result of running this script 98 | - numpy binary files `X_train.npy`, `Y_train.npy` (vectord dataset) are created under `DATA_DIR` 99 | - under `DATA_DIR/test_inputs/` , test audios, such as `X_test_audio1168.npy` , are created 100 | - when N_CONTEXT = 60, the audio vector's shape is (num of timesteps, 61, 26) 101 | - gesture vector's shape is(num of timesteps, 384) 102 | - 384 = 64joints × (x,y,z positions + x,y,z velocities) 103 | 104 | ## If you don't want to customize anything - you can skip reading about steps 4-7 and just use already prepared scripts at the folder `example_scripts` 105 |   106 | 107 | ## 4. Learn motion representation by AutoEncoder 108 | 109 | Create a directory to save training checkpoints such as `chkpt/` and use it as CHKPT_DIR parameter. 110 | #### Learn dataset encoding 111 | ```sh 112 | python motion_repr_learning/ae/learn_dataset_encoding.py DATA_DIR -chkpt_dir=CHKPT_DIR -layer1_width=DIM 113 | ``` 114 | 115 | The optimal dimensionality (DIM) in our experiment was 325 116 | 117 | #### Encode dataset 118 | Create DATA_DIR/DIM directory 119 | ```sh 120 | python motion_repr_learning/ae/encode_dataset.py DATA_DIR -chkpt_dir=CHKPT_DIR -restore=True -pretrain=False -layer1_width=DIM 121 | ``` 122 | 123 | More information can be found in the folder `motion_repr_learning` 124 | 125 | 126 | ## 5. Learn speech-driven gesture generation model 127 | 128 | ```sh 129 | python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE DIM 130 | # MODEL_NAME = hdf5 file name such as 'model_500ep_posvel_60.hdf5' 131 | # EPOCHS = how many epochs do we want to train the model (recommended - 100) 132 | # DATA_DIR = directory with the data (should be same as above) 133 | # N_INPUT = how many dimension does speech data have (default - 26) 134 | # ENCODE = weather we train on the encoded gestures (using proposed model) or on just on the gestures as their are (using baseline model) 135 | # DIM = how many dimension does encoding have (ignored if you don't encode) 136 | ``` 137 | 138 | ## 6. Predict gesture 139 | 140 | ```sh 141 | python predict.py MODEL_NAME INPUT_SPEECH_FILE OUTPUT_GESTURE_FILE 142 | ``` 143 | 144 | ```sh 145 | # Usage example 146 | python predict.py model.hdf5 data/test_inputs/X_test_audio1168.npy data/test_inputs/predict_1168_20fps.txt 147 | ``` 148 | 149 | ```sh 150 | # You need to decode the gestures 151 | python motion_repr_learning/ae/decode.py DATA_DIR ENCODED_PREDICTION_FILE DECODED_GESTURE_FILE -restore=True -pretrain=False -layer1_width=DIM -chkpt_dir=CHKPT_DIR -batch_size=8 152 | ``` 153 | 154 | 155 | Note: This can be used in a for loop over all the test sequences. Examples are provided in the 156 | `example_scripts` folder of this directory 157 | 158 | ```sh 159 | # The network produces both coordinates and velocity 160 | # So we need to remove velocities 161 | python helpers/remove_velocity.py -g PATH_TO_GESTURES 162 | ``` 163 | 164 | ## 7. Quantitative evaluation 165 | Use scripts in the `evaluation` folder of this directory. 166 | 167 | Examples are provided in the `example_scripts` folder of this repository 168 | 169 | ## 8. Qualitative evaluation 170 | Use [animation server](https://secret-meadow-14164.herokuapp.com/coordinates.html) 171 | 172 |   173 | 174 | ## Citation 175 | If you use this code in your research please cite the paper: 176 | ``` 177 | @article{kucherenko2021moving, 178 | title={Moving fast and slow: Analysis of representations and post-processing in speech-driven automatic gesture generation}, 179 | author={Kucherenko, Taras and Hasegawa, Dai and Kaneko, Naoshi and Henter, Gustav Eje and Kjellstr{\"o}m, Hedvig}, 180 | journal={International Journal of Human–Computer Interaction}, 181 | doi={10.1080/10447318.2021.1883883}, 182 | year={2021} 183 | } 184 | ``` 185 | 186 | ## Contact 187 | If you encounter any problems/bugs/issues please contact me on Github or by emailing me at tarask@kth.se for any bug reports/questions/suggestions. I prefer questions and bug reports on Github as that provides visibility to others who might be encountering same issues or who have the same questions. 188 | -------------------------------------------------------------------------------- /data_processing/add_noisy_data.sh: -------------------------------------------------------------------------------- 1 | # This script will add noise to the audio file in order to augment the dataset 2 | # It is used in the script "prepare_data.py" 3 | 4 | data=$4 5 | for i in `seq ${2} ${3}`; 6 | do 7 | echo "${i}" 8 | if [ -e ${data}/${1}/inputs/audio${i}.wav ] 9 | then 10 | sox ${data}/${1}/inputs/audio${i}.wav -p synth whitenoise vol 0.01 | sox -m ${data}/${1}/inputs/audio${i}.wav - ${data}/${1}/inputs/naudio${i}.wav 11 | echo "naudio generated in ${1} for id ${i}" 12 | else 13 | echo "could not generate noisy audio, because original audio at ${data} was not found" 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /data_processing/alt_prosody.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Alternative calculation of prosodic features 3 | """ 4 | Created on Tue Jan 15 18:45:34 2019 5 | 6 | @author: kaneko.naoshi 7 | """ 8 | 9 | import numpy as np 10 | import parselmouth as pm 11 | 12 | 13 | def compute_prosody(audio_filename, time_step=0.05): 14 | audio = pm.Sound(audio_filename) 15 | 16 | # Extract pitch and intensity 17 | pitch = audio.to_pitch(time_step=time_step) 18 | intensity = audio.to_intensity(time_step=time_step) 19 | 20 | # Evenly spaced time steps 21 | times = np.arange(0, audio.get_total_duration() - time_step, time_step) 22 | 23 | # Compute prosodic features at each time step 24 | pitch_values = np.nan_to_num( 25 | np.asarray([pitch.get_value_at_time(t) for t in times])) 26 | intensity_values = np.nan_to_num( 27 | np.asarray([intensity.get_value(t) for t in times])) 28 | 29 | intensity_values = np.clip( 30 | intensity_values, np.finfo(intensity_values.dtype).eps, None) 31 | 32 | # Normalize features [Chiu '11] 33 | pitch_norm = np.clip(np.log(pitch_values + 1) - 4, 0, None) 34 | intensity_norm = np.clip(np.log(intensity_values) - 3, 0, None) 35 | 36 | return pitch_norm, intensity_norm -------------------------------------------------------------------------------- /data_processing/create_vector.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script does preprocessing of the dataset specified in DATA_DIR 3 | and stores it in the same folder as .npy files 4 | It should be used before training, as described in the README.md 5 | 6 | @author: Taras Kucherenko 7 | """ 8 | 9 | import os 10 | import sys 11 | 12 | import pyquaternion as pyq 13 | 14 | from tools import * 15 | 16 | N_OUTPUT = 384 # Number of gesture features (position) 17 | WINDOW_LENGTH = 50 # in miliseconds 18 | FEATURES = "MFCC" 19 | 20 | if FEATURES == "MFCC": 21 | N_INPUT = 26 # Number of MFCC features 22 | if FEATURES == "Pros": 23 | N_INPUT = 4 # Number of prosodic features 24 | if FEATURES == "MFCC+Pros": 25 | N_INPUT = 30 # Total number of features 26 | if FEATURES == "Spectro": 27 | N_INPUT = 64 # Number of spectrogram features 28 | if FEATURES == "Spectro+Pros": 29 | N_INPUT = 68 # Total number of eatures 30 | if FEATURES == "MFCC+Spectro": 31 | N_INPUT = 90 # Total number of eatures 32 | if FEATURES == "MFCC+Spectro+Pros": 33 | N_INPUT = 94 # Total number of eatures 34 | 35 | 36 | def pad_sequence(input_vectors): 37 | """ 38 | Pad array of features in order to be able to take context at each time-frame 39 | We pad N_CONTEXT / 2 frames before and after the signal by the features of the silence 40 | Args: 41 | input_vectors: feature vectors for an audio 42 | 43 | Returns: 44 | new_input_vectors: padded feature vectors 45 | """ 46 | 47 | if FEATURES == "MFCC": 48 | 49 | # Pad sequence not with zeros but with MFCC of the silence 50 | 51 | silence_vectors = calculate_mfcc("data_processing/silence.wav") 52 | mfcc_empty_vector = silence_vectors[0] 53 | 54 | empty_vectors = np.array([mfcc_empty_vector] * int(N_CONTEXT / 2)) 55 | 56 | if FEATURES == "Pros": 57 | 58 | # Pad sequence with zeros 59 | 60 | prosodic_empty_vector =[0, 0, 0, 0] 61 | 62 | empty_vectors = np.array([prosodic_empty_vector] * int(N_CONTEXT / 2)) 63 | 64 | if FEATURES == "MFCC+Pros": 65 | 66 | silence_vectors = calculate_mfcc("data_processing/silence.wav") 67 | mfcc_empty_vector = silence_vectors[0] 68 | 69 | prosodic_empty_vector = [0, 0, 0, 0] 70 | 71 | combined_empty_vector = np.concatenate((mfcc_empty_vector, prosodic_empty_vector)) 72 | 73 | empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2)) 74 | 75 | if FEATURES == "Spectro": 76 | 77 | silence_spectro = calculate_spectrogram("data_processing/silence.wav") 78 | spectro_empty_vector = silence_spectro[0] 79 | 80 | empty_vectors = np.array([spectro_empty_vector] * int(N_CONTEXT / 2)) 81 | 82 | if FEATURES == "Spectro+Pros": 83 | 84 | silence_spectro = calculate_spectrogram("data_processing/silence.wav") 85 | spectro_empty_vector = silence_spectro[0] 86 | 87 | prosodic_empty_vector = [0, 0, 0, 0] 88 | 89 | combined_empty_vector = np.concatenate((spectro_empty_vector, prosodic_empty_vector)) 90 | 91 | empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2)) 92 | 93 | if FEATURES == "MFCC+Spectro": 94 | 95 | silence_spectro = calculate_spectrogram("data_processing/silence.wav") 96 | spectro_empty_vector = silence_spectro[0] 97 | 98 | silence_vectors = calculate_mfcc("data_processing/silence.wav") 99 | mfcc_empty_vector = silence_vectors[0] 100 | 101 | combined_empty_vector = np.concatenate((mfcc_empty_vector, spectro_empty_vector,)) 102 | 103 | empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2)) 104 | 105 | if FEATURES == "MFCC+Spectro+Pros": 106 | 107 | silence_spectro = calculate_spectrogram("data_processing/silence.wav") 108 | spectro_empty_vector = silence_spectro[0] 109 | 110 | silence_vectors = calculate_mfcc("data_processing/silence.wav") 111 | mfcc_empty_vector = silence_vectors[0] 112 | 113 | prosodic_empty_vector = [0, 0, 0, 0] 114 | 115 | combined_empty_vector = np.concatenate((mfcc_empty_vector, spectro_empty_vector, prosodic_empty_vector)) 116 | 117 | empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2)) 118 | 119 | # append N_CONTEXT/2 "empty" mfcc vectors to past 120 | new_input_vectors = np.append(empty_vectors, input_vectors, axis=0) 121 | # append N_CONTEXT/2 "empty" mfcc vectors to future 122 | new_input_vectors = np.append(new_input_vectors, empty_vectors, axis=0) 123 | 124 | return new_input_vectors 125 | 126 | def create_vectors(audio_filename, gesture_filename, nodes): 127 | """ 128 | Extract features from a given pair of audio and motion files 129 | Args: 130 | audio_filename: file name for an audio file (.wav) 131 | gesture_filename: file name for a motion file (.bvh) 132 | nodes: an array of markers for the motion 133 | 134 | Returns: 135 | input_with_context : speech features 136 | output_with_context : motion features 137 | """ 138 | # Step 1: Vactorizing speech, with features of N_INPUT dimension, time steps of 0.01s 139 | # and window length with 0.025s => results in an array of 100 x N_INPUT 140 | 141 | if FEATURES == "MFCC": 142 | 143 | input_vectors = calculate_mfcc(audio_filename) 144 | 145 | if FEATURES == "Pros": 146 | 147 | input_vectors = extract_prosodic_features(audio_filename) 148 | 149 | if FEATURES == "MFCC+Pros": 150 | 151 | mfcc_vectors = calculate_mfcc(audio_filename) 152 | 153 | pros_vectors = extract_prosodic_features(audio_filename) 154 | 155 | mfcc_vectors, pros_vectors = shorten(mfcc_vectors, pros_vectors) 156 | 157 | input_vectors = np.concatenate((mfcc_vectors, pros_vectors), axis=1) 158 | 159 | if FEATURES =="Spectro": 160 | 161 | input_vectors = calculate_spectrogram(audio_filename) 162 | 163 | if FEATURES == "Spectro+Pros": 164 | spectr_vectors = calculate_spectrogram(audio_filename) 165 | 166 | pros_vectors = extract_prosodic_features(audio_filename) 167 | 168 | spectr_vectors, pros_vectors = shorten(spectr_vectors, pros_vectors) 169 | 170 | input_vectors = np.concatenate((spectr_vectors, pros_vectors), axis=1) 171 | 172 | if FEATURES == "MFCC+Spectro": 173 | 174 | spectr_vectors = calculate_spectrogram(audio_filename) 175 | 176 | mfcc_vectors = calculate_mfcc(audio_filename) 177 | 178 | spectr_vectors, mfcc_vectors = shorten(spectr_vectors, mfcc_vectors) 179 | 180 | input_vectors = np.concatenate((mfcc_vectors,spectr_vectors), axis=1) 181 | 182 | if FEATURES == "MFCC+Spectro+Pros": 183 | 184 | spectr_vectors = calculate_spectrogram(audio_filename) 185 | 186 | mfcc_vectors = calculate_mfcc(audio_filename) 187 | 188 | pros_vectors = extract_prosodic_features(audio_filename) 189 | 190 | spectr_vectors, mfcc_vectors, pros_vectors = shorten3(spectr_vectors, mfcc_vectors, pros_vectors) 191 | 192 | input_vectors = np.concatenate((mfcc_vectors,spectr_vectors, pros_vectors), axis=1) 193 | 194 | # Step 2: Read motions 195 | 196 | motion_format = "bvh" 197 | 198 | if motion_format == "npz": 199 | ges_str = np.load(gesture_filename) 200 | output_vectors = ges_str['clips'] 201 | 202 | # Subsample motion (from 60 fsp to 20 fsp) 203 | output_vectors = output_vectors[0::3] 204 | 205 | 206 | elif motion_format == "bvh": 207 | f = open(gesture_filename, 'r') 208 | org = f.readlines() 209 | frametime = org[310].split() 210 | 211 | del org[0:311] 212 | 213 | bvh_len = len(org) 214 | 215 | for idx, line in enumerate(org): 216 | org[idx] = [float(x) for x in line.split()] 217 | 218 | for i in range(0, bvh_len): 219 | for j in range(0, int(306 / 3)): 220 | st = j * 3 221 | del org[i][st:st + 3] 222 | 223 | # if data is 100fps, cut it to 20 fps (every fifth line) 224 | # if data is approx 24fps, cut it to 20 fps (del every sixth line) 225 | if float(frametime[2]) == 0.0416667: 226 | del org[::6] 227 | elif float(frametime[2]) == 0.010000: 228 | org = org[::5] 229 | else: 230 | print("smth wrong with fps of " + gesture_filename) 231 | 232 | output_vectors = rot_vec_to_abs_pos_vec(org, nodes) 233 | 234 | f.close() 235 | 236 | # Step 3: Align vector length 237 | input_vectors, output_vectors = shorten(input_vectors, output_vectors) 238 | 239 | # Step 4: Retrieve N_CONTEXT each time, stride one by one 240 | input_with_context = np.array([]) 241 | output_with_context = np.array([]) 242 | 243 | strides = len(input_vectors) 244 | 245 | input_vectors = pad_sequence(input_vectors) 246 | 247 | for i in range(strides): 248 | stride = i + int(N_CONTEXT/2) 249 | if i == 0: 250 | input_with_context = input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT) 251 | output_with_context = output_vectors[i].reshape(1, N_OUTPUT) 252 | else: 253 | input_with_context = np.append(input_with_context, input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT), axis=0) 254 | output_with_context = np.append(output_with_context, output_vectors[i].reshape(1, N_OUTPUT), axis=0) 255 | 256 | return input_with_context, output_with_context 257 | 258 | 259 | def create_hierarchy_nodes(hierarchy): 260 | """ 261 | Create hierarchy nodes: an array of markers used in the motion capture 262 | Args: 263 | hierarchy: bvh file read in a structure 264 | 265 | Returns: 266 | nodes: array of markers to be used in motion processing 267 | 268 | """ 269 | joint_offsets = [] 270 | joint_names = [] 271 | 272 | for idx, line in enumerate(hierarchy): 273 | hierarchy[idx] = hierarchy[idx].split() 274 | if not len(hierarchy[idx]) == 0: 275 | line_type = hierarchy[idx][0] 276 | if line_type == 'OFFSET': 277 | offset = np.array([float(hierarchy[idx][1]), float(hierarchy[idx][2]), float(hierarchy[idx][3])]) 278 | joint_offsets.append(offset) 279 | elif line_type == 'ROOT' or line_type == 'JOINT': 280 | joint_names.append(hierarchy[idx][1]) 281 | elif line_type == 'End': 282 | joint_names.append('End Site') 283 | 284 | nodes = [] 285 | for idx, name in enumerate(joint_names): 286 | if idx == 0: 287 | parent = None 288 | elif idx in [6, 30]: #spine1->shoulders 289 | parent = 2 290 | elif idx in [14, 18, 22, 26]: #lefthand->leftfingers 291 | parent = 9 292 | elif idx in [38, 42, 46, 50]: #righthand->rightfingers 293 | parent = 33 294 | elif idx in [54, 59]: #hip->legs 295 | parent = 0 296 | else: 297 | parent = idx - 1 298 | 299 | if name == 'End Site': 300 | children = None 301 | elif idx == 0: #hips 302 | children = [1, 54, 59] 303 | elif idx == 2: #spine1 304 | children = [3, 6, 30] 305 | elif idx == 9: #lefthand 306 | children = [10, 14, 18, 22, 26] 307 | elif idx == 33: #righthand 308 | children = [34, 38, 42, 46, 50] 309 | else: 310 | children = [idx + 1] 311 | 312 | node = dict([('name', name), ('parent', parent), ('children', children), ('offset', joint_offsets[idx]), ('rel_degs', None), ('abs_qt', None), ('rel_pos', None), ('abs_pos', None)]) 313 | if idx == 0: 314 | node['rel_pos'] = node['abs_pos'] = [float(0), float(60), float(0)] 315 | node['abs_qt'] = pyq.Quaternion() 316 | nodes.append(node) 317 | 318 | return nodes 319 | 320 | 321 | def rot_vec_to_abs_pos_vec(frames, nodes): 322 | """ 323 | Transform vectors of the human motion from the joint angles to the absolute positions 324 | Args: 325 | frames: human motion in the join angles space 326 | nodes: set of markers used in motion caption 327 | 328 | Returns: 329 | output_vectors : 3d coordinates of this human motion 330 | """ 331 | output_lines = [] 332 | 333 | for frame in frames: 334 | node_idx = 0 335 | for i in range(51): #changed from 51 336 | stepi = i*3 337 | z_deg = float(frame[stepi]) 338 | x_deg = float(frame[stepi+1]) 339 | y_deg = float(frame[stepi+2]) 340 | 341 | if nodes[node_idx]['name'] == 'End Site': 342 | node_idx = node_idx + 1 343 | nodes[node_idx]['rel_degs'] = [z_deg, x_deg, y_deg] 344 | current_node = nodes[node_idx] 345 | 346 | node_idx = node_idx + 1 347 | 348 | for start_node in nodes: 349 | abs_pos = np.array([0, 60, 0]) 350 | current_node = start_node 351 | if start_node['children'] is not None: #= if not start_node['name'] = 'end site' 352 | for child_idx in start_node['children']: 353 | child_node = nodes[child_idx] 354 | 355 | child_offset = np.array(child_node['offset']) 356 | qz = pyq.Quaternion(axis=[0, 0, 1], degrees=start_node['rel_degs'][0]) 357 | qx = pyq.Quaternion(axis=[1, 0, 0], degrees=start_node['rel_degs'][1]) 358 | qy = pyq.Quaternion(axis=[0, 1, 0], degrees=start_node['rel_degs'][2]) 359 | qrot = qz * qx * qy 360 | offset_rotated = qrot.rotate(child_offset) 361 | child_node['rel_pos']= start_node['abs_qt'].rotate(offset_rotated) 362 | 363 | child_node['abs_qt'] = start_node['abs_qt'] * qrot 364 | 365 | while current_node['parent'] is not None: 366 | 367 | abs_pos = abs_pos + current_node['rel_pos'] 368 | current_node = nodes[current_node['parent']] 369 | start_node['abs_pos'] = abs_pos 370 | 371 | line = [] 372 | for node in nodes: 373 | line.append(node['abs_pos']) 374 | output_lines.append(line) 375 | 376 | output_vels = [] 377 | for idx, line in enumerate(output_lines): 378 | vel_line = [] 379 | for jn, joint_pos in enumerate(line): 380 | if idx == 0: 381 | vels = np.array([0.0, 0.0, 0.0]) 382 | else: 383 | vels = np.array([joint_pos[0] - output_lines[idx-1][jn][0], joint_pos[1] - output_lines[idx-1][jn][1], joint_pos[2] - output_lines[idx-1][jn][2]]) 384 | vel_line.append(vels) 385 | output_vels.append(vel_line) 386 | 387 | out = [] 388 | for idx, line in enumerate(output_vels): 389 | ln = [] 390 | for jn, joint_vel in enumerate(line): 391 | ln.append(output_lines[idx][jn]) 392 | ln.append(joint_vel) 393 | out.append(ln) 394 | 395 | output_array = np.asarray(out) 396 | output_vectors = np.empty([len(output_array), N_OUTPUT]) 397 | for idx, line in enumerate(output_array): 398 | output_vectors[idx] = line.flatten() 399 | return output_vectors 400 | 401 | 402 | def create(name, nodes): 403 | """ 404 | Create a dataset 405 | Args: 406 | name: dataset: 'train' or 'test' or 'dev 407 | nodes: markers used in motion caption 408 | 409 | Returns: 410 | nothing: saves numpy arrays of the features and labels as .npy files 411 | 412 | """ 413 | DATA_FILE = pd.read_csv(DATA_DIR + '/gg-' + str(name) + '.csv') 414 | X = np.array([]) 415 | Y = np.array([]) 416 | 417 | for i in range(len(DATA_FILE)): 418 | input_vectors, output_vectors = create_vectors(DATA_FILE['wav_filename'][i], DATA_FILE['bvh_filename'][i], nodes) 419 | 420 | if len(X) == 0: 421 | X = input_vectors 422 | Y = output_vectors 423 | else: 424 | X = np.concatenate((X, input_vectors), axis=0) 425 | Y = np.concatenate((Y, output_vectors), axis=0) 426 | 427 | if i%3==0: 428 | print("^^^^^^^^^^^^^^^^^^") 429 | print('{:.2f}% of processing for {:.8} dataset is done'.format(100.0 * (i+1) / len(DATA_FILE), str(name))) 430 | print("Current dataset sizes are:") 431 | print(X.shape) 432 | print(Y.shape) 433 | 434 | x_file_name = DATA_DIR + '/X_' + str(name) + '.npy' 435 | y_file_name = DATA_DIR + '/Y_' + str(name) + '.npy' 436 | np.save(x_file_name, X) 437 | np.save(y_file_name, Y) 438 | 439 | 440 | def create_test_sequences(nodes, dataset): 441 | """ 442 | Create test sequences 443 | Args: 444 | nodes: markers used in motion caption 445 | dataset: dataset name ('train', 'test' or 'dev') 446 | 447 | Returns: 448 | nothing, saves dataset into .npy file 449 | 450 | """ 451 | DATA_FILE = pd.read_csv(DATA_DIR + '/gg-'+dataset+'.csv') 452 | 453 | for i in range(len(DATA_FILE)): 454 | input_vectors, output_vectors = create_vectors(DATA_FILE['wav_filename'][i], DATA_FILE['bvh_filename'][i], nodes) 455 | 456 | array = DATA_FILE['wav_filename'][i].split("/") 457 | name = array[len(array)-1].split(".")[0] 458 | 459 | X = input_vectors 460 | 461 | if not os.path.isdir(DATA_DIR + '/'+dataset+'_inputs'): 462 | os.makedirs(DATA_DIR + '/'+dataset+'_inputs') 463 | 464 | x_file_name = DATA_DIR + '/'+dataset+'_inputs/X_test_' + name + '.npy' 465 | 466 | np.save(x_file_name, X) 467 | 468 | 469 | if __name__ == "__main__": 470 | 471 | # Check if script get enough parameters 472 | if len(sys.argv) < 3: 473 | raise ValueError('Not enough paramters! \nUsage : python ' + sys.argv[0].split("/")[-1] + ' DATA_DIR N_CONTEXT') 474 | 475 | # Check if the dataset exists 476 | if not os.path.exists(sys.argv[1]): 477 | raise ValueError( 478 | 'Path to the dataset ({}) does not exist!\nPlease, provide correct DATA_DIR as a script parameter' 479 | ''.format(sys.argv[1])) 480 | 481 | DATA_DIR = sys.argv[1] 482 | N_CONTEXT = int(sys.argv[2]) 483 | f = open('hierarchy.txt', 'r') 484 | hierarchy = f.readlines() 485 | f.close() 486 | nodes = create_hierarchy_nodes(hierarchy) 487 | 488 | create_test_sequences(nodes, 'test') 489 | create('test', nodes) 490 | create('dev', nodes) 491 | create('train', nodes) 492 | -------------------------------------------------------------------------------- /data_processing/prepare_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is used to split the dataset into train, test and dev 3 | More info on its usage is given in the READ.me file 4 | 5 | @author: Taras Kucherenko 6 | """ 7 | 8 | import sys 9 | import os 10 | import shutil 11 | import pandas 12 | from os import path 13 | 14 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 15 | 16 | NUM_OF_TEST = 90 17 | FIRST_DATA_ID = 20 18 | LAST_DATA_ID = 1182 19 | 20 | AUGMENT = True 21 | 22 | 23 | def _split_and_format_data(data_dir): 24 | 25 | if not os.path.isdir(data_dir): 26 | os.makedirs(data_dir) 27 | _download_datasets(data_dir) 28 | 29 | 30 | def _download_datasets(data_dir): 31 | 32 | _create_dir(data_dir) 33 | 34 | # prepare training data (including validation data) 35 | for i in range (FIRST_DATA_ID, LAST_DATA_ID - NUM_OF_TEST): 36 | filename = "audio" + str(i) + ".wav" 37 | original_file_path = path.join("dataset/speech/" + filename) 38 | if os.path.exists(original_file_path): 39 | target_file_path = path.join(data_dir + "train/inputs/" + filename) 40 | print(target_file_path) 41 | shutil.copy(original_file_path, target_file_path) 42 | else: 43 | print(original_file_path + " does not exist") 44 | filename = "gesture" + str(i) + ".bvh" 45 | original_file_path = path.join("dataset/motion/" + filename) 46 | if os.path.exists(original_file_path): 47 | target_file_path = path.join(data_dir + "train/labels/" + filename) 48 | print(target_file_path) 49 | shutil.copy(original_file_path, target_file_path) 50 | else: 51 | print(original_file_path + " does not exist") 52 | 53 | # prepare test data 54 | for i in range(LAST_DATA_ID - NUM_OF_TEST, LAST_DATA_ID + 1,2): 55 | filename = "audio" + str(i) + ".wav" 56 | original_file_path = path.join("dataset/speech/" + filename) 57 | if os.path.exists(original_file_path): 58 | target_file_path = path.join(data_dir + "test/inputs/" + filename) 59 | print(target_file_path) 60 | shutil.copy(original_file_path, target_file_path) 61 | else: 62 | print(original_file_path + " does not exist") 63 | filename = "gesture" + str(i) + ".bvh" 64 | original_file_path = path.join("dataset/motion/" + filename) 65 | if os.path.exists(original_file_path): 66 | target_file_path = path.join(data_dir + "test/labels/" + filename) 67 | print(target_file_path) 68 | shutil.copy(original_file_path, target_file_path) 69 | else: 70 | print(original_file_path + " does not exist") 71 | 72 | # prepare dev data (does not affect results of training at all) 73 | for i in range(LAST_DATA_ID - NUM_OF_TEST + 1, LAST_DATA_ID + 1, 2): 74 | filename = "audio" + str(i) + ".wav" 75 | original_file_path = path.join("dataset/speech/" + filename) 76 | if os.path.exists(original_file_path): 77 | target_file_path = path.join(data_dir + "dev/inputs/" + filename) 78 | print(target_file_path) 79 | shutil.copy(original_file_path, target_file_path) 80 | else: 81 | print(original_file_path + " does not exist") 82 | filename = "gesture" + str(i) + ".bvh" 83 | original_file_path = path.join("dataset/motion/" + filename) 84 | if os.path.exists(original_file_path): 85 | target_file_path = path.join(data_dir + "dev/labels/" + filename) 86 | print(target_file_path) 87 | shutil.copy(original_file_path, target_file_path) 88 | else: 89 | print(original_file_path + " does not exist") 90 | 91 | # data augmentation 92 | if AUGMENT: 93 | os.system('./data_processing/add_noisy_data.sh {0} {1} {2} {3}'.format("train", FIRST_DATA_ID, LAST_DATA_ID-NUM_OF_TEST, data_dir)) 94 | 95 | extracted_dir = path.join(data_dir) 96 | 97 | dev_files, train_files, test_files = _format_datasets(extracted_dir) 98 | 99 | dev_files.to_csv(path.join(extracted_dir, "gg-dev.csv"), index=False) 100 | train_files.to_csv(path.join(extracted_dir, "gg-train.csv"), index=False) 101 | test_files.to_csv(path.join(extracted_dir, "gg-test.csv"), index=False) 102 | 103 | 104 | def _create_dir(data_dir): 105 | 106 | dir_names = ["train", "test", "dev"] 107 | sub_dir_names = ["inputs", "labels"] 108 | 109 | # create ../data_dir/[train, test, dev]/[inputs, labels] 110 | for dir_name in dir_names: 111 | dir_path = path.join(data_dir, dir_name) 112 | print(dir_path) 113 | if not os.path.isdir(dir_path): 114 | os.makedirs(dir_path) # ../data/train 115 | 116 | for sub_dir_name in sub_dir_names: 117 | dir_path = path.join(data_dir, dir_name, sub_dir_name) 118 | print(dir_path) 119 | if not os.path.isdir(dir_path): 120 | os.makedirs(dir_path) 121 | 122 | 123 | def _format_datasets(extracted_dir): 124 | train_files = _files_to_pandas_dataframe(extracted_dir, "train", range(FIRST_DATA_ID, LAST_DATA_ID - NUM_OF_TEST)) 125 | test_files = _files_to_pandas_dataframe(extracted_dir, "test", range(LAST_DATA_ID - NUM_OF_TEST, LAST_DATA_ID + 1, 2)) 126 | dev_files = _files_to_pandas_dataframe(extracted_dir, "dev", range(LAST_DATA_ID - NUM_OF_TEST+1, LAST_DATA_ID + 1,2)) 127 | 128 | return dev_files, train_files, test_files 129 | 130 | 131 | def _files_to_pandas_dataframe(extracted_dir, set_name, idx_range): 132 | files = [] 133 | for idx in idx_range: 134 | # original files 135 | try: 136 | input_file = path.abspath(path.join(extracted_dir, set_name, "inputs", "audio" + str(idx) + ".wav")) 137 | except OSError: 138 | continue 139 | try: 140 | label_file = path.abspath(path.join(extracted_dir, set_name, "labels", "gesture" + str(idx) + ".bvh")) 141 | except OSError: 142 | continue 143 | try: 144 | wav_size = path.getsize(input_file) 145 | except OSError: 146 | continue 147 | 148 | files.append((input_file, wav_size, label_file)) 149 | 150 | # noisy files 151 | try: 152 | noisy_input_file = path.abspath(path.join(extracted_dir, set_name, "inputs", "naudio" + str(idx) + ".wav")) 153 | except OSError: 154 | continue 155 | try: 156 | noisy_wav_size = path.getsize(noisy_input_file) 157 | except OSError: 158 | continue 159 | print(str(idx)) 160 | 161 | files.append((noisy_input_file, noisy_wav_size, label_file)) 162 | 163 | return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "bvh_filename"]) 164 | 165 | 166 | if __name__ == "__main__": 167 | _split_and_format_data(sys.argv[1]) 168 | 169 | -------------------------------------------------------------------------------- /data_processing/silence.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/silence.wav -------------------------------------------------------------------------------- /data_processing/tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains supporting function for the data processing. 3 | It is used in several other scripts: 4 | for calculation of speech features, aligning sequences and generating bvh files 5 | """ 6 | 7 | import ctypes 8 | 9 | import librosa 10 | import librosa.display 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | import pandas as pd 14 | # Acoustic signal processing 15 | import scipy.io.wavfile as wav 16 | from pydub import AudioSegment 17 | from python_speech_features import mfcc 18 | import scipy 19 | 20 | from alt_prosody import compute_prosody 21 | 22 | MFCC_INPUTS=26 # How many features we will store for each MFCC vector 23 | WINDOW_LENGTH = 0.1 24 | 25 | 26 | def create_bvh(filename, prediction, frame_time): 27 | """ 28 | Create BVH File 29 | Args: 30 | filename: file, in which motion in bvh format should be written 31 | prediction: motion sequences, to be written into file 32 | frame_time: frame rate of the motion 33 | Returns: 34 | nothing, writes motion to the file 35 | """ 36 | with open('hformat.txt', 'r') as ftemp: 37 | hformat = ftemp.readlines() 38 | 39 | with open(filename, 'w') as fo: 40 | prediction = np.squeeze(prediction) 41 | print("output vector shape: " + str(prediction.shape)) 42 | offset = [0, 60, 0] 43 | offset_line = "\tOFFSET " + " ".join("{:.6f}".format(x) for x in offset) + '\n' 44 | fo.write("HIERARCHY\n") 45 | fo.write("ROOT Hips\n") 46 | fo.write("{\n") 47 | fo.write(offset_line) 48 | fo.writelines(hformat) 49 | fo.write("MOTION\n") 50 | fo.write("Frames: " + str(len(prediction)) + '\n') 51 | fo.write("Frame Time: " + frame_time + "\n") 52 | for row in prediction: 53 | row[0:3] = 0 54 | legs = np.zeros(24) 55 | row = np.concatenate((row, legs)) 56 | label_line = " ".join("{:.6f}".format(x) for x in row) + " " 57 | fo.write(label_line + '\n') 58 | print("bvh generated") 59 | 60 | def shorten(arr1, arr2): 61 | min_len = min(len(arr1), len(arr2)) 62 | 63 | arr1 = arr1[:min_len] 64 | arr2 = arr2[:min_len] 65 | 66 | return arr1, arr2 67 | 68 | def shorten3(arr1, arr2, arr3): 69 | min_len = min(len(arr1), len(arr2), len(arr3)) 70 | 71 | arr1 = arr1[:min_len] 72 | arr2 = arr2[:min_len] 73 | arr3 = arr3[:min_len] 74 | 75 | return arr1, arr2, arr3 76 | 77 | 78 | def average(arr, n): 79 | """ Replace every "n" values by their average 80 | Args: 81 | arr: input array 82 | n: number of elements to average on 83 | Returns: 84 | resulting array 85 | """ 86 | end = n * int(len(arr)/n) 87 | return np.mean(arr[:end].reshape(-1, n), 1) 88 | 89 | 90 | def calculate_mfcc(audio_filename): 91 | """ 92 | Calculate MFCC features for the audio in a given file 93 | Args: 94 | audio_filename: file name of the audio 95 | 96 | Returns: 97 | feature_vectors: MFCC feature vector for the given audio file 98 | """ 99 | fs, audio = wav.read(audio_filename) 100 | 101 | # Make stereo audio being mono 102 | if len(audio.shape) == 2: 103 | audio = (audio[:, 0] + audio[:, 1]) / 2 104 | 105 | # Calculate MFCC feature with the window frame it was designed for 106 | input_vectors = mfcc(audio, winlen=0.02, winstep=0.01, samplerate=fs, numcep=MFCC_INPUTS) 107 | 108 | input_vectors = [average(input_vectors[:, i], 5) for i in range(MFCC_INPUTS)] 109 | 110 | feature_vectors = np.transpose(input_vectors) 111 | 112 | return feature_vectors 113 | 114 | def get_energy_level(sound, win_len): 115 | """ Calculate energy signal of an audio object 116 | Args: 117 | sound: AudioSegment object with the audio signal 118 | win_len: length of the window for the energy calculations 119 | Returns: 120 | energy: the energy of the signal 121 | """ 122 | 123 | loudness = list([]) 124 | 125 | length = len(sound) - win_len 126 | 127 | # Split signal into short chunks and get energy of each of them 128 | for i in range(0, length, win_len): 129 | current_segment = sound[i:i + win_len] 130 | loudness.append(current_segment.rms) 131 | 132 | # Append the last segment, which was not considered 133 | loudness.append(0) 134 | 135 | energy = np.array(loudness) 136 | 137 | return energy 138 | 139 | 140 | def derivative(x, f): 141 | """ Calculate numerical derivative (by FDM) of a 1d array 142 | Args: 143 | x: input space x 144 | f: Function of x 145 | Returns: 146 | der: numerical derivative of f wrt x 147 | """ 148 | 149 | x = 1000 * x # from seconds to milliseconds 150 | 151 | # Normalization: 152 | dx = (x[1] - x[0]) 153 | 154 | cf = np.convolve(f, [1, -1]) / dx 155 | 156 | # Remove unstable values 157 | der = cf[:-1].copy() 158 | der[0] = 0 159 | 160 | return der 161 | 162 | 163 | def calculate_pitch(audio_filename): 164 | """ Calculate F0 contour of a given speech file 165 | Args: 166 | audio_filename: address of a speech file 167 | Returns: 168 | F0 contour in a log scale and flag indicating weather F0 existed 169 | """ 170 | 171 | fs, audio = wav.read(audio_filename) 172 | 173 | # Make stereo audio being mono 174 | if len(audio.shape) == 2: 175 | audio =( (audio[:, 0] + audio[:, 1]) / 2 ).astype(ctypes.c_int16) 176 | 177 | plot = False 178 | 179 | WINDOW_LENGTH = 5 180 | pm_times, pm, f0_times, f0, corr = pyreaper.reaper(audio, fs=fs, minf0=80, maxf0=250) 181 | 182 | # Remove unstable values 183 | f0 = f0[1:-1].copy() 184 | 185 | # Get an indication if F0 exists 186 | f0[f0 == -1] = np.nan 187 | F0_exists = 1 - np.isnan(f0).astype(int) 188 | 189 | # Interpolate pitch values 190 | ts = pd.Series(f0, index=range(f0.shape[0])) 191 | ts = ts.interpolate(method='linear', downcast='infer')\ 192 | 193 | f0 = ts.values 194 | 195 | nans = np.isnan(f0).tolist() 196 | 197 | # Extrapolate at the beginning 198 | if False in nans: 199 | first_value = nans.index(False) 200 | first_nans = nans[0:first_value] 201 | for time in range(len(first_nans)): 202 | f0[time] = f0[first_value] 203 | 204 | # Extrapolate at the end 205 | if True in nans[first_value:]: 206 | last_value = nans[first_value:].index(True) 207 | last_nans = nans[last_value:] 208 | for time in range(len(last_nans)): 209 | f0[-time] = f0[last_value] 210 | 211 | if plot: 212 | 213 | plt.plot(f0, linewidth=3, label="F0") 214 | plt.title("F0 results") 215 | plt.show() 216 | 217 | # Convert to the log scale 218 | F0_contour = np.log2(f0+1) 219 | return F0_contour, F0_exists 220 | 221 | 222 | def extract_prosodic_features(audio_filename): 223 | """ 224 | Extract all 5 prosodic features 225 | Args: 226 | audio_filename: file name for the audio to be used 227 | Returns: 228 | pros_feature: energy, energy_der, pitch, pitch_der, pitch_ind 229 | """ 230 | 231 | WINDOW_LENGTH = 5 232 | 233 | # Read audio from file 234 | sound = AudioSegment.from_file(audio_filename, format="wav") 235 | 236 | # Alternative prosodic features 237 | pitch, energy = compute_prosody(audio_filename, WINDOW_LENGTH / 1000) 238 | 239 | duration = len(sound) / 1000 240 | t = np.arange(0, duration, WINDOW_LENGTH / 1000) 241 | 242 | energy_der = derivative(t, energy) 243 | pitch_der = derivative(t, pitch) 244 | 245 | # Average everything in order to match the frequency 246 | energy = average(energy, 10) 247 | energy_der = average(energy_der, 10) 248 | pitch = average(pitch, 10) 249 | pitch_der = average(pitch_der, 10) 250 | 251 | # Cut them to the same size 252 | min_size = min(len(energy), len(energy_der), len(pitch_der), len(pitch_der)) 253 | energy = energy[:min_size] 254 | energy_der = energy_der[:min_size] 255 | pitch = pitch[:min_size] 256 | pitch_der = pitch_der[:min_size] 257 | 258 | # Stack them all together 259 | pros_feature = np.stack((energy, energy_der, pitch, pitch_der))#, pitch_ind)) 260 | 261 | # And reshape 262 | pros_feature = np.transpose(pros_feature) 263 | 264 | return pros_feature 265 | 266 | 267 | def calculate_spectrogram(audio_filename): 268 | """ Calculate spectrogram for the audio file 269 | Args: 270 | audio_filename: audio file name 271 | Returns: 272 | log spectrogram values 273 | """ 274 | 275 | DIM = int(64) 276 | 277 | audio, sample_rate = librosa.load(audio_filename) 278 | 279 | # Make stereo audio being mono 280 | if len(audio.shape) == 2: 281 | audio = (audio[:, 0] + audio[:, 1]) / 2 282 | 283 | spectr = librosa.feature.melspectrogram(audio, sr=sample_rate, #window = scipy.signal.hanning, 284 | hop_length = int(WINDOW_LENGTH* sample_rate / 2), 285 | fmax=7500, fmin=100, n_mels=DIM) 286 | 287 | # Shift into the log scale 288 | eps = 1e-10 289 | log_spectr = np.log(abs(spectr)+eps) 290 | 291 | return np.transpose(log_spectr) 292 | -------------------------------------------------------------------------------- /data_processing/tools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/tools.pyc -------------------------------------------------------------------------------- /data_processing/white_noise.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/white_noise.wav -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # How to use the evaluation script 2 | 3 | This directory provides the scripts for quantitative evaluation of our gesture generation framework. We support the following measures: 4 | - Average Position Error (APE) 5 | - Mean Absolute Error (MAE) 6 | - Average Jerk (AJ) 7 | - Average Acceleration (AA) 8 | - Histogram of Moving Distance (HMD, for velocity/acceleration) 9 | 10 | ## Data preparation 11 | 1. Use `../helpers/remove_velocity.py` to delete velocities from predicted data. 12 | 2. Use `../helpers/convert_original.py` to create original data. 13 | 14 | This produces gesture files containing `(x, y, z) x 64 joints = 192` white space separated data for each line. 15 | 16 | 3. (optional) Use `../helpers/apply_filters.py` to smooth predicted data. 17 | 18 | ## Directory organization 19 | 20 | We assume original/predicted gesture data are stored as follows: 21 | 22 | ``` 23 | -- evaluation/ 24 | |-- calc_distance.py 25 | |-- calc_errors.py 26 | |-- calc_jerk.py 27 | |-- joints.txt 28 | |-- data/ 29 | |-- original/ 30 | |-- gesture1093.txt, gesture1095.txt, ... 31 | |-- predicted/ 32 | |-- your_prediction_dir/ 33 | |-- gesture1093.txt, gesture1095.txt, ... 34 | ``` 35 | 36 | **Important Note: You have to store the gesture files of the same indices in `original` and `predicted` directories. 37 | If you have gestures 1093, 1095, ... in the `original` directory, but gestures 1094, 1096, ... in the `predicted' - you will get wrong results** 38 | 39 | ## Run 40 | 41 | `calc_errors.py`, `calc_jerk.py`, and `calc_distance.py` support different quantitative measures, described below. 42 | 43 | `--gesture` or `-g` option specifies the predicted directory under `data/predicted`. If you store the predicted gesture files in `data/predicted/your_prediction_dir/`, use `-g your_prediction_dir`. 44 | 45 | ### APE/MAE 46 | 47 | Average Position Error (APE) and Mean Absolute Error (MAE) indicate the prediction errors against the original gestures. 48 | 49 | To calculate APE/MAE, you can use `calc_errors.py`. 50 | You can select the metric to compute by `--metric` or `-m` option (default: ape). 51 | 52 | ```sh 53 | # Compute APE 54 | python calc_errors.py -g your_prediction_dir -m ape 55 | 56 | # Compute MAE 57 | python calc_errors.py -g your_prediction_dir -m mae 58 | ``` 59 | 60 | ### AJ/AA 61 | 62 | Average Jerk (AJ) and Average Acceleration (AA) represent the characteristics of gesture motion. 63 | 64 | To calculate AJ/AA, you can use `calc_jerk.py`. 65 | You can select the measure to compute by `--measure` or `-m` option (default: jerk). 66 | 67 | ```sh 68 | # Compute AJ 69 | python calc_jerk.py -g your_prediction_dir -m jerk 70 | 71 | # Compute AA 72 | python calc_jerks.py -g your_prediction_dir -m acceleration 73 | ``` 74 | 75 | Note: `calc_jerk.py` computes AJ/AA for both original and predicted gestures. The AJ/AA of the original gestures will be stored in `result/original` by default. The AJ/AA of the predicted gestures will be stored in `result/your_prediction_dir`. 76 | 77 | ### HMD 78 | 79 | Histogram of Moving Distance (HMD) shows the velocity/acceleration distribution of gesture motion. 80 | 81 | To calculate HMD, you can use `calc_distance.py`. 82 | You can select the measure to compute by `--measure` or `-m` option (default: velocity). 83 | In addition, this script supports histogram visualization. To enable visualization, use `--visualize` or `-v` option. 84 | 85 | ```sh 86 | # Compute velocity histogram 87 | python calc_distance.py -g your_prediction_dir -m velocity -w 0.05 # You can change the bin width of the histogram 88 | 89 | # Compute acceleration histogram 90 | python calc_distance.py -g your_prediction_dir -m acceleration -w 0.05 91 | ``` 92 | 93 | Note: `calc_distance.py` computes HMD for both original and predicted gestures. The HMD of the original gestures will be stored in `result/original` by default. 94 | 95 | ### Calculate evaluation measures for specific joints 96 | You can use `-s` option for all evaluation scripts to select specific joints, e.g. `-s Head LeftLeg RightLeg` 97 | Here is a table for the joint names: 98 | 99 | | Joint to Calculate | Corresponding Name | 100 | | --- | --- | 101 | | Head | Head | 102 | | Neck | Neck | 103 | | Left Shoulder | LeftArm | 104 | | Left Elobow | LeftForeArm | 105 | | Left Wrist | LeftHand | 106 | | Right Shoulder | RightArm | 107 | | Right Elobow | RightForeArm | 108 | | Right Wrist | RightHand | 109 | | Left Hip | LeftUpLeg | 110 | | Left Knee | LeftLeg | 111 | | Left Ankle | LeftFoot | 112 | | Right Hip | RightUpLeg | 113 | | Right Knee | RightLeg | 114 | | Right Ankle | RightFoot | 115 | 116 | When you calculate the velocity histogram for both elbows, use 117 | ```sh 118 | python calc_distance.py -g your_prediction_dir -m velocity -w 0.05 -s LeftForeArm RightForeArm 119 | ``` 120 | -------------------------------------------------------------------------------- /evaluation/calc_distance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Calculating statistics over the produced and ground truth gestures 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import os 11 | import warnings 12 | 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | 16 | 17 | def read_joint_names(filename): 18 | """Read motion capture's body joint names from file 19 | 20 | Args: 21 | filename: file name to read 22 | 23 | Returns: 24 | joint_names: list of joint names 25 | """ 26 | 27 | with open(filename, 'r') as f: 28 | org = f.read() 29 | joint_names = org.split(',') 30 | 31 | return joint_names 32 | 33 | 34 | def compute_velocity(data, dim=3): 35 | """Compute velocity between adjacent frames 36 | 37 | Args: 38 | data: array containing joint positions of gesture 39 | dim: gesture dimensionality 40 | 41 | Returns: 42 | vel_norms: velocities of each joint between each adjacent frame 43 | """ 44 | 45 | # First derivative of position is velocity 46 | vels = np.diff(data, n=1, axis=0) 47 | 48 | num_vels = vels.shape[0] 49 | num_joints = vels.shape[1] // dim 50 | 51 | vel_norms = np.zeros((num_vels, num_joints)) 52 | 53 | for i in range(num_vels): 54 | for j in range(num_joints): 55 | x1 = j * dim + 0 56 | x2 = j * dim + dim 57 | vel_norms[i, j] = np.linalg.norm(vels[i, x1:x2]) 58 | 59 | return vel_norms 60 | 61 | 62 | def compute_acceleration(data, dim=3): 63 | """Compute acceleration between adjacent frames 64 | 65 | Args: 66 | data: array containing joint positions of gesture 67 | dim: gesture dimensionality 68 | 69 | Returns: 70 | acc_norms: accelerations of each joint between each adjacent frame 71 | """ 72 | 73 | # Second derivative of position is acceleration 74 | accs = np.diff(data, n=2, axis=0) 75 | 76 | num_accs = accs.shape[0] 77 | num_joints = accs.shape[1] // dim 78 | 79 | acc_norms = np.zeros((num_accs, num_joints)) 80 | 81 | for i in range(num_accs): 82 | for j in range(num_joints): 83 | x1 = j * dim + 0 84 | x2 = j * dim + dim 85 | acc_norms[i, j] = np.linalg.norm(accs[i, x1:x2]) 86 | 87 | return acc_norms 88 | 89 | 90 | def save_result(lines, out_dir, width, measure): 91 | """Write computed histogram to CSV 92 | 93 | Args: 94 | lines: list of strings to be written 95 | out_dir: output directory 96 | width: bin width of the histogram 97 | measure: used measure for histogram calculation 98 | """ 99 | 100 | # Make output directory 101 | if not os.path.exists(out_dir): 102 | os.makedirs(out_dir) 103 | 104 | hist_type = measure[:3] # 'vel' or 'acc' 105 | filename = 'hmd_{}_{}.csv'.format(hist_type, width) 106 | outname = os.path.join(out_dir, filename) 107 | 108 | with open(outname, 'w') as out_file: 109 | out_file.writelines(lines) 110 | 111 | print('More detailed result was writen to the file: ' + outname) 112 | print('') 113 | 114 | 115 | def main(): 116 | measures = { 117 | 'velocity': compute_velocity, 118 | 'acceleration': compute_acceleration, 119 | } 120 | 121 | parser = argparse.ArgumentParser( 122 | description='Calculate histograms of moving distances') 123 | parser.add_argument('--original', '-o', default='data/original', 124 | help='Original gesture directory') 125 | parser.add_argument('--predicted', '-p', default='data/predicted', 126 | help='Predicted gesture directory') 127 | parser.add_argument('--joints', '-j', default='joints.txt', 128 | help='Joint name file') 129 | parser.add_argument('--gesture', '-g', required=True, 130 | help='Directory storing predicted txt files') 131 | parser.add_argument('--width', '-w', type=float, default=0.05, 132 | help='Bin width of the histogram') 133 | parser.add_argument('--measure', '-m', default='velocity', 134 | help='Measure to calculate (velocity or acceleration)') 135 | parser.add_argument('--select', '-s', nargs='+', 136 | help='Joint subset to compute (if omitted, use all)') 137 | parser.add_argument('--visualize', '-v', action='store_true', 138 | help='Visualize histograms') 139 | parser.add_argument('--out', default='result', 140 | help='Directory to output the result') 141 | args = parser.parse_args() 142 | 143 | predicted_dir = os.path.join(args.predicted, args.gesture) 144 | 145 | original_files = sorted(glob.glob(os.path.join(args.original, '*.txt'))) 146 | 147 | predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt'))) 148 | 149 | # Check number of files 150 | if len(original_files) != len(predicted_files): 151 | warnings.warn('Inconsistent number of files : {} vs {}' 152 | ''.format(len(original_files), len(predicted_files)), 153 | RuntimeWarning) 154 | 155 | # Check if error measure was correct 156 | if args.measure not in measures: 157 | raise ValueError('Unknown measure: \'{}\'. Choose from {}' 158 | ''.format(args.measure, list(measures.keys()))) 159 | 160 | joint_names = read_joint_names(args.joints) 161 | 162 | if args.select is not None: 163 | selected_joints = [] 164 | for s in args.select: 165 | try: 166 | index = joint_names.index(s) 167 | except ValueError: 168 | print('Ignore invalid joint: {}'.format(s)) 169 | else: 170 | selected_joints.append(index) 171 | selected_joints.sort() 172 | 173 | if len(selected_joints) == 0: 174 | selected_joints = range(len(joint_names)) 175 | print('No valid joints are selected. Use all joints') 176 | else: 177 | # Use all joints 178 | selected_joints = range(len(joint_names)) 179 | 180 | joint_names = [joint_names[s] for s in selected_joints] 181 | original_out_lines = [','.join([''] + joint_names + ['Total']) + '\n'] 182 | predicted_out_lines = [','.join([''] + joint_names + ['Total']) + '\n'] 183 | 184 | original_distances = [] 185 | predicted_distances = [] 186 | for original_file, predicted_file in zip(original_files, predicted_files): 187 | original = np.loadtxt(original_file) 188 | predicted = np.loadtxt(predicted_file) 189 | 190 | original_distance = measures[args.measure]( 191 | original)[:, selected_joints] 192 | predicted_distance = measures[args.measure]( 193 | predicted)[:, selected_joints] 194 | 195 | original_distances.append(original_distance) 196 | predicted_distances.append(predicted_distance) 197 | 198 | original_distances = np.concatenate(original_distances) 199 | predicted_distances = np.concatenate(predicted_distances) 200 | 201 | # Compute histogram for each joint 202 | bins = np.arange(0, 1+args.width, args.width) 203 | num_joints = original_distances.shape[1] 204 | original_hists = [] 205 | predicted_hists = [] 206 | for i in range(num_joints): 207 | original_hist, _ = np.histogram(original_distances[:, i], bins=bins) 208 | predicted_hist, _ = np.histogram(predicted_distances[:, i], bins=bins) 209 | 210 | original_hists.append(original_hist) 211 | predicted_hists.append(predicted_hist) 212 | 213 | # Sum over all joints 214 | original_total = np.sum(original_hists, axis=0) 215 | predicted_total = np.sum(predicted_hists, axis=0) 216 | 217 | # Append total number of bin counts to the last 218 | original_hists = np.stack(original_hists + [original_total], axis=1) 219 | predicted_hists = np.stack(predicted_hists + [predicted_total], axis=1) 220 | 221 | num_bins = bins.size - 1 222 | for i in range(num_bins): 223 | original_line = str(bins[i]) 224 | predicted_line = str(bins[i]) 225 | for j in range(num_joints + 1): 226 | original_line += ',' + str(original_hists[i, j]) 227 | predicted_line += ',' + str(predicted_hists[i, j]) 228 | original_line += '\n' 229 | predicted_line += '\n' 230 | 231 | original_out_lines.append(original_line) 232 | predicted_out_lines.append(predicted_line) 233 | 234 | original_out_dir = os.path.join(args.out, 'original') 235 | predicted_out_dir = os.path.join(args.out, args.gesture) 236 | 237 | if args.visualize: 238 | plt.plot(bins[:-1], original_total, label='Original') 239 | plt.plot(bins[:-1], predicted_total, label=args.gesture) 240 | plt.legend() 241 | plt.xlabel('Velocity (cm/s)') 242 | plt.ylabel('Bin counts') 243 | plt.title('Histograms of Moving Distance ({})'.format(args.measure)) 244 | plt.tight_layout() 245 | plt.show() 246 | 247 | save_result(original_out_lines, original_out_dir, 248 | args.width, args.measure) 249 | save_result(predicted_out_lines, predicted_out_dir, 250 | args.width, args.measure) 251 | 252 | print('HMD ({}):'.format(args.measure)) 253 | print('bins: {}'.format(bins)) 254 | print('original: {}'.format(original_total)) 255 | print('predicted: {}'.format(predicted_total)) 256 | 257 | 258 | if __name__ == '__main__': 259 | main() 260 | -------------------------------------------------------------------------------- /evaluation/calc_errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Calculating average point error 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import os 11 | 12 | import numpy as np 13 | from sklearn.metrics import mean_absolute_error 14 | 15 | 16 | def read_joint_names(filename): 17 | """Read motion capture's body joint names from file 18 | 19 | Args: 20 | filename: file name to read 21 | 22 | Returns: 23 | joint_names: list of joint names 24 | """ 25 | 26 | with open(filename, 'r') as f: 27 | org = f.read() 28 | joint_names = org.split(',') 29 | 30 | return joint_names 31 | 32 | 33 | def remove_velocity(data, dim=3): 34 | """Remove velocity values from raw prediction data 35 | 36 | Args: 37 | data: array containing both position and velocity values 38 | dim: gesture dimensionality 39 | 40 | Returns: 41 | np.ndarray: array containing only position values 42 | """ 43 | 44 | starts = np.arange(0, data.shape[1], dim * 2) 45 | stops = np.arange(dim, data.shape[1], dim * 2) 46 | return np.hstack([data[:, i:j] for i, j in zip(starts, stops)]) 47 | 48 | 49 | def MAE(original, predicted, dim=3): 50 | """Compute Mean Absolute Error (MAE) 51 | 52 | Args: 53 | original: array containing joint positions of original gesture 54 | predicted: array containing joint positions of predicted gesture 55 | dim: gesture dimensionality 56 | 57 | Returns: 58 | mae: MAE between original and predicted for each joint 59 | """ 60 | 61 | num_frames = predicted.shape[0] 62 | 63 | diffs = mean_absolute_error(original[:num_frames], predicted, 64 | multioutput='raw_values') 65 | 66 | num_joints = predicted.shape[1] // dim 67 | mae = np.empty(num_joints) 68 | 69 | for i in range(num_joints): 70 | x1 = i * dim + 0 71 | x2 = i * dim + dim 72 | mae[i] = np.mean(diffs[x1:x2]) 73 | 74 | return mae 75 | 76 | 77 | def APE(original, predicted, dim=3): 78 | """Compute Average Position Error (APE) 79 | 80 | Args: 81 | original: array containing joint positions of original gesture 82 | predicted: array containing joint positions of predicted gesture 83 | dim: gesture dimensionality 84 | 85 | Returns: 86 | np.ndarray: APE between original and predicted for each joint 87 | """ 88 | 89 | num_frames = predicted.shape[0] 90 | num_joints = predicted.shape[1] // dim 91 | 92 | diffs = np.zeros((num_frames, num_joints)) 93 | 94 | for i in range(num_frames): 95 | for j in range(num_joints): 96 | x1 = j * dim + 0 97 | x2 = j * dim + dim 98 | diffs[i, j] = np.linalg.norm( 99 | original[i, x1:x2] - predicted[i, x1:x2]) 100 | 101 | return np.mean(diffs, axis=0) 102 | 103 | 104 | def main(): 105 | metrics = { 106 | 'mae': MAE, 107 | 'ape': APE, 108 | } 109 | 110 | parser = argparse.ArgumentParser( 111 | description='Calculate prediction errors') 112 | parser.add_argument('--original', '-o', default='data/original', 113 | help='Original gesture directory') 114 | parser.add_argument('--predicted', '-p', default='data/predicted', 115 | help='Predicted gesture directory') 116 | parser.add_argument('--joints', '-j', default='joints.txt', 117 | help='Joint name file') 118 | parser.add_argument('--gesture', '-g', required=True, 119 | help='Directory storing predicted txt files') 120 | parser.add_argument('--metric', '-m', default='ape', 121 | help='Error metric (ape or mae)') 122 | parser.add_argument('--select', '-s', nargs='+', 123 | help='Joint subset to compute (if omitted, use all)') 124 | parser.add_argument('--out', default='result', 125 | help='Directory to output the result') 126 | args = parser.parse_args() 127 | 128 | predicted_dir = os.path.join(args.predicted, args.gesture) 129 | 130 | original_files = sorted(glob.glob(os.path.join(args.original, '*.txt'))) 131 | predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt'))) 132 | 133 | # Check number of files 134 | if len(original_files) != len(predicted_files): 135 | raise ValueError('Inconsistent number of files : {} vs {}' 136 | ''.format(len(original_files), len(predicted_files))) 137 | 138 | # Check if error metric was correct 139 | if args.metric not in metrics: 140 | raise ValueError('Unknown metric: \'{}\'. Choose from {}' 141 | ''.format(args.metric, list(metrics.keys()))) 142 | 143 | joint_names = read_joint_names(args.joints) 144 | 145 | if args.select is not None: 146 | selected_joints = [] 147 | for s in args.select: 148 | try: 149 | index = joint_names.index(s) 150 | except ValueError: 151 | print('Ignore invalid joint: {}'.format(s)) 152 | else: 153 | selected_joints.append(index) 154 | selected_joints.sort() 155 | 156 | if len(selected_joints) == 0: 157 | selected_joints = range(len(joint_names)) 158 | print('No valid joints are selected. Use all joints') 159 | else: 160 | # Use all joints 161 | selected_joints = range(len(joint_names)) 162 | 163 | joint_names = [joint_names[s] for s in selected_joints] 164 | out_lines = [','.join(['file'] + joint_names) + '\n'] 165 | 166 | errors = [] 167 | for original_file, predicted_file in zip(original_files, predicted_files): 168 | original = np.loadtxt(original_file) 169 | predicted = np.loadtxt(predicted_file) 170 | 171 | if original.shape[0] != predicted.shape[0]: 172 | # Cut them to the same length 173 | length = min(original.shape[0], predicted.shape[0]) 174 | original = original[:length] 175 | predicted = predicted[:length] 176 | 177 | if predicted.shape[1] == 192 * 2: 178 | print(predicted.shape) 179 | print("Removing the velocity") 180 | # Remove the velocity 181 | predicted = remove_velocity(predicted) 182 | 183 | error = metrics[args.metric](original, predicted)[selected_joints] 184 | errors.append(error) 185 | 186 | basename = os.path.basename(predicted_file) 187 | line = basename 188 | for e in error: 189 | line += ',' + str(e) 190 | line += '\n' 191 | 192 | out_lines.append(line) 193 | 194 | average_line = 'Average' 195 | avgs = np.mean(errors, axis=0) 196 | for a in avgs: 197 | average_line += ',' + str(a) 198 | 199 | out_lines.append(average_line) 200 | 201 | out_dir = os.path.join(args.out, args.gesture) 202 | 203 | # Make output directory 204 | if not os.path.exists(out_dir): 205 | os.makedirs(out_dir) 206 | 207 | outname = os.path.join(out_dir, '{}.csv'.format(args.metric)) 208 | with open(outname, 'w') as out_file: 209 | out_file.writelines(out_lines) 210 | 211 | print('More detailed result was writen to the file: ' + outname) 212 | print('') 213 | 214 | print('{}: {:.2f}'.format(args.metric.upper(), np.mean(errors))) 215 | 216 | 217 | if __name__ == '__main__': 218 | main() 219 | -------------------------------------------------------------------------------- /evaluation/calc_jerk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Calculating average jerk over the produced and ground truth gestures 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import os 11 | import warnings 12 | 13 | import numpy as np 14 | 15 | 16 | def read_joint_names(filename): 17 | """Read motion capture's body joint names from file 18 | 19 | Args: 20 | filename: file name to read 21 | 22 | Returns: 23 | joint_names: list of joint names 24 | """ 25 | 26 | with open(filename, 'r') as f: 27 | org = f.read() 28 | joint_names = org.split(',') 29 | 30 | return joint_names 31 | 32 | 33 | def compute_jerks(data, dim=3): 34 | """Compute jerk between adjacent frames 35 | 36 | Args: 37 | data: array containing joint positions of gesture 38 | dim: gesture dimensionality 39 | 40 | Returns: 41 | np.ndarray: jerks of each joint averaged over all frames 42 | """ 43 | 44 | # Third derivative of position is jerk 45 | jerks = np.diff(data, n=3, axis=0) 46 | 47 | num_jerks = jerks.shape[0] 48 | num_joints = jerks.shape[1] // dim 49 | 50 | jerk_norms = np.zeros((num_jerks, num_joints)) 51 | 52 | for i in range(num_jerks): 53 | for j in range(num_joints): 54 | x1 = j * dim + 0 55 | x2 = j * dim + dim 56 | jerk_norms[i, j] = np.linalg.norm(jerks[i, x1:x2]) 57 | 58 | return np.mean(jerk_norms, axis=0) 59 | 60 | 61 | def compute_acceleration(data, dim=3): 62 | """Compute acceleration between adjacent frames 63 | 64 | Args: 65 | data: array containing joint positions of gesture 66 | dim: gesture dimensionality 67 | 68 | Returns: 69 | np.ndarray: accelerations of each joint averaged over all frames 70 | """ 71 | 72 | # Second derivative of position is acceleration 73 | accs = np.diff(data, n=2, axis=0) 74 | 75 | num_accs = accs.shape[0] 76 | num_joints = accs.shape[1] // dim 77 | 78 | acc_norms = np.zeros((num_accs, num_joints)) 79 | 80 | for i in range(num_accs): 81 | for j in range(num_joints): 82 | x1 = j * dim + 0 83 | x2 = j * dim + dim 84 | acc_norms[i, j] = np.linalg.norm(accs[i, x1:x2]) 85 | 86 | return np.mean(acc_norms, axis=0) 87 | 88 | 89 | def save_result(lines, out_dir, measure): 90 | """Write computed measure to CSV 91 | 92 | Args: 93 | lines: list of strings to be written 94 | out_dir: output directory 95 | measure: used measure 96 | """ 97 | 98 | # Make output directory 99 | if not os.path.exists(out_dir): 100 | os.makedirs(out_dir) 101 | 102 | if measure == "jerk": 103 | outname = os.path.join(out_dir, 'aj.csv') 104 | elif measure == "acceleration": 105 | outname = os.path.join(out_dir, 'aa.csv') 106 | 107 | with open(outname, 'w') as out_file: 108 | out_file.writelines(lines) 109 | 110 | print('More detailed result was writen to the file: ' + outname) 111 | print('') 112 | 113 | 114 | def main(): 115 | measures = { 116 | 'jerk': compute_jerks, 117 | 'acceleration': compute_acceleration, 118 | } 119 | 120 | parser = argparse.ArgumentParser( 121 | description='Calculate prediction errors') 122 | parser.add_argument('--original', '-o', default='data/original', 123 | help='Original gesture directory') 124 | parser.add_argument('--predicted', '-p', default='data/predicted', 125 | help='Predicted gesture directory') 126 | parser.add_argument('--joints', '-j', default='joints.txt', 127 | help='Joint name file') 128 | parser.add_argument('--gesture', '-g', required=True, 129 | help='Directory storing predicted txt files') 130 | parser.add_argument('--measure', '-m', default='jerk', 131 | help='Measure to calculate (jerk or acceleration)') 132 | parser.add_argument('--select', '-s', nargs='+', 133 | help='Joint subset to compute (if omitted, use all)') 134 | parser.add_argument('--out', default='result', 135 | help='Directory to output the result') 136 | args = parser.parse_args() 137 | 138 | predicted_dir = os.path.join(args.predicted, args.gesture) 139 | 140 | original_files = sorted(glob.glob(os.path.join(args.original, '*.txt'))) 141 | predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt'))) 142 | 143 | # Check number of files 144 | if len(original_files) != len(predicted_files): 145 | warnings.warn('Inconsistent number of files : {} vs {}' 146 | ''.format(len(original_files), len(predicted_files)), 147 | RuntimeWarning) 148 | 149 | # Check if error measure was correct 150 | if args.measure not in measures: 151 | raise ValueError('Unknown measure: \'{}\'. Choose from {}' 152 | ''.format(args.measure, list(measures.keys()))) 153 | 154 | joint_names = read_joint_names(args.joints) 155 | 156 | if args.select is not None: 157 | selected_joints = [] 158 | for s in args.select: 159 | try: 160 | index = joint_names.index(s) 161 | except ValueError: 162 | print('Ignore invalid joint: {}'.format(s)) 163 | else: 164 | selected_joints.append(index) 165 | selected_joints.sort() 166 | 167 | if len(selected_joints) == 0: 168 | selected_joints = range(len(joint_names)) 169 | print('No valid joints are selected. Use all joints') 170 | else: 171 | # Use all joints 172 | selected_joints = range(len(joint_names)) 173 | 174 | joint_names = [joint_names[s] for s in selected_joints] 175 | original_out_lines = [','.join(['file'] + joint_names) + '\n'] 176 | predicted_out_lines = [','.join(['file'] + joint_names) + '\n'] 177 | 178 | original_values = [] 179 | predicted_values = [] 180 | for original_file, predicted_file in zip(original_files, predicted_files): 181 | original = np.loadtxt(original_file) 182 | predicted = np.loadtxt(predicted_file) 183 | 184 | if original.shape[0] != predicted.shape[0]: 185 | # Cut them to the same length 186 | length = min(original.shape[0], predicted.shape[0]) 187 | original = original[:length] 188 | predicted = predicted[:length] 189 | 190 | original_value = measures[args.measure](original)[selected_joints] 191 | predicted_value = measures[args.measure](predicted)[selected_joints] 192 | 193 | original_values.append(original_value) 194 | predicted_values.append(predicted_value) 195 | 196 | basename = os.path.basename(original_file) 197 | original_line = basename 198 | predicted_line = basename 199 | for ov, pv in zip(original_value, predicted_value): 200 | original_line += ',' + str(ov) 201 | predicted_line += ',' + str(pv) 202 | original_line += '\n' 203 | predicted_line += '\n' 204 | 205 | original_out_lines.append(original_line) 206 | predicted_out_lines.append(predicted_line) 207 | 208 | original_average_line = 'Average' 209 | predicted_average_line = 'Average' 210 | original_avgs = np.mean(original_values, axis=0) 211 | predicted_avgs = np.mean(predicted_values, axis=0) 212 | for oa, pa in zip(original_avgs, predicted_avgs): 213 | original_average_line += ',' + str(oa) 214 | predicted_average_line += ',' + str(pa) 215 | 216 | original_out_lines.append(original_average_line) 217 | predicted_out_lines.append(predicted_average_line) 218 | 219 | original_out_dir = os.path.join(args.out, 'original') 220 | predicted_out_dir = os.path.join(args.out, args.gesture) 221 | 222 | save_result(original_out_lines, original_out_dir, args.measure) 223 | save_result(predicted_out_lines, predicted_out_dir, args.measure) 224 | 225 | if args.measure == 'jerk': 226 | print('AJ:') 227 | elif args.measure == 'acceleration': 228 | print('AA:') 229 | print('original: {:.2f}'.format(np.mean(original_values))) 230 | print('predicted: {:.2f}'.format(np.mean(predicted_values))) 231 | 232 | 233 | if __name__ == '__main__': 234 | main() 235 | -------------------------------------------------------------------------------- /evaluation/hellinger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 2 11:30:40 2020 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import os 11 | import re 12 | 13 | import matplotlib 14 | matplotlib.use('Agg') 15 | import matplotlib.pyplot as plt 16 | from matplotlib.ticker import MaxNLocator 17 | from matplotlib.patches import Rectangle 18 | import numpy as np 19 | import pandas as pd 20 | import seaborn as sns 21 | 22 | 23 | def read_joint_names(filename): 24 | """Read motion capture's body joint names from file 25 | 26 | Args: 27 | filename: file name to read 28 | 29 | Returns: 30 | joint_names: list of joint names 31 | """ 32 | 33 | with open(filename, 'r') as f: 34 | org = f.read() 35 | joint_names = org.split(',') 36 | 37 | return joint_names 38 | 39 | 40 | def normalize(hist): 41 | return hist / np.sum(hist) 42 | 43 | 44 | def hellinger(hist1, hist2): 45 | """Compute Hellinger distance between two histograms 46 | 47 | Args: 48 | hist1: first histogram 49 | hist2: second histogram of the same size as hist1 50 | 51 | Returns: 52 | float: Hellinger distance between hist1 and hist2 53 | """ 54 | 55 | return np.sqrt(1.0 - np.sum(np.sqrt(normalize(hist1) * normalize(hist2)))) 56 | 57 | 58 | # https://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort # NOQA 59 | def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): 60 | return [int(text) if text.isdigit() else text.lower() 61 | for text in _nsre.split(s)] 62 | 63 | 64 | def natural_sort(l, key=natural_sort_key): 65 | return sorted(l, key=key) 66 | 67 | 68 | def main(): 69 | parser = argparse.ArgumentParser( 70 | description='Calculate histograms of moving distances') 71 | parser.add_argument('--original', default='data/original', 72 | help='Original gesture directory') 73 | parser.add_argument('--predicted', '-p', default='data/predicted', 74 | help='Predicted gesture directory') 75 | parser.add_argument('--file', '-f', default='hmd_vel_0.05.csv', 76 | help='File name to load') 77 | parser.add_argument('--joints', '-j', default='joints.txt', 78 | help='Joint name file') 79 | parser.add_argument('--select', '-s', nargs='+', 80 | help='Joint subset to compute (if omitted, use all)') 81 | parser.add_argument('--visualize', '-v', action='store_true', 82 | help='Visualize histograms') 83 | parser.add_argument('--out', '-o', default='results', 84 | help='Directory to output the result') 85 | args = parser.parse_args() 86 | 87 | joint_names = read_joint_names(args.joints) 88 | 89 | if args.select is not None: 90 | selected_joints = [] 91 | for s in args.select: 92 | if not s in joint_names: 93 | print('Ignore invalid joint: {}'.format(s)) 94 | else: 95 | selected_joints.append(s) 96 | 97 | if not selected_joints: 98 | selected_joints = ['Total'] 99 | print('No valid joints are selected. Use all joints') 100 | else: 101 | # Use all joints 102 | selected_joints = ['Total'] 103 | 104 | def get_directories(directory): 105 | return sorted(filter(lambda x: os.path.isdir(x), glob.glob(directory))) 106 | 107 | # Read original gesture's distribution 108 | original_file = os.path.join(args.original, args.file) 109 | original = pd.read_csv(original_file, index_col=0) 110 | original_hist = np.array(original[selected_joints]).sum(axis=1) 111 | 112 | # List of predicted gesture direcotires 113 | predicted_dirs = get_directories(os.path.join(args.predicted, '*')) 114 | 115 | results = {os.path.basename(d): None for d in predicted_dirs} 116 | 117 | # Iterate over the list of direcotires 118 | for predicted_dir in predicted_dirs: 119 | # Does this directory have a target file? 120 | try: 121 | predicted_file = os.path.join(predicted_dir, args.file) 122 | predicted = pd.read_csv(predicted_file, index_col=0) 123 | except FileNotFoundError: 124 | # Are there any subdirectories which have integer names? 125 | sub_dirs = sorted( 126 | filter(lambda x: os.path.basename(x).isdecimal(), 127 | get_directories(os.path.join(predicted_dir, '*')))) 128 | 129 | # If no, raise an exception 130 | if not sub_dirs: 131 | raise FileNotFoundError( 132 | 'There is neither ' + args.file 133 | + ' nor subdirectories in ' + predicted_dir) 134 | 135 | predicted = None 136 | for sub_dir in sub_dirs: 137 | predicted_file = os.path.join(sub_dir, args.file) 138 | tmp = pd.read_csv(predicted_file, index_col=0) 139 | 140 | if predicted is None: 141 | predicted = tmp 142 | else: 143 | predicted = predicted + tmp 144 | 145 | predicted = predicted / float(len(sub_dirs)) 146 | 147 | # Get histograms 148 | predicted_hist = np.array(predicted[selected_joints]).sum(axis=1) 149 | 150 | assert len(original_hist) == len(predicted_hist) 151 | 152 | # Hellinger distance between two histograms 153 | dist = hellinger(original_hist, predicted_hist) 154 | 155 | # Store results 156 | key = os.path.basename(predicted_dir) 157 | results[key] = {'dist': dist, 'hist': predicted_hist} 158 | 159 | # Print and save results 160 | keys = natural_sort(results.keys()) 161 | 162 | result_str = ['Hellinger distances:'] 163 | for key in keys: 164 | result_str.append('\t{}: {}'.format(key, results[key]['dist'])) 165 | 166 | result_str = '\n'.join(result_str) 167 | 168 | print(result_str) 169 | print('') 170 | 171 | # Make output directory 172 | out = os.path.join(args.out, os.path.basename(args.predicted), 173 | '+'.join(selected_joints)) 174 | if not os.path.isdir(out): 175 | os.makedirs(out) 176 | 177 | with open(os.path.join(out, 'distances.txt'), 'w') as f: 178 | f.write(result_str) 179 | 180 | if args.visualize: 181 | # Set color and style 182 | mpl_default = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 183 | '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', 184 | '#bcbd22', '#17becf'] 185 | sns.set(context='poster', palette=sns.color_palette(mpl_default), font_scale=1.05) 186 | sns.set_style('white', {'legend.frameon':True}) 187 | 188 | # Velocities are computed in 20fps: make them into cm/s 189 | index = original.index * 20 190 | bins = [format(i, '.2f') for i in list(index)] 191 | 192 | # Plot speed in a range of [0, 15] 193 | bins = bins[:-4] 194 | original_hist = original_hist[:-4] 195 | 196 | fig = plt.figure(figsize=(8, 5)) 197 | ax = fig.add_subplot(111) 198 | 199 | # Convert frequency to percentage 200 | gt_handle, = ax.plot(bins, normalize(original_hist) * 100, color='C4') 201 | 202 | # Awesome way to create a tabular-style legend 203 | # https://stackoverflow.com/questions/25830780/tabular-legend-layout-for-matplotlib 204 | # Create a blank rectangle 205 | blank = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0) 206 | 207 | # Correspond to each columns of the tabular 208 | legend_handles = [blank, gt_handle] 209 | legend_names = ['Name', 'Ground Truth'] 210 | legend_dists = ['Hell. Dist.', '0'.center(16)] 211 | 212 | colors = ['C1', 'C3', 'C0', 'C2'] if len(keys) <= 4 else \ 213 | ['C1', 'C0', 'C6', 'C7', 'C8', 'C9', 'C5', 'C2', 'C3'] 214 | 215 | assert len(keys) <= len(colors) 216 | 217 | for color, key in zip(colors, keys): 218 | predicted_hist = results[key]['hist'][:-4] 219 | label = key.split('-')[1].replace('_smooth', '*') 220 | 221 | #if 'Aud2Pose' in label: 222 | # label += ' [18]' 223 | 224 | handle, = ax.plot(bins, normalize(predicted_hist) * 100, color=color) 225 | 226 | legend_handles.append(handle) 227 | legend_names.append(label) 228 | legend_dists.append('{:.3f}'.format(results[key]['dist']).center(12)) 229 | 230 | # Legend will have a tabular of (rows x 3) 231 | rows = len(legend_handles) 232 | empty_label = [''] 233 | 234 | legend_handles = legend_handles + [blank] * (rows * 2) 235 | legend_labels = np.concatenate([empty_label * rows, legend_names, legend_dists]) 236 | 237 | ax.legend(legend_handles, legend_labels, 238 | ncol=3, handletextpad=0.5, columnspacing=-2.15, 239 | labelspacing=0.35) 240 | ax.set_xlabel('Speed (cm/s)') 241 | ax.set_ylabel('Frequency (%)') 242 | ax.set_xticks(np.arange(16)) 243 | ax.tick_params(pad=6) 244 | ax.yaxis.set_major_locator( 245 | MaxNLocator(nbins='auto', steps=[1, 2, 2.5, 5, 10], integer=True)) 246 | 247 | plt.subplots_adjust(left=0.09, right=0.98, top=0.98, bottom=0.12) 248 | plt.savefig(os.path.join(out, 'speed_histogram.pdf')) 249 | plt.show() 250 | 251 | print('Results were writen in ' + out) 252 | print('') 253 | 254 | 255 | if __name__ == '__main__': 256 | main() 257 | -------------------------------------------------------------------------------- /evaluation/hellinger_one2one.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 30 16:58:35 2020 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | import argparse 9 | import glob 10 | import os 11 | import re 12 | 13 | import matplotlib 14 | matplotlib.use('Agg') 15 | import matplotlib.pyplot as plt 16 | from matplotlib.ticker import FixedLocator, MaxNLocator 17 | from matplotlib.patches import Rectangle 18 | import numpy as np 19 | import pandas as pd 20 | import seaborn as sns 21 | 22 | 23 | def read_joint_names(filename): 24 | """Read motion capture's body joint names from file 25 | 26 | Args: 27 | filename: file name to read 28 | 29 | Returns: 30 | joint_names: list of joint names 31 | """ 32 | 33 | with open(filename, 'r') as f: 34 | org = f.read() 35 | joint_names = org.split(',') 36 | 37 | return joint_names 38 | 39 | 40 | def compute_speed(data, dim=3): 41 | """Compute speed between adjacent frames 42 | 43 | Args: 44 | data: array containing joint positions of gesture 45 | dim: gesture dimensionality 46 | 47 | Returns: 48 | speeds: velocities of each joint between each adjacent frame 49 | """ 50 | 51 | # First derivative of position is velocity 52 | vels = np.diff(data, n=1, axis=0) 53 | 54 | num_vels = vels.shape[0] 55 | num_joints = vels.shape[1] // dim 56 | 57 | speeds = np.zeros((num_vels, num_joints)) 58 | 59 | for i in range(num_vels): 60 | for j in range(num_joints): 61 | x1 = j * dim + 0 62 | x2 = j * dim + dim 63 | speeds[i, j] = np.linalg.norm(vels[i, x1:x2]) 64 | 65 | return speeds 66 | 67 | 68 | def normalize(hist): 69 | return hist / np.sum(hist) 70 | 71 | 72 | def hellinger(hist1, hist2): 73 | """Compute Hellinger distance between two histograms 74 | 75 | Args: 76 | hist1: first histogram 77 | hist2: second histogram of the same size as hist1 78 | 79 | Returns: 80 | float: Hellinger distance between hist1 and hist2 81 | """ 82 | 83 | return np.sqrt(1.0 - np.sum(np.sqrt(normalize(hist1) * normalize(hist2)))) 84 | 85 | 86 | # https://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort # NOQA 87 | def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): 88 | return [int(text) if text.isdigit() else text.lower() 89 | for text in _nsre.split(s)] 90 | 91 | 92 | def natural_sort(l, key=natural_sort_key): 93 | return sorted(l, key=key) 94 | 95 | 96 | # https://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list # NOQA 97 | def reject_outliers(data, m=5.189): 98 | d = np.abs(data - np.median(data)) 99 | mdev = np.median(d) 100 | s = d / mdev if mdev else 0. 101 | return data[s < m] 102 | 103 | 104 | def main(): 105 | parser = argparse.ArgumentParser( 106 | description='Compute Hellinger distances between predicted ' 107 | 'and ground truth gestures in a one-to-one manner') 108 | parser.add_argument('--original', '-o', default='data/original', 109 | help='Original gesture directory') 110 | parser.add_argument('--predicted', '-p', default='data/predicted', 111 | help='Predicted gesture directory') 112 | parser.add_argument('--width', '-w', type=float, default=0.05, 113 | help='Bin width of the histogram (default: 0.05)') 114 | parser.add_argument('--joints', '-j', default='joints.txt', 115 | help='Joint name file') 116 | parser.add_argument('--select', '-s', nargs='+', 117 | help='Joint subset to compute (if omitted, use all)') 118 | parser.add_argument('--visualize', '-v', action='store_true', 119 | help='Visualize histograms') 120 | parser.add_argument('--match_yticks', '-m', action='store_true', 121 | help='Match y-ticks over all the sequences in visualization') 122 | parser.add_argument('--out', default='results', 123 | help='Directory to output the result') 124 | args = parser.parse_args() 125 | 126 | joint_names = read_joint_names(args.joints) 127 | 128 | if args.select is not None: 129 | selected_joints = [] 130 | for s in args.select: 131 | try: 132 | index = joint_names.index(s) 133 | except ValueError: 134 | print('Ignore invalid joint: {}'.format(s)) 135 | else: 136 | selected_joints.append(index) 137 | selected_joints.sort() 138 | 139 | if len(selected_joints) == 0: 140 | selected_joints = range(len(joint_names)) 141 | print('No valid joints are selected. Use all joints') 142 | else: 143 | # Use all joints 144 | selected_joints = range(len(joint_names)) 145 | 146 | def get_directories(directory): 147 | return sorted(filter(lambda x: os.path.isdir(x), glob.glob(directory))) 148 | 149 | # Define histogram bins 150 | bins = np.arange(0, 1 + args.width, args.width) 151 | 152 | # Find original gesture data 153 | original_files = natural_sort( 154 | glob.glob(os.path.join(args.original, '*.txt'))) 155 | 156 | if args.match_yticks: 157 | max_freqs = [] 158 | 159 | # Compute speed histogram for original gestures 160 | original_hists = [] 161 | for original_file in original_files: 162 | original = np.loadtxt(original_file) 163 | 164 | # Compute speed histogram 165 | original_speed = compute_speed(original)[:, selected_joints] 166 | original_hist, _ = np.histogram(original_speed, bins=bins) 167 | 168 | original_hists.append(original_hist) 169 | 170 | if args.match_yticks: 171 | max_freqs.append(normalize(original_hist).max().item()) 172 | 173 | # List of predicted gesture direcotires 174 | predicted_dirs = get_directories(os.path.join(args.predicted, '*')) 175 | 176 | if len(predicted_dirs) == 0: 177 | raise ValueError('No gesture directories are found in ' 178 | + args.predicted) 179 | 180 | results = {os.path.basename(d): None for d in predicted_dirs} 181 | 182 | assert 'original' not in results.keys() 183 | 184 | # Store original gesture histograms 185 | original_key = 'original' 186 | results[original_key] = dict() 187 | for i, original_hist in enumerate(original_hists): 188 | file_key = os.path.basename(original_files[i]) 189 | results[original_key][file_key] = {'hist': original_hist} 190 | 191 | # Iterate over the list of direcotires 192 | overall_dists = dict() 193 | for predicted_dir in predicted_dirs: 194 | predicted_files = natural_sort( 195 | glob.glob(os.path.join(predicted_dir, '*.txt'))) 196 | 197 | # Check if the predicted gesture files are consistent with the original files 198 | if [os.path.basename(p) for p in predicted_files] != [os.path.basename(o) for o in original_files]: 199 | raise ValueError('Gesture files located in ' + predicted_dir + ' are inconsistent with ' 200 | 'original gesture files located in ' + args.original) 201 | 202 | dir_key = os.path.basename(predicted_dir) 203 | results[dir_key] = dict() 204 | 205 | # Compute speed histogram for predicted gestures 206 | predicted_hists = [] 207 | for predicted_file in predicted_files: 208 | predicted = np.loadtxt(predicted_file) 209 | 210 | # Compute speed histogram 211 | predicted_speed = compute_speed(predicted)[:, selected_joints] 212 | predicted_hist, _ = np.histogram(predicted_speed, bins=bins) 213 | 214 | predicted_hists.append(predicted_hist) 215 | 216 | if args.match_yticks: 217 | max_freqs.append(normalize(predicted_hist).max().item()) 218 | 219 | assert len(original_hists) == len(predicted_hists) 220 | 221 | # Compute Hellinger distance in a one-to-one manner 222 | for i, (original_hist, predicted_hist) in enumerate(zip(original_hists, predicted_hists)): 223 | assert len(original_hist) == len(predicted_hist) 224 | 225 | # Hellinger distance between two histograms 226 | dist = hellinger(original_hist, predicted_hist) 227 | 228 | # Store results 229 | file_key = os.path.basename(predicted_files[i]) 230 | results[dir_key][file_key] = {'dist': dist, 'hist': predicted_hist} 231 | 232 | # Print the overall Hellinger distance (Note: this is not one-to-one) 233 | overall_dist = hellinger(np.sum(original_hists, axis=0), 234 | np.sum(predicted_hists, axis=0)) 235 | overall_dists[dir_key] = overall_dist 236 | 237 | # Create a dataframe to save 238 | dir_keys = natural_sort(results.keys()) 239 | dir_keys.remove('original') 240 | file_keys = natural_sort(results['original'].keys()) 241 | 242 | save_dict = {d_k: [results[d_k][f_k]['dist'] for f_k in file_keys] for d_k in dir_keys} 243 | df = pd.DataFrame(save_dict, index=file_keys) 244 | 245 | # Add mean and std values 246 | mean = df.mean() 247 | std = df.std() 248 | df.loc['mean'] = mean 249 | df.loc['std'] = std 250 | 251 | # Make an output directory 252 | if selected_joints == range(len(joint_names)): 253 | selected_joint_names = ['Total'] 254 | else: 255 | selected_joint_names = [joint_names[s] for s in selected_joints] 256 | out = os.path.join(args.out, os.path.basename(args.predicted), 257 | '+'.join(selected_joint_names)) 258 | if not os.path.isdir(out): 259 | os.makedirs(out) 260 | 261 | # Save the results to a CSV file 262 | df.to_csv(os.path.join(out, 'hellinger_distances.csv')) 263 | 264 | # Print and save the overall distances 265 | overall_str = ['Overall Hellinger distances:'] 266 | print('Overall Hellinger distances:') 267 | for dir_key in dir_keys: 268 | overall_str.append('{}: {}'.format(dir_key, overall_dists[dir_key])) 269 | print('{: <20}'.format(dir_key), 270 | '\t{:.3f}'.format(overall_dists[dir_key])) 271 | print('') 272 | 273 | overall_str = '\n'.join(overall_str) 274 | 275 | with open(os.path.join(out, 'overall_distances.txt'), 'w') as f: 276 | f.write(overall_str) 277 | 278 | if args.visualize: 279 | # Set color and style 280 | mpl_default = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', 281 | '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', 282 | '#bcbd22', '#17becf'] 283 | sns.set(context='poster', palette=sns.color_palette(mpl_default), font_scale=1.05) 284 | sns.set_style('white', {'legend.frameon': True}) 285 | 286 | # Velocities are computed in 20fps: make them into cm/s 287 | plot_bins = [format(b, '.2f') for b in bins[:-1] * 20] 288 | 289 | # Plot speed in a range of [0, 15] 290 | plot_bins = plot_bins[:-4] 291 | 292 | # Make an output directory 293 | vis_out = os.path.join(out, 'histograms') 294 | if not os.path.isdir(vis_out): 295 | os.makedirs(vis_out) 296 | 297 | if args.match_yticks: 298 | max_percentage = int(reject_outliers(np.array(max_freqs)).max().item() * 100) 299 | 300 | tick_interval = 5 if max_percentage // 5 < 9 else 10 # Avoid too many ticks 301 | ticks = list(range(0, max_percentage, tick_interval)) 302 | 303 | for file_key in file_keys: 304 | # Plot in a range of [0, 15] 305 | original_hist = results['original'][file_key]['hist'][:-4] 306 | 307 | fig = plt.figure(figsize=(8, 5)) 308 | ax = fig.add_subplot(111) 309 | 310 | # Convert frequency to percentage 311 | gt_handle, = ax.plot(plot_bins, normalize(original_hist) * 100, color='C4') 312 | 313 | # Awesome way to create a tabular-style legend 314 | # https://stackoverflow.com/questions/25830780/tabular-legend-layout-for-matplotlib 315 | # Create a blank rectangle 316 | blank = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0) 317 | 318 | # Correspond to each columns of the tabular 319 | legend_handles = [blank, gt_handle] 320 | legend_names = ['Name', 'Ground Truth'] 321 | legend_dists = ['Hell. Dist.', '0'.center(16)] 322 | 323 | colors = ['C1', 'C3', 'C0', 'C2'] if len(dir_keys) <= 4 else \ 324 | ['C1', 'C0', 'C6', 'C7', 'C8', 'C9', 'C5', 'C2', 'C3'] 325 | 326 | assert len(dir_keys) <= len(colors) 327 | 328 | for color, dir_key in zip(colors, dir_keys): 329 | predicted_hist = results[dir_key][file_key]['hist'][:-4] 330 | label = dir_key.split('-')[1].replace('_smooth', '*') 331 | 332 | # if 'Aud2Pose' in label: 333 | # label += ' [18]' 334 | 335 | handle, = ax.plot(plot_bins, normalize(predicted_hist) * 100, color=color) 336 | 337 | legend_handles.append(handle) 338 | legend_names.append(label) 339 | legend_dists.append('{:.3f}'.format(results[dir_key][file_key]['dist']).center(12)) 340 | 341 | # Legend will have a tabular of (rows x 3) 342 | rows = len(legend_handles) 343 | empty_label = [''] 344 | 345 | legend_handles = legend_handles + [blank] * (rows * 2) 346 | legend_labels = np.concatenate([empty_label * rows, legend_names, legend_dists]) 347 | 348 | ax.legend(legend_handles, legend_labels, 349 | ncol=3, handletextpad=0.5, columnspacing=-2.15, 350 | labelspacing=0.35) 351 | ax.set_xlabel('Speed (cm/s)') 352 | ax.set_ylabel('Frequency (%)') 353 | ax.set_xticks(np.arange(16)) 354 | ax.tick_params(pad=6) 355 | 356 | if args.match_yticks: 357 | ax.set_ylim(0, max_percentage) 358 | ax.yaxis.set_major_locator(FixedLocator(ticks)) 359 | else: 360 | ax.yaxis.set_major_locator( 361 | MaxNLocator(nbins='auto', steps=[1, 2, 2.5, 5, 10], integer=True)) 362 | 363 | plt.subplots_adjust(left=0.09, right=0.98, top=0.98, bottom=0.12) 364 | plt.savefig(os.path.join(vis_out, os.path.splitext(file_key)[0] + '_speed_histogram.pdf')) 365 | plt.show() 366 | 367 | plt.clf() 368 | plt.close() 369 | 370 | print('Results were writen in ' + out) 371 | print('') 372 | 373 | 374 | if __name__ == '__main__': 375 | main() 376 | -------------------------------------------------------------------------------- /evaluation/joints.txt: -------------------------------------------------------------------------------- 1 | Hips,Spine,Spine1,Neck,Head,Site1,LeftShoulder,LeftArm,LeftForeArm,LeftHand,LeftHandThumb1,LeftHandThumb2,LeftHandThumb3,Site2,LeftHandIndex1,LeftHandIndex2,LeftHandIndex3,Site3,LeftHandMiddle1,LeftHandMiddle2,LeftHandMiddle3,Site4,LeftHandRing1,LeftHandRing2,LeftHandRing3,Site5,LeftHandPinky1,LeftHandPinky2,LeftHandPinky3,Site6,RightShoulder,RightArm,RightForeArm,RightHand,RightHandThumb1,RightHandThumb2,RightHandThumb3,Site7,RightHandIndex1,RightHandIndex2,RightHandIndex3,Site8,RightHandMiddle1,RightHandMiddle2,RightHandMiddle3,Site9,RightHandRing1,RightHandRing2,RightHandRing3,Site10,RightHandPinky1,RightHandPinky2,RightHandPinky3,Site11,LeftUpLeg,LeftLeg,LeftFoot,LeftToeBase,Site12,RightUpLeg,RightLeg,RightFoot,RightToeBase,Site13 -------------------------------------------------------------------------------- /evaluation/plot_results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plots the experimental results after calculating motion statistics 3 | Expects that calc_distance was run before this script 4 | 5 | @author: Taras Kucherenko 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | import csv 10 | import numpy as np 11 | 12 | def read_joint_names(filename): 13 | with open(filename, 'r') as f: 14 | org = f.read() 15 | joint_names = org.split(',') 16 | 17 | return joint_names 18 | 19 | def read_csv(filename): 20 | 21 | joint_names = read_joint_names("joints.txt") 22 | 23 | r_shoulder_index = joint_names.index("RightShoulder") + 1 24 | l_shoulder_index = joint_names.index("LeftShoulder") + 1 25 | 26 | r_hand_index = joint_names.index("RightHand") + 1 27 | l_hand_index = joint_names.index("LeftHand") + 1 28 | 29 | r_forearm_index = joint_names.index("RightForeArm") + 1 30 | l_forearm_index = joint_names.index("LeftForeArm") + 1 31 | 32 | x=[] 33 | y=[] 34 | total_sum = 0 35 | with open(filename, 'r') as csvfile: 36 | reader = csv.reader(csvfile, delimiter=',') 37 | next(reader, None) # skip the headers 38 | for row in reader: 39 | x.append(float(row[0]) * 20) # Scale the velocity 40 | next_val = float(row[r_hand_index]) + float(row[l_hand_index]) # float(row[-1]) #l_hand_index]) # 41 | y.append(next_val*100) 42 | total_sum+=next_val 43 | 44 | # Crop on 15 45 | if float(row[0]) * 20 >= 15: 46 | break 47 | 48 | return np.array(x), np.array(y) / total_sum 49 | 50 | def barplot_annotate_brackets(num1, num2, data, center, height, yerr=None, dh=.05, barh=.05, fs=None, maxasterix=None): 51 | """ 52 | Annotate barplot with p-values. 53 | 54 | :param num1: number of left bar to put bracket over 55 | :param num2: number of right bar to put bracket over 56 | :param data: string to write or number for generating asterixes 57 | :param center: centers of all bars (like plt.bar() input) 58 | :param height: heights of all bars (like plt.bar() input) 59 | :param yerr: yerrs of all bars (like plt.bar() input) 60 | :param dh: height offset over bar / bar + yerr in axes coordinates (0 to 1) 61 | :param barh: bar height in axes coordinates (0 to 1) 62 | :param fs: font size 63 | :param maxasterix: maximum number of asterixes to write (for very small p-values) 64 | """ 65 | 66 | text = data 67 | 68 | lx, ly = center[num1], height[num1] 69 | rx, ry = center[num2], height[num2] 70 | 71 | if yerr: 72 | ly += yerr[num1] 73 | ry += yerr[num2] 74 | 75 | ax_y0, ax_y1 = plt.gca().get_ylim() 76 | dh *= (ax_y1 - ax_y0) 77 | barh *= (ax_y1 - ax_y0) 78 | 79 | y = max(ly, ry) + dh 80 | 81 | barx = [lx, lx, rx, rx] 82 | bary = [y, y+barh, y+barh, y] 83 | mid = ((lx+rx)/2, y+barh) 84 | 85 | plt.plot(barx, bary, c='black') 86 | 87 | kwargs = dict(ha='center', va='bottom') 88 | if fs is not None: 89 | kwargs['fontsize'] = fs 90 | 91 | #plt.text(*mid, text, **kwargs) 92 | 93 | 94 | def get_average(feature_name): 95 | 96 | feature_filename = 'result/'+feature_name+'/1/hmd_' + type + '_0.05.csv' 97 | _, feature_1 = read_csv(feature_filename) 98 | feature_filename = 'result/'+feature_name+'/2/hmd_' + type + '_0.05.csv' 99 | _, feature_2 = read_csv(feature_filename) 100 | feature_filename = 'result/'+feature_name+'/3/hmd_' + type + '_0.05.csv' 101 | _, feature_3 = read_csv(feature_filename) 102 | # average 103 | feature = np.mean(np.array([feature_1, feature_2, feature_3]), axis=0) 104 | 105 | return feature 106 | 107 | 108 | plt.rcParams.update({'font.size': 36}) 109 | 110 | 111 | type = "vel" 112 | 113 | original_filename = 'result/original/hmd_'+type+'_0.05.csv' 114 | 115 | x,original = read_csv(original_filename) 116 | 117 | mfcc = get_average('MFCC') 118 | 119 | baseline = get_average('MFCC_Bas') 120 | 121 | spectr = get_average('Spectr') 122 | 123 | pros = get_average('Pros') 124 | 125 | spectr_pros = get_average('Spectr_Pros') 126 | 127 | mfcc_pros = get_average('MFCC_Pros') 128 | 129 | 130 | """baseline = [4.160, 4.940, 4.319] 131 | encoder = np.array([4.798, 4.830, 4.151]) 132 | x = np.arange(3) 133 | 134 | errorB = [0.93, 1, 1.43] 135 | errorE = [0.89, 0.98, 1.43] 136 | 137 | plt.bar(x, baseline, yerr=errorB, label='Baseline' ,width = 0.25, hatch='/') 138 | plt.bar(x+0.25, encoder, label = 'Proposed' ,width = 0.25) 139 | 140 | special_x = np.array([0, 0.25, 0.5, 0.75]) 141 | 142 | barplot_annotate_brackets(0, 1, "p < 0.002", special_x, encoder) 143 | barplot_annotate_brackets(1, 2, "p = 0.32", special_x+0.75, encoder) 144 | barplot_annotate_brackets(1, 2, "p = 0.13", special_x+1.75, encoder) 145 | 146 | plt.xticks(np.arange(3),('Naturalness', 'Time-consistency', 'Semantic-consistency')) 147 | 148 | plt.legend(bbox_to_anchor=(0.2, 0.99), ncol=2) 149 | 150 | plt.ylim(top=6)""" 151 | 152 | 153 | 154 | 155 | 156 | #plt.plot(x,original, label='Ground Truth',linewidth=7.0)#,width = 0.25) 157 | plt.plot(x,original,linewidth=7.0, label='Ground Truth', color='Purple') 158 | plt.plot(x,spectr , label='Proposed (Spectral)',linewidth=7.0) 159 | plt.plot(x,pros , label='Proposed (Prosodic)',linewidth=7.0, color='C2') 160 | 161 | 162 | #plt.plot(x,mfcc_pros , label='MFCC+Pros',linewidth=7.0, color='Pink') 163 | #plt.plot(x,spectr_pros , label='Spectrogram+Pros',linewidth=7.0, color='C3') 164 | 165 | plt.plot(x,mfcc , label='Proposed (MFCC)',linewidth=7.0, color='C1') 166 | 167 | plt.plot(x,baseline , label='Baseline (MFCC)',linewidth=7.0, color='Blue') 168 | 169 | plt.xlabel("Velocity (cm/s)") 170 | plt.ylabel('Frequency (%)') 171 | #plt.title('Average Velocity Histogram') 172 | 173 | 174 | 175 | plt.xticks(np.arange(16))#, ('Tom', 'Dick', 'Harry', 'Sally', 'Sue')) 176 | 177 | 178 | leg = plt.legend() 179 | 180 | 181 | 182 | plt.show() 183 | -------------------------------------------------------------------------------- /example_scripts/README.md: -------------------------------------------------------------------------------- 1 | # Example Scripts 2 | 3 | This directory contain scripts used for in experiments for training and testing different Neural Networks (NN) 4 | 1. Training and testing a baseline gesture generation NN (baseline_train_n_test.sh) 5 | 2. Training and testing of autoencoder-based gesture generation NN (proposed_train_n_test.sh) 6 | 7 | Note: prior to using this scripts a user needs 8 | a) download and preprocess dataset, as described in the root folder 9 | b) adjust parameters in the `config.txt` file 10 | 11 | ### Baseline model 12 | 13 | Use `baseline_train_n_test.sh` to train a baseline speech-driven gesture generation neural network 14 | ```sh 15 | ./baseline_train_n_test.sh 16 | ``` 17 | The resulting model will be stored in the following file: `folder`BasedModel.hdf5 18 | The numerical evaluation will be writen in the file `../results.txt` 19 | 20 | Note: `baseline_test.sh` is used in `baseline_train_n_test.sh` for testing. 21 | 22 | 23 | ### Proposed model 24 | 25 | Use `proposed_train_n_test.sh` to train and test a baseline speech-driven gesture generation neural network 26 | ```sh 27 | ./proposed_train_n_test.sh 28 | ``` 29 | The resulting model will be stored in the following file: `folder`Based`enc_dim`DimModel.hdf5 30 | The numerical evaluation will be writen in the file `../results.txt` 31 | 32 | Note: `proposed_test.sh` is used in `proposed_train_n_test.sh` for testing. 33 | -------------------------------------------------------------------------------- /example_scripts/baseline_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script is used in "baseline_train_n_test.sh" to evaluate the baseline model 4 | # You call use it by itself if the model is already trained 5 | # Several aspects needs to be customized 6 | 7 | # Read parameters 8 | source config.txt 9 | 10 | model=example_scripts/models/${folder}"BasedModel" 11 | 12 | # Create a folder to store produced gesture sequences 13 | mkdir -p gestures 14 | 15 | # Remove previous results 16 | cd .. 17 | rm evaluation/data/predicted/$speech_features/* 18 | 19 | # Make predictions for all the test sequences 20 | # (replace 1094 by 1093 for the dev sequences) 21 | for seq in `seq 1094 2 1182`; 22 | do 23 | echo 24 | echo 'Predicting sequence' $seq 25 | CUDA_VISIBLE_DEVICES=$gpu python predict.py $model.hdf5 $data_dir/test_inputs/X_test_audio${seq}.npy normal_prediction$seq.txt 26 | mv normal_prediction$seq.txt example_scripts/gestures/gesture${seq}.txt 27 | done 28 | 29 | echo 'Removing the velocities ...' 30 | python helpers/remove_velocity.py -g example_scripts/gestures 31 | cd example_scripts/gestures 32 | 33 | # remove gestures with velocites 34 | rm *.txt 35 | 36 | # Move gestrues without velocities to the corresponding folder 37 | mkdir -p ../../evaluation/data/predicted/$speech_features/ 38 | mv no_vel/*.txt ../../evaluation/data/predicted/$speech_features/ 39 | cd ../../evaluation 40 | 41 | # In order for an evaluation to be correct ONLY ground truth motion 3d coords in txt format for the 42 | # same sequences as used in the script above (1094, 1096,...) has to be in evaluation/data/original 43 | # if evaluation/data/origibal contains all the sequences (1093,1094...) the results will be wrong 44 | # see "evaluation" folder for the info on how to transform the true gestures from bvh to txt format 45 | 46 | echo 'Evaluating ...' 47 | echo "Evaluating "${model}" ..." >> ../results.txt 48 | python calc_errors.py -g $speech_features -m ape >> ../results.txt 49 | python calc_errors.py -g $speech_features -m mae >> ../results.txt 50 | python calc_jerk.py -g $speech_features -m acceleration >> ../results.txt 51 | python calc_jerk.py -g $speech_features >> ../results.txt 52 | -------------------------------------------------------------------------------- /example_scripts/baseline_train_n_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script can be used to train a speech-gesture neural network 4 | # You might need to customize it using config.txt file 5 | 6 | # (Optional) Activate your virtual env 7 | source activate CondaEnvPy3Tf 8 | 9 | # Read the parameters for the scripts 10 | source config.txt 11 | 12 | model=${folder}"BasedModel" 13 | 14 | echo "Training "${model}" on the ${folder} folder" 15 | START=$(date +%s) 16 | 17 | # Train baseline model 18 | CUDA_VISIBLE_DEVICES=$gpu python ../train.py models/$model.hdf5 100 $data_dir $numb_in_features False 19 | 20 | Tr_FINISH=$(date +%s) 21 | 22 | # Evaluate the model 23 | echo "Testing "${model}" model" >> ../results.txt 24 | ./baseline_test.sh 25 | 26 | # Compress and save the results 27 | archive=${model}Results.tar 28 | echo "Compressing the results:" 29 | tar -czvf $archive ../evaluation/data/predicted/$speech_features/*.txt 30 | echo "The results were compressed into example_scripts/"$archive 31 | 32 | END=$(date +%s) 33 | DIFF=$(( $END - $START )) 34 | echo "The whole cicle took $[DIFF/60] minutes" 35 | 36 | DIFF=$(( $Tr_FINISH - $START )) 37 | echo "Learning speech-motion mapping took $[DIFF/60] minutes" 38 | 39 | echo "The model was saved in "example_scripts/models/${model}".hdf5" 40 | -------------------------------------------------------------------------------- /example_scripts/config.txt: -------------------------------------------------------------------------------- 1 | gpu=0 # which GPU to use to run the model 2 | folder=TheLAtest # which folder with the data to use 3 | data_dir=/home/taras/Documents/Datasets/SpeechToMotion/Japanese/$folder 4 | speech_features=MFCC # which speech features to use 5 | numb_in_features=26 # how many speech features there are 6 | dim=325 # what is the dimensionality of representation 7 | -------------------------------------------------------------------------------- /example_scripts/proposed_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script is used in "proposed_train_n_test.sh" to evaluate the proposed model 4 | # You call use it by itself if the model is already trained 5 | # Several aspects needs to be customized at config.txt 6 | 7 | # Read the parameters for the scripts 8 | source config.txt 9 | 10 | model=example_scripts/models/${folder}"Based"${dim}"DimModel" 11 | 12 | # Create a folder to store produced gesture sequences 13 | mkdir -p gestures 14 | 15 | # Remove previous results 16 | cd .. 17 | rm evaluation/data/predicted/$speech_features/* 18 | 19 | # Make predictions for all the test sequences 20 | # (replace 1094 by 1093 for the dev sequences) 21 | for seq in `seq 1094 2 1182`; 22 | do 23 | echo 24 | echo 'Predicting sequence' $seq 25 | # Step1: Predict representation 26 | CUDA_VISIBLE_DEVICES=$GPU python predict.py $model.hdf5 $data_dir/test_inputs/X_test_audio${seq}.npy enc_${dim}_prediction$seq.txt 27 | mv enc_${dim}_prediction$seq.txt motion_repr_learning/ae/ 28 | cd motion_repr_learning/ae/ 29 | # Step2: Decode representation into motion 30 | CUDA_VISIBLE_DEVICES=$GPU python decode.py $data_dir enc_${dim}_prediction${seq}.txt ../../example_scripts/gestures/gesture${seq}.txt -restore=True -pretrain=False -layer1_width=$dim -chkpt_dir='/home/taras/tmp/MoCap/'$dim -batch_size=8 31 | # Remove encoded prediction 32 | rm enc_${dim}_pred* 33 | cd ../.. 34 | done 35 | 36 | echo 'Removing the velocities ...' 37 | python helpers/remove_velocity.py -g example_scripts/gestures 38 | cd example_scripts/gestures 39 | 40 | # remove gestures with velocites 41 | rm *.txt 42 | 43 | # Move gestrues without velocities to the corresponding folder 44 | mkdir -p ../../evaluation/data/predicted/$speech_features/ 45 | mv no_vel/*.txt ../../evaluation/data/predicted/$speech_features/ 46 | cd ../../evaluation 47 | 48 | # In order for an evaluation to be correct ONLY ground truth motion 3d coords in txt format for the 49 | # same sequences as used in the script above (1094, 1096,...) has to be in evaluation/data/original 50 | # if evaluation/data/origibal contains all the sequences (1093,1094...) the results will be wrong 51 | # see "evaluation" folder for the info on how to transform the true gestures from bvh to txt format 52 | 53 | echo 'Evaluating ...' 54 | echo "Evaluating "${model}" ..." >> ../results.txt 55 | python calc_errors.py -g $speech_features -m ape >> ../results.txt 56 | python calc_errors.py -g $speech_features -m mae >> ../results.txt 57 | python calc_jerk.py -g $speech_features >> ../results.txt 58 | python calc_jerk.py -g $speech_features -m acceleration >> ../results.txt 59 | # Where to store the results can be customized 60 | -------------------------------------------------------------------------------- /example_scripts/proposed_train_n_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script contain both training and testing 4 | # of the autoencoder based gesture generation neural network 5 | # You might need to customize it using config.txt file 6 | 7 | # (Optional) Activate your virtual env 8 | source activate CondaEnvPy3Tf 9 | 10 | # Read the parameters for the scripts 11 | source config.txt 12 | 13 | model=${folder}"Based"${dim}"DimModel" 14 | 15 | echo "Training "${model} 16 | 17 | # Do timing 18 | START=$(date +%s) 19 | 20 | cd ../motion_repr_learning/ae/ 21 | 22 | # Create a folder for the encoded dataset 23 | mkdir -p $data_dir/325 24 | 25 | # Learn dataset encoding 26 | CUDA_VISIBLE_DEVICES=$gpu python learn_dataset_encoding.py $data_dir -chkpt_dir='/home/taras/tmp/MoCap/'$dim -layer1_width=$dim 27 | 28 | #Encode dataset 29 | echo "Encoding the dataset" 30 | CUDA_VISIBLE_DEVICES=$gpu python encode_dataset.py $data_dir -chkpt_dir='/home/taras/tmp/MoCap/'$dim -restore=True -pretrain=False -layer1_width=$dim 31 | 32 | # Copy input data 33 | Encoding=$(date +%s) 34 | 35 | cd ../../example_scripts 36 | 37 | Tr_START=$(date +%s) 38 | 39 | # Train model on the reprentation 40 | CUDA_VISIBLE_DEVICES=$gpu python ../train.py models/$model.hdf5 100 $data_dir $numb_in_features True $dim 41 | 42 | Tr_FINISH=$(date +%s) 43 | 44 | # Evaluate the model 45 | ./proposed_test.sh 46 | 47 | # Compress and save the results 48 | archive=${model}Results.tar 49 | echo "Compressing the results:" 50 | tar -czvf $archive ../evaluation/data/predicted/$speech_features/*.txt 51 | echo "The results were compressed into example_scripts/models/"$archive 52 | 53 | END=$(date +%s) 54 | DIFF=$(( $END - $START )) 55 | echo "The whole cicle took $[DIFF/60] minutes" 56 | 57 | DIFF=$(( $Encoding - $START )) 58 | echo "Learning repr. and encoding took $[DIFF/60] minutes" 59 | 60 | DIFF=$(( $Tr_FINISH - $Tr_START )) 61 | echo "Learning speech-motion mapping took $[DIFF/60] minutes" 62 | -------------------------------------------------------------------------------- /helpers/README.md: -------------------------------------------------------------------------------- 1 | # How to use the helper scripts 2 | 3 | This directory provides data handling scripts for our gesture generation framework. It provides the following functionality: 4 | - Velocity remover for predicted gestures 5 | 6 | (The neural network outputs coordinates and velocities to regularize training and we remove velocities as postprocessing) 7 | - Original gesture converter to create the ground truth 8 | 9 | (Converting the original motion for joint angles space in .bvh format to 3d coordinates in txt coordinates) 10 | - Temporal filters for motion smoothing 11 | 12 | (Can be applied as postprocessing. Were not used in the experiments from the paper) 13 | 14 | ## Data preparation 15 | 1. Run `../predict.py` to predict gestures from speech audio as described in the root folder. 16 | 2. Put the predicted gestures (e.g. `predict_1094.txt, ...`) into a directory, say, `your_prediction_dir/`. 17 | 18 | ### Velocity remover 19 | 20 | `remove_velocity.py` removes velocities from raw predicted gestures. This produces gesture files containing `(x, y, z) x 64 joints = 192` white space separated data for each line. 21 | **You have to remove the velocities before using the evaluation scripts or the animation server.** 22 | 23 | ```sh 24 | # Remove velocities 25 | python remove_velocity.py -g your_prediction_dir 26 | ``` 27 | The resulting files will be stored in the subfolder: `your_prediction_dir/no_vel` 28 | 29 | ### Original gesture converter 30 | 31 | `convert_original.py` converts `.bvh` files in the test set to ground truth body keypoint positions. **You need the ground truth for the quantitative evaluation.** 32 | 33 | ```sh 34 | # Convert test bvh to ground truth 35 | python convert_original.py 36 | ``` 37 | 38 | Note: `convert_original.py` assumes that the `.bvh` files are stored in `../data/test/labels/` by default. You can use `--data` or `-d` option to specify a different directory. You can specify the output directory by `--out` or `-o` option (default: `../evaluation/data/original/`). 39 | 40 | ### Temporal filters 41 | 42 | We support two types of temporal filters, 1€ filter and Simple Moving Average (SMA) filter, to smooth gesture motion. 43 | 44 | To apply filters, you can use `apply_filters.py`. 45 | You can change the averaging window size for SMA filter by `--window` or `-w` option (default: 5). 46 | 47 | ```sh 48 | # Apply temporal filters 49 | python apply_filters.py -g your_prediction_dir -w 5 50 | ``` 51 | 52 | Note: `apply_filters.py` produces three types of smoothed gestures (1€, SMA, and 1€ + SMA). The smoothed gestures will be stored in `euro/`, `sma/`, and `euro_sma/` subfolders of `your_prediction_dir/`. 53 | -------------------------------------------------------------------------------- /helpers/apply_filters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Apply smoothing filters as postprocessing 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | 9 | import argparse 10 | import glob 11 | import os 12 | 13 | import numpy as np 14 | 15 | from filters.ma_filter import simple_moving_average 16 | from filters.one_euro_filter import apply_one_euro 17 | 18 | 19 | def save_positions(out_dir, gesture_name, positions): 20 | """Save body keypoint positions into file 21 | 22 | Args: 23 | out_dir: output directory 24 | gesture_name: basename of the output file 25 | positions: keypoint positions to save 26 | """ 27 | 28 | filename = os.path.join(out_dir, gesture_name + '.txt') 29 | np.savetxt(filename, positions, fmt='%s') 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser( 34 | description='Apply filters to the generated gestures') 35 | parser.add_argument('--gesture', '-g', required=True, 36 | help='Path to the gesture directory to filter') 37 | parser.add_argument('--window', '-w', type=int, default=5, 38 | help='Windows size for moving average (must be odd)') 39 | args = parser.parse_args() 40 | 41 | print('Apply temporal filters to the ' 42 | 'gestures in "{}"'.format(args.gesture)) 43 | print('') 44 | 45 | # List of gesture files 46 | txt_paths = sorted(glob.glob(os.path.join(args.gesture, '*.txt'))) 47 | 48 | # Check file existence 49 | if not txt_paths: 50 | raise ValueError('Could not find the gesture files in "{}". ' 51 | 'Please specify correct folder as --gesture flag.' 52 | .format(args.gesture)) 53 | 54 | # Filter types 55 | types = { 56 | 'euro': 'euro', 57 | 'sma': 'sma{}'.format(args.window), 58 | 'euro_sma': 'euro_sma{}'.format(args.window)} 59 | 60 | # Make output directories 61 | euro_dir = os.path.join(args.gesture, types['euro']) 62 | sma_dir = os.path.join(args.gesture, types['sma']) 63 | euro_sma_dir = os.path.join(args.gesture, types['euro_sma']) 64 | for d in [euro_dir, sma_dir, euro_sma_dir]: 65 | if not os.path.isdir(d): 66 | os.makedirs(d) 67 | 68 | for txt_path in txt_paths: 69 | print('Process "{}"'.format(txt_path)) 70 | 71 | raw_pos = np.loadtxt(txt_path) 72 | 73 | # One Euro filter 74 | euro_pos = apply_one_euro(raw_pos) 75 | 76 | # Moving average filter 77 | sma_pos = simple_moving_average(raw_pos, args.window) 78 | 79 | # Combined 80 | euro_sma_pos = simple_moving_average(euro_pos, args.window) 81 | 82 | gesture_name, _ = os.path.splitext(os.path.basename(txt_path)) 83 | save_positions(euro_dir, gesture_name, euro_pos) 84 | save_positions(sma_dir, gesture_name, sma_pos) 85 | save_positions(euro_sma_dir, gesture_name, euro_sma_pos) 86 | 87 | print('') 88 | print('Results were written under "{}"'.format(args.gesture)) 89 | print('') 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /helpers/convert_original.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Convert ground truth gestures from joint angles in bvh format to the 3d coordinates in text format 4 | 5 | @author: kaneko.naoshi 6 | """ 7 | 8 | 9 | import argparse 10 | import glob 11 | import os 12 | 13 | import numpy as np 14 | import pyquaternion as pyq 15 | 16 | 17 | def create_hierarchy_nodes(filename): 18 | """Load bvh hierarchy nodes 19 | 20 | Args: 21 | filename: name of the hierarchy file 22 | 23 | Returns: 24 | nodes: bvh hierarchy nodes 25 | """ 26 | 27 | # Read BVH hierarchy 28 | with open(filename, 'r') as f: 29 | hierarchy = f.readlines() 30 | 31 | joint_offsets = [] 32 | joint_names = [] 33 | 34 | for idx, line in enumerate(hierarchy): 35 | hierarchy[idx] = hierarchy[idx].split() 36 | 37 | if not len(hierarchy[idx]) == 0: 38 | line_type = hierarchy[idx][0] 39 | if line_type == 'OFFSET': 40 | offset = np.array([float(hierarchy[idx][1]), 41 | float(hierarchy[idx][2]), 42 | float(hierarchy[idx][3])]) 43 | joint_offsets.append(offset) 44 | elif line_type == 'ROOT' or line_type == 'JOINT': 45 | joint_names.append(hierarchy[idx][1]) 46 | elif line_type == 'End': 47 | joint_names.append('End Site') 48 | 49 | nodes = [] 50 | for idx, name in enumerate(joint_names): 51 | if idx == 0: 52 | parent = None 53 | elif idx in [6, 30]: # spine1->shoulders 54 | parent = 2 55 | elif idx in [14, 18, 22, 26]: # lefthand->leftfingers 56 | parent = 9 57 | elif idx in [38, 42, 46, 50]: # righthand->rightfingers 58 | parent = 33 59 | elif idx in [54, 59]: # hip->legs 60 | parent = 0 61 | else: 62 | parent = idx - 1 63 | 64 | if name == 'End Site': 65 | children = None 66 | elif idx == 0: # hips 67 | children = [1, 54, 59] 68 | elif idx == 2: # spine1 69 | children = [3, 6, 30] 70 | elif idx == 9: # lefthand 71 | children = [10, 14, 18, 22, 26] 72 | elif idx == 33: # righthand 73 | children = [34, 38, 42, 46, 50] 74 | else: 75 | children = [idx + 1] 76 | 77 | node = dict([('name', name), ('parent', parent), 78 | ('children', children), ('offset', joint_offsets[idx]), 79 | ('rel_degs', None), ('abs_qt', None), 80 | ('rel_pos', None), ('abs_pos', None)]) 81 | if idx == 0: 82 | node['rel_pos'] = node['abs_pos'] = [float(0), float(60), float(0)] 83 | node['abs_qt'] = pyq.Quaternion() 84 | nodes.append(node) 85 | 86 | return nodes 87 | 88 | 89 | def load_bvh(filename): 90 | """Load bvh motion frames 91 | 92 | Args: 93 | filename: bvh filename 94 | 95 | Returns: 96 | frames: list of bvh frames 97 | """ 98 | 99 | with open(filename, 'r') as f: 100 | frames = f.readlines() 101 | frametime = frames[310].split()[2] 102 | 103 | del frames[0:311] 104 | bvh_len = len(frames) 105 | 106 | for idx, line in enumerate(frames): 107 | frames[idx] = [float(x) for x in line.split()] 108 | 109 | for i in range(0, bvh_len): 110 | for j in range(0, 306 // 3): 111 | st = j * 3 112 | del frames[i][st:st + 3] 113 | 114 | # If data is approx 24fps, cut it to 20 fps (del every sixth line) 115 | # If data is 100fps, cut it to 20 fps (take every fifth line) 116 | if float(frametime) == 0.0416667: 117 | del frames[::6] 118 | elif float(frametime) == 0.010000: 119 | frames = frames[::5] 120 | else: 121 | print('Unsupported fps {} in {}'.format(frametime, filename)) 122 | 123 | return frames 124 | 125 | 126 | def rotation_to_position(frames, nodes): 127 | """Convert bvh frames to body keypoint positions 128 | 129 | Args: 130 | frames: bvh frames 131 | nodes: bvh hierarchy nodes 132 | 133 | Returns: 134 | out_data: array containing body keypoint positions 135 | """ 136 | 137 | output_lines = [] 138 | 139 | for frame in frames: 140 | node_idx = 0 141 | for i in range(51): 142 | stepi = i * 3 143 | z_deg = float(frame[stepi]) 144 | x_deg = float(frame[stepi + 1]) 145 | y_deg = float(frame[stepi + 2]) 146 | 147 | if nodes[node_idx]['name'] == 'End Site': 148 | node_idx = node_idx + 1 149 | nodes[node_idx]['rel_degs'] = [z_deg, x_deg, y_deg] 150 | current_node = nodes[node_idx] 151 | 152 | node_idx = node_idx + 1 153 | 154 | for start_node in nodes: 155 | abs_pos = np.array([0, 60, 0]) 156 | current_node = start_node 157 | if start_node['children'] is not None: 158 | for child_idx in start_node['children']: 159 | child_node = nodes[child_idx] 160 | 161 | child_offset = np.array(child_node['offset']) 162 | 163 | qz = pyq.Quaternion(axis=[0, 0, 1], 164 | degrees=start_node['rel_degs'][0]) 165 | qx = pyq.Quaternion(axis=[1, 0, 0], 166 | degrees=start_node['rel_degs'][1]) 167 | qy = pyq.Quaternion(axis=[0, 1, 0], 168 | degrees=start_node['rel_degs'][2]) 169 | qrot = qz * qx * qy 170 | offset_rotated = qrot.rotate(child_offset) 171 | child_node['rel_pos'] = start_node['abs_qt'].rotate( 172 | offset_rotated) 173 | 174 | child_node['abs_qt'] = start_node['abs_qt'] * qrot 175 | 176 | while current_node['parent'] is not None: 177 | abs_pos = abs_pos + current_node['rel_pos'] 178 | current_node = nodes[current_node['parent']] 179 | start_node['abs_pos'] = abs_pos 180 | 181 | line = [] 182 | for node in nodes: 183 | line.append(node['abs_pos']) 184 | output_lines.append(line) 185 | 186 | output_array = np.asarray(output_lines) 187 | out_data = np.empty([len(output_array), 192]) 188 | for idx, line in enumerate(output_array): 189 | out_data[idx] = line.flatten() 190 | 191 | return out_data 192 | 193 | 194 | def main(): 195 | parser = argparse.ArgumentParser( 196 | description='Convert original motion data into joint positions') 197 | parser.add_argument('--data', '-d', default='../data/test/labels', 198 | help='Path to the original test motion data directory') 199 | parser.add_argument('--out', '-o', default='../evaluation/data/original', 200 | help='Directory to store the resultant position files') 201 | args = parser.parse_args() 202 | 203 | print('Convert original gestures to the ground truth') 204 | if args.data != parser.get_default('data'): 205 | print('Warning: non-default original gesture directory is given: ' 206 | + args.data) 207 | print('') 208 | 209 | # List of bvh files 210 | bvh_paths = sorted(glob.glob(os.path.join(args.data, '*.bvh'))) 211 | 212 | # Check file existence 213 | if not bvh_paths: 214 | raise ValueError( 215 | 'Could not find the ground truth bvh files in "{}". ' 216 | 'Please specify correct folder as --data flag.'.format(args.data)) 217 | 218 | # Read bvh hierarchy 219 | nodes = create_hierarchy_nodes('../hierarchy.txt') 220 | 221 | # Make output directories 222 | if not os.path.isdir(args.out): 223 | os.makedirs(args.out) 224 | 225 | for bvh_path in bvh_paths: 226 | print('Process "{}"'.format(bvh_path)) 227 | frames = load_bvh(bvh_path) 228 | 229 | out_data = rotation_to_position(frames, nodes) 230 | gesture_name, _ = os.path.splitext(os.path.basename(bvh_path)) 231 | out_path = os.path.join(args.out, gesture_name + '.txt') 232 | np.savetxt(out_path, out_data, fmt='%s') 233 | 234 | print('') 235 | print('Results were written in "{}"'.format(args.out)) 236 | print('') 237 | 238 | 239 | if __name__ == '__main__': 240 | main() 241 | -------------------------------------------------------------------------------- /helpers/filters/__pycache__/ma_filter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/helpers/filters/__pycache__/ma_filter.cpython-35.pyc -------------------------------------------------------------------------------- /helpers/filters/__pycache__/one_euro_filter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/helpers/filters/__pycache__/one_euro_filter.cpython-35.pyc -------------------------------------------------------------------------------- /helpers/filters/ma_filter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def simple_moving_average(pos_array, winlen): 5 | """Apply simple moving average filter to a gesture 6 | 7 | Args: 8 | pos_array: body keypoint positions to filter 9 | winlen: averaging window size (must be odd) 10 | Returns: 11 | np.ndarray: filtered positions 12 | """ 13 | 14 | pos_columns = [] 15 | winlen_oneside = int((winlen - 1) / 2) 16 | for i in range(len(pos_array[0])): 17 | line = [] 18 | for j in range(len(pos_array)): 19 | line.append(pos_array[j][i]) 20 | pos_columns.append(line) 21 | 22 | res_list = [] 23 | for i, joint in enumerate(pos_columns): 24 | line = [] 25 | for j in range(len(pos_columns[i])): 26 | start_idx = j - winlen_oneside 27 | end_idx = j + winlen_oneside + 1 28 | if start_idx < 0: 29 | line.append(np.mean(pos_columns[i][:end_idx])) 30 | elif end_idx > len(pos_columns[i]): 31 | line.append(np.mean(pos_columns[i][start_idx:])) 32 | else: 33 | line.append(np.mean(pos_columns[i][start_idx:end_idx])) 34 | res_list.append(line) 35 | 36 | res_array = np.array(res_list) 37 | 38 | return res_array.transpose() 39 | -------------------------------------------------------------------------------- /helpers/filters/one_euro_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # OneEuroFilter.py - 4 | # 5 | # Author: Nicolas Roussel (nicolas.roussel@inria.fr) 6 | 7 | import math 8 | import numpy as np 9 | 10 | 11 | # ---------------------------------------------------------------------------- 12 | 13 | class LowPassFilter(object): 14 | 15 | def __init__(self, alpha): 16 | self.__setAlpha(alpha) 17 | self.__y = self.__s = None 18 | 19 | def __setAlpha(self, alpha): 20 | alpha = float(alpha) 21 | if alpha <= 0 or alpha > 1.0: 22 | raise ValueError("alpha (%s) should be in (0.0, 1.0]" % alpha) 23 | self.__alpha = alpha 24 | 25 | def __call__(self, value, timestamp=None, alpha=None): 26 | if alpha is not None: 27 | self.__setAlpha(alpha) 28 | if self.__y is None: 29 | s = value 30 | else: 31 | s = self.__alpha * value + (1.0 - self.__alpha) * self.__s 32 | self.__y = value 33 | self.__s = s 34 | return s 35 | 36 | def lastValue(self): 37 | return self.__y 38 | 39 | # ---------------------------------------------------------------------------- 40 | 41 | 42 | class OneEuroFilter(object): 43 | 44 | def __init__(self, freq, mincutoff=1.0, beta=0.0, dcutoff=1.0): 45 | if freq <= 0: 46 | raise ValueError("freq should be >0") 47 | if mincutoff <= 0: 48 | raise ValueError("mincutoff should be >0") 49 | if dcutoff <= 0: 50 | raise ValueError("dcutoff should be >0") 51 | self.__freq = float(freq) 52 | self.__mincutoff = float(mincutoff) 53 | self.__beta = float(beta) 54 | self.__dcutoff = float(dcutoff) 55 | self.__x = LowPassFilter(self.__alpha(self.__mincutoff)) 56 | self.__dx = LowPassFilter(self.__alpha(self.__dcutoff)) 57 | self.__lasttime = None 58 | 59 | def __alpha(self, cutoff): 60 | te = 1.0 / self.__freq 61 | tau = 1.0 / (2 * math.pi * cutoff) 62 | return 1.0 / (1.0 + tau / te) 63 | 64 | def __call__(self, x, timestamp=None): 65 | # ---- update the sampling frequency based on timestamps 66 | if self.__lasttime and timestamp: 67 | self.__freq = 1.0 / (timestamp - self.__lasttime) 68 | self.__lasttime = timestamp 69 | # ---- estimate the current variation per second 70 | prev_x = self.__x.lastValue() 71 | dx = 0.0 if prev_x is None else (x - prev_x) * self.__freq # FIXME: 0.0 or value? # noqa 72 | edx = self.__dx(dx, timestamp, alpha=self.__alpha(self.__dcutoff)) 73 | # ---- use it to update the cutoff frequency 74 | cutoff = self.__mincutoff + self.__beta * math.fabs(edx) 75 | # ---- filter the given value 76 | return self.__x(x, timestamp, alpha=self.__alpha(cutoff)) 77 | 78 | # ---------------------------------------------------------------------------- 79 | 80 | 81 | def apply_one_euro(pos_array): 82 | """Apply one euro filter to a gesture 83 | 84 | Original implementation can be downloaded from 85 | http://cristal.univ-lille.fr/~casiez/1euro/ 86 | 87 | Args: 88 | pos_array: body keypoint positions to filter 89 | Returns: 90 | np.ndarray: filtered positions 91 | """ 92 | 93 | pos_along_timestep = pos_array.transpose() 94 | 95 | config = { 96 | 'freq': 20, # Hz 97 | 'mincutoff': 0.1, # Minimum cutoff frequency 98 | 'beta': 0.08, # Cutoff slope 99 | 'dcutoff': 1.0 # Cutoff frequency for derivate 100 | } 101 | 102 | oef = OneEuroFilter(**config) 103 | 104 | filtered_pos = [] 105 | for i, joint in enumerate(pos_along_timestep): 106 | joint_pos = [] 107 | for timestep, pos in enumerate(joint): 108 | if timestep > 0: 109 | timestep = timestep * 1.0 / config["freq"] 110 | filt_num = oef(pos, timestep) 111 | joint_pos.append(filt_num) 112 | filtered_pos.append(joint_pos) 113 | 114 | filtered_pos_array = np.array(filtered_pos) 115 | 116 | return filtered_pos_array.transpose() 117 | -------------------------------------------------------------------------------- /helpers/remove_velocity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Remove velocity from the network output 4 | (it produces both coordinates and velocities while we need only velocities) 5 | 6 | @author: kaneko.naoshi 7 | """ 8 | 9 | import argparse 10 | import glob 11 | import os 12 | 13 | import numpy as np 14 | 15 | 16 | def save_positions(out_dir, gesture_name, positions): 17 | """Save body keypoint positions into file 18 | 19 | Args: 20 | out_dir: output directory 21 | gesture_name: basename of the output file 22 | positions: keypoint positions to save 23 | """ 24 | 25 | filename = os.path.join(out_dir, gesture_name + '.txt') 26 | np.savetxt(filename, positions, fmt='%s') 27 | 28 | 29 | def remove_velocity(data, dim=3): 30 | """Remove velocity values from raw prediction data 31 | 32 | Args: 33 | data: array containing both position and velocity values 34 | dim: gesture dimensionality 35 | 36 | Returns: 37 | np.ndarray: array containing only position values 38 | """ 39 | 40 | starts = np.arange(0, data.shape[1], dim * 2) 41 | stops = np.arange(dim, data.shape[1], dim * 2) 42 | return np.hstack([data[:, i:j] for i, j in zip(starts, stops)]) 43 | 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser( 47 | description='Remove velocity values from the raw generated gestures') 48 | parser.add_argument('--gesture', '-g', required=True, 49 | help='Path to the raw gesture directory') 50 | args = parser.parse_args() 51 | 52 | print('Remove velocities from the ' 53 | 'gestures in "{}"'.format(args.gesture)) 54 | print('') 55 | 56 | # List of gesture files 57 | txt_paths = sorted(glob.glob(os.path.join(args.gesture, '*.txt'))) 58 | 59 | # Check file existence 60 | if not txt_paths: 61 | raise ValueError('Could not find the gesture files in "{}". ' 62 | 'Please specify correct folder as --gesture flag.' 63 | .format(args.gesture)) 64 | 65 | # Make output directory 66 | out_dir = os.path.join(args.gesture, 'no_vel') 67 | if not os.path.isdir(out_dir): 68 | os.makedirs(out_dir) 69 | 70 | for txt_path in txt_paths: 71 | print('Process "{}"'.format(txt_path)) 72 | 73 | pos_vel = np.loadtxt(txt_path) 74 | 75 | # Remove velocity values 76 | only_pos = remove_velocity(pos_vel) 77 | 78 | gesture_name, _ = os.path.splitext(os.path.basename(txt_path)) 79 | save_positions(out_dir, gesture_name, only_pos) 80 | 81 | print('') 82 | print('Results were written in "{}"'.format(out_dir)) 83 | print('') 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /hierarchy.txt: -------------------------------------------------------------------------------- 1 | HIERARCHY 2 | ROOT Hips 3 | { 4 | OFFSET 0.000000 60.000000 0.000000 5 | CHANNELS 3 Zrotation Xrotation Yrotation 6 | JOINT Spine 7 | { 8 | OFFSET 0.000000 4.744019 0.000000 9 | CHANNELS 3 Zrotation Xrotation Yrotation 10 | JOINT Spine1 11 | { 12 | OFFSET 0.000000 11.747704 0.000000 13 | CHANNELS 3 Zrotation Xrotation Yrotation 14 | JOINT Neck 15 | { 16 | OFFSET 0.000000 11.699501 -1.063590 17 | CHANNELS 3 Zrotation Xrotation Yrotation 18 | JOINT Head 19 | { 20 | OFFSET 0.000000 8.099556 1.157080 21 | CHANNELS 3 Zrotation Xrotation Yrotation 22 | End Site 23 | { 24 | OFFSET 0.000000 7.463501 0.000000 25 | } 26 | } 27 | } 28 | JOINT LeftShoulder 29 | { 30 | OFFSET -2.323960 10.457596 0.333555 31 | CHANNELS 3 Zrotation Xrotation Yrotation 32 | JOINT LeftArm 33 | { 34 | OFFSET -7.102620 0.000000 0.000000 35 | CHANNELS 3 Zrotation Xrotation Yrotation 36 | JOINT LeftForeArm 37 | { 38 | OFFSET -15.301900 0.000000 0.000000 39 | CHANNELS 3 Zrotation Xrotation Yrotation 40 | JOINT LeftHand 41 | { 42 | OFFSET -17.165703 0.000000 0.000000 43 | CHANNELS 3 Zrotation Xrotation Yrotation 44 | JOINT LeftHandThumb1 45 | { 46 | OFFSET -1.446360 -0.867805 -2.892700 47 | CHANNELS 3 Zrotation Xrotation Yrotation 48 | JOINT LeftHandThumb2 49 | { 50 | OFFSET -1.735620 0.000000 0.000000 51 | CHANNELS 3 Zrotation Xrotation Yrotation 52 | JOINT LeftHandThumb3 53 | { 54 | OFFSET -1.735620 0.000000 0.000000 55 | CHANNELS 3 Zrotation Xrotation Yrotation 56 | End Site 57 | { 58 | OFFSET -1.673540 0.000000 0.000000 59 | } 60 | } 61 | } 62 | } 63 | JOINT LeftHandIndex1 64 | { 65 | OFFSET -7.345020 0.000000 -2.024890 66 | CHANNELS 3 Zrotation Xrotation Yrotation 67 | JOINT LeftHandIndex2 68 | { 69 | OFFSET -2.892700 0.000000 0.000000 70 | CHANNELS 3 Zrotation Xrotation Yrotation 71 | JOINT LeftHandIndex3 72 | { 73 | OFFSET -1.446362 0.000000 0.000000 74 | CHANNELS 3 Zrotation Xrotation Yrotation 75 | End Site 76 | { 77 | OFFSET -1.394619 0.000000 0.000000 78 | } 79 | } 80 | } 81 | } 82 | JOINT LeftHandMiddle1 83 | { 84 | OFFSET -7.345009 0.000000 -0.671109 85 | CHANNELS 3 Zrotation Xrotation Yrotation 86 | JOINT LeftHandMiddle2 87 | { 88 | OFFSET -3.181961 0.000000 0.000000 89 | CHANNELS 3 Zrotation Xrotation Yrotation 90 | JOINT LeftHandMiddle3 91 | { 92 | OFFSET -1.735611 0.000000 0.000000 93 | CHANNELS 3 Zrotation Xrotation Yrotation 94 | End Site 95 | { 96 | OFFSET -1.673538 0.000000 0.000000 97 | } 98 | } 99 | } 100 | } 101 | JOINT LeftHandRing1 102 | { 103 | OFFSET -5.666491 0.000000 0.671104 104 | CHANNELS 3 Zrotation Xrotation Yrotation 105 | JOINT LeftHandRing2 106 | { 107 | OFFSET -2.892691 0.000000 0.000000 108 | CHANNELS 3 Zrotation Xrotation Yrotation 109 | JOINT LeftHandRing3 110 | { 111 | OFFSET -1.446339 0.000000 0.000000 112 | CHANNELS 3 Zrotation Xrotation Yrotation 113 | End Site 114 | { 115 | OFFSET -1.394619 0.000000 0.000000 116 | } 117 | } 118 | } 119 | } 120 | JOINT LeftHandPinky1 121 | { 122 | OFFSET -3.987949 0.000000 2.024890 123 | CHANNELS 3 Zrotation Xrotation Yrotation 124 | JOINT LeftHandPinky2 125 | { 126 | OFFSET -2.314140 0.000000 0.000000 127 | CHANNELS 3 Zrotation Xrotation Yrotation 128 | JOINT LeftHandPinky3 129 | { 130 | OFFSET -1.157070 0.000000 0.000000 131 | CHANNELS 3 Zrotation Xrotation Yrotation 132 | End Site 133 | { 134 | OFFSET -1.115688 0.000000 0.000000 135 | } 136 | } 137 | } 138 | } 139 | } 140 | } 141 | } 142 | } 143 | JOINT RightShoulder 144 | { 145 | OFFSET 2.286459 10.457596 0.333558 146 | CHANNELS 3 Zrotation Xrotation Yrotation 147 | JOINT RightArm 148 | { 149 | OFFSET 7.102619 0.000000 0.000000 150 | CHANNELS 3 Zrotation Xrotation Yrotation 151 | JOINT RightForeArm 152 | { 153 | OFFSET 15.301899 0.000000 0.000000 154 | CHANNELS 3 Zrotation Xrotation Yrotation 155 | JOINT RightHand 156 | { 157 | OFFSET 17.165699 0.000000 0.000000 158 | CHANNELS 3 Zrotation Xrotation Yrotation 159 | JOINT RightHandThumb1 160 | { 161 | OFFSET 1.446362 -0.867805 -2.892700 162 | CHANNELS 3 Zrotation Xrotation Yrotation 163 | JOINT RightHandThumb2 164 | { 165 | OFFSET 1.735611 0.000000 0.000000 166 | CHANNELS 3 Zrotation Xrotation Yrotation 167 | JOINT RightHandThumb3 168 | { 169 | OFFSET 1.735619 0.000000 0.000000 170 | CHANNELS 3 Zrotation Xrotation Yrotation 171 | End Site 172 | { 173 | OFFSET 1.673542 0.000000 0.000000 174 | } 175 | } 176 | } 177 | } 178 | JOINT RightHandIndex1 179 | { 180 | OFFSET 7.345032 0.000000 -2.024890 181 | CHANNELS 3 Zrotation Xrotation Yrotation 182 | JOINT RightHandIndex2 183 | { 184 | OFFSET 2.892723 0.000000 0.000000 185 | CHANNELS 3 Zrotation Xrotation Yrotation 186 | JOINT RightHandIndex3 187 | { 188 | OFFSET 1.446350 0.000000 0.000000 189 | CHANNELS 3 Zrotation Xrotation Yrotation 190 | End Site 191 | { 192 | OFFSET 1.394623 0.000000 0.000000 193 | } 194 | } 195 | } 196 | } 197 | JOINT RightHandMiddle1 198 | { 199 | OFFSET 7.345032 0.000000 -0.671109 200 | CHANNELS 3 Zrotation Xrotation Yrotation 201 | JOINT RightHandMiddle2 202 | { 203 | OFFSET 3.181969 0.000000 0.000000 204 | CHANNELS 3 Zrotation Xrotation Yrotation 205 | JOINT RightHandMiddle3 206 | { 207 | OFFSET 1.735611 0.000000 0.000000 208 | CHANNELS 3 Zrotation Xrotation Yrotation 209 | End Site 210 | { 211 | OFFSET 1.673538 0.000000 0.000000 212 | } 213 | } 214 | } 215 | } 216 | JOINT RightHandRing1 217 | { 218 | OFFSET 5.666489 0.000000 0.671106 219 | CHANNELS 3 Zrotation Xrotation Yrotation 220 | JOINT RightHandRing2 221 | { 222 | OFFSET 2.892708 0.000000 0.000000 223 | CHANNELS 3 Zrotation Xrotation Yrotation 224 | JOINT RightHandRing3 225 | { 226 | OFFSET 1.446358 0.000000 0.000000 227 | CHANNELS 3 Zrotation Xrotation Yrotation 228 | End Site 229 | { 230 | OFFSET 1.394623 0.000000 0.000000 231 | } 232 | } 233 | } 234 | } 235 | JOINT RightHandPinky1 236 | { 237 | OFFSET 3.987961 0.000000 2.024890 238 | CHANNELS 3 Zrotation Xrotation Yrotation 239 | JOINT RightHandPinky2 240 | { 241 | OFFSET 2.314171 0.000000 0.000000 242 | CHANNELS 3 Zrotation Xrotation Yrotation 243 | JOINT RightHandPinky3 244 | { 245 | OFFSET 1.157082 0.000000 0.000000 246 | CHANNELS 3 Zrotation Xrotation Yrotation 247 | End Site 248 | { 249 | OFFSET 1.115692 0.000000 0.000000 250 | } 251 | } 252 | } 253 | } 254 | } 255 | } 256 | } 257 | } 258 | } 259 | } 260 | JOINT LeftUpLeg 261 | { 262 | OFFSET -5.785400 0.000000 0.000000 263 | CHANNELS 3 Zrotation Xrotation Yrotation 264 | JOINT LeftLeg 265 | { 266 | OFFSET 0.000000 -30.002701 0.000000 267 | CHANNELS 3 Zrotation Xrotation Yrotation 268 | JOINT LeftFoot 269 | { 270 | OFFSET 0.000000 -22.702000 0.000000 271 | CHANNELS 3 Zrotation Xrotation Yrotation 272 | JOINT LeftToeBase 273 | { 274 | OFFSET 0.000000 -3.760510 -8.678090 275 | CHANNELS 3 Zrotation Xrotation Yrotation 276 | End Site 277 | { 278 | OFFSET 0.000000 0.000000 -2.314159 279 | } 280 | } 281 | } 282 | } 283 | } 284 | JOINT RightUpLeg 285 | { 286 | OFFSET 5.785400 0.000000 0.000000 287 | CHANNELS 3 Zrotation Xrotation Yrotation 288 | JOINT RightLeg 289 | { 290 | OFFSET 0.000000 -30.002701 0.000000 291 | CHANNELS 3 Zrotation Xrotation Yrotation 292 | JOINT RightFoot 293 | { 294 | OFFSET 0.000000 -22.702101 0.000000 295 | CHANNELS 3 Zrotation Xrotation Yrotation 296 | JOINT RightToeBase 297 | { 298 | OFFSET 0.000000 -3.760510 -8.678100 299 | CHANNELS 3 Zrotation Xrotation Yrotation 300 | End Site 301 | { 302 | OFFSET 0.000000 0.000000 -2.314159 303 | } 304 | } 305 | } 306 | } 307 | } 308 | } 309 | 310 | -------------------------------------------------------------------------------- /motion_repr_learning/README.md: -------------------------------------------------------------------------------- 1 | # Motion Representation Learning 2 | 3 | This is a folder for learning a compact and informative representation of the human motion sequence. 4 | 5 | ## The main idea 6 | The aim is to learn a better representation of the motion frames using an auto-encoding neural networks, such as Denoising Autoencoder or Variational Autoencoder. 7 | 8 | Encoding (MotionE) is a mapping from a sequence of the 3D positions of the human to a lower-dimensional representation, which will contain enough information to reconstruct original motion sequence, but will have less redundancy and hence will be better for the speech-to-motion mapping. 9 | Decoding (MotionD) is a mapping from the encoded vector back to the 3D motion sequence. 10 | 11 | Once a motion encoder MotionE and a motion decoder MotionD are learned, we train a novel encoder network SpeechE to map from speech to a corresponding low-dimensional motion representation (code for this mapping is given in the main folder of this repository). 12 | 13 | At test time, the speech encoder and the motion decoder networks are combined: SpeechE predicts motion representations based on a given speech signal and MotionD then decodes these representations to produce motion sequences. 14 | 15 | ## Data preparation 16 | 17 | 1. Follow the instruction on data preparation at the root folder of this repository. 18 | 2. Indicate the directory for the data at utils/flags.py as "data_dir" value. 19 | 3. Indicate the directory to the checkpoints (will be used to store the model) at utils/flags as "chkpt_dir" value. 20 | 21 | ## Run 22 | To run the default example execute the following command. 23 | 24 | ```bash 25 | # Learn dataset encoding 26 | python learn_dataset_encoding.py DATA_DIR motion -chkpt_dir=CHKPT_DIR -layer1_width=DIM 27 | 28 | #Encode dataset 29 | python encode_dataset.py DATA_DIR motion -chkpt_dir=CHKPT_DIR -restore=True -pretrain=False -layer1_width=DIM 30 | ``` 31 | 32 | Where DATA_DIR is a directory where the data is stored, CHKPT_DIR is a directory to store the model checkpoints and DIM is dimensionality of the representation. 33 | 34 | 35 | ## Customizing 36 | You can play around with the run options, including the neural net size and shape, dropout, learning rates, etc. in the file flags.py. 37 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/DAE.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains an implementation of a particular type of AE, 3 | namely Denoising Autoendoder. 4 | 5 | To be used in the files learn_dataset_encoding and train.py 6 | 7 | Developed by Taras Kucherenko (tarask@kth.se) 8 | """ 9 | 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import tensorflow as tf 14 | import numpy as np 15 | 16 | from utils.utils import add_noise, loss_reconstruction 17 | from utils.flags import FLAGS 18 | 19 | 20 | class DAE: 21 | """ Denoising Autoendoder (DAE) 22 | 23 | More details about the network in the original paper: 24 | http://www.jmlr.org/papers/v11/vincent10a.html 25 | 26 | The user specifies the structure of this network 27 | by specifying number of inputs, the number of hidden 28 | units for each layer and the number of final outputs. 29 | All this information is set in the utils/flags.py file. 30 | 31 | The number of input neurons is defined as a frame_size*chunk_length, 32 | since it will take a time-window as an input 33 | 34 | """ 35 | 36 | def __init__(self, shape, sess, variance_coef, data_info): 37 | """DAE initializer 38 | 39 | Args: 40 | shape: list of ints specifying 41 | num input, hidden1 units,...hidden_n units, num outputs 42 | sess: tensorflow session object to use 43 | varience_coef: multiplicative factor for the variance of noise wrt the variance of data 44 | data_info: key information about the dataset 45 | """ 46 | 47 | self.__shape = shape # [input_dim,hidden1_dim,...,hidden_n_dim,output_dim] 48 | self.__variables = {} 49 | self.__sess = sess 50 | 51 | self.num_hidden_layers = np.size(shape) - 2 52 | 53 | self.batch_size = FLAGS.batch_size 54 | self.sequence_length = FLAGS.chunk_length 55 | 56 | self.scaling_factor = 1 57 | 58 | # maximal value and mean pose in the dataset (used for scaling it to interval [-1,1] and back) 59 | self.max_val = data_info.max_val 60 | self.mean_pose = data_info.mean_pose 61 | 62 | 63 | #################### Add the DATASETS to the GRAPH ############### 64 | 65 | #### 1 - TRAIN ### 66 | self._train_data_initializer = tf.placeholder(dtype=tf.float32, 67 | shape=data_info.train_shape) 68 | self._train_data = tf.Variable(self._train_data_initializer, 69 | trainable=False, collections=[], name='Train_data') 70 | train_epochs = FLAGS.training_epochs + FLAGS.pretraining_epochs * FLAGS.num_hidden_layers 71 | train_frames = tf.train.slice_input_producer([self._train_data], num_epochs=train_epochs) 72 | self._train_batch = tf.train.shuffle_batch(train_frames, 73 | batch_size=FLAGS.batch_size, capacity=5000, 74 | min_after_dequeue=1000, name='Train_batch') 75 | 76 | #### 2 - VALIDATE, can be used as TEST ### 77 | # When optimizing - this dataset stores as a validation dataset, 78 | # when testing - this dataset stores a test dataset 79 | self._valid_data_initializer = tf.placeholder(dtype=tf.float32, 80 | shape=data_info.eval_shape) 81 | self._valid_data = tf.Variable(self._valid_data_initializer, 82 | trainable=False, collections=[], name='Valid_data') 83 | valid_frames = tf.train.slice_input_producer([self._valid_data], 84 | num_epochs=FLAGS.training_epochs) 85 | self._valid_batch = tf.train.shuffle_batch(valid_frames, 86 | batch_size=FLAGS.batch_size, capacity=5000, 87 | min_after_dequeue=1000, name='Valid_batch') 88 | 89 | if FLAGS.weight_decay is not None: 90 | print('\nWe apply weight decay') 91 | 92 | ### Specify tensorflow setup ### 93 | with sess.graph.as_default(): 94 | 95 | ############## SETUP VARIABLES ###################### 96 | 97 | with tf.variable_scope("AE_Variables"): 98 | 99 | for i in range(self.num_hidden_layers + 1): # go over layers 100 | 101 | # create variables for matrices and biases for each layer 102 | self._create_variables(i, FLAGS.weight_decay) 103 | 104 | ############## DEFINE THE NETWORK ################## 105 | 106 | ''' 1 - Setup network for TRAINing ''' 107 | # Input noisy data and reconstruct the original one 108 | # as in Denoising AutoEncoder 109 | self._input_ = add_noise(self._train_batch, variance_coef, data_info.data_sigma) 110 | self._target_ = self._train_batch 111 | 112 | # Define output and loss for the training data 113 | self._output, _, _ = self.construct_graph(self._input_, FLAGS.dropout) 114 | self._reconstruction_loss = loss_reconstruction(self._output, 115 | self._target_, self.max_val) 116 | tf.add_to_collection('losses', self._reconstruction_loss) # add weight decay loses 117 | self._loss = tf.add_n(tf.get_collection('losses'), name='total_loss') 118 | 119 | ''' 2 - Setup network for TESTing ''' 120 | self._valid_input_ = self._valid_batch 121 | self._valid_target_ = self._valid_batch 122 | 123 | # Define output (no dropout) 124 | self._valid_output, self._encode, self._decode = \ 125 | self.construct_graph(self._valid_input_, 1) 126 | 127 | # Define loss 128 | self._valid_loss = loss_reconstruction(self._valid_output, 129 | self._valid_target_, self.max_val) 130 | @property 131 | def session(self): 132 | """ Interface for the session""" 133 | return self.__sess 134 | 135 | @property 136 | def shape(self): 137 | """ Interface for the shape""" 138 | return self.__shape 139 | 140 | # Make more comfortable interface to the network weights 141 | 142 | def _w(self, n, suffix=""): 143 | return self["matrix"+str(n)+suffix] 144 | 145 | def _b(self, n, suffix=""): 146 | return self["bias"+str(n)+suffix] 147 | 148 | @staticmethod 149 | def _feedforward(x, w, b): 150 | """ 151 | Traditional feedforward layer: multiply on weight matrix, add bias vector 152 | and apply activation function 153 | 154 | Args: 155 | x: input ( usually - batch of vectors) 156 | w: matrix to be multiplied on 157 | b: bias to be added 158 | 159 | Returns: 160 | y: result of applying this feedforward layer 161 | """ 162 | 163 | y = tf.tanh(tf.nn.bias_add(tf.matmul(x, w), b)) 164 | return y 165 | 166 | def construct_graph(self, input_seq_pl, dropout): 167 | 168 | """ Construct a TensorFlow graph for the AutoEncoding network 169 | 170 | Args: 171 | input_seq_pl: tf placeholder for input data: size [batch_size, sequence_length * DoF] 172 | dropout: how much of the input neurons will be activated, value in range [0,1] 173 | Returns: 174 | output: output tensor: result of running input placeholder through the network 175 | middle_layer: tensor which is encoding input placeholder into a representation 176 | decoding: tensor which is decoding a representation back into the input vector 177 | """ 178 | 179 | network_input = input_seq_pl 180 | 181 | curr_layer = tf.reshape(network_input, [self.batch_size, 182 | FLAGS.chunk_length * FLAGS.frame_size]) 183 | 184 | numb_layers = self.num_hidden_layers + 1 185 | 186 | with tf.name_scope("Joint_run"): 187 | 188 | # Pass through the network 189 | for i in range(numb_layers): 190 | 191 | if i == FLAGS.middle_layer: 192 | # Save middle layer 193 | with tf.name_scope('middle_layer'): 194 | middle_layer = tf.identity(curr_layer) 195 | 196 | with tf.name_scope('hidden'+str(i)): 197 | 198 | # First - Apply Dropout 199 | curr_layer = tf.nn.dropout(curr_layer, dropout) 200 | 201 | w = self._w(i + 1) 202 | b = self._b(i + 1) 203 | 204 | curr_layer = self._feedforward(curr_layer, w, b) 205 | 206 | output = curr_layer 207 | 208 | # Now create a decoding network 209 | 210 | with tf.name_scope("Decoding"): 211 | 212 | layer = self._representation = tf.placeholder\ 213 | (dtype=tf.float32, shape=middle_layer.get_shape().as_list(), name="Respres.") 214 | 215 | for i in range(FLAGS.middle_layer, numb_layers): 216 | 217 | with tf.name_scope('hidden' + str(i)): 218 | 219 | # First - Apply Dropout 220 | layer = tf.nn.dropout(layer, dropout) 221 | 222 | w = self._w(i + 1) 223 | b = self._b(i + 1) 224 | 225 | layer = self._feedforward(layer, w, b) 226 | 227 | decoding = layer 228 | 229 | return output, middle_layer, decoding 230 | 231 | def __getitem__(self, item): 232 | """Get AutoEncoder tf variable 233 | 234 | Returns the specified variable created by this object. 235 | Names are weights#, biases#, biases#_out, weights#_fixed, 236 | biases#_fixed. 237 | 238 | Args: 239 | item: string, variables internal name 240 | Returns: 241 | Tensorflow variable 242 | """ 243 | return self.__variables[item] 244 | 245 | def __setitem__(self, key, value): 246 | """Store a TensorFlow variable 247 | 248 | NOTE: Don't call this explicitly. It should 249 | be used only internally when setting up 250 | variables. 251 | 252 | Args: 253 | key: string, name of variable 254 | value: tensorflow variable 255 | """ 256 | self.__variables[key] = value 257 | 258 | def _create_variables(self, i, wd): 259 | """Helper to create an initialized Variable with weight decay. 260 | Note that the Variable is initialized with a truncated normal distribution. 261 | A weight decay is added only if 'wd' is specified. 262 | If 'wd' is None, weight decay is not added for this Variable. 263 | 264 | This function was taken from the web 265 | 266 | Args: 267 | i: number of hidden layer 268 | wd: add L2Loss weight decay multiplied by this float. 269 | Returns: 270 | Nothing 271 | """ 272 | 273 | # Initialize Train weights 274 | w_shape = (self.__shape[i], self.__shape[i + 1]) 275 | a = tf.multiply(2.0, tf.sqrt(6.0 / (w_shape[0] + w_shape[1]))) 276 | name_w = "matrix"+str(i + 1) 277 | self[name_w] = tf.get_variable("Variables/"+name_w, 278 | initializer=tf.random_uniform(w_shape, -1 * a, a)) 279 | 280 | # Add weight to the loss function for weight decay 281 | if wd is not None: 282 | weight_decay = tf.multiply(tf.nn.l2_loss(self[name_w]), wd, name='wgt_'+str(i)+'_loss') 283 | tf.add_to_collection('losses', weight_decay) 284 | 285 | # Add the histogram summary 286 | tf.summary.histogram(name_w, self[name_w]) 287 | 288 | # Initialize Train biases 289 | name_b = "bias"+str(i + 1) 290 | b_shape = (self.__shape[i + 1],) 291 | self[name_b] = tf.get_variable("Variables/"+name_b, initializer=tf.zeros(b_shape)) 292 | 293 | if i < self.num_hidden_layers: 294 | # Hidden layer pretrained weights 295 | # which are used after pretraining before fine-tuning 296 | self[name_w + "_pretr"] = tf.get_variable(name="Var/" + name_w + "_pretr", initializer= 297 | tf.random_uniform(w_shape, -1 * a, a), 298 | trainable=False) 299 | # Hidden layer pretrained biases 300 | self[name_b + "_pretr"] = tf.get_variable("Var/"+name_b+"_pretr", trainable=False, 301 | initializer=tf.zeros(b_shape)) 302 | 303 | # Pretraining output training biases 304 | name_b_out = "bias" + str(i+1) + "_out" 305 | b_shape = (self.__shape[i],) 306 | b_init = tf.zeros(b_shape) 307 | self[name_b_out] = tf.get_variable(name="Var/"+name_b_out, initializer=b_init, 308 | trainable=True) 309 | 310 | def run_less_layers(self, input_pl, n, is_target=False): 311 | """Return result of a net after n layers or n-1 layer (if is_target is true) 312 | This function will be used for the layer-wise pretraining of the AE 313 | Args: 314 | input_pl: TensorFlow placeholder of AE inputs 315 | n: int specifying pretrain step 316 | is_target: bool specifying if required tensor 317 | should be the target tensor 318 | meaning if we should run n layers or n-1 (if is_target) 319 | Returns: 320 | Tensor giving pretraining net result or pretraining target 321 | """ 322 | assert n > 0 323 | assert n <= self.num_hidden_layers 324 | 325 | last_output = input_pl 326 | 327 | for i in range(n - 1): 328 | w = self._w(i + 1, "_pretrained") 329 | b = self._b(i + 1, "_pretrained") 330 | 331 | last_output = self._feedforward(last_output, w, b) 332 | 333 | if is_target: 334 | return last_output 335 | 336 | last_output = self._feedforward(last_output, self._w(n), self._b(n)) 337 | 338 | out = self._feedforward(last_output, self._w(n), self["bias" + str(n) + "_out"]) 339 | 340 | return out 341 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/decode.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a usage script, intended to test using interface. 3 | Developed by Taras Kucherenko (tarask@kth.se) 4 | """ 5 | 6 | import train as tr 7 | import utils.data as dt 8 | import utils.flags as fl 9 | from learn_dataset_encoding import create_nn, prepare_motion_data 10 | 11 | import numpy as np 12 | 13 | import sys 14 | 15 | DATA_DIR = sys.argv[1] 16 | TEST_FILE = sys.argv[2] 17 | OUTPUT_FILE = sys.argv[3] 18 | 19 | if __name__ == '__main__': 20 | 21 | # Get the data 22 | Y_train_normalized, Y_train, Y_test_normalized, Y_test, Y_dev_normalized, max_val, mean_pose = prepare_motion_data(DATA_DIR) 23 | 24 | # Train the network 25 | nn = create_nn(Y_train_normalized, Y_dev_normalized, max_val, mean_pose, restoring=True) 26 | 27 | # Read the encoding 28 | encoding = np.loadtxt(TEST_FILE) 29 | 30 | print(encoding.shape) 31 | 32 | # Decode it 33 | decoding = tr.decode(nn, encoding) 34 | 35 | print(decoding.shape) 36 | 37 | np.savetxt(OUTPUT_FILE, decoding, delimiter = ' ') 38 | 39 | # Close Tf session 40 | nn.session.close() 41 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/encode_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a script for encoding motion dataset. 3 | 4 | Usage example: python encode_dataset.py data_dir 5 | 6 | Developed by Taras Kucherenko (tarask@kth.se) 7 | """ 8 | 9 | import sys 10 | import numpy as np 11 | 12 | import train as tr 13 | import utils.flags as fl 14 | from learn_dataset_encoding import create_nn, prepare_motion_data, check_params, os 15 | 16 | if __name__ == '__main__': 17 | 18 | # Check the parameters 19 | check_params() 20 | 21 | DATA_DIR = sys.argv[1] 22 | 23 | # Additional check 24 | if not os.path.exists(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)): 25 | raise ValueError( 26 | 'Path to the dataset encoding ({}) does not exist!\nPlease, create a folder {} in the DATA_DIR directory' 27 | ''.format(DATA_DIR+"/"+str(fl.FLAGS.layer1_width), str(fl.FLAGS.layer1_width))) 28 | 29 | # Get the data 30 | train_normalized_data, train_data, test_normalized_data, test_data, dev_normalized_data, \ 31 | max_val, mean_pose = prepare_motion_data(DATA_DIR) 32 | 33 | # Restore the network 34 | nn = create_nn(train_normalized_data, dev_normalized_data, max_val, mean_pose, restoring=True) 35 | 36 | debug = 0 37 | 38 | # For debug - shorten the dataset 39 | if debug: 40 | train_normalized_data = train_normalized_data[:12000] 41 | 42 | """ Encode the train data """ 43 | 44 | # Encode it 45 | encoded_train_data = tr.encode(nn, train_normalized_data) 46 | 47 | # And save into file 48 | np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_train_encoded.npy", encoded_train_data) 49 | 50 | if debug: 51 | print(train_normalized_data.shape) 52 | print(encoded_train_data.shape) 53 | 54 | # Decode train 55 | decoded = tr.decode(nn, encoded_train_data) 56 | print(decoded.shape) 57 | 58 | # Reshape back to the frames 59 | decoded = np.reshape(decoded, (-1, fl.FLAGS.frame_size)) 60 | 61 | # And calculate an error 62 | 63 | size = min(train_normalized_data.shape[0], decoded.shape[0]) 64 | error = decoded[:size] - train_data[:size] 65 | rmse = np.sqrt(np.mean(error**2)) 66 | 67 | print("AE Train Error is ", rmse) 68 | 69 | """ Encode the test data """ 70 | 71 | # Encode it 72 | encoded_test_data = tr.encode(nn, test_normalized_data) 73 | 74 | # And save into files 75 | np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_test_encoded.npy", encoded_test_data) 76 | 77 | if debug: 78 | # Decode test 79 | decoded = tr.decode(nn, encoded_test_data) 80 | 81 | # Reshape back to the frames 82 | decoded = np.reshape(decoded, (-1, fl.FLAGS.frame_size)) 83 | 84 | size = min(test_normalized_data.shape[0], decoded.shape[0]) 85 | error = decoded[:size] - test_data[:size] 86 | rmse = np.sqrt(np.mean(error**2)) 87 | 88 | print("AE Test Error is ", rmse) 89 | 90 | """ Encode the dev data """ 91 | 92 | # Encode it 93 | encoded_dev_data = tr.encode(nn, dev_normalized_data) 94 | 95 | # And save into files 96 | np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_dev_encoded.npy", encoded_dev_data) 97 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/learn_dataset_encoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a script for learning encoding-decoding network 3 | on our dataset. 4 | 5 | Usage example: python learn_dataset_encoding.py data_dir 6 | 7 | Developed by Taras Kucherenko (tarask@kth.se) 8 | """ 9 | 10 | import sys 11 | import numpy as np 12 | import os 13 | 14 | import train as tr 15 | from utils.utils import prepare_motion_data, DataSet, DataSets, fl 16 | 17 | def create_nn(train_data, dev_data, max_val, mean_pose, restoring): 18 | """ 19 | Train or restore a neural network 20 | Args: 21 | train_data: training dataset normalized to the values [-1,1] 22 | dev_data: dev dataset normalized to the values [-1,1] 23 | max_val: maximal values in the dataset 24 | mean_pose: mean pose of the dataset 25 | restoring: weather we are going to just restore already trained model 26 | Returns: 27 | nn: neural network, which is ready to use 28 | """ 29 | 30 | # Create DataSet object 31 | 32 | data = DataSets() 33 | 34 | data.train = DataSet(train_data, fl.FLAGS.batch_size) 35 | data.test = DataSet(dev_data, fl.FLAGS.batch_size) 36 | 37 | # Assign variance 38 | data.train.sigma = np.std(train_data, axis=(0, 1)) 39 | 40 | # Create information about the dataset 41 | data_info = tr.DataInfo(data.train.sigma, data.train._sequences.shape, 42 | data.test._sequences.shape, max_val, mean_pose) 43 | 44 | # Set "restore" flag 45 | fl.FLAGS.restore = restoring 46 | 47 | # Train the network 48 | nn = tr.learning(data, data_info, just_restore=restoring) 49 | 50 | return nn 51 | 52 | def check_params(): 53 | 54 | # Check if script get enough parameters 55 | if len(sys.argv)<2: 56 | raise ValueError('Not enough paramters! \nUsage : python '+sys.argv[0].split("/")[-1]+' DATA_DIR') 57 | 58 | # Check if the dataset exists 59 | if not os.path.exists(sys.argv[1]): 60 | raise ValueError('Path to the dataset ({}) does not exist!\nPlease, provide correct DATA_DIR as a script parameter' 61 | ''.format(sys.argv[1])) 62 | 63 | # Check if the flags were set properly 64 | 65 | if not os.path.exists(fl.FLAGS.chkpt_dir): 66 | raise ValueError('Path to the checkpoints ({}) does not exit!\nChange the "chkpt_dir" flag in utils/flags.py' 67 | ''.format(fl.FLAGS.chkpt_dir)) 68 | 69 | if __name__ == '__main__': 70 | 71 | # Check parameters 72 | check_params() 73 | 74 | # Get the data 75 | DATA_DIR = sys.argv[1] 76 | train_normalized_data, train_data, test_normalized_data, test_data, dev_normalized_data, \ 77 | max_val, mean_pose = prepare_motion_data(DATA_DIR) 78 | 79 | # Train an AE network 80 | nn = create_nn(train_normalized_data, dev_normalized_data, max_val, mean_pose, restoring=False) -------------------------------------------------------------------------------- /motion_repr_learning/ae/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/motion_repr_learning/ae/utils/__init__.py -------------------------------------------------------------------------------- /motion_repr_learning/ae/utils/data.py: -------------------------------------------------------------------------------- 1 | """Functions for downloading,reading and preprocessing CMU data.""" 2 | 3 | import sys 4 | import os 5 | 6 | #sys.path.append('/home/taras/Desktop/Work/Code/Git/MotionCleaning/BVH_format/parser') 7 | #from reader import MyReader 8 | 9 | import matplotlib.pyplot as plt 10 | from mpl_toolkits.mplot3d import Axes3D 11 | import numpy as np 12 | from six.moves import xrange 13 | 14 | import utils.flags as fl 15 | 16 | class DataSet(object): 17 | ''' 18 | A class for storing a dataset and all important information, 19 | which might be needed during training, 20 | such as batch size amount of epochs completed and so on. 21 | ''' 22 | 23 | 24 | def __init__(self, sequences, batch_size): 25 | self._batch_size = batch_size 26 | self._sequences = sequences # all the sequnces in the dataset 27 | self._num_sequences = sequences.shape[0] 28 | self._epochs_completed = 0 29 | self._index_in_epoch = 0 30 | 31 | @property 32 | def sequences(self): 33 | return self._sequences 34 | 35 | @property 36 | def num_sequences(self): 37 | return self._num_sequences 38 | 39 | @property 40 | def epochs_completed(self): 41 | return self._epochs_completed 42 | 43 | def next_batch(self): 44 | """Return the next batch of sequences from this data set.""" 45 | batch_numb = self._index_in_epoch 46 | self._index_in_epoch += self._batch_size 47 | if self._index_in_epoch > self._num_chunks: 48 | # Finished epoch 49 | self._epochs_completed += 1 50 | # Shuffle the data 51 | perm = np.arange(self._num_sequences) 52 | np.random.shuffle(perm) 53 | self._sequences = self._sequences[perm] 54 | # Start next epoch 55 | batch_numb = 0 56 | self._index_in_epoch = self._batch_size 57 | return self._sequences[batch_numb:batch_numb + self._batch_size:1, :] 58 | 59 | 60 | class DataSets(object): 61 | ''' 62 | A class for storing Train and Eval datasets and all related information, 63 | ''' 64 | pass 65 | 66 | 67 | def read_bvh_file(fileName, test=False): 68 | """ 69 | Reads a file from CMU MoCap dataset in BVH format 70 | 71 | Returns: 72 | sequence [sequence_length,frame_size] - local chanells transformed to the hips-centered coordinates 73 | hips [frame_size] - coordinates of the hips 74 | 75 | """ 76 | 77 | # Read the data 78 | reader = MyReader(fileName); 79 | reader.read(); 80 | sequence = np.array(reader.points) 81 | 82 | # Translate to the hips-center coordinate system 83 | hips = sequence[:,:,0] 84 | sequence = sequence - hips[:,:,np.newaxis] 85 | 86 | # This is a visualization for debug 87 | '''fig = plt.figure() 88 | ax = fig.add_subplot(111, projection='3d') 89 | treshhold = 22 # to show legs in a different color 90 | # use 10 to color only the spine, 16 - spine and right hand, 22 - spine and both arms, 27 - all except left leg, 32 - all 91 | time_step = 10 92 | ax.scatter(sequence[time_step ][2][0:treshhold],sequence[time_step ][0][0:treshhold], sequence[time_step ][1][0:treshhold], 93 | c='r', marker='o') 94 | ax.scatter(sequence[time_step ][2][treshhold:], sequence[time_step ][0][treshhold:], sequence[time_step ][1][treshhold:], 95 | c='g', marker='o')''' 96 | plt.show() 97 | 98 | # Transpose the last 2 dimensions 99 | sequence = np.transpose(sequence, axes = (0,2,1)) 100 | 101 | #Flaten all the coords into one vector [T,3,m] -> [T,3m] 102 | return np.reshape(sequence,(sequence.shape[0],sequence.shape[1]*sequence.shape[2])),hips 103 | 104 | def read_a_folder(curr_dir): 105 | chunk_length = fl.FLAGS.chunk_length 106 | stride = fl.FLAGS.chunking_stride 107 | 108 | data = np.array([]) 109 | 110 | for filename in os.listdir(curr_dir): 111 | curr_sequence,_ = read_bvh_file(curr_dir + '/' + filename) 112 | 113 | # Split sequence into chunks 114 | curr_chunks = np.array([curr_sequence[i:i + chunk_length, :] for i in 115 | xrange(0, len(curr_sequence) - chunk_length, stride)]) 116 | 117 | if curr_chunks.shape[0] > 0: 118 | # Concatanate curr chunks to all of them 119 | data = np.vstack([data, curr_chunks]) if data.size else np.array(curr_chunks) 120 | 121 | print(data.shape) 122 | 123 | data = np.array(data) 124 | 125 | return data 126 | 127 | def read_unlabeled_data(train_dir, evaluate): 128 | """ 129 | Reads all 3 datasets from CMU MoCap dataset in C3D format 130 | 131 | Args: 132 | train_dir - address to the train, dev and eval datasets 133 | evaluate - flag : weather we want to evaluate a network or we just optimize parameters 134 | Returns: 135 | datasets - object of class DataSets, containing Train and Eval datasets 136 | max_val - maximal value in the raw data ( for post-processing) 137 | mean_pose - mean pose in the raw data ( for post-processing) 138 | """ 139 | 140 | data_sets = DataSets() 141 | 142 | # Get constants from the file 143 | data_dir = fl.FLAGS.data_dir 144 | chunk_length = fl.FLAGS.chunk_length 145 | stride = fl.FLAGS.chunking_stride 146 | 147 | if stride > chunk_length: 148 | print( 149 | 'ERROR! \nYou have stride bigger than lentgh of chunks. ' 150 | 'Please, change those values at flags.py, so that you don\'t ignore the data') 151 | exit(0) 152 | 153 | # ######### Get TRAIN data ########### 154 | print('\nReading train data from the following folder ... ', data_dir + '/train/labels') 155 | 156 | train_data = read_a_folder(data_dir + '/train/labels') 157 | 158 | [amount_of_train_strings, seq_length, DoF] = train_data.shape 159 | print('\n' + str(amount_of_train_strings) + ' sequences with length ' + str( 160 | seq_length) + ' will be used for training') 161 | 162 | # ######### Get TEST data ########### 163 | 164 | if evaluate: 165 | print('\nReading test data from the following folder : ', data_dir + '/eval/labels') 166 | test_data = read_a_folder(data_dir + '/eval/labels') 167 | else: 168 | print('\nReading test data from the following folder : ', data_dir + '/dev/labels') 169 | test_data = read_a_folder(data_dir + '/dev/labels') 170 | 171 | [amount_of_test_strings, seq_length, DoF] = test_data.shape 172 | print('\n' + str(amount_of_test_strings) + ' sequences with length ' 173 | + str(seq_length) + ' will be used for testing') 174 | 175 | # Do mean normalization : substract mean pose 176 | mean_pose = train_data.mean(axis=(0, 1)) 177 | train_data = train_data - mean_pose[np.newaxis, np.newaxis, :] 178 | test_data = test_data - mean_pose[np.newaxis, np.newaxis, :] 179 | 180 | # Scales all values in the input_data to be between -1 and 1 181 | eps = 1e-8 182 | max_train = np.amax(np.absolute(train_data), axis=(0, 1)) 183 | max_test = np.amax(np.absolute(test_data), axis=(0, 1)) 184 | max_val = np.maximum(max_train, max_test) 185 | train_data = np.divide(train_data, max_val[np.newaxis, np.newaxis, :] + eps) 186 | test_data = np.divide(test_data, max_val[np.newaxis, np.newaxis, :] + eps) 187 | 188 | # Check the data range 189 | max_ = test_data.max() 190 | min_ = test_data.min() 191 | 192 | print("MAximum value in the normalized test dataset : " + str(max_)) 193 | print("Minimum value in the normalized test dataset : " + str(min_)) 194 | 195 | print('\nTrain data shape: ', train_data.shape) 196 | 197 | data_sets.train = DataSet(train_data, fl.FLAGS.batch_size) 198 | data_sets.test = DataSet(test_data, fl.FLAGS.batch_size) 199 | 200 | # Assign variance 201 | data_sets.train.sigma = np.std(train_data, axis=(0, 1)) 202 | 203 | # Check if we have enough data 204 | if data_sets.train._num_sequences < data_sets.train._batch_size: 205 | print('ERROR: We have got not enough data! ' 206 | 'Reduce batch_size or increase amount of subfolder you use.') 207 | exit(1) 208 | 209 | return data_sets, max_val, mean_pose 210 | 211 | 212 | def read_dataset_and_write_in_binary(evaluate): 213 | """ 214 | Reads 3 datasets: "Train","Dev" and "Eval" from the CMU MoCap dataset in bvh format 215 | And write them in the binary format. 216 | Will get the address of the folder with the data from flags.py 217 | Args: 218 | evaluate - flag: weather we evaluate the system or we optimize parameters 219 | Returns: 220 | will write binary files in the same folder as the original data 221 | """ 222 | 223 | # Get the data 224 | data, max_val, mean_pose = read_unlabeled_data(fl.FLAGS.data_dir, False) # read_all_the_data() 225 | 226 | # Write all important information into binary files 227 | 228 | # Datasets themselfs 229 | train_file = open(fl.FLAGS.data_dir + '/train.binary', 'wb') 230 | data.train._sequences.tofile(train_file) 231 | train_file.close() 232 | 233 | eval_file = open(fl.FLAGS.data_dir + '/eval.binary', 'wb') 234 | data.test._sequences.tofile(eval_file) 235 | eval_file.close() 236 | 237 | # Dataset properties 238 | 239 | sigma_file = open(fl.FLAGS.data_dir + '/variance.binary', 'wb') 240 | data.train.sigma.tofile(sigma_file) 241 | sigma_file.close() 242 | 243 | max_val_file = open(fl.FLAGS.data_dir + '/maximums.binary', 'wb') 244 | max_val.tofile(max_val_file) 245 | max_val_file.close() 246 | 247 | mean_file = open(fl.FLAGS.data_dir + '/mean.binary', 'wb') 248 | mean_pose.tofile(mean_file) 249 | mean_file.close() 250 | 251 | print('All the binary files for the dataset was saved in the folder ', fl.FLAGS.data_dir) 252 | 253 | 254 | def read_binary_dataset(dataset_name): 255 | filename = fl.FLAGS.data_dir + '/' + dataset_name + '.binary' 256 | dataset = np.fromfile(filename) 257 | amount_of_frames = int(dataset.shape[0] /(fl.FLAGS.chunk_length * fl.FLAGS.frame_size)) 258 | # Clip array so that it divides exactly into the inputs we want (frame_size *chunk_length) 259 | dataset = dataset[0:amount_of_frames * fl.FLAGS.chunk_length * fl.FLAGS.frame_size] 260 | # Reshape 261 | dataset = dataset.reshape(amount_of_frames, fl.FLAGS.chunk_length, fl.FLAGS.frame_size) 262 | return dataset 263 | 264 | 265 | def read_3_datasets_from_binary(): 266 | """ 267 | Reads train and test datasets and their properties from binary file format 268 | 269 | Will take them from the corresponding file in the folder, which is defined by FLAGS.data_dir 270 | 271 | Returns: 272 | datasets - object of class DataSets, containing Train and Eval datasets 273 | max_val - maximal value in the raw data ( for post-processing) 274 | mean_pose - mean pose in the raw data ( for post-processing) 275 | 276 | """ 277 | data_sets = DataSets() 278 | 279 | # ######### Get TRAIN data ########### 280 | 281 | train_data = read_binary_dataset('train') 282 | [amount_of_train_strings, seq_length, DoF] = train_data.shape 283 | print('\n' + str(amount_of_train_strings) + ' sequences with length ' + str(fl.FLAGS.chunk_length) 284 | + ' frames in each will be used for training') 285 | 286 | # Merge all the time-frames together 287 | train_data = np.reshape(train_data, [amount_of_train_strings, seq_length * DoF]) 288 | 289 | # ######### Get TEST data ########### 290 | 291 | test_data = read_binary_dataset('eval') 292 | [amount_of_test_strings, seq_length, DoF] = test_data.shape 293 | print(str(amount_of_test_strings) + ' sequences will be used for testing') 294 | 295 | # Merge all the time-frames together 296 | test_data = np.reshape(test_data, [amount_of_test_strings, seq_length * DoF]) 297 | 298 | # Shuffle the data 299 | perm = np.arange(amount_of_train_strings) 300 | np.random.shuffle(perm) 301 | train_data = train_data[perm] 302 | 303 | data_sets.train = DataSet(train_data, fl.FLAGS.batch_size) 304 | data_sets.test = DataSet(test_data, fl.FLAGS.batch_size) 305 | 306 | # Assign variance 307 | data_sets.train.sigma = np.std(train_data, axis=(0, 1)) 308 | 309 | # Read maximal value and mean pose before normalizatio 310 | max_val = np.fromfile(fl.FLAGS.data_dir + '/maximums.binary') 311 | mean_pose = np.fromfile(fl.FLAGS.data_dir + '/mean.binary') 312 | 313 | # Check if we have enough data 314 | if data_sets.train._num_sequences < data_sets.train._batch_size: 315 | print('ERROR: We have got not enough data! ' 316 | 'Reduce batch_size or increase amount of subfolder you use.') 317 | exit(1) 318 | 319 | return data_sets, max_val, mean_pose 320 | 321 | 322 | def write_test_seq_in_binary(input_file_name, output_file_name): 323 | """ Read test sequence in c3d format and 324 | write it into the binart file 325 | 326 | Args: 327 | input_file_name: the name of the input file 328 | output_file_name: the name of the output file 329 | Returns: 330 | nothing 331 | """ 332 | test_file = open(output_file_name, 'wb') 333 | test_seq,_ = read_bvh_file(input_file_name) 334 | test_seq.tofile(test_file) 335 | test_file.close() 336 | print("The test sequence was read from", input_file_name, " and written to", output_file_name) 337 | 338 | 339 | def read_test_seq_from_binary(binary_file_name): 340 | """ Read test sequence from the binart file 341 | 342 | Args: 343 | binary_file_name: the name of the input binary file 344 | Returns: 345 | read_seq: test sequence 346 | """ 347 | # Read the sequence 348 | read_seq = np.fromfile(binary_file_name) 349 | # Reshape 350 | read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size) 351 | amount_of_frames = int(read_seq.shape[0] / (fl.FLAGS.chunk_length)) 352 | if amount_of_frames > 0: 353 | # Clip array so that it divides exactly into the inputs we want (frame_size * chunk_length) 354 | read_seq = read_seq[0:amount_of_frames * fl.FLAGS.chunk_length] 355 | 356 | # Reshape 357 | read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size * fl.FLAGS.chunk_length) #? 358 | 359 | return read_seq 360 | 361 | 362 | def visualize(mocap_seq, test=False): 363 | all_3d_coords = mocap_seq.reshape(-1, 3, int(fl.FLAGS.frame_size/3)) # Concatanate all coords into one vector 364 | 365 | # For debug - Visualize the skeleton 366 | fig = plt.figure() 367 | ax = fig.add_subplot(111, projection='3d') 368 | 369 | start_frame = 40 370 | treshhold_0 = 14 371 | treshhold_1 = 20 372 | treshhold_2 = 27 373 | coef = 100 374 | for step in range(start_frame, start_frame + 30, 10): 375 | 376 | # Visualize a 3D point cloud 377 | ax.scatter3D(all_3d_coords[step][0][:treshhold_0], 378 | np.add(all_3d_coords[step][1][:treshhold_0], (step - start_frame) * coef), 379 | all_3d_coords[step][2][:treshhold_0], c='c', marker='o') 380 | ax.scatter3D(all_3d_coords[step][0][treshhold_0:treshhold_1], 381 | np.add(all_3d_coords[step][1][treshhold_0:treshhold_1], 382 | (step - start_frame) * coef), 383 | all_3d_coords[step][2][treshhold_0:treshhold_1], c='r', marker='o') 384 | ax.scatter3D(all_3d_coords[step][0][treshhold_1:treshhold_2], 385 | np.add(all_3d_coords[step][1][treshhold_1:treshhold_2], 386 | (step - start_frame) * coef), 387 | all_3d_coords[step][2][treshhold_1:treshhold_2], c='y', marker='o') 388 | ax.scatter3D(all_3d_coords[step][0][treshhold_2:], 389 | np.add(all_3d_coords[step][1][treshhold_2:], (step - start_frame) * coef), 390 | all_3d_coords[step][2][treshhold_2:], c='b', marker='o') 391 | 392 | # Find which points are present 393 | 394 | key_point_arm = [] 395 | for point in list([0, 1, 2, 7, 8, 9]): 396 | if all_3d_coords[step][0][point] != 0 and all_3d_coords[step][0][point + 1] != 0: 397 | if all_3d_coords[step][1][point] != 0 and all_3d_coords[step][1][point + 1] != 0: 398 | if all_3d_coords[step][2][point] != 0 and all_3d_coords[step][2][point + 1] != 0: 399 | key_point_arm.append(point) 400 | 401 | key_point_arm = np.array(key_point_arm) 402 | 403 | key_point_leg = [] 404 | for point in list([27, 34]): # 28, 35 405 | if all_3d_coords[step][0][point] != 0 and all_3d_coords[step][0][point + 1] != 0: 406 | if all_3d_coords[step][1][point] != 0 and all_3d_coords[step][1][point + 1] != 0: 407 | if all_3d_coords[step][2][point] != 0 and all_3d_coords[step][2][point + 1] != 0: 408 | key_point_leg.append(point) 409 | key_point_leg = np.array(key_point_leg) 410 | 411 | # Add lines in between 412 | 413 | for point in key_point_arm: 414 | xline = all_3d_coords[step][0][point:point + 2] 415 | yline = np.add(all_3d_coords[step][1][point:point + 2], (step - start_frame) * coef) 416 | zline = all_3d_coords[step][2][point:point + 2] 417 | ax.plot(xline, yline, zline, c='c') 418 | for point in key_point_leg: 419 | xline = all_3d_coords[step][0][point:point + 3:2] 420 | yline = np.add(all_3d_coords[step][1][point:point + 3:2], (step - start_frame) * coef) 421 | zline = all_3d_coords[step][2][point:point + 3:2] 422 | ax.plot(xline, yline, zline, c='b') 423 | 424 | plt.show() 425 | 426 | 427 | if __name__ == '__main__': 428 | 429 | # Do some testing 430 | 431 | Test = False 432 | 433 | if Test: 434 | input_file_name = '/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture22.bvh' 435 | output_file_name = fl.FLAGS.data_dir + '/talking2.csv' 436 | 437 | test_file = open(output_file_name, 'wb') 438 | test_seq, _ = read_bvh_file(input_file_name) 439 | 440 | visualize(test_seq, test=False) 441 | 442 | # Save the data into a file 443 | with open(output_file_name, 'w') as fp: 444 | np.savetxt(fp, test_seq, delimiter=",") 445 | 446 | print("The test sequence was read from", input_file_name, " and written to", output_file_name) 447 | 448 | write_test_seq_in_binary('/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture1093.bvh', 449 | fl.FLAGS.data_dir + '/test_1.binary') 450 | write_test_seq_in_binary('/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture1097.bvh', 451 | fl.FLAGS.data_dir + '/test_2.binary') 452 | 453 | else: 454 | read_dataset_and_write_in_binary(True) 455 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/utils/flags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contrains all the flags for the motion representation learning repository 3 | """ 4 | from __future__ import division 5 | import os 6 | from os.path import join as pjoin 7 | 8 | import tensorflow as tf 9 | 10 | # Modify this function to set your home directory for this repo 11 | def home_out(path): 12 | return pjoin(os.environ['HOME'], 'tmp', 'MoCap', path) 13 | 14 | flags = tf.app.flags 15 | FLAGS = flags.FLAGS 16 | 17 | """ Fine-tuning Parameters """ 18 | 19 | # Flags about the sequence processing 20 | 21 | flags.DEFINE_integer('chunk_length', 1, 'Length of the chunks, for the data processing.') 22 | 23 | # Flags about training 24 | flags.DEFINE_float('learning_rate', 0.0001, 25 | 'learning rate for training .') 26 | flags.DEFINE_float('pretraining_learning_rate', 0.001 , 27 | 'learning rate for training .') 28 | 29 | flags.DEFINE_float('variance_of_noise', 0.05, 'Coefficient for the gaussian noise ' 30 | 'added to every point in input during the training') 31 | 32 | flags.DEFINE_boolean('pretrain', False,' Whether we pretrain the model in a layerwise way') 33 | flags.DEFINE_boolean('restore', False,' Whether we restore the model from the checkpoint') 34 | 35 | flags.DEFINE_boolean('evaluate', False, ' Whether we are evaluating the system') 36 | 37 | flags.DEFINE_float('dropout', 0.9, 'Probability to keep the neuron on') 38 | 39 | flags.DEFINE_integer('batch_size', 128, 40 | 'Size of the mini batch') 41 | 42 | flags.DEFINE_integer('training_epochs', 20, 43 | "Number of training epochs for pretraining layers") 44 | flags.DEFINE_integer('pretraining_epochs', 5, 45 | "Number of training epochs for pretraining layers") 46 | 47 | flags.DEFINE_float('weight_decay', 0.5, ' Whether we apply weight decay') 48 | 49 | flags.DEFINE_boolean('early_stopping', True, ' Whether we do early stopping') 50 | flags.DEFINE_float('delta_for_early_stopping', 0.5, 'How much worst the results must get in order' 51 | ' for training to be terminated.' 52 | ' 0.05 mean 5% worst than best we had.') 53 | 54 | # Network Architecture Specific Flags 55 | flags.DEFINE_integer('frame_size', 384, 'Dimensionality of the input for a single frame') 56 | 57 | flags.DEFINE_integer("num_hidden_layers", 1, "Number of hidden layers") 58 | flags.DEFINE_integer("middle_layer", 1, "Number of hidden layers") 59 | 60 | flags.DEFINE_integer('layer1_width', 312, 'Number of units in each hidden layer ') 61 | flags.DEFINE_integer('layer2_width', 248, 'Number of units in each hidden layer ') 62 | flags.DEFINE_integer('layer3_width', 312, 'Number of units in each hidden layer ') 63 | 64 | # Constants 65 | 66 | flags.DEFINE_integer('seed', 123456, 'Random seed') 67 | 68 | flags.DEFINE_string('summary_dir', home_out('summaries_exp'), 69 | 'Directory to put the summary data') 70 | 71 | flags.DEFINE_string('chkpt_dir', home_out('chkpts_exp'), 72 | 'Directory to put the model checkpoints') 73 | 74 | flags.DEFINE_string('results_file', home_out('results.txt'), 75 | 'File to put the experimental results') 76 | -------------------------------------------------------------------------------- /motion_repr_learning/ae/utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains helping function for the training and testing of the AE 3 | """ 4 | 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | import utils.flags as fl 11 | 12 | """ Dataset class""" 13 | 14 | class DataSet(object): 15 | ''' 16 | A class for storing a dataset and all important information, 17 | which might be needed during training, 18 | such as batch size amount of epochs completed and so on. 19 | ''' 20 | 21 | 22 | def __init__(self, sequences, batch_size): 23 | self._batch_size = batch_size 24 | self._sequences = sequences # all the sequnces in the dataset 25 | self._num_sequences = sequences.shape[0] 26 | self._epochs_completed = 0 27 | self._index_in_epoch = 0 28 | 29 | # Make interface to the protected variables 30 | @property 31 | def sequences(self): 32 | return self._sequences 33 | 34 | @property 35 | def num_sequences(self): 36 | return self._num_sequences 37 | 38 | class DataSets(object): 39 | ''' 40 | A class for storing Train and Eval datasets and all related information, 41 | ''' 42 | pass 43 | 44 | def read_test_seq_from_binary(binary_file_name): 45 | """ Read test sequence from the binart file 46 | Args: 47 | binary_file_name: the name of the input binary file 48 | Returns: 49 | read_seq: test sequence 50 | """ 51 | # Read the sequence 52 | read_seq = np.fromfile(binary_file_name) 53 | # Reshape 54 | read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size) 55 | amount_of_frames = int(read_seq.shape[0] / (fl.FLAGS.chunk_length)) 56 | if amount_of_frames > 0: 57 | # Clip array so that it divides exactly into the inputs we want (frame_size * chunk_length) 58 | read_seq = read_seq[0:amount_of_frames * fl.FLAGS.chunk_length] 59 | 60 | # Reshape 61 | read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size * fl.FLAGS.chunk_length) #? 62 | 63 | return read_seq 64 | 65 | def add_noise(x, variance_multiplier, sigma): 66 | """ 67 | Add Gaussian noise to the data 68 | Args: 69 | x - input vector 70 | variance_multiplier - coefficient to multiple variance of the noise on 71 | sigma - variance of the dataset 72 | Returns: 73 | x - output vector, noisy data 74 | """ 75 | eps = 1e-15 76 | noise = tf.random_normal(x.shape, 0.0, stddev=np.multiply(sigma, variance_multiplier) + eps) 77 | x = x + noise 78 | return x 79 | 80 | def loss_reconstruction(output, target, max_vals, pretrain=False): 81 | """ Reconstruction error. Square of the RMSE 82 | 83 | Args: 84 | output: tensor of net output 85 | target: tensor of net we are trying to reconstruct 86 | max_vals: array of absolute maximal values in the dataset, 87 | is used for scaling an error to the original space 88 | pretrain: wether we are using it during the pretraining phase 89 | Returns: 90 | Scalar tensor of mean squared Eucledean distance 91 | """ 92 | with tf.name_scope("reconstruction_loss"): 93 | net_output_tf = tf.convert_to_tensor(tf.cast(output, tf.float32), name='input') 94 | target_tf = tf.convert_to_tensor(tf.cast(target, tf.float32), name='target') 95 | 96 | # Euclidean distance between net_output_tf,target_tf 97 | error = tf.subtract(net_output_tf, target_tf) 98 | 99 | if not pretrain: 100 | # Convert it back from the [-1,1] to original values 101 | error_scaled = tf.multiply(error, max_vals[np.newaxis, :] + 1e-15) 102 | else: 103 | error_scaled = error 104 | 105 | squared_error = tf.reduce_mean(tf.square(error_scaled, name="square"), name="averaging") 106 | return squared_error 107 | 108 | def convert_back_to_3d_coords(sequence, max_val, mean_pose): 109 | ''' 110 | Convert back from the normalized values between -1 and 1 to original 3d coordinates 111 | and unroll them into the sequence 112 | 113 | Args: 114 | sequence: sequence of the normalized values 115 | max_val: maximal value in the dataset 116 | mean_pose: mean value in the dataset 117 | 118 | Return: 119 | 3d coordinates corresponding to the batch 120 | ''' 121 | 122 | # Convert it back from the [-1,1] to original values 123 | reconstructed = np.multiply(sequence, max_val[np.newaxis, :] + 1e-15) 124 | 125 | # Add the mean pose back 126 | reconstructed = reconstructed + mean_pose[np.newaxis, :] 127 | 128 | # Unroll batches into the sequence 129 | reconstructed = reconstructed.reshape(-1, reconstructed.shape[-1]) 130 | 131 | return reconstructed 132 | 133 | def reshape_dataset(dataset): 134 | """ 135 | Changing the shape of the dataset array to correspond to the frame dimentionality 136 | 137 | Args: 138 | dataset: an array of the dataset 139 | Return: 140 | dataset_final: array of the dataset in a proper shape 141 | """ 142 | 143 | amount_of_train_chunks = int(dataset.shape[0] / fl.FLAGS.chunk_length) 144 | dataset_shorten = dataset[:amount_of_train_chunks * fl.FLAGS.chunk_length, :fl.FLAGS.frame_size] 145 | dataset_chunks = np.reshape(dataset_shorten, (-1, fl.FLAGS.chunk_length * fl.FLAGS.frame_size)) 146 | 147 | # Merge all the time-frames together 148 | dataset_final = np.reshape(dataset_chunks, [amount_of_train_chunks, 149 | fl.FLAGS.chunk_length * fl.FLAGS.frame_size]) 150 | 151 | return dataset_final 152 | 153 | def prepare_motion_data(data_dir): 154 | """ 155 | Read and preprocess the motion dataset 156 | 157 | Args: 158 | data_dir: a directory with the dataset 159 | Return: 160 | Y_train: an array of the training dataset 161 | Y_train_normalized: training dataset normalized to the values [-1,1] 162 | Y_test: an array of the test dataset 163 | Y_test_normalized: test dataset normalized to the values [-1,1] 164 | Y_dev_normalized: dev dataset normalized to the values [-1,1] 165 | max_val: maximal values in the dataset 166 | mean_pose: mean pose of the dataset 167 | """ 168 | 169 | # Get the data 170 | 171 | Y_train = np.load(data_dir + '/Y_train.npy') 172 | Y_dev = np.load(data_dir + '/Y_dev.npy') 173 | Y_test = np.load(data_dir + '/Y_test.npy') 174 | 175 | # Normalize dataset 176 | max_val = np.amax(np.absolute(Y_train), axis=(0)) 177 | mean_pose = Y_train.mean(axis=(0)) 178 | 179 | Y_train_centered = Y_train - mean_pose[np.newaxis, :] 180 | Y_dev_centered = Y_dev - mean_pose[np.newaxis, :] 181 | Y_test_centered = Y_test - mean_pose[np.newaxis, :] 182 | 183 | # Scales all values in the input_data to be between -1 and 1 184 | eps = 1e-8 185 | Y_train_normalized = np.divide(Y_train_centered, max_val[np.newaxis, :] + eps) 186 | Y_dev_normalized = np.divide(Y_dev_centered, max_val[np.newaxis, :] + eps) 187 | Y_test_normalized = np.divide(Y_test_centered, max_val[np.newaxis, :] + eps) 188 | 189 | # Reshape to accomodate multiple frames at each input 190 | 191 | if fl.FLAGS.chunk_length > 1: 192 | Y_train_normalized = reshape_dataset(Y_train_normalized) 193 | Y_dev_normalized = reshape_dataset(Y_dev_normalized) 194 | Y_test_normalized = reshape_dataset(Y_test_normalized) 195 | 196 | # Pad max values and the mean pose, if neeeded 197 | if fl.FLAGS.chunk_length > 1: 198 | max_val = np.tile(max_val, fl.FLAGS.chunk_length) 199 | mean_pose = np.tile(mean_pose, fl.FLAGS.chunk_length) 200 | 201 | 202 | # Some tests for flags 203 | if fl.FLAGS.restore and fl.FLAGS.pretrain: 204 | print('ERROR! You cannot restore and pretrain at the same time!' 205 | ' Please, chose one of these options') 206 | exit(1) 207 | 208 | if fl.FLAGS.middle_layer > fl.FLAGS.num_hidden_layers: 209 | print('ERROR! Middle layer cannot be more than number of hidden layers!' 210 | ' Please, update flags') 211 | exit(1) 212 | 213 | return Y_train_normalized, Y_train, Y_test_normalized, Y_test,\ 214 | Y_dev_normalized, max_val, mean_pose 215 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script generates gestures output based on the speech input. 3 | The gestures will be written in the text file: 4 | 3d coordinates together with the velocities. 5 | """ 6 | 7 | import sys 8 | from keras.models import load_model 9 | import numpy as np 10 | 11 | 12 | def predict(model_name, input_file, output_file): 13 | """ Predict human gesture based on the speech 14 | 15 | Args: 16 | model_name: name of the Keras model to be used 17 | input_file: file name of the audio input 18 | output_file: file name for the gesture output 19 | 20 | Returns: 21 | 22 | """ 23 | model = load_model(model_name) 24 | X = np.load(input_file) 25 | 26 | predicted = np.array(model.predict(X)) 27 | print(predicted.shape) 28 | np.savetxt(output_file, predicted) 29 | 30 | 31 | if __name__ == "__main__": 32 | 33 | # Check if script get enough parameters 34 | if len(sys.argv) < 4: 35 | raise ValueError('Not enough paramters! \nUsage : python ' + sys.argv[0].split("/")[-1] + 36 | ' MODEL_NAME INPUT_FILE OUTPUT_FILE') 37 | 38 | predict(sys.argv[1], sys.argv[2], sys.argv[3]) 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py==2.10.0 2 | Keras==2.3.1 3 | Keras-Applications==1.0.8 4 | Keras-Preprocessing==1.1.0 5 | librosa==0.7.1 6 | matplotlib==3.1.1 7 | numpy==1.17.2 8 | pandas==0.25.2 9 | praat-parselmouth==0.3.3 10 | pydub==0.23.1 11 | pyquaternion==0.9.5 12 | pysptk==0.1.17 13 | python-speech-features==0.6 14 | scikit-learn==0.21.3 15 | scipy==1.3.1 16 | seaborn==0.7.1 -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the main script for the training. 3 | It contains speech-motion neural network implemented in Keras 4 | This script should be used to train the model, as described in READ.me 5 | """ 6 | 7 | import sys 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Activation, Dropout 13 | from keras.layers.recurrent import SimpleRNN, LSTM, GRU 14 | from keras.optimizers import SGD, Adam 15 | from keras.layers.wrappers import TimeDistributed, Bidirectional 16 | from keras.layers.normalization import BatchNormalization 17 | 18 | import matplotlib 19 | matplotlib.use('Agg') 20 | from matplotlib import pyplot 21 | 22 | # Check if script get enough parameters 23 | if len(sys.argv) < 6: 24 | raise ValueError( 25 | 'Not enough paramters! \nUsage : python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE (DIM)') 26 | ENCODED = sys.argv[5].lower() == 'true' 27 | 28 | if ENCODED: 29 | if len(sys.argv) < 7: 30 | raise ValueError( 31 | 'Not enough paramters! \nUsage : python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE DIM') 32 | else: 33 | N_OUTPUT = int(sys.argv[6]) # Representation dimensionality 34 | else: 35 | N_OUTPUT = 192 * 2 # Number of Gesture Feature (position + velocity) 36 | 37 | 38 | EPOCHS = int(sys.argv[2]) 39 | DATA_DIR = sys.argv[3] 40 | N_INPUT = int(sys.argv[4]) # Number of input features 41 | 42 | BATCH_SIZE = 2056 43 | N_HIDDEN = 256 44 | 45 | N_CONTEXT = 60 + 1 # The number of frames in the context 46 | 47 | 48 | def train(model_file): 49 | """ 50 | Train a neural network to take speech as input and produce gesture as an output 51 | 52 | Args: 53 | model_file: file to store the model 54 | 55 | Returns: 56 | 57 | """ 58 | 59 | # Get the data 60 | X = np.load(DATA_DIR + '/X_train.npy') 61 | 62 | if ENCODED: 63 | 64 | # If we learn speech-representation mapping we use encoded motion as output 65 | Y = np.load(DATA_DIR + '/' + str(N_OUTPUT)+ '/Y_train_encoded.npy') 66 | 67 | # Correct the sizes 68 | train_size = min(X.shape[0], Y.shape[0]) 69 | X = X[:train_size] 70 | Y = Y[:train_size] 71 | 72 | else: 73 | Y = np.load(DATA_DIR + '/Y_train.npy') 74 | 75 | N_train = int(len(X)*0.9) 76 | N_validation = len(X) - N_train 77 | 78 | # Split on training and validation 79 | X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=N_validation) 80 | 81 | # Define Keras model 82 | 83 | model = Sequential() 84 | model.add(TimeDistributed(Dense(N_HIDDEN), input_shape=(N_CONTEXT, N_INPUT))) 85 | model.add(BatchNormalization()) 86 | model.add(Activation('relu')) 87 | model.add(Dropout(0.1)) 88 | 89 | model.add(TimeDistributed(Dense(N_HIDDEN))) 90 | model.add(BatchNormalization()) 91 | model.add(Activation('relu')) 92 | model.add(Dropout(0.1)) 93 | 94 | model.add(TimeDistributed(Dense(N_HIDDEN))) 95 | model.add(BatchNormalization()) 96 | model.add(Activation('relu')) 97 | model.add(Dropout(0.1)) 98 | 99 | model.add(GRU(N_HIDDEN, return_sequences=False)) 100 | model.add(BatchNormalization()) 101 | model.add(Activation('relu')) 102 | model.add(Dropout(0.1)) 103 | 104 | model.add(Dense(N_OUTPUT)) 105 | model.add(Activation('linear')) 106 | 107 | print(model.summary()) 108 | 109 | optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999) 110 | model.compile(loss='mean_squared_error', optimizer=optimizer) 111 | 112 | hist = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_validation, Y_validation)) 113 | 114 | model.save(model_file) 115 | 116 | # Save convergence results into an image 117 | pyplot.plot(hist.history['loss'], linewidth=3, label='train') 118 | pyplot.plot(hist.history['val_loss'], linewidth=3, label='valid') 119 | pyplot.grid() 120 | pyplot.legend() 121 | pyplot.xlabel('epoch') 122 | pyplot.ylabel('loss') 123 | pyplot.savefig(model_file.replace('hdf5', 'png')) 124 | 125 | 126 | if __name__ == "__main__": 127 | 128 | train(sys.argv[1]) 129 | -------------------------------------------------------------------------------- /visuals/SpeechReprMotion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/visuals/SpeechReprMotion.png --------------------------------------------------------------------------------