├── LICENSE
├── README.md
├── data_processing
    ├── add_noisy_data.sh
    ├── alt_prosody.py
    ├── create_vector.py
    ├── prepare_data.py
    ├── silence.wav
    ├── tools.py
    ├── tools.pyc
    └── white_noise.wav
├── evaluation
    ├── README.md
    ├── calc_distance.py
    ├── calc_errors.py
    ├── calc_jerk.py
    ├── hellinger.py
    ├── hellinger_one2one.py
    ├── joints.txt
    └── plot_results.py
├── example_scripts
    ├── README.md
    ├── baseline_test.sh
    ├── baseline_train_n_test.sh
    ├── config.txt
    ├── proposed_test.sh
    └── proposed_train_n_test.sh
├── helpers
    ├── README.md
    ├── apply_filters.py
    ├── convert_original.py
    ├── filters
    │   ├── __pycache__
    │   │   ├── ma_filter.cpython-35.pyc
    │   │   └── one_euro_filter.cpython-35.pyc
    │   ├── ma_filter.py
    │   └── one_euro_filter.py
    └── remove_velocity.py
├── hierarchy.txt
├── motion_repr_learning
    ├── README.md
    └── ae
    │   ├── DAE.py
    │   ├── decode.py
    │   ├── encode_dataset.py
    │   ├── learn_dataset_encoding.py
    │   ├── train.py
    │   └── utils
    │       ├── __init__.py
    │       ├── data.py
    │       ├── flags.py
    │       └── utils.py
├── predict.py
├── requirements.txt
├── train.py
└── visuals
    └── SpeechReprMotion.png


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Aud2Repr2Pose: Analyzing input and output representations for speech-driven gesture generation 
  2 | [Taras Kucherenko](https://svito-zar.github.io/), [Dai Hasegawa](https://hasegawadai.info/), [Gustav Eje Henter](https://people.kth.se/~ghe/), Naoshi Kaneko, [Hedvig Kjellström](http://www.csc.kth.se/~hedvig/)
  3 | 
  4 | ![ImageOfIdea](visuals/SpeechReprMotion.png?raw=true "Idea")
  5 | 
  6 | This repository contains Keras and Tensorflow based implementation of the speech-driven gesture generation by a neural network which was published at  International Conference on Intelligent Virtual Agents (IVA'19) and the extention was published in International Journal of Human-Computer Interaction in 2021.
  7 | 
  8 | The [project website](https://svito-zar.github.io/audio2gestures/) contains all the information about this project, including [video](https://youtu.be/Iv7UBe92zrw) explanation of the method and the [paper](https://www.researchgate.net/publication/331645229_Analyzing_Input_and_Output_Representations_for_Speech-Driven_Gesture_Generation).
  9 | 
 10 | ## Demo on another dataset
 11 | 
 12 | This model has been applied to English dataset. 
 13 | 
 14 | The [demo video](https://youtu.be/tQLVyTVtsSU) as well as the [code](https://github.com/Svito-zar/speech-driven-hand-gesture-generation-demo) to run the pre-trained model are online.
 15 | 
 16 | ## Requirements
 17 | 
 18 | - Python 3
 19 | 
 20 | 
 21 | ## Initial setup
 22 | 
 23 | ### install packages
 24 | ```sh
 25 | 
 26 | # if you have GPU
 27 | pip install tensorflow-gpu==1.15.2
 28 | 
 29 | # if you don't have GPU
 30 | pip install tensorflow==1.15.2
 31 | 
 32 | pip install -r requirements.txt
 33 | ```
 34 | 
 35 | ### install ffmpeg
 36 | ```sh
 37 | # macos
 38 | brew install ffmpeg
 39 | ```
 40 | 
 41 | ```
 42 | # ubuntu
 43 | sudo add-apt-repository ppa:jonathonf/ffmpeg-4
 44 | sudo apt-get update
 45 | sudo apt-get install ffmpeg
 46 | ```
 47 | 
 48 | 
 49 | &nbsp;
 50 | ____________________________________________________________________________________________________________
 51 | &nbsp;
 52 | 
 53 | # How to use this repository?
 54 | 
 55 | # 0. Notation
 56 | 
 57 | We write all the parameters which needs to be specified by a user in the capslock.
 58 | 
 59 | ## 1. Download raw data
 60 | 
 61 | - Clone this repository
 62 | - Download a dataset from `https://www.dropbox.com/sh/j419kp4m8hkt9nd/AAC_pIcS1b_WFBqUp5ofBG1Ia?dl=0`
 63 | - Create a directory named `dataset` and put two directories `motion/` and `speech/` under `dataset/`
 64 | 
 65 | ## 2. Split dataset
 66 | 
 67 | - Put the folder with the dataset in the `data_processing` directory of this repo: next to the script `prepare_data.py`
 68 | - Run the following command
 69 | 
 70 | ```sh
 71 | python data_processing/prepare_data.py DATA_DIR
 72 | # DATA_DIR = directory to save data such as 'data/'
 73 | ```
 74 | 
 75 | Note: DATA_DIR is not a directory where the raw data is stored (the folder with data, "dataset" , has to be stored in the root folder of this repo). DATA_DIR is the directory where the postprocessed data should be saved. After this step you don't need to have "dataset" in the root folder any more. 
 76 | You should use the same DATA_DIR in all the following scripts.
 77 | 
 78 | After this command:
 79 | - `train/` `test/` `dev/` are created under `DATA_DIR/`  
 80 |   - in `inputs/` inside each directory, audio(id).wav files are stored  
 81 |   - in `labels/` inside each directory, gesture(id).bvh files are stored  
 82 | - under `DATA_DIR/`,  three csv files `gg-train.csv` `gg-test.csv` `gg-dev.csv` are created and these files have paths to actual data
 83 | 
 84 | 
 85 | ## 3. Convert the dataset into vectors
 86 | 
 87 | ```sh
 88 | python data_processing/create_vector.py DATA_DIR N_CONTEXT
 89 | # N_CONTEXT = number of context, in our experiments was set to '60'
 90 | # (this means 30 steps backwards and forwards)
 91 | ```
 92 | 
 93 | Note: if you change the N_CONTEXT value - you need to update it in the `train.py` script.
 94 | 
 95 | (You are likely to get a warning like this "WARNING:root:frame length (5513) is greater than FFT size (512), frame will be truncated. Increase NFFT to avoid." )
 96 | 
 97 | As a result of running this script
 98 | - numpy binary files `X_train.npy`, `Y_train.npy` (vectord dataset) are created under `DATA_DIR`
 99 | - under `DATA_DIR/test_inputs/` , test audios, such as `X_test_audio1168.npy` , are created
100 | - when N_CONTEXT = 60, the audio vector's shape is (num of timesteps, 61, 26) 
101 | - gesture vector's shape is（num of timesteps, 384)
102 |   - 384 = 64joints × (x,y,z positions + x,y,z velocities)
103 | 
104 | ## If you don't want to customize anything - you can skip reading about steps 4-7 and just use already prepared scripts at the folder `example_scripts`
105 | &nbsp;
106 | 
107 | ## 4. Learn motion representation by AutoEncoder
108 | 
109 | Create a directory to save training checkpoints such as `chkpt/` and use it as CHKPT_DIR parameter.
110 | #### Learn dataset encoding
111 | ```sh
112 | python motion_repr_learning/ae/learn_dataset_encoding.py DATA_DIR -chkpt_dir=CHKPT_DIR -layer1_width=DIM
113 | ```
114 | 
115 | The optimal dimensionality (DIM) in our experiment was 325
116 | 
117 | #### Encode dataset
118 | Create DATA_DIR/DIM directory
119 | ```sh
120 | python motion_repr_learning/ae/encode_dataset.py DATA_DIR -chkpt_dir=CHKPT_DIR -restore=True -pretrain=False -layer1_width=DIM
121 | ```
122 | 
123 | More information can be found in the folder `motion_repr_learning` 
124 | 
125 | 
126 | ## 5. Learn speech-driven gesture generation model
127 | 
128 | ```sh
129 | python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE DIM
130 | # MODEL_NAME = hdf5 file name such as 'model_500ep_posvel_60.hdf5'
131 | # EPOCHS = how many epochs do we want to train the model (recommended - 100)
132 | # DATA_DIR = directory with the data (should be same as above)
133 | # N_INPUT = how many dimension does speech data have (default - 26)
134 | # ENCODE = weather we train on the encoded gestures (using proposed model) or on just on the gestures as their are (using baseline model)
135 | # DIM = how many dimension does encoding have (ignored if you don't encode)
136 | ```
137 | 
138 | ## 6. Predict gesture
139 | 
140 | ```sh
141 | python predict.py MODEL_NAME INPUT_SPEECH_FILE OUTPUT_GESTURE_FILE
142 | ```
143 | 
144 | ```sh
145 | # Usage example
146 | python predict.py model.hdf5 data/test_inputs/X_test_audio1168.npy data/test_inputs/predict_1168_20fps.txt
147 | ```
148 | 
149 | ```sh
150 | # You need to decode the gestures
151 | python motion_repr_learning/ae/decode.py DATA_DIR ENCODED_PREDICTION_FILE DECODED_GESTURE_FILE -restore=True -pretrain=False -layer1_width=DIM -chkpt_dir=CHKPT_DIR -batch_size=8 
152 | ```
153 | 
154 | 
155 | Note: This can be used in a for loop over all the test sequences. Examples are provided in the 
156 | `example_scripts` folder of this directory
157 | 
158 | ```sh
159 | # The network produces both coordinates and velocity
160 | # So we need to remove velocities
161 | python helpers/remove_velocity.py -g PATH_TO_GESTURES
162 | ```
163 | 
164 | ## 7. Quantitative evaluation
165 | Use scripts in the `evaluation` folder of this directory.
166 | 
167 | Examples are provided in the `example_scripts` folder of this repository
168 | 
169 | ## 8. Qualitative evaluation
170 | Use [animation server](https://secret-meadow-14164.herokuapp.com/coordinates.html)
171 | 
172 | &nbsp;
173 | 
174 | ## Citation
175 | If you use this code in your research please cite the paper:
176 | ```
177 | @article{kucherenko2021moving,
178 |   title={Moving fast and slow: Analysis of representations and post-processing in speech-driven automatic gesture generation},
179 |   author={Kucherenko, Taras and Hasegawa, Dai and Kaneko, Naoshi and Henter, Gustav Eje and Kjellstr{\"o}m, Hedvig},
180 |   journal={International Journal of Human–Computer Interaction},
181 |   doi={10.1080/10447318.2021.1883883},
182 |   year={2021}
183 | }
184 | ```
185 | 
186 | ## Contact
187 | If you encounter any problems/bugs/issues please contact me on Github or by emailing me at tarask@kth.se for any bug reports/questions/suggestions. I prefer questions and bug reports on Github as that provides visibility to others who might be encountering same issues or who have the same questions.
188 | 


--------------------------------------------------------------------------------
/data_processing/add_noisy_data.sh:
--------------------------------------------------------------------------------
 1 | # This script will add noise to the audio file in order to augment the dataset
 2 | # It is used in the script "prepare_data.py"
 3 | 
 4 | data=$4
 5 | for i in `seq ${2} ${3}`;
 6 | do
 7 |   echo "${i}"
 8 |   if [ -e ${data}/${1}/inputs/audio${i}.wav ]
 9 |   then 
10 |     sox ${data}/${1}/inputs/audio${i}.wav -p synth whitenoise vol 0.01 | sox -m ${data}/${1}/inputs/audio${i}.wav - ${data}/${1}/inputs/naudio${i}.wav
11 |     echo "naudio generated in ${1} for id ${i}"
12 |   else
13 |     echo "could not generate noisy audio, because original audio at ${data} was not found"
14 |   fi
15 | done
16 | 


--------------------------------------------------------------------------------
/data_processing/alt_prosody.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Alternative calculation of prosodic features
 3 | """
 4 | Created on Tue Jan 15 18:45:34 2019
 5 | 
 6 | @author: kaneko.naoshi
 7 | """
 8 | 
 9 | import numpy as np
10 | import parselmouth as pm
11 | 
12 | 
13 | def compute_prosody(audio_filename, time_step=0.05):
14 |     audio = pm.Sound(audio_filename)
15 | 
16 |     # Extract pitch and intensity
17 |     pitch = audio.to_pitch(time_step=time_step)
18 |     intensity = audio.to_intensity(time_step=time_step)
19 | 
20 |     # Evenly spaced time steps
21 |     times = np.arange(0, audio.get_total_duration() - time_step, time_step)
22 | 
23 |     # Compute prosodic features at each time step
24 |     pitch_values = np.nan_to_num(
25 |         np.asarray([pitch.get_value_at_time(t) for t in times]))
26 |     intensity_values = np.nan_to_num(
27 |         np.asarray([intensity.get_value(t) for t in times]))
28 | 
29 |     intensity_values = np.clip(
30 |         intensity_values, np.finfo(intensity_values.dtype).eps, None)
31 | 
32 |     # Normalize features [Chiu '11]
33 |     pitch_norm = np.clip(np.log(pitch_values + 1) - 4, 0, None)
34 |     intensity_norm = np.clip(np.log(intensity_values) - 3, 0, None)
35 | 
36 |     return pitch_norm, intensity_norm


--------------------------------------------------------------------------------
/data_processing/create_vector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script does preprocessing of the dataset specified in DATA_DIR
  3 |  and stores it in the same folder as .npy files
  4 | It should be used before training, as described in the README.md
  5 | 
  6 | @author: Taras Kucherenko
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import pyquaternion as pyq
 13 | 
 14 | from tools import *
 15 | 
 16 | N_OUTPUT = 384 # Number of gesture features (position)
 17 | WINDOW_LENGTH = 50 # in miliseconds
 18 | FEATURES = "MFCC"
 19 | 
 20 | if FEATURES == "MFCC":
 21 |     N_INPUT = 26 # Number of MFCC features
 22 | if FEATURES == "Pros":
 23 |     N_INPUT = 4 # Number of prosodic features
 24 | if FEATURES == "MFCC+Pros":
 25 |     N_INPUT = 30 # Total number of features
 26 | if FEATURES == "Spectro":
 27 |     N_INPUT = 64 # Number of spectrogram features
 28 | if FEATURES == "Spectro+Pros":
 29 |     N_INPUT = 68  # Total number of eatures
 30 | if FEATURES == "MFCC+Spectro":
 31 |     N_INPUT = 90  # Total number of eatures
 32 | if FEATURES == "MFCC+Spectro+Pros":
 33 |     N_INPUT = 94  # Total number of eatures
 34 | 
 35 | 
 36 | def pad_sequence(input_vectors):
 37 |     """
 38 |     Pad array of features in order to be able to take context at each time-frame
 39 |     We pad N_CONTEXT / 2 frames before and after the signal by the features of the silence
 40 |     Args:
 41 |         input_vectors:      feature vectors for an audio
 42 | 
 43 |     Returns:
 44 |         new_input_vectors:  padded feature vectors
 45 |     """
 46 | 
 47 |     if FEATURES == "MFCC":
 48 | 
 49 |         # Pad sequence not with zeros but with MFCC of the silence
 50 | 
 51 |         silence_vectors = calculate_mfcc("data_processing/silence.wav")
 52 |         mfcc_empty_vector = silence_vectors[0]
 53 | 
 54 |         empty_vectors = np.array([mfcc_empty_vector] * int(N_CONTEXT / 2))
 55 | 
 56 |     if FEATURES == "Pros":
 57 | 
 58 |         # Pad sequence with zeros
 59 | 
 60 |         prosodic_empty_vector =[0, 0, 0, 0]
 61 | 
 62 |         empty_vectors = np.array([prosodic_empty_vector] * int(N_CONTEXT / 2))
 63 | 
 64 |     if FEATURES == "MFCC+Pros":
 65 | 
 66 |         silence_vectors = calculate_mfcc("data_processing/silence.wav")
 67 |         mfcc_empty_vector = silence_vectors[0]
 68 | 
 69 |         prosodic_empty_vector = [0, 0, 0, 0]
 70 | 
 71 |         combined_empty_vector = np.concatenate((mfcc_empty_vector, prosodic_empty_vector))
 72 | 
 73 |         empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
 74 | 
 75 |     if FEATURES == "Spectro":
 76 | 
 77 |         silence_spectro = calculate_spectrogram("data_processing/silence.wav")
 78 |         spectro_empty_vector = silence_spectro[0]
 79 | 
 80 |         empty_vectors = np.array([spectro_empty_vector] * int(N_CONTEXT / 2))
 81 | 
 82 |     if FEATURES == "Spectro+Pros":
 83 | 
 84 |         silence_spectro = calculate_spectrogram("data_processing/silence.wav")
 85 |         spectro_empty_vector = silence_spectro[0]
 86 | 
 87 |         prosodic_empty_vector = [0, 0, 0, 0]
 88 | 
 89 |         combined_empty_vector = np.concatenate((spectro_empty_vector, prosodic_empty_vector))
 90 | 
 91 |         empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
 92 | 
 93 |     if FEATURES == "MFCC+Spectro":
 94 | 
 95 |         silence_spectro = calculate_spectrogram("data_processing/silence.wav")
 96 |         spectro_empty_vector = silence_spectro[0]
 97 | 
 98 |         silence_vectors = calculate_mfcc("data_processing/silence.wav")
 99 |         mfcc_empty_vector = silence_vectors[0]
100 | 
101 |         combined_empty_vector = np.concatenate((mfcc_empty_vector, spectro_empty_vector,))
102 | 
103 |         empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
104 | 
105 |     if FEATURES == "MFCC+Spectro+Pros":
106 | 
107 |         silence_spectro = calculate_spectrogram("data_processing/silence.wav")
108 |         spectro_empty_vector = silence_spectro[0]
109 | 
110 |         silence_vectors = calculate_mfcc("data_processing/silence.wav")
111 |         mfcc_empty_vector = silence_vectors[0]
112 | 
113 |         prosodic_empty_vector = [0, 0, 0, 0]
114 | 
115 |         combined_empty_vector = np.concatenate((mfcc_empty_vector, spectro_empty_vector, prosodic_empty_vector))
116 | 
117 |         empty_vectors = np.array([combined_empty_vector] * int(N_CONTEXT / 2))
118 | 
119 |     # append N_CONTEXT/2 "empty" mfcc vectors to past
120 |     new_input_vectors = np.append(empty_vectors, input_vectors, axis=0)
121 |     # append N_CONTEXT/2 "empty" mfcc vectors to future
122 |     new_input_vectors = np.append(new_input_vectors, empty_vectors, axis=0)
123 | 
124 |     return new_input_vectors
125 | 
126 | def create_vectors(audio_filename, gesture_filename, nodes):
127 |     """
128 |     Extract features from a given pair of audio and motion files
129 |     Args:
130 |         audio_filename:    file name for an audio file (.wav)
131 |         gesture_filename:  file name for a motion file (.bvh)
132 |         nodes:             an array of markers for the motion
133 | 
134 |     Returns:
135 |         input_with_context   : speech features
136 |         output_with_context  : motion features
137 |     """
138 |     # Step 1: Vactorizing speech, with features of N_INPUT dimension, time steps of 0.01s
139 |     # and window length with 0.025s => results in an array of 100 x N_INPUT
140 | 
141 |     if FEATURES == "MFCC":
142 | 
143 |         input_vectors = calculate_mfcc(audio_filename)
144 | 
145 |     if FEATURES == "Pros":
146 | 
147 |         input_vectors = extract_prosodic_features(audio_filename)
148 | 
149 |     if FEATURES == "MFCC+Pros":
150 | 
151 |         mfcc_vectors = calculate_mfcc(audio_filename)
152 | 
153 |         pros_vectors = extract_prosodic_features(audio_filename)
154 | 
155 |         mfcc_vectors, pros_vectors = shorten(mfcc_vectors, pros_vectors)
156 | 
157 |         input_vectors = np.concatenate((mfcc_vectors, pros_vectors), axis=1)
158 | 
159 |     if FEATURES =="Spectro":
160 | 
161 |         input_vectors = calculate_spectrogram(audio_filename)
162 | 
163 |     if FEATURES == "Spectro+Pros":
164 |         spectr_vectors = calculate_spectrogram(audio_filename)
165 | 
166 |         pros_vectors = extract_prosodic_features(audio_filename)
167 | 
168 |         spectr_vectors, pros_vectors = shorten(spectr_vectors, pros_vectors)
169 | 
170 |         input_vectors = np.concatenate((spectr_vectors, pros_vectors), axis=1)
171 | 
172 |     if FEATURES == "MFCC+Spectro":
173 | 
174 |         spectr_vectors = calculate_spectrogram(audio_filename)
175 | 
176 |         mfcc_vectors = calculate_mfcc(audio_filename)
177 | 
178 |         spectr_vectors, mfcc_vectors = shorten(spectr_vectors, mfcc_vectors)
179 | 
180 |         input_vectors = np.concatenate((mfcc_vectors,spectr_vectors), axis=1)
181 | 
182 |     if FEATURES == "MFCC+Spectro+Pros":
183 | 
184 |         spectr_vectors = calculate_spectrogram(audio_filename)
185 | 
186 |         mfcc_vectors = calculate_mfcc(audio_filename)
187 | 
188 |         pros_vectors = extract_prosodic_features(audio_filename)
189 | 
190 |         spectr_vectors, mfcc_vectors, pros_vectors = shorten3(spectr_vectors, mfcc_vectors, pros_vectors)
191 | 
192 |         input_vectors = np.concatenate((mfcc_vectors,spectr_vectors, pros_vectors), axis=1)
193 | 
194 |     # Step 2: Read motions
195 | 
196 |     motion_format = "bvh"
197 | 
198 |     if motion_format == "npz":
199 |         ges_str = np.load(gesture_filename)
200 |         output_vectors = ges_str['clips']
201 | 
202 |         # Subsample motion (from 60 fsp to 20 fsp)
203 |         output_vectors = output_vectors[0::3]
204 | 
205 | 
206 |     elif motion_format == "bvh":
207 |         f = open(gesture_filename, 'r')
208 |         org = f.readlines()
209 |         frametime = org[310].split()
210 | 
211 |         del org[0:311]
212 | 
213 |         bvh_len = len(org)
214 | 
215 |         for idx, line in enumerate(org):
216 |             org[idx] = [float(x) for x in line.split()]
217 | 
218 |         for i in range(0, bvh_len):
219 |             for j in range(0, int(306 / 3)):
220 |                 st = j * 3
221 |                 del org[i][st:st + 3]
222 | 
223 |         # if data is 100fps, cut it to 20 fps (every fifth line)
224 |         # if data is approx 24fps, cut it to 20 fps (del every sixth line)
225 |         if float(frametime[2]) == 0.0416667:
226 |             del org[::6]
227 |         elif float(frametime[2]) == 0.010000:
228 |             org = org[::5]
229 |         else:
230 |             print("smth wrong with fps of " + gesture_filename)
231 | 
232 |         output_vectors = rot_vec_to_abs_pos_vec(org, nodes)
233 | 
234 |         f.close()
235 | 
236 |     # Step 3: Align vector length
237 |     input_vectors, output_vectors = shorten(input_vectors, output_vectors)
238 | 
239 |     # Step 4: Retrieve N_CONTEXT each time, stride one by one
240 |     input_with_context = np.array([])
241 |     output_with_context = np.array([])
242 | 
243 |     strides = len(input_vectors)
244 | 
245 |     input_vectors = pad_sequence(input_vectors)
246 | 
247 |     for i in range(strides):
248 |         stride = i + int(N_CONTEXT/2)
249 |         if i == 0:
250 |             input_with_context = input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT)
251 |             output_with_context = output_vectors[i].reshape(1, N_OUTPUT)
252 |         else:
253 |             input_with_context = np.append(input_with_context, input_vectors[stride - int(N_CONTEXT/2) : stride + int(N_CONTEXT/2) + 1].reshape(1, N_CONTEXT+1, N_INPUT), axis=0)
254 |             output_with_context = np.append(output_with_context, output_vectors[i].reshape(1, N_OUTPUT), axis=0)
255 | 
256 |     return input_with_context, output_with_context
257 | 
258 | 
259 | def create_hierarchy_nodes(hierarchy):
260 |     """
261 |     Create hierarchy nodes: an array of markers used in the motion capture
262 |     Args:
263 |         hierarchy: bvh file read in a structure
264 | 
265 |     Returns:
266 |         nodes: array of markers to be used in motion processing
267 | 
268 |     """
269 |     joint_offsets = []
270 |     joint_names = []
271 | 
272 |     for idx, line in enumerate(hierarchy):
273 |         hierarchy[idx] = hierarchy[idx].split()
274 |         if not len(hierarchy[idx]) == 0:
275 |             line_type = hierarchy[idx][0]
276 |             if line_type == 'OFFSET':
277 |                 offset = np.array([float(hierarchy[idx][1]), float(hierarchy[idx][2]), float(hierarchy[idx][3])])
278 |                 joint_offsets.append(offset)
279 |             elif line_type == 'ROOT' or line_type == 'JOINT':
280 |                 joint_names.append(hierarchy[idx][1])
281 |             elif line_type == 'End':
282 |                 joint_names.append('End Site')
283 | 
284 |     nodes = []
285 |     for idx, name in enumerate(joint_names):
286 |         if idx == 0:
287 |             parent = None
288 |         elif idx in [6, 30]: #spine1->shoulders
289 |             parent = 2
290 |         elif idx in [14, 18, 22, 26]: #lefthand->leftfingers
291 |             parent = 9
292 |         elif idx in [38, 42, 46, 50]: #righthand->rightfingers
293 |             parent = 33
294 |         elif idx in [54, 59]: #hip->legs
295 |             parent = 0
296 |         else:
297 |             parent = idx - 1
298 | 
299 |         if name == 'End Site':
300 |             children = None
301 |         elif idx == 0: #hips
302 |             children = [1, 54, 59]
303 |         elif idx == 2: #spine1
304 |             children = [3, 6, 30]
305 |         elif idx == 9: #lefthand
306 |             children = [10, 14, 18, 22, 26]
307 |         elif idx == 33: #righthand
308 |             children = [34, 38, 42, 46, 50]
309 |         else:
310 |             children = [idx + 1]
311 | 
312 |         node = dict([('name', name), ('parent', parent), ('children', children), ('offset', joint_offsets[idx]), ('rel_degs', None), ('abs_qt', None), ('rel_pos', None), ('abs_pos', None)])
313 |         if idx == 0:
314 |             node['rel_pos'] = node['abs_pos'] = [float(0), float(60), float(0)]
315 |             node['abs_qt'] = pyq.Quaternion()
316 |         nodes.append(node)
317 | 
318 |     return nodes
319 | 
320 | 
321 | def rot_vec_to_abs_pos_vec(frames, nodes):
322 |     """
323 |     Transform vectors of the human motion from the joint angles to the absolute positions
324 |     Args:
325 |         frames: human motion in the join angles space
326 |         nodes:  set of markers used in motion caption
327 | 
328 |     Returns:
329 |         output_vectors : 3d coordinates of this human motion
330 |     """
331 |     output_lines = []
332 | 
333 |     for frame in frames:
334 |         node_idx = 0
335 |         for i in range(51): #changed from 51
336 |             stepi = i*3
337 |             z_deg = float(frame[stepi])
338 |             x_deg = float(frame[stepi+1])
339 |             y_deg = float(frame[stepi+2])
340 | 
341 |             if nodes[node_idx]['name'] == 'End Site':
342 |                  node_idx = node_idx + 1
343 |             nodes[node_idx]['rel_degs'] = [z_deg, x_deg, y_deg]
344 |             current_node = nodes[node_idx]
345 | 
346 |             node_idx = node_idx + 1
347 | 
348 |         for start_node in nodes:
349 |             abs_pos = np.array([0, 60, 0])
350 |             current_node = start_node
351 |             if start_node['children'] is not None: #= if not start_node['name'] = 'end site'
352 |                 for child_idx in start_node['children']:
353 |                     child_node = nodes[child_idx]
354 | 
355 |                     child_offset = np.array(child_node['offset'])
356 |                     qz = pyq.Quaternion(axis=[0, 0, 1], degrees=start_node['rel_degs'][0])
357 |                     qx = pyq.Quaternion(axis=[1, 0, 0], degrees=start_node['rel_degs'][1])
358 |                     qy = pyq.Quaternion(axis=[0, 1, 0], degrees=start_node['rel_degs'][2])
359 |                     qrot = qz * qx * qy
360 |                     offset_rotated = qrot.rotate(child_offset)
361 |                     child_node['rel_pos']= start_node['abs_qt'].rotate(offset_rotated)
362 | 
363 |                     child_node['abs_qt'] = start_node['abs_qt'] * qrot
364 | 
365 |             while current_node['parent'] is not None:
366 | 
367 |                 abs_pos = abs_pos + current_node['rel_pos']
368 |                 current_node = nodes[current_node['parent']]
369 |             start_node['abs_pos'] = abs_pos
370 | 
371 |         line = []
372 |         for node in nodes:
373 |             line.append(node['abs_pos'])
374 |         output_lines.append(line)
375 | 
376 |     output_vels = []
377 |     for idx, line in enumerate(output_lines):
378 |         vel_line = []
379 |         for jn, joint_pos in enumerate(line):
380 |            if idx == 0:
381 |                vels = np.array([0.0, 0.0, 0.0])
382 |            else:
383 |                vels = np.array([joint_pos[0] - output_lines[idx-1][jn][0], joint_pos[1] - output_lines[idx-1][jn][1], joint_pos[2] - output_lines[idx-1][jn][2]])
384 |            vel_line.append(vels)
385 |         output_vels.append(vel_line)
386 | 
387 |     out = []
388 |     for idx, line in enumerate(output_vels):
389 |         ln = []
390 |         for jn, joint_vel in enumerate(line):
391 |             ln.append(output_lines[idx][jn])
392 |             ln.append(joint_vel)
393 |         out.append(ln)
394 | 
395 |     output_array = np.asarray(out)
396 |     output_vectors = np.empty([len(output_array), N_OUTPUT])
397 |     for idx, line in enumerate(output_array):
398 |         output_vectors[idx] = line.flatten()
399 |     return output_vectors
400 | 
401 | 
402 | def create(name, nodes):
403 |     """
404 |     Create a dataset
405 |     Args:
406 |         name:  dataset: 'train' or 'test' or 'dev
407 |         nodes: markers used in motion caption
408 | 
409 |     Returns:
410 |         nothing: saves numpy arrays of the features and labels as .npy files
411 | 
412 |     """
413 |     DATA_FILE = pd.read_csv(DATA_DIR + '/gg-' + str(name) + '.csv')
414 |     X = np.array([])
415 |     Y = np.array([])
416 | 
417 |     for i in range(len(DATA_FILE)):
418 |         input_vectors, output_vectors = create_vectors(DATA_FILE['wav_filename'][i], DATA_FILE['bvh_filename'][i], nodes)
419 | 
420 |         if len(X) == 0:
421 |             X = input_vectors
422 |             Y = output_vectors
423 |         else:
424 |             X = np.concatenate((X, input_vectors), axis=0)
425 |             Y = np.concatenate((Y, output_vectors), axis=0)
426 | 
427 |         if i%3==0:
428 |             print("^^^^^^^^^^^^^^^^^^")
429 |             print('{:.2f}% of processing for {:.8} dataset is done'.format(100.0 * (i+1) / len(DATA_FILE), str(name)))
430 |             print("Current dataset sizes are:")
431 |             print(X.shape)
432 |             print(Y.shape)
433 | 
434 |     x_file_name = DATA_DIR + '/X_' + str(name) + '.npy'
435 |     y_file_name = DATA_DIR + '/Y_' + str(name) + '.npy'
436 |     np.save(x_file_name, X)
437 |     np.save(y_file_name, Y)
438 | 
439 | 
440 | def create_test_sequences(nodes, dataset):
441 |     """
442 |     Create test sequences
443 |     Args:
444 |         nodes:    markers used in motion caption
445 |         dataset:  dataset name ('train', 'test' or 'dev')
446 | 
447 |     Returns:
448 |         nothing, saves dataset into .npy file
449 | 
450 |     """
451 |     DATA_FILE = pd.read_csv(DATA_DIR + '/gg-'+dataset+'.csv')
452 | 
453 |     for i in range(len(DATA_FILE)):
454 |         input_vectors, output_vectors = create_vectors(DATA_FILE['wav_filename'][i], DATA_FILE['bvh_filename'][i], nodes)
455 | 
456 |         array = DATA_FILE['wav_filename'][i].split("/")
457 |         name = array[len(array)-1].split(".")[0]
458 | 
459 |         X = input_vectors
460 | 
461 |         if not os.path.isdir(DATA_DIR + '/'+dataset+'_inputs'):
462 |             os.makedirs(DATA_DIR +  '/'+dataset+'_inputs')
463 | 
464 |         x_file_name = DATA_DIR + '/'+dataset+'_inputs/X_test_' + name + '.npy'
465 | 
466 |         np.save(x_file_name, X)
467 | 
468 | 
469 | if __name__ == "__main__":
470 | 
471 |     # Check if script get enough parameters
472 |     if len(sys.argv) < 3:
473 |         raise ValueError('Not enough paramters! \nUsage : python ' + sys.argv[0].split("/")[-1] + ' DATA_DIR N_CONTEXT')
474 | 
475 |     # Check if the dataset exists
476 |     if not os.path.exists(sys.argv[1]):
477 |         raise ValueError(
478 |             'Path to the dataset ({}) does not exist!\nPlease, provide correct DATA_DIR as a script parameter'
479 |             ''.format(sys.argv[1]))
480 | 
481 |     DATA_DIR = sys.argv[1]
482 |     N_CONTEXT = int(sys.argv[2])
483 |     f = open('hierarchy.txt', 'r')
484 |     hierarchy = f.readlines()
485 |     f.close()
486 |     nodes = create_hierarchy_nodes(hierarchy)
487 | 
488 |     create_test_sequences(nodes, 'test')
489 |     create('test', nodes)
490 |     create('dev', nodes)
491 |     create('train', nodes)
492 | 


--------------------------------------------------------------------------------
/data_processing/prepare_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script is used to split the dataset into train, test and dev
  3 | More info on its usage is given in the READ.me file
  4 | 
  5 | @author: Taras Kucherenko
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import shutil
 11 | import pandas
 12 | from os import path
 13 | 
 14 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 15 | 
 16 | NUM_OF_TEST = 90
 17 | FIRST_DATA_ID = 20
 18 | LAST_DATA_ID = 1182
 19 | 
 20 | AUGMENT = True
 21 | 
 22 | 
 23 | def _split_and_format_data(data_dir):
 24 | 
 25 |     if not os.path.isdir(data_dir):
 26 |         os.makedirs(data_dir)
 27 |     _download_datasets(data_dir)
 28 | 
 29 | 
 30 | def _download_datasets(data_dir):
 31 | 
 32 |     _create_dir(data_dir)
 33 | 
 34 |     # prepare training data (including validation data)
 35 |     for i in range (FIRST_DATA_ID, LAST_DATA_ID - NUM_OF_TEST):
 36 |         filename = "audio" + str(i) + ".wav"
 37 |         original_file_path = path.join("dataset/speech/" + filename)
 38 |         if os.path.exists(original_file_path):
 39 |             target_file_path = path.join(data_dir + "train/inputs/" + filename)
 40 |             print(target_file_path)
 41 |             shutil.copy(original_file_path, target_file_path)
 42 |         else:
 43 |             print(original_file_path + " does not exist")
 44 |         filename = "gesture" + str(i) + ".bvh"
 45 |         original_file_path = path.join("dataset/motion/" + filename)
 46 |         if os.path.exists(original_file_path):
 47 |             target_file_path = path.join(data_dir + "train/labels/" + filename)
 48 |             print(target_file_path)
 49 |             shutil.copy(original_file_path, target_file_path)
 50 |         else:
 51 |             print(original_file_path + " does not exist")
 52 | 
 53 |     # prepare test data
 54 |     for i in range(LAST_DATA_ID - NUM_OF_TEST, LAST_DATA_ID + 1,2):
 55 |         filename = "audio" + str(i) + ".wav"
 56 |         original_file_path = path.join("dataset/speech/" + filename)
 57 |         if os.path.exists(original_file_path):
 58 |             target_file_path = path.join(data_dir + "test/inputs/" + filename)
 59 |             print(target_file_path)
 60 |             shutil.copy(original_file_path, target_file_path)
 61 |         else:
 62 |             print(original_file_path + " does not exist")
 63 |         filename = "gesture" + str(i) + ".bvh"
 64 |         original_file_path = path.join("dataset/motion/" + filename)
 65 |         if os.path.exists(original_file_path):
 66 |             target_file_path = path.join(data_dir + "test/labels/" + filename)
 67 |             print(target_file_path)
 68 |             shutil.copy(original_file_path, target_file_path)
 69 |         else:
 70 |             print(original_file_path + " does not exist")
 71 | 
 72 |     # prepare dev data (does not affect results of training at all)
 73 |     for i in range(LAST_DATA_ID - NUM_OF_TEST + 1, LAST_DATA_ID + 1, 2):
 74 |         filename = "audio" + str(i) + ".wav"
 75 |         original_file_path = path.join("dataset/speech/" + filename)
 76 |         if os.path.exists(original_file_path):
 77 |             target_file_path = path.join(data_dir + "dev/inputs/" + filename)
 78 |             print(target_file_path)
 79 |             shutil.copy(original_file_path, target_file_path)
 80 |         else:
 81 |             print(original_file_path + " does not exist")
 82 |         filename = "gesture" + str(i) + ".bvh"
 83 |         original_file_path = path.join("dataset/motion/" + filename)
 84 |         if os.path.exists(original_file_path):
 85 |             target_file_path = path.join(data_dir + "dev/labels/" + filename)
 86 |             print(target_file_path)
 87 |             shutil.copy(original_file_path, target_file_path)
 88 |         else:
 89 |             print(original_file_path + " does not exist")
 90 | 
 91 |     # data augmentation
 92 |     if AUGMENT:
 93 |         os.system('./data_processing/add_noisy_data.sh {0} {1} {2} {3}'.format("train", FIRST_DATA_ID, LAST_DATA_ID-NUM_OF_TEST, data_dir))
 94 | 
 95 |     extracted_dir = path.join(data_dir)
 96 | 
 97 |     dev_files, train_files, test_files = _format_datasets(extracted_dir)
 98 | 
 99 |     dev_files.to_csv(path.join(extracted_dir, "gg-dev.csv"), index=False)
100 |     train_files.to_csv(path.join(extracted_dir, "gg-train.csv"), index=False)
101 |     test_files.to_csv(path.join(extracted_dir, "gg-test.csv"), index=False)
102 | 
103 | 
104 | def _create_dir(data_dir):
105 | 
106 |     dir_names = ["train", "test", "dev"]
107 |     sub_dir_names = ["inputs", "labels"]
108 | 
109 |     # create ../data_dir/[train, test, dev]/[inputs, labels]
110 |     for dir_name in dir_names:
111 |         dir_path = path.join(data_dir, dir_name)
112 |         print(dir_path)
113 |         if not os.path.isdir(dir_path):
114 |             os.makedirs(dir_path)  # ../data/train
115 | 
116 |         for sub_dir_name in sub_dir_names:
117 |             dir_path = path.join(data_dir, dir_name, sub_dir_name)
118 |             print(dir_path)
119 |             if not os.path.isdir(dir_path):
120 |                 os.makedirs(dir_path)
121 | 
122 | 
123 | def _format_datasets(extracted_dir):
124 |     train_files = _files_to_pandas_dataframe(extracted_dir, "train", range(FIRST_DATA_ID, LAST_DATA_ID - NUM_OF_TEST))
125 |     test_files = _files_to_pandas_dataframe(extracted_dir, "test", range(LAST_DATA_ID - NUM_OF_TEST, LAST_DATA_ID + 1, 2))
126 |     dev_files = _files_to_pandas_dataframe(extracted_dir, "dev", range(LAST_DATA_ID - NUM_OF_TEST+1, LAST_DATA_ID + 1,2))
127 | 
128 |     return dev_files, train_files, test_files
129 | 
130 | 
131 | def _files_to_pandas_dataframe(extracted_dir, set_name, idx_range):
132 |     files = []
133 |     for idx in idx_range:
134 |         # original files
135 |         try:
136 |             input_file = path.abspath(path.join(extracted_dir, set_name, "inputs", "audio" + str(idx) + ".wav"))
137 |         except OSError:
138 |             continue
139 |         try:
140 |             label_file = path.abspath(path.join(extracted_dir, set_name, "labels", "gesture" + str(idx) + ".bvh"))
141 |         except OSError:
142 |             continue
143 |         try:
144 |             wav_size = path.getsize(input_file)
145 |         except OSError:
146 |             continue
147 | 
148 |         files.append((input_file, wav_size, label_file))
149 | 
150 |         # noisy files
151 |         try:
152 |             noisy_input_file = path.abspath(path.join(extracted_dir, set_name, "inputs", "naudio" + str(idx) + ".wav"))
153 |         except OSError:
154 |             continue
155 |         try:
156 |             noisy_wav_size = path.getsize(noisy_input_file)
157 |         except OSError:
158 |             continue
159 |         print(str(idx))
160 | 
161 |         files.append((noisy_input_file, noisy_wav_size, label_file))
162 | 
163 |     return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "bvh_filename"])
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     _split_and_format_data(sys.argv[1])
168 | 
169 | 


--------------------------------------------------------------------------------
/data_processing/silence.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/silence.wav


--------------------------------------------------------------------------------
/data_processing/tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script contains supporting function for the data processing.
  3 | It is used in several other scripts:
  4 | for calculation of speech features, aligning sequences and generating bvh files
  5 | """
  6 | 
  7 | import ctypes
  8 | 
  9 | import librosa
 10 | import librosa.display
 11 | import matplotlib.pyplot as plt
 12 | import numpy as np
 13 | import pandas as pd
 14 | # Acoustic signal processing
 15 | import scipy.io.wavfile as wav
 16 | from pydub import AudioSegment
 17 | from python_speech_features import mfcc
 18 | import scipy
 19 | 
 20 | from alt_prosody import compute_prosody
 21 | 
 22 | MFCC_INPUTS=26 # How many features we will store for each MFCC vector
 23 | WINDOW_LENGTH = 0.1
 24 | 
 25 | 
 26 | def create_bvh(filename, prediction, frame_time):
 27 |     """
 28 |     Create BVH File
 29 |     Args:
 30 |         filename:    file, in which motion in bvh format should be written
 31 |         prediction:  motion sequences, to be written into file
 32 |         frame_time:  frame rate of the motion
 33 |     Returns:
 34 |         nothing, writes motion to the file
 35 |     """
 36 |     with open('hformat.txt', 'r') as ftemp:
 37 |         hformat = ftemp.readlines()
 38 | 
 39 |     with open(filename, 'w') as fo:
 40 |         prediction = np.squeeze(prediction)
 41 |         print("output vector shape: " + str(prediction.shape))
 42 |         offset = [0, 60, 0]
 43 |         offset_line = "\tOFFSET " + " ".join("{:.6f}".format(x) for x in offset) + '\n'
 44 |         fo.write("HIERARCHY\n")
 45 |         fo.write("ROOT Hips\n")
 46 |         fo.write("{\n")
 47 |         fo.write(offset_line)
 48 |         fo.writelines(hformat)
 49 |         fo.write("MOTION\n")
 50 |         fo.write("Frames: " + str(len(prediction)) + '\n')
 51 |         fo.write("Frame Time: " + frame_time + "\n")
 52 |         for row in prediction:
 53 |             row[0:3] = 0
 54 |             legs = np.zeros(24)
 55 |             row = np.concatenate((row, legs))
 56 |             label_line = " ".join("{:.6f}".format(x) for x in row) + " "
 57 |             fo.write(label_line + '\n')
 58 |         print("bvh generated")
 59 | 
 60 | def shorten(arr1, arr2):
 61 |     min_len = min(len(arr1), len(arr2))
 62 | 
 63 |     arr1 = arr1[:min_len]
 64 |     arr2 = arr2[:min_len]
 65 | 
 66 |     return arr1, arr2
 67 | 
 68 | def shorten3(arr1, arr2, arr3):
 69 |     min_len = min(len(arr1), len(arr2), len(arr3))
 70 | 
 71 |     arr1 = arr1[:min_len]
 72 |     arr2 = arr2[:min_len]
 73 |     arr3 = arr3[:min_len]
 74 | 
 75 |     return arr1, arr2, arr3
 76 | 
 77 | 
 78 | def average(arr, n):
 79 |     """ Replace every "n" values by their average
 80 |     Args:
 81 |         arr: input array
 82 |         n:   number of elements to average on
 83 |     Returns:
 84 |         resulting array
 85 |     """
 86 |     end = n * int(len(arr)/n)
 87 |     return np.mean(arr[:end].reshape(-1, n), 1)
 88 | 
 89 | 
 90 | def calculate_mfcc(audio_filename):
 91 |     """
 92 |     Calculate MFCC features for the audio in a given file
 93 |     Args:
 94 |         audio_filename: file name of the audio
 95 | 
 96 |     Returns:
 97 |         feature_vectors: MFCC feature vector for the given audio file
 98 |     """
 99 |     fs, audio = wav.read(audio_filename)
100 | 
101 |     # Make stereo audio being mono
102 |     if len(audio.shape) == 2:
103 |         audio = (audio[:, 0] + audio[:, 1]) / 2
104 | 
105 |     # Calculate MFCC feature with the window frame it was designed for
106 |     input_vectors = mfcc(audio, winlen=0.02, winstep=0.01, samplerate=fs, numcep=MFCC_INPUTS)
107 | 
108 |     input_vectors = [average(input_vectors[:, i], 5) for i in range(MFCC_INPUTS)]
109 | 
110 |     feature_vectors = np.transpose(input_vectors)
111 | 
112 |     return feature_vectors
113 | 
114 | def get_energy_level(sound, win_len):
115 |     """ Calculate energy signal of an audio object
116 |     Args:
117 |         sound:   AudioSegment object with the audio signal
118 |         win_len: length of the window for the energy calculations
119 |     Returns:
120 |         energy:  the energy of the signal
121 |     """
122 | 
123 |     loudness = list([])
124 | 
125 |     length = len(sound) - win_len
126 | 
127 |     # Split signal into short chunks and get energy of each of them
128 |     for i in range(0, length, win_len):
129 |         current_segment = sound[i:i + win_len]
130 |         loudness.append(current_segment.rms)
131 | 
132 |     # Append the last segment, which was not considered
133 |     loudness.append(0)
134 | 
135 |     energy = np.array(loudness)
136 | 
137 |     return energy
138 | 
139 | 
140 | def derivative(x, f):
141 |     """ Calculate numerical derivative (by FDM) of a 1d array
142 |     Args:
143 |         x: input space x
144 |         f: Function of x
145 |     Returns:
146 |         der:  numerical derivative of f wrt x
147 |     """
148 | 
149 |     x = 1000 * x  # from seconds to milliseconds
150 | 
151 |     # Normalization:
152 |     dx = (x[1] - x[0])
153 | 
154 |     cf = np.convolve(f, [1, -1]) / dx
155 | 
156 |     # Remove unstable values
157 |     der = cf[:-1].copy()
158 |     der[0] = 0
159 | 
160 |     return der
161 | 
162 | 
163 | def calculate_pitch(audio_filename):
164 |     """ Calculate F0 contour of a given speech file
165 |     Args:
166 |         audio_filename:  address of a speech file
167 |     Returns:
168 |         F0 contour in a log scale and flag indicating weather F0 existed
169 |     """
170 | 
171 |     fs, audio = wav.read(audio_filename)
172 | 
173 |     # Make stereo audio being mono
174 |     if len(audio.shape) == 2:
175 |         audio =( (audio[:, 0] + audio[:, 1]) / 2 ).astype(ctypes.c_int16)
176 | 
177 |     plot = False
178 | 
179 |     WINDOW_LENGTH = 5
180 |     pm_times, pm, f0_times, f0, corr = pyreaper.reaper(audio, fs=fs, minf0=80, maxf0=250)
181 | 
182 |     # Remove unstable values
183 |     f0 = f0[1:-1].copy()
184 | 
185 |     # Get an indication if F0 exists
186 |     f0[f0 == -1] = np.nan
187 |     F0_exists = 1 - np.isnan(f0).astype(int)
188 | 
189 |     # Interpolate pitch values
190 |     ts = pd.Series(f0, index=range(f0.shape[0]))
191 |     ts = ts.interpolate(method='linear', downcast='infer')\
192 | 
193 |     f0 = ts.values
194 | 
195 |     nans = np.isnan(f0).tolist()
196 | 
197 |     # Extrapolate at the beginning
198 |     if False in nans:
199 |         first_value = nans.index(False)
200 |         first_nans = nans[0:first_value]
201 |         for time in range(len(first_nans)):
202 |             f0[time] = f0[first_value]
203 | 
204 |         # Extrapolate at the end
205 |         if True in nans[first_value:]:
206 |             last_value = nans[first_value:].index(True)
207 |             last_nans = nans[last_value:]
208 |             for time in range(len(last_nans)):
209 |                 f0[-time] = f0[last_value]
210 | 
211 |     if plot:
212 | 
213 |         plt.plot(f0, linewidth=3, label="F0")
214 |         plt.title("F0 results")
215 |         plt.show()
216 | 
217 |     # Convert to the log scale
218 |     F0_contour = np.log2(f0+1)
219 |     return F0_contour, F0_exists
220 | 
221 | 
222 | def extract_prosodic_features(audio_filename):
223 |     """
224 |     Extract all 5 prosodic features
225 |     Args:
226 |         audio_filename:   file name for the audio to be used
227 |     Returns:
228 |         pros_feature:     energy, energy_der, pitch, pitch_der, pitch_ind
229 |     """
230 | 
231 |     WINDOW_LENGTH = 5
232 | 
233 |     # Read audio from file
234 |     sound = AudioSegment.from_file(audio_filename, format="wav")
235 | 
236 |     # Alternative prosodic features
237 |     pitch, energy = compute_prosody(audio_filename, WINDOW_LENGTH / 1000)
238 | 
239 |     duration = len(sound) / 1000
240 |     t = np.arange(0, duration, WINDOW_LENGTH / 1000)
241 | 
242 |     energy_der = derivative(t, energy)
243 |     pitch_der = derivative(t, pitch)
244 | 
245 |     # Average everything in order to match the frequency
246 |     energy = average(energy, 10)
247 |     energy_der = average(energy_der, 10)
248 |     pitch = average(pitch, 10)
249 |     pitch_der = average(pitch_der, 10)
250 | 
251 |     # Cut them to the same size
252 |     min_size = min(len(energy), len(energy_der), len(pitch_der), len(pitch_der))
253 |     energy = energy[:min_size]
254 |     energy_der = energy_der[:min_size]
255 |     pitch = pitch[:min_size]
256 |     pitch_der = pitch_der[:min_size]
257 | 
258 |     # Stack them all together
259 |     pros_feature = np.stack((energy, energy_der, pitch, pitch_der))#, pitch_ind))
260 | 
261 |     # And reshape
262 |     pros_feature = np.transpose(pros_feature)
263 | 
264 |     return pros_feature
265 | 
266 | 
267 | def calculate_spectrogram(audio_filename):
268 |     """ Calculate spectrogram for the audio file
269 |     Args:
270 |         audio_filename: audio file name
271 |     Returns:
272 |         log spectrogram values
273 |     """
274 | 
275 |     DIM = int(64)
276 | 
277 |     audio, sample_rate = librosa.load(audio_filename)
278 | 
279 |     # Make stereo audio being mono
280 |     if len(audio.shape) == 2:
281 |         audio = (audio[:, 0] + audio[:, 1]) / 2
282 | 
283 |     spectr = librosa.feature.melspectrogram(audio, sr=sample_rate, #window = scipy.signal.hanning,
284 |                                             hop_length = int(WINDOW_LENGTH* sample_rate / 2),
285 |                                             fmax=7500, fmin=100, n_mels=DIM)
286 | 
287 |     # Shift into the log scale
288 |     eps = 1e-10
289 |     log_spectr = np.log(abs(spectr)+eps)
290 | 
291 |     return np.transpose(log_spectr)
292 | 


--------------------------------------------------------------------------------
/data_processing/tools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/tools.pyc


--------------------------------------------------------------------------------
/data_processing/white_noise.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/data_processing/white_noise.wav


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
  1 | # How to use the evaluation script
  2 | 
  3 | This directory provides the scripts for quantitative evaluation of our gesture generation framework. We support the following measures:
  4 | - Average Position Error (APE)
  5 | - Mean Absolute Error (MAE)
  6 | - Average Jerk (AJ)
  7 | - Average Acceleration (AA)
  8 | - Histogram of Moving Distance (HMD, for velocity/acceleration)
  9 | 
 10 | ## Data preparation 
 11 |   1. Use `../helpers/remove_velocity.py` to delete velocities from predicted data.
 12 |   2. Use `../helpers/convert_original.py` to create original data.
 13 | 
 14 | This produces gesture files containing `(x, y, z) x 64 joints = 192` white space separated data for each line.
 15 | 
 16 |   3. (optional) Use `../helpers/apply_filters.py` to smooth predicted data.
 17 | 
 18 | ## Directory organization
 19 | 
 20 | We assume original/predicted gesture data are stored as follows:
 21 | 
 22 | ```
 23 | -- evaluation/
 24 |       |-- calc_distance.py
 25 |       |-- calc_errors.py
 26 |       |-- calc_jerk.py
 27 |       |-- joints.txt
 28 |       |-- data/
 29 |            |-- original/
 30 |                   |-- gesture1093.txt, gesture1095.txt, ...
 31 |            |-- predicted/
 32 |                   |-- your_prediction_dir/
 33 |                         |-- gesture1093.txt, gesture1095.txt, ...
 34 | ```
 35 | 
 36 | **Important Note: You have to store the gesture files of the same indices in `original` and `predicted` directories.
 37 | If you have gestures 1093, 1095, ... in the `original` directory, but gestures 1094, 1096, ... in the `predicted' - you will get wrong results**
 38 | 
 39 | ## Run
 40 | 
 41 | `calc_errors.py`, `calc_jerk.py`, and `calc_distance.py` support different quantitative measures, described below.
 42 | 
 43 | `--gesture` or `-g` option specifies the predicted directory under `data/predicted`. If you store the predicted gesture files in `data/predicted/your_prediction_dir/`, use `-g your_prediction_dir`.
 44 | 
 45 | ### APE/MAE
 46 | 
 47 | Average Position Error (APE) and Mean Absolute Error (MAE) indicate the prediction errors against the original gestures.
 48 | 
 49 | To calculate APE/MAE, you can use `calc_errors.py`.
 50 | You can select the metric to compute by `--metric` or `-m` option (default: ape).
 51 | 
 52 | ```sh
 53 | # Compute APE
 54 | python calc_errors.py -g your_prediction_dir -m ape
 55 | 
 56 | # Compute MAE
 57 | python calc_errors.py -g your_prediction_dir -m mae
 58 | ```
 59 | 
 60 | ### AJ/AA
 61 | 
 62 | Average Jerk (AJ) and Average Acceleration (AA) represent the characteristics of gesture motion.
 63 | 
 64 | To calculate AJ/AA, you can use `calc_jerk.py`.
 65 | You can select the measure to compute by `--measure` or `-m` option (default: jerk).
 66 | 
 67 | ```sh
 68 | # Compute AJ
 69 | python calc_jerk.py -g your_prediction_dir -m jerk
 70 | 
 71 | # Compute AA
 72 | python calc_jerks.py -g your_prediction_dir -m acceleration
 73 | ```
 74 | 
 75 | Note: `calc_jerk.py` computes AJ/AA for both original and predicted gestures. The AJ/AA of the original gestures will be stored in `result/original` by default. The AJ/AA of the predicted gestures will be stored in `result/your_prediction_dir`.
 76 | 
 77 | ### HMD
 78 | 
 79 | Histogram of Moving Distance (HMD) shows the velocity/acceleration distribution of gesture motion.
 80 | 
 81 | To calculate HMD, you can use `calc_distance.py`.
 82 | You can select the measure to compute by `--measure` or `-m` option (default: velocity).  
 83 | In addition, this script supports histogram visualization. To enable visualization, use `--visualize` or `-v` option.
 84 | 
 85 | ```sh
 86 | # Compute velocity histogram
 87 | python calc_distance.py -g your_prediction_dir -m velocity -w 0.05  # You can change the bin width of the histogram
 88 | 
 89 | # Compute acceleration histogram
 90 | python calc_distance.py -g your_prediction_dir -m acceleration -w 0.05
 91 | ```
 92 | 
 93 | Note: `calc_distance.py` computes HMD for both original and predicted gestures. The HMD of the original gestures will be stored in `result/original` by default.
 94 | 
 95 | ### Calculate evaluation measures for specific joints  
 96 | You can use `-s` option for all evaluation scripts to select specific joints, e.g. `-s Head LeftLeg RightLeg`  
 97 | Here is a table for the joint names:
 98 | 
 99 | | Joint to Calculate | Corresponding Name |
100 | | --- | --- |
101 | | Head | Head |
102 | | Neck | Neck |
103 | | Left Shoulder | LeftArm |
104 | | Left Elobow | LeftForeArm |
105 | | Left Wrist | LeftHand |
106 | | Right Shoulder | RightArm |
107 | | Right Elobow | RightForeArm |
108 | | Right Wrist | RightHand |
109 | | Left Hip | LeftUpLeg |
110 | | Left Knee | LeftLeg |
111 | | Left Ankle | LeftFoot |
112 | | Right Hip | RightUpLeg |
113 | | Right Knee | RightLeg |
114 | | Right Ankle | RightFoot |
115 | 
116 | When you calculate the velocity histogram for both elbows, use
117 | ```sh
118 | python calc_distance.py -g your_prediction_dir -m velocity -w 0.05 -s LeftForeArm RightForeArm
119 | ```
120 | 


--------------------------------------------------------------------------------
/evaluation/calc_distance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Calculating statistics over the produced and ground truth gestures
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import os
 11 | import warnings
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | 
 16 | 
 17 | def read_joint_names(filename):
 18 |     """Read motion capture's body joint names from file
 19 | 
 20 |       Args:
 21 |           filename:     file name to read
 22 | 
 23 |       Returns:
 24 |           joint_names:  list of joint names
 25 |     """
 26 | 
 27 |     with open(filename, 'r') as f:
 28 |         org = f.read()
 29 |         joint_names = org.split(',')
 30 | 
 31 |     return joint_names
 32 | 
 33 | 
 34 | def compute_velocity(data, dim=3):
 35 |     """Compute velocity between adjacent frames
 36 | 
 37 |       Args:
 38 |           data:         array containing joint positions of gesture
 39 |           dim:          gesture dimensionality
 40 | 
 41 |       Returns:
 42 |           vel_norms:    velocities of each joint between each adjacent frame
 43 |     """
 44 | 
 45 |     # First derivative of position is velocity
 46 |     vels = np.diff(data, n=1, axis=0)
 47 | 
 48 |     num_vels = vels.shape[0]
 49 |     num_joints = vels.shape[1] // dim
 50 | 
 51 |     vel_norms = np.zeros((num_vels, num_joints))
 52 | 
 53 |     for i in range(num_vels):
 54 |         for j in range(num_joints):
 55 |             x1 = j * dim + 0
 56 |             x2 = j * dim + dim
 57 |             vel_norms[i, j] = np.linalg.norm(vels[i, x1:x2])
 58 | 
 59 |     return vel_norms
 60 | 
 61 | 
 62 | def compute_acceleration(data, dim=3):
 63 |     """Compute acceleration between adjacent frames
 64 | 
 65 |       Args:
 66 |           data:         array containing joint positions of gesture
 67 |           dim:          gesture dimensionality
 68 | 
 69 |       Returns:
 70 |           acc_norms:    accelerations of each joint between each adjacent frame
 71 |     """
 72 | 
 73 |     # Second derivative of position is acceleration
 74 |     accs = np.diff(data, n=2, axis=0)
 75 | 
 76 |     num_accs = accs.shape[0]
 77 |     num_joints = accs.shape[1] // dim
 78 | 
 79 |     acc_norms = np.zeros((num_accs, num_joints))
 80 | 
 81 |     for i in range(num_accs):
 82 |         for j in range(num_joints):
 83 |             x1 = j * dim + 0
 84 |             x2 = j * dim + dim
 85 |             acc_norms[i, j] = np.linalg.norm(accs[i, x1:x2])
 86 | 
 87 |     return acc_norms
 88 | 
 89 | 
 90 | def save_result(lines, out_dir, width, measure):
 91 |     """Write computed histogram to CSV
 92 | 
 93 |       Args:
 94 |           lines:        list of strings to be written
 95 |           out_dir:      output directory
 96 |           width:        bin width of the histogram
 97 |           measure:      used measure for histogram calculation
 98 |     """
 99 | 
100 |     # Make output directory
101 |     if not os.path.exists(out_dir):
102 |         os.makedirs(out_dir)
103 | 
104 |     hist_type = measure[:3]  # 'vel' or 'acc'
105 |     filename = 'hmd_{}_{}.csv'.format(hist_type, width)
106 |     outname = os.path.join(out_dir, filename)
107 | 
108 |     with open(outname, 'w') as out_file:
109 |         out_file.writelines(lines)
110 | 
111 |     print('More detailed result was writen to the file: ' + outname)
112 |     print('')
113 | 
114 | 
115 | def main():
116 |     measures = {
117 |         'velocity': compute_velocity,
118 |         'acceleration': compute_acceleration,
119 |     }
120 | 
121 |     parser = argparse.ArgumentParser(
122 |         description='Calculate histograms of moving distances')
123 |     parser.add_argument('--original', '-o', default='data/original',
124 |                         help='Original gesture directory')
125 |     parser.add_argument('--predicted', '-p', default='data/predicted',
126 |                         help='Predicted gesture directory')
127 |     parser.add_argument('--joints', '-j', default='joints.txt',
128 |                         help='Joint name file')
129 |     parser.add_argument('--gesture', '-g', required=True,
130 |                         help='Directory storing predicted txt files')
131 |     parser.add_argument('--width', '-w', type=float, default=0.05,
132 |                         help='Bin width of the histogram')
133 |     parser.add_argument('--measure', '-m', default='velocity',
134 |                         help='Measure to calculate (velocity or acceleration)')
135 |     parser.add_argument('--select', '-s', nargs='+',
136 |                         help='Joint subset to compute (if omitted, use all)')
137 |     parser.add_argument('--visualize', '-v', action='store_true',
138 |                         help='Visualize histograms')
139 |     parser.add_argument('--out', default='result',
140 |                         help='Directory to output the result')
141 |     args = parser.parse_args()
142 | 
143 |     predicted_dir = os.path.join(args.predicted, args.gesture)
144 | 
145 |     original_files = sorted(glob.glob(os.path.join(args.original, '*.txt')))
146 | 
147 |     predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt')))
148 | 
149 |     # Check number of files
150 |     if len(original_files) != len(predicted_files):
151 |         warnings.warn('Inconsistent number of files : {} vs {}'
152 |                       ''.format(len(original_files), len(predicted_files)),
153 |                       RuntimeWarning)
154 | 
155 |     # Check if error measure was correct
156 |     if args.measure not in measures:
157 |         raise ValueError('Unknown measure: \'{}\'. Choose from {}'
158 |                          ''.format(args.measure, list(measures.keys())))
159 | 
160 |     joint_names = read_joint_names(args.joints)
161 | 
162 |     if args.select is not None:
163 |         selected_joints = []
164 |         for s in args.select:
165 |             try:
166 |                 index = joint_names.index(s)
167 |             except ValueError:
168 |                 print('Ignore invalid joint: {}'.format(s))
169 |             else:
170 |                 selected_joints.append(index)
171 |         selected_joints.sort()
172 | 
173 |         if len(selected_joints) == 0:
174 |             selected_joints = range(len(joint_names))
175 |             print('No valid joints are selected. Use all joints')
176 |     else:
177 |         # Use all joints
178 |         selected_joints = range(len(joint_names))
179 | 
180 |     joint_names = [joint_names[s] for s in selected_joints]
181 |     original_out_lines = [','.join([''] + joint_names + ['Total']) + '\n']
182 |     predicted_out_lines = [','.join([''] + joint_names + ['Total']) + '\n']
183 | 
184 |     original_distances = []
185 |     predicted_distances = []
186 |     for original_file, predicted_file in zip(original_files, predicted_files):
187 |         original = np.loadtxt(original_file)
188 |         predicted = np.loadtxt(predicted_file)
189 | 
190 |         original_distance = measures[args.measure](
191 |             original)[:, selected_joints]
192 |         predicted_distance = measures[args.measure](
193 |             predicted)[:, selected_joints]
194 | 
195 |         original_distances.append(original_distance)
196 |         predicted_distances.append(predicted_distance)
197 | 
198 |     original_distances = np.concatenate(original_distances)
199 |     predicted_distances = np.concatenate(predicted_distances)
200 | 
201 |     # Compute histogram for each joint
202 |     bins = np.arange(0, 1+args.width, args.width)
203 |     num_joints = original_distances.shape[1]
204 |     original_hists = []
205 |     predicted_hists = []
206 |     for i in range(num_joints):
207 |         original_hist, _ = np.histogram(original_distances[:, i], bins=bins)
208 |         predicted_hist, _ = np.histogram(predicted_distances[:, i], bins=bins)
209 | 
210 |         original_hists.append(original_hist)
211 |         predicted_hists.append(predicted_hist)
212 | 
213 |     # Sum over all joints
214 |     original_total = np.sum(original_hists, axis=0)
215 |     predicted_total = np.sum(predicted_hists, axis=0)
216 | 
217 |     # Append total number of bin counts to the last
218 |     original_hists = np.stack(original_hists + [original_total], axis=1)
219 |     predicted_hists = np.stack(predicted_hists + [predicted_total], axis=1)
220 | 
221 |     num_bins = bins.size - 1
222 |     for i in range(num_bins):
223 |         original_line = str(bins[i])
224 |         predicted_line = str(bins[i])
225 |         for j in range(num_joints + 1):
226 |             original_line += ',' + str(original_hists[i, j])
227 |             predicted_line += ',' + str(predicted_hists[i, j])
228 |         original_line += '\n'
229 |         predicted_line += '\n'
230 | 
231 |         original_out_lines.append(original_line)
232 |         predicted_out_lines.append(predicted_line)
233 | 
234 |     original_out_dir = os.path.join(args.out, 'original')
235 |     predicted_out_dir = os.path.join(args.out, args.gesture)
236 | 
237 |     if args.visualize:
238 |         plt.plot(bins[:-1], original_total, label='Original')
239 |         plt.plot(bins[:-1], predicted_total, label=args.gesture)
240 |         plt.legend()
241 |         plt.xlabel('Velocity (cm/s)')
242 |         plt.ylabel('Bin counts')
243 |         plt.title('Histograms of Moving Distance ({})'.format(args.measure))
244 |         plt.tight_layout()
245 |         plt.show()
246 | 
247 |     save_result(original_out_lines, original_out_dir,
248 |                 args.width, args.measure)
249 |     save_result(predicted_out_lines, predicted_out_dir,
250 |                 args.width, args.measure)
251 | 
252 |     print('HMD ({}):'.format(args.measure))
253 |     print('bins: {}'.format(bins))
254 |     print('original: {}'.format(original_total))
255 |     print('predicted: {}'.format(predicted_total))
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     main()
260 | 


--------------------------------------------------------------------------------
/evaluation/calc_errors.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Calculating average point error
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import os
 11 | 
 12 | import numpy as np
 13 | from sklearn.metrics import mean_absolute_error
 14 | 
 15 | 
 16 | def read_joint_names(filename):
 17 |     """Read motion capture's body joint names from file
 18 | 
 19 |       Args:
 20 |           filename:     file name to read
 21 | 
 22 |       Returns:
 23 |           joint_names:  list of joint names
 24 |     """
 25 | 
 26 |     with open(filename, 'r') as f:
 27 |         org = f.read()
 28 |         joint_names = org.split(',')
 29 | 
 30 |     return joint_names
 31 | 
 32 | 
 33 | def remove_velocity(data, dim=3):
 34 |     """Remove velocity values from raw prediction data
 35 | 
 36 |       Args:
 37 |           data:         array containing both position and velocity values
 38 |           dim:          gesture dimensionality
 39 | 
 40 |       Returns:
 41 |           np.ndarray:   array containing only position values
 42 |     """
 43 | 
 44 |     starts = np.arange(0, data.shape[1], dim * 2)
 45 |     stops = np.arange(dim, data.shape[1], dim * 2)
 46 |     return np.hstack([data[:, i:j] for i, j in zip(starts, stops)])
 47 | 
 48 | 
 49 | def MAE(original, predicted, dim=3):
 50 |     """Compute Mean Absolute Error (MAE)
 51 | 
 52 |       Args:
 53 |           original:     array containing joint positions of original gesture
 54 |           predicted:    array containing joint positions of predicted gesture
 55 |           dim:          gesture dimensionality
 56 | 
 57 |       Returns:
 58 |           mae:          MAE between original and predicted for each joint
 59 |     """
 60 | 
 61 |     num_frames = predicted.shape[0]
 62 | 
 63 |     diffs = mean_absolute_error(original[:num_frames], predicted,
 64 |                                 multioutput='raw_values')
 65 | 
 66 |     num_joints = predicted.shape[1] // dim
 67 |     mae = np.empty(num_joints)
 68 | 
 69 |     for i in range(num_joints):
 70 |         x1 = i * dim + 0
 71 |         x2 = i * dim + dim
 72 |         mae[i] = np.mean(diffs[x1:x2])
 73 | 
 74 |     return mae
 75 | 
 76 | 
 77 | def APE(original, predicted, dim=3):
 78 |     """Compute Average Position Error (APE)
 79 | 
 80 |       Args:
 81 |           original:     array containing joint positions of original gesture
 82 |           predicted:    array containing joint positions of predicted gesture
 83 |           dim:          gesture dimensionality
 84 | 
 85 |       Returns:
 86 |           np.ndarray:   APE between original and predicted for each joint
 87 |     """
 88 | 
 89 |     num_frames = predicted.shape[0]
 90 |     num_joints = predicted.shape[1] // dim
 91 | 
 92 |     diffs = np.zeros((num_frames, num_joints))
 93 | 
 94 |     for i in range(num_frames):
 95 |         for j in range(num_joints):
 96 |             x1 = j * dim + 0
 97 |             x2 = j * dim + dim
 98 |             diffs[i, j] = np.linalg.norm(
 99 |                 original[i, x1:x2] - predicted[i, x1:x2])
100 | 
101 |     return np.mean(diffs, axis=0)
102 | 
103 | 
104 | def main():
105 |     metrics = {
106 |         'mae': MAE,
107 |         'ape': APE,
108 |     }
109 | 
110 |     parser = argparse.ArgumentParser(
111 |         description='Calculate prediction errors')
112 |     parser.add_argument('--original', '-o', default='data/original',
113 |                         help='Original gesture directory')
114 |     parser.add_argument('--predicted', '-p', default='data/predicted',
115 |                         help='Predicted gesture directory')
116 |     parser.add_argument('--joints', '-j', default='joints.txt',
117 |                         help='Joint name file')
118 |     parser.add_argument('--gesture', '-g', required=True,
119 |                         help='Directory storing predicted txt files')
120 |     parser.add_argument('--metric', '-m', default='ape',
121 |                         help='Error metric (ape or mae)')
122 |     parser.add_argument('--select', '-s', nargs='+',
123 |                         help='Joint subset to compute (if omitted, use all)')
124 |     parser.add_argument('--out', default='result',
125 |                         help='Directory to output the result')
126 |     args = parser.parse_args()
127 | 
128 |     predicted_dir = os.path.join(args.predicted, args.gesture)
129 | 
130 |     original_files = sorted(glob.glob(os.path.join(args.original, '*.txt')))
131 |     predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt')))
132 | 
133 |     # Check number of files
134 |     if len(original_files) != len(predicted_files):
135 |         raise ValueError('Inconsistent number of files : {} vs {}'
136 |                          ''.format(len(original_files), len(predicted_files)))
137 | 
138 |     # Check if error metric was correct
139 |     if args.metric not in metrics:
140 |         raise ValueError('Unknown metric: \'{}\'. Choose from {}'
141 |                          ''.format(args.metric, list(metrics.keys())))
142 | 
143 |     joint_names = read_joint_names(args.joints)
144 | 
145 |     if args.select is not None:
146 |         selected_joints = []
147 |         for s in args.select:
148 |             try:
149 |                 index = joint_names.index(s)
150 |             except ValueError:
151 |                 print('Ignore invalid joint: {}'.format(s))
152 |             else:
153 |                 selected_joints.append(index)
154 |         selected_joints.sort()
155 | 
156 |         if len(selected_joints) == 0:
157 |             selected_joints = range(len(joint_names))
158 |             print('No valid joints are selected. Use all joints')
159 |     else:
160 |         # Use all joints
161 |         selected_joints = range(len(joint_names))
162 | 
163 |     joint_names = [joint_names[s] for s in selected_joints]
164 |     out_lines = [','.join(['file'] + joint_names) + '\n']
165 | 
166 |     errors = []
167 |     for original_file, predicted_file in zip(original_files, predicted_files):
168 |         original = np.loadtxt(original_file)
169 |         predicted = np.loadtxt(predicted_file)
170 | 
171 |         if original.shape[0] != predicted.shape[0]:
172 |             # Cut them to the same length
173 |             length = min(original.shape[0], predicted.shape[0])
174 |             original = original[:length]
175 |             predicted = predicted[:length]
176 | 
177 |         if predicted.shape[1] == 192 * 2:
178 |             print(predicted.shape)
179 |             print("Removing the velocity")
180 |             # Remove the velocity
181 |             predicted = remove_velocity(predicted)
182 | 
183 |         error = metrics[args.metric](original, predicted)[selected_joints]
184 |         errors.append(error)
185 | 
186 |         basename = os.path.basename(predicted_file)
187 |         line = basename
188 |         for e in error:
189 |             line += ',' + str(e)
190 |         line += '\n'
191 | 
192 |         out_lines.append(line)
193 | 
194 |     average_line = 'Average'
195 |     avgs = np.mean(errors, axis=0)
196 |     for a in avgs:
197 |         average_line += ',' + str(a)
198 | 
199 |     out_lines.append(average_line)
200 | 
201 |     out_dir = os.path.join(args.out, args.gesture)
202 | 
203 |     # Make output directory
204 |     if not os.path.exists(out_dir):
205 |         os.makedirs(out_dir)
206 | 
207 |     outname = os.path.join(out_dir, '{}.csv'.format(args.metric))
208 |     with open(outname, 'w') as out_file:
209 |         out_file.writelines(out_lines)
210 | 
211 |     print('More detailed result was writen to the file: ' + outname)
212 |     print('')
213 | 
214 |     print('{}: {:.2f}'.format(args.metric.upper(), np.mean(errors)))
215 | 
216 | 
217 | if __name__ == '__main__':
218 |     main()
219 | 


--------------------------------------------------------------------------------
/evaluation/calc_jerk.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Calculating average jerk over the produced and ground truth gestures
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import os
 11 | import warnings
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | def read_joint_names(filename):
 17 |     """Read motion capture's body joint names from file
 18 | 
 19 |       Args:
 20 |           filename:     file name to read
 21 | 
 22 |       Returns:
 23 |           joint_names:  list of joint names
 24 |     """
 25 | 
 26 |     with open(filename, 'r') as f:
 27 |         org = f.read()
 28 |         joint_names = org.split(',')
 29 | 
 30 |     return joint_names
 31 | 
 32 | 
 33 | def compute_jerks(data, dim=3):
 34 |     """Compute jerk between adjacent frames
 35 | 
 36 |       Args:
 37 |           data:         array containing joint positions of gesture
 38 |           dim:          gesture dimensionality
 39 | 
 40 |       Returns:
 41 |           np.ndarray:   jerks of each joint averaged over all frames
 42 |     """
 43 | 
 44 |     # Third derivative of position is jerk
 45 |     jerks = np.diff(data, n=3, axis=0)
 46 | 
 47 |     num_jerks = jerks.shape[0]
 48 |     num_joints = jerks.shape[1] // dim
 49 | 
 50 |     jerk_norms = np.zeros((num_jerks, num_joints))
 51 | 
 52 |     for i in range(num_jerks):
 53 |         for j in range(num_joints):
 54 |             x1 = j * dim + 0
 55 |             x2 = j * dim + dim
 56 |             jerk_norms[i, j] = np.linalg.norm(jerks[i, x1:x2])
 57 | 
 58 |     return np.mean(jerk_norms, axis=0)
 59 | 
 60 | 
 61 | def compute_acceleration(data, dim=3):
 62 |     """Compute acceleration between adjacent frames
 63 | 
 64 |       Args:
 65 |           data:         array containing joint positions of gesture
 66 |           dim:          gesture dimensionality
 67 | 
 68 |       Returns:
 69 |           np.ndarray:   accelerations of each joint averaged over all frames
 70 |     """
 71 | 
 72 |     # Second derivative of position is acceleration
 73 |     accs = np.diff(data, n=2, axis=0)
 74 | 
 75 |     num_accs = accs.shape[0]
 76 |     num_joints = accs.shape[1] // dim
 77 | 
 78 |     acc_norms = np.zeros((num_accs, num_joints))
 79 | 
 80 |     for i in range(num_accs):
 81 |         for j in range(num_joints):
 82 |             x1 = j * dim + 0
 83 |             x2 = j * dim + dim
 84 |             acc_norms[i, j] = np.linalg.norm(accs[i, x1:x2])
 85 | 
 86 |     return np.mean(acc_norms, axis=0)
 87 | 
 88 | 
 89 | def save_result(lines, out_dir, measure):
 90 |     """Write computed measure to CSV
 91 | 
 92 |       Args:
 93 |           lines:        list of strings to be written
 94 |           out_dir:      output directory
 95 |           measure:      used measure
 96 |     """
 97 | 
 98 |     # Make output directory
 99 |     if not os.path.exists(out_dir):
100 |         os.makedirs(out_dir)
101 | 
102 |     if measure == "jerk":
103 |         outname = os.path.join(out_dir, 'aj.csv')
104 |     elif measure == "acceleration":
105 |         outname = os.path.join(out_dir, 'aa.csv')
106 | 
107 |     with open(outname, 'w') as out_file:
108 |         out_file.writelines(lines)
109 | 
110 |     print('More detailed result was writen to the file: ' + outname)
111 |     print('')
112 | 
113 | 
114 | def main():
115 |     measures = {
116 |         'jerk': compute_jerks,
117 |         'acceleration': compute_acceleration,
118 |     }
119 | 
120 |     parser = argparse.ArgumentParser(
121 |         description='Calculate prediction errors')
122 |     parser.add_argument('--original', '-o', default='data/original',
123 |                         help='Original gesture directory')
124 |     parser.add_argument('--predicted', '-p', default='data/predicted',
125 |                         help='Predicted gesture directory')
126 |     parser.add_argument('--joints', '-j', default='joints.txt',
127 |                         help='Joint name file')
128 |     parser.add_argument('--gesture', '-g', required=True,
129 |                         help='Directory storing predicted txt files')
130 |     parser.add_argument('--measure', '-m', default='jerk',
131 |                         help='Measure to calculate (jerk or acceleration)')
132 |     parser.add_argument('--select', '-s', nargs='+',
133 |                         help='Joint subset to compute (if omitted, use all)')
134 |     parser.add_argument('--out', default='result',
135 |                         help='Directory to output the result')
136 |     args = parser.parse_args()
137 | 
138 |     predicted_dir = os.path.join(args.predicted, args.gesture)
139 | 
140 |     original_files = sorted(glob.glob(os.path.join(args.original, '*.txt')))
141 |     predicted_files = sorted(glob.glob(os.path.join(predicted_dir, '*.txt')))
142 | 
143 |     # Check number of files
144 |     if len(original_files) != len(predicted_files):
145 |         warnings.warn('Inconsistent number of files : {} vs {}'
146 |                       ''.format(len(original_files), len(predicted_files)),
147 |                       RuntimeWarning)
148 | 
149 |     # Check if error measure was correct
150 |     if args.measure not in measures:
151 |         raise ValueError('Unknown measure: \'{}\'. Choose from {}'
152 |                          ''.format(args.measure, list(measures.keys())))
153 | 
154 |     joint_names = read_joint_names(args.joints)
155 | 
156 |     if args.select is not None:
157 |         selected_joints = []
158 |         for s in args.select:
159 |             try:
160 |                 index = joint_names.index(s)
161 |             except ValueError:
162 |                 print('Ignore invalid joint: {}'.format(s))
163 |             else:
164 |                 selected_joints.append(index)
165 |         selected_joints.sort()
166 | 
167 |         if len(selected_joints) == 0:
168 |             selected_joints = range(len(joint_names))
169 |             print('No valid joints are selected. Use all joints')
170 |     else:
171 |         # Use all joints
172 |         selected_joints = range(len(joint_names))
173 | 
174 |     joint_names = [joint_names[s] for s in selected_joints]
175 |     original_out_lines = [','.join(['file'] + joint_names) + '\n']
176 |     predicted_out_lines = [','.join(['file'] + joint_names) + '\n']
177 | 
178 |     original_values = []
179 |     predicted_values = []
180 |     for original_file, predicted_file in zip(original_files, predicted_files):
181 |         original = np.loadtxt(original_file)
182 |         predicted = np.loadtxt(predicted_file)
183 | 
184 |         if original.shape[0] != predicted.shape[0]:
185 |             # Cut them to the same length
186 |             length = min(original.shape[0], predicted.shape[0])
187 |             original = original[:length]
188 |             predicted = predicted[:length]
189 | 
190 |         original_value = measures[args.measure](original)[selected_joints]
191 |         predicted_value = measures[args.measure](predicted)[selected_joints]
192 | 
193 |         original_values.append(original_value)
194 |         predicted_values.append(predicted_value)
195 | 
196 |         basename = os.path.basename(original_file)
197 |         original_line = basename
198 |         predicted_line = basename
199 |         for ov, pv in zip(original_value, predicted_value):
200 |             original_line += ',' + str(ov)
201 |             predicted_line += ',' + str(pv)
202 |         original_line += '\n'
203 |         predicted_line += '\n'
204 | 
205 |         original_out_lines.append(original_line)
206 |         predicted_out_lines.append(predicted_line)
207 | 
208 |     original_average_line = 'Average'
209 |     predicted_average_line = 'Average'
210 |     original_avgs = np.mean(original_values, axis=0)
211 |     predicted_avgs = np.mean(predicted_values, axis=0)
212 |     for oa, pa in zip(original_avgs, predicted_avgs):
213 |         original_average_line += ',' + str(oa)
214 |         predicted_average_line += ',' + str(pa)
215 | 
216 |     original_out_lines.append(original_average_line)
217 |     predicted_out_lines.append(predicted_average_line)
218 | 
219 |     original_out_dir = os.path.join(args.out, 'original')
220 |     predicted_out_dir = os.path.join(args.out, args.gesture)
221 | 
222 |     save_result(original_out_lines, original_out_dir, args.measure)
223 |     save_result(predicted_out_lines, predicted_out_dir, args.measure)
224 | 
225 |     if args.measure == 'jerk':
226 |         print('AJ:')
227 |     elif args.measure == 'acceleration':
228 |         print('AA:')
229 |     print('original: {:.2f}'.format(np.mean(original_values)))
230 |     print('predicted: {:.2f}'.format(np.mean(predicted_values)))
231 | 
232 | 
233 | if __name__ == '__main__':
234 |     main()
235 | 


--------------------------------------------------------------------------------
/evaluation/hellinger.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Mar  2 11:30:40 2020
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import os
 11 | import re
 12 | 
 13 | import matplotlib
 14 | matplotlib.use('Agg')
 15 | import matplotlib.pyplot as plt
 16 | from matplotlib.ticker import MaxNLocator
 17 | from matplotlib.patches import Rectangle
 18 | import numpy as np
 19 | import pandas as pd
 20 | import seaborn as sns
 21 | 
 22 | 
 23 | def read_joint_names(filename):
 24 |     """Read motion capture's body joint names from file
 25 | 
 26 |       Args:
 27 |           filename:     file name to read
 28 | 
 29 |       Returns:
 30 |           joint_names:  list of joint names
 31 |     """
 32 | 
 33 |     with open(filename, 'r') as f:
 34 |         org = f.read()
 35 |         joint_names = org.split(',')
 36 | 
 37 |     return joint_names
 38 | 
 39 | 
 40 | def normalize(hist):
 41 |     return hist / np.sum(hist)
 42 | 
 43 | 
 44 | def hellinger(hist1, hist2):
 45 |     """Compute Hellinger distance between two histograms
 46 | 
 47 |       Args:
 48 |           hist1:        first histogram
 49 |           hist2:        second histogram of the same size as hist1
 50 | 
 51 |       Returns:
 52 |           float:        Hellinger distance between hist1 and hist2
 53 |     """
 54 | 
 55 |     return np.sqrt(1.0 - np.sum(np.sqrt(normalize(hist1) * normalize(hist2))))
 56 | 
 57 | 
 58 | # https://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort  # NOQA
 59 | def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
 60 |     return [int(text) if text.isdigit() else text.lower()
 61 |             for text in _nsre.split(s)]
 62 | 
 63 | 
 64 | def natural_sort(l, key=natural_sort_key):
 65 |     return sorted(l, key=key)
 66 | 
 67 | 
 68 | def main():
 69 |     parser = argparse.ArgumentParser(
 70 |         description='Calculate histograms of moving distances')
 71 |     parser.add_argument('--original', default='data/original',
 72 |                         help='Original gesture directory')
 73 |     parser.add_argument('--predicted', '-p', default='data/predicted',
 74 |                         help='Predicted gesture directory')
 75 |     parser.add_argument('--file', '-f', default='hmd_vel_0.05.csv',
 76 |                         help='File name to load')
 77 |     parser.add_argument('--joints', '-j', default='joints.txt',
 78 |                         help='Joint name file')
 79 |     parser.add_argument('--select', '-s', nargs='+',
 80 |                         help='Joint subset to compute (if omitted, use all)')
 81 |     parser.add_argument('--visualize', '-v', action='store_true',
 82 |                         help='Visualize histograms')
 83 |     parser.add_argument('--out', '-o', default='results',
 84 |                         help='Directory to output the result')
 85 |     args = parser.parse_args()
 86 | 
 87 |     joint_names = read_joint_names(args.joints)
 88 | 
 89 |     if args.select is not None:
 90 |         selected_joints = []
 91 |         for s in args.select:
 92 |             if not s in joint_names:
 93 |                 print('Ignore invalid joint: {}'.format(s))
 94 |             else:
 95 |                 selected_joints.append(s)
 96 | 
 97 |         if not selected_joints:
 98 |             selected_joints = ['Total']
 99 |             print('No valid joints are selected. Use all joints')
100 |     else:
101 |         # Use all joints
102 |         selected_joints = ['Total']
103 | 
104 |     def get_directories(directory):
105 |         return sorted(filter(lambda x: os.path.isdir(x), glob.glob(directory)))
106 | 
107 |     # Read original gesture's distribution
108 |     original_file = os.path.join(args.original, args.file)
109 |     original = pd.read_csv(original_file, index_col=0)
110 |     original_hist = np.array(original[selected_joints]).sum(axis=1)
111 | 
112 |     # List of predicted gesture direcotires
113 |     predicted_dirs = get_directories(os.path.join(args.predicted, '*'))
114 | 
115 |     results = {os.path.basename(d): None for d in predicted_dirs}
116 | 
117 |     # Iterate over the list of direcotires
118 |     for predicted_dir in predicted_dirs:
119 |         # Does this directory have a target file?
120 |         try:
121 |             predicted_file = os.path.join(predicted_dir, args.file)
122 |             predicted = pd.read_csv(predicted_file, index_col=0)
123 |         except FileNotFoundError:
124 |             # Are there any subdirectories which have integer names?
125 |             sub_dirs = sorted(
126 |                 filter(lambda x: os.path.basename(x).isdecimal(),
127 |                 get_directories(os.path.join(predicted_dir, '*'))))
128 | 
129 |             # If no, raise an exception
130 |             if not sub_dirs:
131 |                 raise FileNotFoundError(
132 |                     'There is neither ' + args.file
133 |                     + ' nor subdirectories in ' + predicted_dir)
134 | 
135 |             predicted = None
136 |             for sub_dir in sub_dirs:
137 |                 predicted_file = os.path.join(sub_dir, args.file)
138 |                 tmp = pd.read_csv(predicted_file, index_col=0)
139 | 
140 |                 if predicted is None:
141 |                     predicted = tmp
142 |                 else:
143 |                     predicted = predicted + tmp
144 |                 
145 |             predicted = predicted / float(len(sub_dirs))
146 | 
147 |         # Get histograms
148 |         predicted_hist = np.array(predicted[selected_joints]).sum(axis=1)
149 | 
150 |         assert len(original_hist) == len(predicted_hist)
151 | 
152 |         # Hellinger distance between two histograms
153 |         dist = hellinger(original_hist, predicted_hist)
154 | 
155 |         # Store results
156 |         key = os.path.basename(predicted_dir)
157 |         results[key] = {'dist': dist, 'hist': predicted_hist}
158 | 
159 |     # Print and save results
160 |     keys = natural_sort(results.keys())
161 | 
162 |     result_str = ['Hellinger distances:']
163 |     for key in keys:
164 |         result_str.append('\t{}: {}'.format(key, results[key]['dist']))
165 |     
166 |     result_str = '\n'.join(result_str)
167 |     
168 |     print(result_str)
169 |     print('')
170 | 
171 |     # Make output directory
172 |     out = os.path.join(args.out, os.path.basename(args.predicted),
173 |                        '+'.join(selected_joints))
174 |     if not os.path.isdir(out):
175 |         os.makedirs(out)
176 |     
177 |     with open(os.path.join(out, 'distances.txt'), 'w') as f:
178 |         f.write(result_str)
179 | 
180 |     if args.visualize:
181 |         # Set color and style
182 |         mpl_default = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
183 |                        '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
184 |                        '#bcbd22', '#17becf']
185 |         sns.set(context='poster', palette=sns.color_palette(mpl_default), font_scale=1.05)
186 |         sns.set_style('white', {'legend.frameon':True})
187 | 
188 |         # Velocities are computed in 20fps: make them into cm/s
189 |         index = original.index * 20
190 |         bins = [format(i, '.2f') for i in list(index)]
191 | 
192 |         # Plot speed in a range of [0, 15]
193 |         bins = bins[:-4]
194 |         original_hist = original_hist[:-4]
195 | 
196 |         fig = plt.figure(figsize=(8, 5))
197 |         ax = fig.add_subplot(111)
198 | 
199 |         # Convert frequency to percentage
200 |         gt_handle, = ax.plot(bins, normalize(original_hist) * 100, color='C4')
201 | 
202 |         # Awesome way to create a tabular-style legend
203 |         # https://stackoverflow.com/questions/25830780/tabular-legend-layout-for-matplotlib
204 |         # Create a blank rectangle
205 |         blank = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
206 | 
207 |         # Correspond to each columns of the tabular
208 |         legend_handles = [blank, gt_handle]
209 |         legend_names = ['Name', 'Ground Truth']
210 |         legend_dists = ['Hell. Dist.', '0'.center(16)]
211 | 
212 |         colors = ['C1', 'C3', 'C0', 'C2'] if len(keys) <= 4 else \
213 |                  ['C1', 'C0', 'C6', 'C7', 'C8', 'C9', 'C5', 'C2', 'C3']
214 |         
215 |         assert len(keys) <= len(colors)
216 | 
217 |         for color, key in zip(colors, keys):
218 |             predicted_hist = results[key]['hist'][:-4]
219 |             label = key.split('-')[1].replace('_smooth', '*')
220 | 
221 |             #if 'Aud2Pose' in label:
222 |             #    label += ' [18]'
223 | 
224 |             handle, = ax.plot(bins, normalize(predicted_hist) * 100, color=color)
225 | 
226 |             legend_handles.append(handle)
227 |             legend_names.append(label)
228 |             legend_dists.append('{:.3f}'.format(results[key]['dist']).center(12))
229 | 
230 |         # Legend will have a tabular of (rows x 3)
231 |         rows = len(legend_handles)
232 |         empty_label = ['']
233 | 
234 |         legend_handles = legend_handles + [blank] * (rows * 2)
235 |         legend_labels = np.concatenate([empty_label * rows, legend_names, legend_dists])
236 | 
237 |         ax.legend(legend_handles, legend_labels,
238 |                   ncol=3, handletextpad=0.5, columnspacing=-2.15,
239 |                   labelspacing=0.35)
240 |         ax.set_xlabel('Speed (cm/s)')
241 |         ax.set_ylabel('Frequency (%)')
242 |         ax.set_xticks(np.arange(16))
243 |         ax.tick_params(pad=6)
244 |         ax.yaxis.set_major_locator(
245 |             MaxNLocator(nbins='auto', steps=[1, 2, 2.5, 5, 10], integer=True))
246 | 
247 |         plt.subplots_adjust(left=0.09, right=0.98, top=0.98, bottom=0.12)
248 |         plt.savefig(os.path.join(out, 'speed_histogram.pdf'))
249 |         plt.show()
250 |     
251 |     print('Results were writen in ' + out)
252 |     print('')
253 | 
254 | 
255 | if __name__ == '__main__':
256 |     main()
257 | 


--------------------------------------------------------------------------------
/evaluation/hellinger_one2one.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Sep 30 16:58:35 2020
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | import argparse
  9 | import glob
 10 | import os
 11 | import re
 12 | 
 13 | import matplotlib
 14 | matplotlib.use('Agg')
 15 | import matplotlib.pyplot as plt
 16 | from matplotlib.ticker import FixedLocator, MaxNLocator
 17 | from matplotlib.patches import Rectangle
 18 | import numpy as np
 19 | import pandas as pd
 20 | import seaborn as sns
 21 | 
 22 | 
 23 | def read_joint_names(filename):
 24 |     """Read motion capture's body joint names from file
 25 | 
 26 |       Args:
 27 |           filename:     file name to read
 28 | 
 29 |       Returns:
 30 |           joint_names:  list of joint names
 31 |     """
 32 | 
 33 |     with open(filename, 'r') as f:
 34 |         org = f.read()
 35 |         joint_names = org.split(',')
 36 | 
 37 |     return joint_names
 38 | 
 39 | 
 40 | def compute_speed(data, dim=3):
 41 |     """Compute speed between adjacent frames
 42 | 
 43 |       Args:
 44 |           data:         array containing joint positions of gesture
 45 |           dim:          gesture dimensionality
 46 | 
 47 |       Returns:
 48 |           speeds:       velocities of each joint between each adjacent frame
 49 |     """
 50 | 
 51 |     # First derivative of position is velocity
 52 |     vels = np.diff(data, n=1, axis=0)
 53 | 
 54 |     num_vels = vels.shape[0]
 55 |     num_joints = vels.shape[1] // dim
 56 | 
 57 |     speeds = np.zeros((num_vels, num_joints))
 58 | 
 59 |     for i in range(num_vels):
 60 |         for j in range(num_joints):
 61 |             x1 = j * dim + 0
 62 |             x2 = j * dim + dim
 63 |             speeds[i, j] = np.linalg.norm(vels[i, x1:x2])
 64 | 
 65 |     return speeds
 66 | 
 67 | 
 68 | def normalize(hist):
 69 |     return hist / np.sum(hist)
 70 | 
 71 | 
 72 | def hellinger(hist1, hist2):
 73 |     """Compute Hellinger distance between two histograms
 74 | 
 75 |       Args:
 76 |           hist1:        first histogram
 77 |           hist2:        second histogram of the same size as hist1
 78 | 
 79 |       Returns:
 80 |           float:        Hellinger distance between hist1 and hist2
 81 |     """
 82 | 
 83 |     return np.sqrt(1.0 - np.sum(np.sqrt(normalize(hist1) * normalize(hist2))))
 84 | 
 85 | 
 86 | # https://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort  # NOQA
 87 | def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
 88 |     return [int(text) if text.isdigit() else text.lower()
 89 |             for text in _nsre.split(s)]
 90 | 
 91 | 
 92 | def natural_sort(l, key=natural_sort_key):
 93 |     return sorted(l, key=key)
 94 | 
 95 | 
 96 | # https://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list  # NOQA
 97 | def reject_outliers(data, m=5.189):
 98 |     d = np.abs(data - np.median(data))
 99 |     mdev = np.median(d)
100 |     s = d / mdev if mdev else 0.
101 |     return data[s < m]
102 | 
103 | 
104 | def main():
105 |     parser = argparse.ArgumentParser(
106 |         description='Compute Hellinger distances between predicted '
107 |                     'and ground truth gestures in a one-to-one manner')
108 |     parser.add_argument('--original', '-o', default='data/original',
109 |                         help='Original gesture directory')
110 |     parser.add_argument('--predicted', '-p', default='data/predicted',
111 |                         help='Predicted gesture directory')
112 |     parser.add_argument('--width', '-w', type=float, default=0.05,
113 |                         help='Bin width of the histogram (default: 0.05)')
114 |     parser.add_argument('--joints', '-j', default='joints.txt',
115 |                         help='Joint name file')
116 |     parser.add_argument('--select', '-s', nargs='+',
117 |                         help='Joint subset to compute (if omitted, use all)')
118 |     parser.add_argument('--visualize', '-v', action='store_true',
119 |                         help='Visualize histograms')
120 |     parser.add_argument('--match_yticks', '-m', action='store_true',
121 |                         help='Match y-ticks over all the sequences in visualization')
122 |     parser.add_argument('--out', default='results',
123 |                         help='Directory to output the result')
124 |     args = parser.parse_args()
125 | 
126 |     joint_names = read_joint_names(args.joints)
127 | 
128 |     if args.select is not None:
129 |         selected_joints = []
130 |         for s in args.select:
131 |             try:
132 |                 index = joint_names.index(s)
133 |             except ValueError:
134 |                 print('Ignore invalid joint: {}'.format(s))
135 |             else:
136 |                 selected_joints.append(index)
137 |         selected_joints.sort()
138 | 
139 |         if len(selected_joints) == 0:
140 |             selected_joints = range(len(joint_names))
141 |             print('No valid joints are selected. Use all joints')
142 |     else:
143 |         # Use all joints
144 |         selected_joints = range(len(joint_names))
145 | 
146 |     def get_directories(directory):
147 |         return sorted(filter(lambda x: os.path.isdir(x), glob.glob(directory)))
148 | 
149 |     # Define histogram bins
150 |     bins = np.arange(0, 1 + args.width, args.width)
151 | 
152 |     # Find original gesture data
153 |     original_files = natural_sort(
154 |         glob.glob(os.path.join(args.original, '*.txt')))
155 |     
156 |     if args.match_yticks:
157 |         max_freqs = []
158 | 
159 |     # Compute speed histogram for original gestures
160 |     original_hists = []
161 |     for original_file in original_files:
162 |         original = np.loadtxt(original_file)
163 | 
164 |         # Compute speed histogram
165 |         original_speed = compute_speed(original)[:, selected_joints]
166 |         original_hist, _ = np.histogram(original_speed, bins=bins)
167 | 
168 |         original_hists.append(original_hist)
169 | 
170 |         if args.match_yticks:
171 |             max_freqs.append(normalize(original_hist).max().item())
172 | 
173 |     # List of predicted gesture direcotires
174 |     predicted_dirs = get_directories(os.path.join(args.predicted, '*'))
175 | 
176 |     if len(predicted_dirs) == 0:
177 |         raise ValueError('No gesture directories are found in '
178 |                          + args.predicted)
179 | 
180 |     results = {os.path.basename(d): None for d in predicted_dirs}
181 | 
182 |     assert 'original' not in results.keys()
183 | 
184 |     # Store original gesture histograms
185 |     original_key = 'original'
186 |     results[original_key] = dict()
187 |     for i, original_hist in enumerate(original_hists):
188 |         file_key = os.path.basename(original_files[i])
189 |         results[original_key][file_key] = {'hist': original_hist}
190 | 
191 |     # Iterate over the list of direcotires
192 |     overall_dists = dict()
193 |     for predicted_dir in predicted_dirs:
194 |         predicted_files = natural_sort(
195 |             glob.glob(os.path.join(predicted_dir, '*.txt')))
196 | 
197 |         # Check if the predicted gesture files are consistent with the original files
198 |         if [os.path.basename(p) for p in predicted_files] != [os.path.basename(o) for o in original_files]:
199 |             raise ValueError('Gesture files located in ' + predicted_dir + ' are inconsistent with '
200 |                              'original gesture files located in ' + args.original)
201 | 
202 |         dir_key = os.path.basename(predicted_dir)
203 |         results[dir_key] = dict()
204 | 
205 |         # Compute speed histogram for predicted gestures
206 |         predicted_hists = []
207 |         for predicted_file in predicted_files:
208 |             predicted = np.loadtxt(predicted_file)
209 | 
210 |             # Compute speed histogram
211 |             predicted_speed = compute_speed(predicted)[:, selected_joints]
212 |             predicted_hist, _ = np.histogram(predicted_speed, bins=bins)
213 | 
214 |             predicted_hists.append(predicted_hist)
215 | 
216 |             if args.match_yticks:
217 |                 max_freqs.append(normalize(predicted_hist).max().item())
218 | 
219 |         assert len(original_hists) == len(predicted_hists)
220 | 
221 |         # Compute Hellinger distance in a one-to-one manner
222 |         for i, (original_hist, predicted_hist) in enumerate(zip(original_hists, predicted_hists)):
223 |             assert len(original_hist) == len(predicted_hist)
224 | 
225 |             # Hellinger distance between two histograms
226 |             dist = hellinger(original_hist, predicted_hist)
227 | 
228 |             # Store results
229 |             file_key = os.path.basename(predicted_files[i])
230 |             results[dir_key][file_key] = {'dist': dist, 'hist': predicted_hist}
231 | 
232 |         # Print the overall Hellinger distance (Note: this is not one-to-one)
233 |         overall_dist = hellinger(np.sum(original_hists, axis=0),
234 |                                  np.sum(predicted_hists, axis=0))
235 |         overall_dists[dir_key] = overall_dist
236 | 
237 |     # Create a dataframe to save
238 |     dir_keys = natural_sort(results.keys())
239 |     dir_keys.remove('original')
240 |     file_keys = natural_sort(results['original'].keys())
241 | 
242 |     save_dict = {d_k: [results[d_k][f_k]['dist'] for f_k in file_keys] for d_k in dir_keys}
243 |     df = pd.DataFrame(save_dict, index=file_keys)
244 | 
245 |     # Add mean and std values
246 |     mean = df.mean()
247 |     std = df.std()
248 |     df.loc['mean'] = mean
249 |     df.loc['std'] = std
250 | 
251 |     # Make an output directory
252 |     if selected_joints == range(len(joint_names)):
253 |         selected_joint_names = ['Total']
254 |     else:
255 |         selected_joint_names = [joint_names[s] for s in selected_joints]
256 |     out = os.path.join(args.out, os.path.basename(args.predicted),
257 |                        '+'.join(selected_joint_names))
258 |     if not os.path.isdir(out):
259 |         os.makedirs(out)
260 | 
261 |     # Save the results to a CSV file
262 |     df.to_csv(os.path.join(out, 'hellinger_distances.csv'))
263 | 
264 |     # Print and save the overall distances
265 |     overall_str = ['Overall Hellinger distances:']
266 |     print('Overall Hellinger distances:')
267 |     for dir_key in dir_keys:
268 |         overall_str.append('{}: {}'.format(dir_key, overall_dists[dir_key]))
269 |         print('{: <20}'.format(dir_key),
270 |               '\t{:.3f}'.format(overall_dists[dir_key]))
271 |     print('')
272 |     
273 |     overall_str = '\n'.join(overall_str)
274 | 
275 |     with open(os.path.join(out, 'overall_distances.txt'), 'w') as f:
276 |         f.write(overall_str)
277 | 
278 |     if args.visualize:
279 |         # Set color and style
280 |         mpl_default = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728',
281 |                        '#9467bd', '#8c564b', '#e377c2', '#7f7f7f',
282 |                        '#bcbd22', '#17becf']
283 |         sns.set(context='poster', palette=sns.color_palette(mpl_default), font_scale=1.05)
284 |         sns.set_style('white', {'legend.frameon': True})
285 | 
286 |         # Velocities are computed in 20fps: make them into cm/s
287 |         plot_bins = [format(b, '.2f') for b in bins[:-1] * 20]
288 | 
289 |         # Plot speed in a range of [0, 15]
290 |         plot_bins = plot_bins[:-4]
291 | 
292 |         # Make an output directory
293 |         vis_out = os.path.join(out, 'histograms')
294 |         if not os.path.isdir(vis_out):
295 |             os.makedirs(vis_out)
296 |         
297 |         if args.match_yticks:
298 |             max_percentage = int(reject_outliers(np.array(max_freqs)).max().item() * 100)
299 | 
300 |             tick_interval = 5 if max_percentage // 5 < 9 else 10  # Avoid too many ticks
301 |             ticks = list(range(0, max_percentage, tick_interval))
302 |         
303 |         for file_key in file_keys:
304 |             # Plot in a range of [0, 15]
305 |             original_hist = results['original'][file_key]['hist'][:-4]
306 | 
307 |             fig = plt.figure(figsize=(8, 5))
308 |             ax = fig.add_subplot(111)
309 | 
310 |             # Convert frequency to percentage
311 |             gt_handle, = ax.plot(plot_bins, normalize(original_hist) * 100, color='C4')
312 | 
313 |             # Awesome way to create a tabular-style legend
314 |             # https://stackoverflow.com/questions/25830780/tabular-legend-layout-for-matplotlib
315 |             # Create a blank rectangle
316 |             blank = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0)
317 | 
318 |             # Correspond to each columns of the tabular
319 |             legend_handles = [blank, gt_handle]
320 |             legend_names = ['Name', 'Ground Truth']
321 |             legend_dists = ['Hell. Dist.', '0'.center(16)]
322 | 
323 |             colors = ['C1', 'C3', 'C0', 'C2'] if len(dir_keys) <= 4 else \
324 |                      ['C1', 'C0', 'C6', 'C7', 'C8', 'C9', 'C5', 'C2', 'C3']
325 | 
326 |             assert len(dir_keys) <= len(colors)
327 | 
328 |             for color, dir_key in zip(colors, dir_keys):
329 |                 predicted_hist = results[dir_key][file_key]['hist'][:-4]
330 |                 label = dir_key.split('-')[1].replace('_smooth', '*')
331 | 
332 |                 # if 'Aud2Pose' in label:
333 |                 #     label += ' [18]'
334 | 
335 |                 handle, = ax.plot(plot_bins, normalize(predicted_hist) * 100, color=color)
336 | 
337 |                 legend_handles.append(handle)
338 |                 legend_names.append(label)
339 |                 legend_dists.append('{:.3f}'.format(results[dir_key][file_key]['dist']).center(12))
340 | 
341 |             # Legend will have a tabular of (rows x 3)
342 |             rows = len(legend_handles)
343 |             empty_label = ['']
344 | 
345 |             legend_handles = legend_handles + [blank] * (rows * 2)
346 |             legend_labels = np.concatenate([empty_label * rows, legend_names, legend_dists])
347 | 
348 |             ax.legend(legend_handles, legend_labels,
349 |                       ncol=3, handletextpad=0.5, columnspacing=-2.15,
350 |                       labelspacing=0.35)
351 |             ax.set_xlabel('Speed (cm/s)')
352 |             ax.set_ylabel('Frequency (%)')
353 |             ax.set_xticks(np.arange(16))
354 |             ax.tick_params(pad=6)
355 | 
356 |             if args.match_yticks:
357 |                 ax.set_ylim(0, max_percentage)
358 |                 ax.yaxis.set_major_locator(FixedLocator(ticks))
359 |             else:
360 |                 ax.yaxis.set_major_locator(
361 |                     MaxNLocator(nbins='auto', steps=[1, 2, 2.5, 5, 10], integer=True))
362 | 
363 |             plt.subplots_adjust(left=0.09, right=0.98, top=0.98, bottom=0.12)
364 |             plt.savefig(os.path.join(vis_out, os.path.splitext(file_key)[0] + '_speed_histogram.pdf'))
365 |             plt.show()
366 | 
367 |             plt.clf()
368 |             plt.close()
369 | 
370 |     print('Results were writen in ' + out)
371 |     print('')
372 | 
373 | 
374 | if __name__ == '__main__':
375 |     main()
376 | 


--------------------------------------------------------------------------------
/evaluation/joints.txt:
--------------------------------------------------------------------------------
1 | Hips,Spine,Spine1,Neck,Head,Site1,LeftShoulder,LeftArm,LeftForeArm,LeftHand,LeftHandThumb1,LeftHandThumb2,LeftHandThumb3,Site2,LeftHandIndex1,LeftHandIndex2,LeftHandIndex3,Site3,LeftHandMiddle1,LeftHandMiddle2,LeftHandMiddle3,Site4,LeftHandRing1,LeftHandRing2,LeftHandRing3,Site5,LeftHandPinky1,LeftHandPinky2,LeftHandPinky3,Site6,RightShoulder,RightArm,RightForeArm,RightHand,RightHandThumb1,RightHandThumb2,RightHandThumb3,Site7,RightHandIndex1,RightHandIndex2,RightHandIndex3,Site8,RightHandMiddle1,RightHandMiddle2,RightHandMiddle3,Site9,RightHandRing1,RightHandRing2,RightHandRing3,Site10,RightHandPinky1,RightHandPinky2,RightHandPinky3,Site11,LeftUpLeg,LeftLeg,LeftFoot,LeftToeBase,Site12,RightUpLeg,RightLeg,RightFoot,RightToeBase,Site13


--------------------------------------------------------------------------------
/evaluation/plot_results.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Plots the experimental results after calculating motion statistics
  3 | Expects that calc_distance was run before this script
  4 | 
  5 | @author: Taras Kucherenko
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | import csv
 10 | import numpy as np
 11 | 
 12 | def read_joint_names(filename):
 13 |     with open(filename, 'r') as f:
 14 |         org = f.read()
 15 |         joint_names = org.split(',')
 16 | 
 17 |     return joint_names
 18 | 
 19 | def read_csv(filename):
 20 | 
 21 |     joint_names = read_joint_names("joints.txt")
 22 | 
 23 |     r_shoulder_index = joint_names.index("RightShoulder") + 1
 24 |     l_shoulder_index = joint_names.index("LeftShoulder") + 1
 25 | 
 26 |     r_hand_index = joint_names.index("RightHand") + 1
 27 |     l_hand_index = joint_names.index("LeftHand") + 1
 28 | 
 29 |     r_forearm_index = joint_names.index("RightForeArm") + 1
 30 |     l_forearm_index = joint_names.index("LeftForeArm") + 1
 31 | 
 32 |     x=[]
 33 |     y=[]
 34 |     total_sum = 0
 35 |     with open(filename, 'r') as csvfile:
 36 |         reader = csv.reader(csvfile, delimiter=',')
 37 |         next(reader, None)  # skip the headers
 38 |         for row in reader:
 39 |             x.append(float(row[0]) * 20)  # Scale the velocity
 40 |             next_val = float(row[r_hand_index]) + float(row[l_hand_index]) # float(row[-1]) #l_hand_index])   #
 41 |             y.append(next_val*100)
 42 |             total_sum+=next_val
 43 | 
 44 |             # Crop on 15
 45 |             if float(row[0]) * 20 >= 15:
 46 |                 break
 47 | 
 48 |     return np.array(x), np.array(y) / total_sum
 49 | 
 50 | def barplot_annotate_brackets(num1, num2, data, center, height, yerr=None, dh=.05, barh=.05, fs=None, maxasterix=None):
 51 |     """
 52 |     Annotate barplot with p-values.
 53 | 
 54 |     :param num1: number of left bar to put bracket over
 55 |     :param num2: number of right bar to put bracket over
 56 |     :param data: string to write or number for generating asterixes
 57 |     :param center: centers of all bars (like plt.bar() input)
 58 |     :param height: heights of all bars (like plt.bar() input)
 59 |     :param yerr: yerrs of all bars (like plt.bar() input)
 60 |     :param dh: height offset over bar / bar + yerr in axes coordinates (0 to 1)
 61 |     :param barh: bar height in axes coordinates (0 to 1)
 62 |     :param fs: font size
 63 |     :param maxasterix: maximum number of asterixes to write (for very small p-values)
 64 |     """
 65 | 
 66 |     text = data
 67 | 
 68 |     lx, ly = center[num1], height[num1]
 69 |     rx, ry = center[num2], height[num2]
 70 | 
 71 |     if yerr:
 72 |         ly += yerr[num1]
 73 |         ry += yerr[num2]
 74 | 
 75 |     ax_y0, ax_y1 = plt.gca().get_ylim()
 76 |     dh *= (ax_y1 - ax_y0)
 77 |     barh *= (ax_y1 - ax_y0)
 78 | 
 79 |     y = max(ly, ry) + dh
 80 | 
 81 |     barx = [lx, lx, rx, rx]
 82 |     bary = [y, y+barh, y+barh, y]
 83 |     mid = ((lx+rx)/2, y+barh)
 84 | 
 85 |     plt.plot(barx, bary, c='black')
 86 | 
 87 |     kwargs = dict(ha='center', va='bottom')
 88 |     if fs is not None:
 89 |         kwargs['fontsize'] = fs
 90 | 
 91 |     #plt.text(*mid, text, **kwargs)
 92 | 
 93 | 
 94 | def get_average(feature_name):
 95 | 
 96 |     feature_filename = 'result/'+feature_name+'/1/hmd_' + type + '_0.05.csv'
 97 |     _, feature_1 = read_csv(feature_filename)
 98 |     feature_filename = 'result/'+feature_name+'/2/hmd_' + type + '_0.05.csv'
 99 |     _, feature_2 = read_csv(feature_filename)
100 |     feature_filename = 'result/'+feature_name+'/3/hmd_' + type + '_0.05.csv'
101 |     _, feature_3 = read_csv(feature_filename)
102 |     # average
103 |     feature = np.mean(np.array([feature_1, feature_2, feature_3]), axis=0)
104 | 
105 |     return feature
106 | 
107 | 
108 | plt.rcParams.update({'font.size': 36})
109 | 
110 | 
111 | type = "vel"
112 | 
113 | original_filename = 'result/original/hmd_'+type+'_0.05.csv'
114 | 
115 | x,original = read_csv(original_filename)
116 | 
117 | mfcc = get_average('MFCC')
118 | 
119 | baseline = get_average('MFCC_Bas')
120 | 
121 | spectr = get_average('Spectr')
122 | 
123 | pros = get_average('Pros')
124 | 
125 | spectr_pros = get_average('Spectr_Pros')
126 | 
127 | mfcc_pros = get_average('MFCC_Pros')
128 | 
129 | 
130 | """baseline = [4.160, 4.940, 4.319]
131 | encoder = np.array([4.798, 4.830, 4.151])
132 | x = np.arange(3)
133 | 
134 | errorB = [0.93, 1, 1.43]
135 | errorE = [0.89, 0.98, 1.43]
136 | 
137 | plt.bar(x, baseline, yerr=errorB, label='Baseline' ,width = 0.25, hatch='/')
138 | plt.bar(x+0.25, encoder, label = 'Proposed' ,width = 0.25)
139 | 
140 | special_x = np.array([0, 0.25, 0.5, 0.75])
141 | 
142 | barplot_annotate_brackets(0, 1, "p < 0.002", special_x, encoder)
143 | barplot_annotate_brackets(1, 2, "p = 0.32", special_x+0.75, encoder)
144 | barplot_annotate_brackets(1, 2, "p = 0.13", special_x+1.75, encoder)
145 | 
146 | plt.xticks(np.arange(3),('Naturalness', 'Time-consistency', 'Semantic-consistency'))
147 | 
148 | plt.legend(bbox_to_anchor=(0.2, 0.99), ncol=2)
149 | 
150 | plt.ylim(top=6)"""
151 | 
152 | 
153 | 
154 | 
155 | 
156 | #plt.plot(x,original, label='Ground Truth',linewidth=7.0)#,width = 0.25)
157 | plt.plot(x,original,linewidth=7.0, label='Ground Truth', color='Purple')
158 | plt.plot(x,spectr , label='Proposed (Spectral)',linewidth=7.0)
159 | plt.plot(x,pros , label='Proposed (Prosodic)',linewidth=7.0, color='C2')
160 | 
161 | 
162 | #plt.plot(x,mfcc_pros , label='MFCC+Pros',linewidth=7.0, color='Pink')
163 | #plt.plot(x,spectr_pros , label='Spectrogram+Pros',linewidth=7.0, color='C3')
164 | 
165 | plt.plot(x,mfcc , label='Proposed (MFCC)',linewidth=7.0, color='C1')
166 | 
167 | plt.plot(x,baseline , label='Baseline (MFCC)',linewidth=7.0, color='Blue')
168 | 
169 | plt.xlabel("Velocity (cm/s)")
170 | plt.ylabel('Frequency (%)')
171 | #plt.title('Average Velocity Histogram')
172 | 
173 | 
174 | 
175 | plt.xticks(np.arange(16))#, ('Tom', 'Dick', 'Harry', 'Sally', 'Sue'))
176 | 
177 | 
178 | leg = plt.legend()
179 | 
180 | 
181 | 
182 | plt.show()
183 | 


--------------------------------------------------------------------------------
/example_scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Example Scripts
 2 | 
 3 | This directory contain scripts used for in experiments for training and testing different Neural Networks (NN)
 4 | 1. Training and testing a baseline gesture generation NN  (baseline_train_n_test.sh)
 5 | 2. Training and testing of autoencoder-based gesture generation NN (proposed_train_n_test.sh)
 6 | 
 7 | Note: prior to using this scripts a user needs 
 8 | a) download and preprocess dataset, as described in the root folder
 9 | b) adjust parameters in the `config.txt` file
10 | 
11 | ### Baseline model
12 | 
13 | Use `baseline_train_n_test.sh` to train a baseline speech-driven gesture generation neural network
14 | ```sh
15 | ./baseline_train_n_test.sh
16 | ```
17 | The resulting model will be stored in the following file: `folder`BasedModel.hdf5
18 | The numerical evaluation will be writen in the file `../results.txt`
19 | 
20 | Note: `baseline_test.sh` is used in `baseline_train_n_test.sh` for testing.
21 | 
22 | 
23 | ### Proposed model
24 | 
25 | Use `proposed_train_n_test.sh` to train and test a baseline speech-driven gesture generation neural network
26 | ```sh
27 | ./proposed_train_n_test.sh
28 | ```
29 | The resulting model will be stored in the following file: `folder`Based`enc_dim`DimModel.hdf5
30 | The numerical evaluation will be writen in the file `../results.txt`
31 | 
32 | Note: `proposed_test.sh` is used in `proposed_train_n_test.sh` for testing.
33 | 


--------------------------------------------------------------------------------
/example_scripts/baseline_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is used in "baseline_train_n_test.sh" to evaluate the baseline model
 4 | # You call use it by itself if the model is already trained
 5 | # Several aspects needs to be customized
 6 | 
 7 | # Read parameters
 8 | source config.txt
 9 | 
10 | model=example_scripts/models/${folder}"BasedModel"
11 | 
12 | # Create a folder to store produced gesture sequences
13 | mkdir -p gestures
14 | 
15 | # Remove previous results
16 | cd ..
17 | rm evaluation/data/predicted/$speech_features/*
18 | 
19 | # Make predictions for all the test sequences
20 | # (replace 1094 by 1093 for the dev sequences)
21 | for seq in `seq 1094 2 1182`; 
22 |         do
23 | 		echo
24 |                 echo 'Predicting sequence' $seq
25 |                 CUDA_VISIBLE_DEVICES=$gpu python predict.py $model.hdf5 $data_dir/test_inputs/X_test_audio${seq}.npy normal_prediction$seq.txt
26 |                 mv normal_prediction$seq.txt example_scripts/gestures/gesture${seq}.txt	
27 |         done
28 | 
29 | echo 'Removing the velocities ...'
30 | python helpers/remove_velocity.py -g example_scripts/gestures
31 | cd example_scripts/gestures
32 | 
33 | # remove gestures with velocites
34 | rm *.txt
35 | 
36 | # Move gestrues without velocities to the corresponding folder
37 | mkdir -p ../../evaluation/data/predicted/$speech_features/
38 | mv no_vel/*.txt ../../evaluation/data/predicted/$speech_features/
39 | cd ../../evaluation
40 | 
41 | # In order for an evaluation to be correct ONLY ground truth motion 3d coords in txt format for the
42 | # same sequences as used in the script above (1094, 1096,...) has to be in evaluation/data/original
43 | # if evaluation/data/origibal contains all the sequences (1093,1094...) the results will be wrong
44 | # see "evaluation" folder for the info on how to transform the true gestures from bvh to txt format
45 | 
46 | echo 'Evaluating ...'
47 | echo "Evaluating "${model}" ..." >> ../results.txt
48 | python calc_errors.py -g $speech_features -m ape  >> ../results.txt
49 | python calc_errors.py -g $speech_features -m mae  >> ../results.txt
50 | python calc_jerk.py -g $speech_features -m acceleration >> ../results.txt
51 | python calc_jerk.py -g $speech_features >> ../results.txt
52 | 


--------------------------------------------------------------------------------
/example_scripts/baseline_train_n_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script can be used to train a speech-gesture neural network
 4 | # You might need to customize it using config.txt file
 5 | 
 6 | # (Optional) Activate your virtual env
 7 | source activate CondaEnvPy3Tf
 8 | 
 9 | # Read the parameters for the scripts
10 | source config.txt
11 | 
12 | model=${folder}"BasedModel"
13 | 
14 | echo "Training "${model}" on the ${folder} folder"
15 | START=$(date +%s)
16 | 
17 | # Train baseline model
18 | CUDA_VISIBLE_DEVICES=$gpu python ../train.py models/$model.hdf5 100 $data_dir $numb_in_features False
19 | 
20 | Tr_FINISH=$(date +%s)
21 | 
22 | # Evaluate the model
23 | echo "Testing "${model}" model" >> ../results.txt
24 | ./baseline_test.sh
25 | 
26 | # Compress and save the results
27 | archive=${model}Results.tar
28 | echo "Compressing the results:"
29 | tar -czvf $archive ../evaluation/data/predicted/$speech_features/*.txt
30 | echo "The results were compressed into example_scripts/"$archive
31 | 
32 | END=$(date +%s)
33 | DIFF=$(( $END - $START ))
34 | echo "The whole cicle took $[DIFF/60] minutes"
35 | 
36 | DIFF=$(( $Tr_FINISH - $START ))
37 | echo "Learning speech-motion mapping took $[DIFF/60] minutes"
38 | 
39 | echo "The model was saved in "example_scripts/models/${model}".hdf5"
40 | 


--------------------------------------------------------------------------------
/example_scripts/config.txt:
--------------------------------------------------------------------------------
1 | gpu=0                 # which GPU to use to run the model
2 | folder=TheLAtest       # which folder with the data to use
3 | data_dir=/home/taras/Documents/Datasets/SpeechToMotion/Japanese/$folder
4 | speech_features=MFCC # which speech features to use
5 | numb_in_features=26  # how many speech features there are
6 | dim=325              # what is the dimensionality of representation
7 | 


--------------------------------------------------------------------------------
/example_scripts/proposed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script is used in "proposed_train_n_test.sh" to evaluate the proposed model
 4 | # You call use it by itself if the model is already trained
 5 | # Several aspects needs to be customized at config.txt
 6 | 
 7 | # Read the parameters for the scripts
 8 | source config.txt
 9 | 
10 | model=example_scripts/models/${folder}"Based"${dim}"DimModel"
11 | 
12 | # Create a folder to store produced gesture sequences
13 | mkdir -p gestures
14 | 
15 | # Remove previous results
16 | cd ..
17 | rm evaluation/data/predicted/$speech_features/*
18 | 
19 | # Make predictions for all the test sequences
20 | # (replace 1094 by 1093 for the dev sequences)
21 | for seq in `seq 1094 2 1182`; 
22 |         do
23 | 		echo
24 |                 echo 'Predicting sequence' $seq
25 |                 # Step1: Predict representation
26 |                 CUDA_VISIBLE_DEVICES=$GPU python predict.py $model.hdf5 $data_dir/test_inputs/X_test_audio${seq}.npy enc_${dim}_prediction$seq.txt
27 |                 mv enc_${dim}_prediction$seq.txt motion_repr_learning/ae/
28 |                 cd motion_repr_learning/ae/
29 |                 # Step2: Decode representation into motion
30 |                 CUDA_VISIBLE_DEVICES=$GPU python decode.py $data_dir enc_${dim}_prediction${seq}.txt ../../example_scripts/gestures/gesture${seq}.txt -restore=True -pretrain=False -layer1_width=$dim -chkpt_dir='/home/taras/tmp/MoCap/'$dim -batch_size=8
31 |                 # Remove encoded prediction
32 |                 rm enc_${dim}_pred*
33 |                 cd ../..
34 |         done
35 | 
36 | echo 'Removing the velocities ...'
37 | python helpers/remove_velocity.py -g example_scripts/gestures
38 | cd example_scripts/gestures
39 | 
40 | # remove gestures with velocites
41 | rm *.txt
42 | 
43 | # Move gestrues without velocities to the corresponding folder
44 | mkdir -p ../../evaluation/data/predicted/$speech_features/
45 | mv no_vel/*.txt ../../evaluation/data/predicted/$speech_features/
46 | cd ../../evaluation
47 | 
48 | # In order for an evaluation to be correct ONLY ground truth motion 3d coords in txt format for the
49 | # same sequences as used in the script above (1094, 1096,...) has to be in evaluation/data/original
50 | # if evaluation/data/origibal contains all the sequences (1093,1094...) the results will be wrong
51 | # see "evaluation" folder for the info on how to transform the true gestures from bvh to txt format
52 | 
53 | echo 'Evaluating ...'
54 | echo "Evaluating "${model}" ..." >> ../results.txt
55 | python calc_errors.py -g $speech_features -m ape  >> ../results.txt
56 | python calc_errors.py -g $speech_features -m mae  >> ../results.txt
57 | python calc_jerk.py -g $speech_features >> ../results.txt
58 | python calc_jerk.py -g $speech_features -m acceleration >> ../results.txt
59 | # Where to store the results can be customized
60 | 


--------------------------------------------------------------------------------
/example_scripts/proposed_train_n_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script contain both training and testing
 4 | # of the autoencoder based gesture generation neural network
 5 | # You might need to customize it using config.txt file
 6 | 
 7 | # (Optional) Activate your virtual env
 8 | source activate CondaEnvPy3Tf
 9 | 
10 | # Read the parameters for the scripts
11 | source config.txt
12 | 
13 | model=${folder}"Based"${dim}"DimModel"
14 | 
15 | echo "Training "${model}
16 | 
17 | # Do timing
18 | START=$(date +%s)
19 | 
20 | cd ../motion_repr_learning/ae/
21 | 
22 | # Create a folder for the encoded dataset
23 | mkdir -p $data_dir/325
24 | 
25 | # Learn dataset encoding
26 | CUDA_VISIBLE_DEVICES=$gpu python learn_dataset_encoding.py $data_dir -chkpt_dir='/home/taras/tmp/MoCap/'$dim -layer1_width=$dim
27 | 
28 | #Encode dataset
29 | echo "Encoding the dataset"
30 | CUDA_VISIBLE_DEVICES=$gpu python encode_dataset.py $data_dir -chkpt_dir='/home/taras/tmp/MoCap/'$dim -restore=True -pretrain=False -layer1_width=$dim
31 | 
32 | # Copy input data
33 | Encoding=$(date +%s)
34 | 
35 | cd ../../example_scripts
36 | 
37 | Tr_START=$(date +%s)
38 | 
39 | # Train model on the reprentation
40 | CUDA_VISIBLE_DEVICES=$gpu python ../train.py models/$model.hdf5 100 $data_dir $numb_in_features True $dim
41 | 
42 | Tr_FINISH=$(date +%s)
43 | 
44 | # Evaluate the model
45 | ./proposed_test.sh 
46 | 
47 | # Compress and save the results
48 | archive=${model}Results.tar
49 | echo "Compressing the results:"
50 | tar -czvf $archive ../evaluation/data/predicted/$speech_features/*.txt
51 | echo "The results were compressed into example_scripts/models/"$archive
52 | 
53 | END=$(date +%s)
54 | DIFF=$(( $END - $START ))
55 | echo "The whole cicle took $[DIFF/60] minutes"
56 | 
57 | DIFF=$(( $Encoding - $START ))
58 | echo "Learning repr. and encoding took $[DIFF/60] minutes"
59 | 
60 | DIFF=$(( $Tr_FINISH - $Tr_START ))
61 | echo "Learning speech-motion mapping took $[DIFF/60] minutes"
62 | 


--------------------------------------------------------------------------------
/helpers/README.md:
--------------------------------------------------------------------------------
 1 | # How to use the helper scripts
 2 | 
 3 | This directory provides data handling scripts for our gesture generation framework. It provides the following functionality:
 4 | - Velocity remover for predicted gestures
 5 | 
 6 | (The neural network outputs coordinates and velocities to regularize training and we remove velocities as postprocessing)
 7 | - Original gesture converter to create the ground truth
 8 | 
 9 | (Converting the original motion for joint angles space in .bvh format to 3d coordinates in txt coordinates)
10 | - Temporal filters for motion smoothing
11 | 
12 | (Can be applied as postprocessing. Were not used in the experiments from the paper)
13 | 
14 | ## Data preparation 
15 |   1. Run `../predict.py` to predict gestures from speech audio as described in the root folder.
16 |   2. Put the predicted gestures (e.g. `predict_1094.txt, ...`) into a directory, say, `your_prediction_dir/`.
17 | 
18 | ### Velocity remover
19 | 
20 | `remove_velocity.py` removes velocities from raw predicted gestures. This produces gesture files containing `(x, y, z) x 64 joints = 192` white space separated data for each line. 
21 | **You have to remove the velocities before using the evaluation scripts or the animation server.**
22 | 
23 | ```sh
24 | # Remove velocities
25 | python remove_velocity.py -g your_prediction_dir
26 | ```
27 | The resulting files will be stored in the subfolder: `your_prediction_dir/no_vel`
28 | 
29 | ### Original gesture converter
30 | 
31 | `convert_original.py` converts `.bvh` files in the test set to ground truth body keypoint positions. **You need the ground truth for the quantitative evaluation.**
32 | 
33 | ```sh
34 | # Convert test bvh to ground truth
35 | python convert_original.py
36 | ```
37 | 
38 | Note: `convert_original.py` assumes that the `.bvh` files are stored in `../data/test/labels/` by default. You can use `--data` or `-d` option to specify a different directory. You can specify the output directory by `--out` or `-o` option (default: `../evaluation/data/original/`).
39 | 
40 | ### Temporal filters
41 | 
42 | We support two types of temporal filters, 1€ filter and Simple Moving Average (SMA) filter, to smooth gesture motion.
43 | 
44 | To apply filters, you can use `apply_filters.py`.
45 | You can change the averaging window size for SMA filter by `--window` or `-w` option (default: 5).
46 | 
47 | ```sh
48 | # Apply temporal filters
49 | python apply_filters.py -g your_prediction_dir -w 5
50 | ```
51 | 
52 | Note: `apply_filters.py` produces three types of smoothed gestures (1€, SMA, and 1€ + SMA). The smoothed gestures will be stored in `euro/`, `sma/`, and `euro_sma/` subfolders of `your_prediction_dir/`.
53 | 


--------------------------------------------------------------------------------
/helpers/apply_filters.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Apply smoothing filters as postprocessing
 4 | 
 5 | @author: kaneko.naoshi
 6 | """
 7 | 
 8 | 
 9 | import argparse
10 | import glob
11 | import os
12 | 
13 | import numpy as np
14 | 
15 | from filters.ma_filter import simple_moving_average
16 | from filters.one_euro_filter import apply_one_euro
17 | 
18 | 
19 | def save_positions(out_dir, gesture_name, positions):
20 |     """Save body keypoint positions into file
21 | 
22 |       Args:
23 |           out_dir:      output directory
24 |           gesture_name: basename of the output file
25 |           positions:    keypoint positions to save
26 |     """
27 | 
28 |     filename = os.path.join(out_dir, gesture_name + '.txt')
29 |     np.savetxt(filename, positions, fmt='%s')
30 | 
31 | 
32 | def main():
33 |     parser = argparse.ArgumentParser(
34 |         description='Apply filters to the generated gestures')
35 |     parser.add_argument('--gesture', '-g', required=True,
36 |                         help='Path to the gesture directory to filter')
37 |     parser.add_argument('--window', '-w', type=int, default=5,
38 |                         help='Windows size for moving average (must be odd)')
39 |     args = parser.parse_args()
40 | 
41 |     print('Apply temporal filters to the '
42 |           'gestures in "{}"'.format(args.gesture))
43 |     print('')
44 | 
45 |     # List of gesture files
46 |     txt_paths = sorted(glob.glob(os.path.join(args.gesture, '*.txt')))
47 | 
48 |     # Check file existence
49 |     if not txt_paths:
50 |         raise ValueError('Could not find the gesture files in "{}". '
51 |                          'Please specify correct folder as --gesture flag.'
52 |                          .format(args.gesture))
53 | 
54 |     # Filter types
55 |     types = {
56 |         'euro': 'euro',
57 |         'sma': 'sma{}'.format(args.window),
58 |         'euro_sma': 'euro_sma{}'.format(args.window)}
59 | 
60 |     # Make output directories
61 |     euro_dir = os.path.join(args.gesture, types['euro'])
62 |     sma_dir = os.path.join(args.gesture, types['sma'])
63 |     euro_sma_dir = os.path.join(args.gesture, types['euro_sma'])
64 |     for d in [euro_dir, sma_dir, euro_sma_dir]:
65 |         if not os.path.isdir(d):
66 |             os.makedirs(d)
67 | 
68 |     for txt_path in txt_paths:
69 |         print('Process "{}"'.format(txt_path))
70 | 
71 |         raw_pos = np.loadtxt(txt_path)
72 | 
73 |         # One Euro filter
74 |         euro_pos = apply_one_euro(raw_pos)
75 | 
76 |         # Moving average filter
77 |         sma_pos = simple_moving_average(raw_pos, args.window)
78 | 
79 |         # Combined
80 |         euro_sma_pos = simple_moving_average(euro_pos, args.window)
81 | 
82 |         gesture_name, _ = os.path.splitext(os.path.basename(txt_path))
83 |         save_positions(euro_dir, gesture_name, euro_pos)
84 |         save_positions(sma_dir, gesture_name, sma_pos)
85 |         save_positions(euro_sma_dir, gesture_name, euro_sma_pos)
86 | 
87 |     print('')
88 |     print('Results were written under "{}"'.format(args.gesture))
89 |     print('')
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/helpers/convert_original.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Convert ground truth gestures from joint angles in bvh format to the 3d coordinates in text format
  4 | 
  5 | @author: kaneko.naoshi
  6 | """
  7 | 
  8 | 
  9 | import argparse
 10 | import glob
 11 | import os
 12 | 
 13 | import numpy as np
 14 | import pyquaternion as pyq
 15 | 
 16 | 
 17 | def create_hierarchy_nodes(filename):
 18 |     """Load bvh hierarchy nodes
 19 | 
 20 |       Args:
 21 |           filename:     name of the hierarchy file
 22 | 
 23 |       Returns:
 24 |           nodes:        bvh hierarchy nodes
 25 |     """
 26 | 
 27 |     # Read BVH hierarchy
 28 |     with open(filename, 'r') as f:
 29 |         hierarchy = f.readlines()
 30 | 
 31 |     joint_offsets = []
 32 |     joint_names = []
 33 | 
 34 |     for idx, line in enumerate(hierarchy):
 35 |         hierarchy[idx] = hierarchy[idx].split()
 36 | 
 37 |         if not len(hierarchy[idx]) == 0:
 38 |             line_type = hierarchy[idx][0]
 39 |             if line_type == 'OFFSET':
 40 |                 offset = np.array([float(hierarchy[idx][1]),
 41 |                                    float(hierarchy[idx][2]),
 42 |                                    float(hierarchy[idx][3])])
 43 |                 joint_offsets.append(offset)
 44 |             elif line_type == 'ROOT' or line_type == 'JOINT':
 45 |                 joint_names.append(hierarchy[idx][1])
 46 |             elif line_type == 'End':
 47 |                 joint_names.append('End Site')
 48 | 
 49 |     nodes = []
 50 |     for idx, name in enumerate(joint_names):
 51 |         if idx == 0:
 52 |             parent = None
 53 |         elif idx in [6, 30]:  # spine1->shoulders
 54 |             parent = 2
 55 |         elif idx in [14, 18, 22, 26]:  # lefthand->leftfingers
 56 |             parent = 9
 57 |         elif idx in [38, 42, 46, 50]:  # righthand->rightfingers
 58 |             parent = 33
 59 |         elif idx in [54, 59]:  # hip->legs
 60 |             parent = 0
 61 |         else:
 62 |             parent = idx - 1
 63 | 
 64 |         if name == 'End Site':
 65 |             children = None
 66 |         elif idx == 0:  # hips
 67 |             children = [1, 54, 59]
 68 |         elif idx == 2:  # spine1
 69 |             children = [3, 6, 30]
 70 |         elif idx == 9:  # lefthand
 71 |             children = [10, 14, 18, 22, 26]
 72 |         elif idx == 33:  # righthand
 73 |             children = [34, 38, 42, 46, 50]
 74 |         else:
 75 |             children = [idx + 1]
 76 | 
 77 |         node = dict([('name', name), ('parent', parent),
 78 |                      ('children', children), ('offset', joint_offsets[idx]),
 79 |                      ('rel_degs', None), ('abs_qt', None),
 80 |                      ('rel_pos', None), ('abs_pos', None)])
 81 |         if idx == 0:
 82 |             node['rel_pos'] = node['abs_pos'] = [float(0), float(60), float(0)]
 83 |             node['abs_qt'] = pyq.Quaternion()
 84 |         nodes.append(node)
 85 | 
 86 |     return nodes
 87 | 
 88 | 
 89 | def load_bvh(filename):
 90 |     """Load bvh motion frames
 91 | 
 92 |       Args:
 93 |           filename:     bvh filename
 94 | 
 95 |       Returns:
 96 |           frames:       list of bvh frames
 97 |     """
 98 | 
 99 |     with open(filename, 'r') as f:
100 |         frames = f.readlines()
101 |         frametime = frames[310].split()[2]
102 | 
103 |     del frames[0:311]
104 |     bvh_len = len(frames)
105 | 
106 |     for idx, line in enumerate(frames):
107 |         frames[idx] = [float(x) for x in line.split()]
108 | 
109 |     for i in range(0, bvh_len):
110 |         for j in range(0, 306 // 3):
111 |             st = j * 3
112 |             del frames[i][st:st + 3]
113 | 
114 |     # If data is approx 24fps, cut it to 20 fps (del every sixth line)
115 |     # If data is 100fps, cut it to 20 fps (take every fifth line)
116 |     if float(frametime) == 0.0416667:
117 |         del frames[::6]
118 |     elif float(frametime) == 0.010000:
119 |         frames = frames[::5]
120 |     else:
121 |         print('Unsupported fps {} in {}'.format(frametime, filename))
122 | 
123 |     return frames
124 | 
125 | 
126 | def rotation_to_position(frames, nodes):
127 |     """Convert bvh frames to body keypoint positions
128 | 
129 |       Args:
130 |           frames:       bvh frames
131 |           nodes:        bvh hierarchy nodes
132 | 
133 |       Returns:
134 |           out_data:     array containing body keypoint positions
135 |     """
136 | 
137 |     output_lines = []
138 | 
139 |     for frame in frames:
140 |         node_idx = 0
141 |         for i in range(51):
142 |             stepi = i * 3
143 |             z_deg = float(frame[stepi])
144 |             x_deg = float(frame[stepi + 1])
145 |             y_deg = float(frame[stepi + 2])
146 | 
147 |             if nodes[node_idx]['name'] == 'End Site':
148 |                 node_idx = node_idx + 1
149 |             nodes[node_idx]['rel_degs'] = [z_deg, x_deg, y_deg]
150 |             current_node = nodes[node_idx]
151 | 
152 |             node_idx = node_idx + 1
153 | 
154 |         for start_node in nodes:
155 |             abs_pos = np.array([0, 60, 0])
156 |             current_node = start_node
157 |             if start_node['children'] is not None:
158 |                 for child_idx in start_node['children']:
159 |                     child_node = nodes[child_idx]
160 | 
161 |                     child_offset = np.array(child_node['offset'])
162 | 
163 |                     qz = pyq.Quaternion(axis=[0, 0, 1],
164 |                                         degrees=start_node['rel_degs'][0])
165 |                     qx = pyq.Quaternion(axis=[1, 0, 0],
166 |                                         degrees=start_node['rel_degs'][1])
167 |                     qy = pyq.Quaternion(axis=[0, 1, 0],
168 |                                         degrees=start_node['rel_degs'][2])
169 |                     qrot = qz * qx * qy
170 |                     offset_rotated = qrot.rotate(child_offset)
171 |                     child_node['rel_pos'] = start_node['abs_qt'].rotate(
172 |                         offset_rotated)
173 | 
174 |                     child_node['abs_qt'] = start_node['abs_qt'] * qrot
175 | 
176 |             while current_node['parent'] is not None:
177 |                 abs_pos = abs_pos + current_node['rel_pos']
178 |                 current_node = nodes[current_node['parent']]
179 |             start_node['abs_pos'] = abs_pos
180 | 
181 |         line = []
182 |         for node in nodes:
183 |             line.append(node['abs_pos'])
184 |         output_lines.append(line)
185 | 
186 |     output_array = np.asarray(output_lines)
187 |     out_data = np.empty([len(output_array), 192])
188 |     for idx, line in enumerate(output_array):
189 |         out_data[idx] = line.flatten()
190 | 
191 |     return out_data
192 | 
193 | 
194 | def main():
195 |     parser = argparse.ArgumentParser(
196 |         description='Convert original motion data into joint positions')
197 |     parser.add_argument('--data', '-d', default='../data/test/labels',
198 |                         help='Path to the original test motion data directory')
199 |     parser.add_argument('--out', '-o', default='../evaluation/data/original',
200 |                         help='Directory to store the resultant position files')
201 |     args = parser.parse_args()
202 | 
203 |     print('Convert original gestures to the ground truth')
204 |     if args.data != parser.get_default('data'):
205 |         print('Warning: non-default original gesture directory is given: '
206 |               + args.data)
207 |     print('')
208 | 
209 |     # List of bvh files
210 |     bvh_paths = sorted(glob.glob(os.path.join(args.data, '*.bvh')))
211 | 
212 |     # Check file existence
213 |     if not bvh_paths:
214 |         raise ValueError(
215 |             'Could not find the ground truth bvh files in "{}". '
216 |             'Please specify correct folder as --data flag.'.format(args.data))
217 | 
218 |     # Read bvh hierarchy
219 |     nodes = create_hierarchy_nodes('../hierarchy.txt')
220 | 
221 |     # Make output directories
222 |     if not os.path.isdir(args.out):
223 |         os.makedirs(args.out)
224 | 
225 |     for bvh_path in bvh_paths:
226 |         print('Process "{}"'.format(bvh_path))
227 |         frames = load_bvh(bvh_path)
228 | 
229 |         out_data = rotation_to_position(frames, nodes)
230 |         gesture_name, _ = os.path.splitext(os.path.basename(bvh_path))
231 |         out_path = os.path.join(args.out, gesture_name + '.txt')
232 |         np.savetxt(out_path, out_data, fmt='%s')
233 | 
234 |     print('')
235 |     print('Results were written in "{}"'.format(args.out))
236 |     print('')
237 | 
238 | 
239 | if __name__ == '__main__':
240 |     main()
241 | 


--------------------------------------------------------------------------------
/helpers/filters/__pycache__/ma_filter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/helpers/filters/__pycache__/ma_filter.cpython-35.pyc


--------------------------------------------------------------------------------
/helpers/filters/__pycache__/one_euro_filter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/helpers/filters/__pycache__/one_euro_filter.cpython-35.pyc


--------------------------------------------------------------------------------
/helpers/filters/ma_filter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def simple_moving_average(pos_array, winlen):
 5 |     """Apply simple moving average filter to a gesture
 6 | 
 7 |       Args:
 8 |           pos_array:    body keypoint positions to filter
 9 |           winlen:       averaging window size (must be odd)
10 |       Returns:
11 |           np.ndarray:   filtered positions
12 |     """
13 | 
14 |     pos_columns = []
15 |     winlen_oneside = int((winlen - 1) / 2)
16 |     for i in range(len(pos_array[0])):
17 |         line = []
18 |         for j in range(len(pos_array)):
19 |             line.append(pos_array[j][i])
20 |         pos_columns.append(line)
21 | 
22 |     res_list = []
23 |     for i, joint in enumerate(pos_columns):
24 |         line = []
25 |         for j in range(len(pos_columns[i])):
26 |             start_idx = j - winlen_oneside
27 |             end_idx = j + winlen_oneside + 1
28 |             if start_idx < 0:
29 |                 line.append(np.mean(pos_columns[i][:end_idx]))
30 |             elif end_idx > len(pos_columns[i]):
31 |                 line.append(np.mean(pos_columns[i][start_idx:]))
32 |             else:
33 |                 line.append(np.mean(pos_columns[i][start_idx:end_idx]))
34 |         res_list.append(line)
35 | 
36 |     res_array = np.array(res_list)
37 | 
38 |     return res_array.transpose()
39 | 


--------------------------------------------------------------------------------
/helpers/filters/one_euro_filter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # OneEuroFilter.py -
  4 | #
  5 | # Author: Nicolas Roussel (nicolas.roussel@inria.fr)
  6 | 
  7 | import math
  8 | import numpy as np
  9 | 
 10 | 
 11 | # ----------------------------------------------------------------------------
 12 | 
 13 | class LowPassFilter(object):
 14 | 
 15 |     def __init__(self, alpha):
 16 |         self.__setAlpha(alpha)
 17 |         self.__y = self.__s = None
 18 | 
 19 |     def __setAlpha(self, alpha):
 20 |         alpha = float(alpha)
 21 |         if alpha <= 0 or alpha > 1.0:
 22 |             raise ValueError("alpha (%s) should be in (0.0, 1.0]" % alpha)
 23 |         self.__alpha = alpha
 24 | 
 25 |     def __call__(self, value, timestamp=None, alpha=None):
 26 |         if alpha is not None:
 27 |             self.__setAlpha(alpha)
 28 |         if self.__y is None:
 29 |             s = value
 30 |         else:
 31 |             s = self.__alpha * value + (1.0 - self.__alpha) * self.__s
 32 |         self.__y = value
 33 |         self.__s = s
 34 |         return s
 35 | 
 36 |     def lastValue(self):
 37 |         return self.__y
 38 | 
 39 | # ----------------------------------------------------------------------------
 40 | 
 41 | 
 42 | class OneEuroFilter(object):
 43 | 
 44 |     def __init__(self, freq, mincutoff=1.0, beta=0.0, dcutoff=1.0):
 45 |         if freq <= 0:
 46 |             raise ValueError("freq should be >0")
 47 |         if mincutoff <= 0:
 48 |             raise ValueError("mincutoff should be >0")
 49 |         if dcutoff <= 0:
 50 |             raise ValueError("dcutoff should be >0")
 51 |         self.__freq = float(freq)
 52 |         self.__mincutoff = float(mincutoff)
 53 |         self.__beta = float(beta)
 54 |         self.__dcutoff = float(dcutoff)
 55 |         self.__x = LowPassFilter(self.__alpha(self.__mincutoff))
 56 |         self.__dx = LowPassFilter(self.__alpha(self.__dcutoff))
 57 |         self.__lasttime = None
 58 | 
 59 |     def __alpha(self, cutoff):
 60 |         te = 1.0 / self.__freq
 61 |         tau = 1.0 / (2 * math.pi * cutoff)
 62 |         return 1.0 / (1.0 + tau / te)
 63 | 
 64 |     def __call__(self, x, timestamp=None):
 65 |         # ---- update the sampling frequency based on timestamps
 66 |         if self.__lasttime and timestamp:
 67 |             self.__freq = 1.0 / (timestamp - self.__lasttime)
 68 |         self.__lasttime = timestamp
 69 |         # ---- estimate the current variation per second
 70 |         prev_x = self.__x.lastValue()
 71 |         dx = 0.0 if prev_x is None else (x - prev_x) * self.__freq  # FIXME: 0.0 or value?  # noqa
 72 |         edx = self.__dx(dx, timestamp, alpha=self.__alpha(self.__dcutoff))
 73 |         # ---- use it to update the cutoff frequency
 74 |         cutoff = self.__mincutoff + self.__beta * math.fabs(edx)
 75 |         # ---- filter the given value
 76 |         return self.__x(x, timestamp, alpha=self.__alpha(cutoff))
 77 | 
 78 | # ----------------------------------------------------------------------------
 79 | 
 80 | 
 81 | def apply_one_euro(pos_array):
 82 |     """Apply one euro filter to a gesture
 83 | 
 84 |        Original implementation can be downloaded from
 85 |        http://cristal.univ-lille.fr/~casiez/1euro/
 86 | 
 87 |       Args:
 88 |           pos_array:    body keypoint positions to filter
 89 |       Returns:
 90 |           np.ndarray:   filtered positions
 91 |     """
 92 | 
 93 |     pos_along_timestep = pos_array.transpose()
 94 | 
 95 |     config = {
 96 |         'freq': 20,        # Hz
 97 |         'mincutoff': 0.1,  # Minimum cutoff frequency
 98 |         'beta': 0.08,      # Cutoff slope
 99 |         'dcutoff': 1.0     # Cutoff frequency for derivate
100 |     }
101 | 
102 |     oef = OneEuroFilter(**config)
103 | 
104 |     filtered_pos = []
105 |     for i, joint in enumerate(pos_along_timestep):
106 |         joint_pos = []
107 |         for timestep, pos in enumerate(joint):
108 |             if timestep > 0:
109 |                 timestep = timestep * 1.0 / config["freq"]
110 |             filt_num = oef(pos, timestep)
111 |             joint_pos.append(filt_num)
112 |         filtered_pos.append(joint_pos)
113 | 
114 |     filtered_pos_array = np.array(filtered_pos)
115 | 
116 |     return filtered_pos_array.transpose()
117 | 


--------------------------------------------------------------------------------
/helpers/remove_velocity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Remove velocity from the network output
 4 | (it produces both coordinates and velocities while we need only velocities)
 5 | 
 6 | @author: kaneko.naoshi
 7 | """
 8 | 
 9 | import argparse
10 | import glob
11 | import os
12 | 
13 | import numpy as np
14 | 
15 | 
16 | def save_positions(out_dir, gesture_name, positions):
17 |     """Save body keypoint positions into file
18 | 
19 |       Args:
20 |           out_dir:      output directory
21 |           gesture_name: basename of the output file
22 |           positions:    keypoint positions to save
23 |     """
24 | 
25 |     filename = os.path.join(out_dir, gesture_name + '.txt')
26 |     np.savetxt(filename, positions, fmt='%s')
27 | 
28 | 
29 | def remove_velocity(data, dim=3):
30 |     """Remove velocity values from raw prediction data
31 | 
32 |       Args:
33 |           data:         array containing both position and velocity values
34 |           dim:          gesture dimensionality
35 | 
36 |       Returns:
37 |           np.ndarray:   array containing only position values
38 |     """
39 | 
40 |     starts = np.arange(0, data.shape[1], dim * 2)
41 |     stops = np.arange(dim, data.shape[1], dim * 2)
42 |     return np.hstack([data[:, i:j] for i, j in zip(starts, stops)])
43 | 
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(
47 |         description='Remove velocity values from the raw generated gestures')
48 |     parser.add_argument('--gesture', '-g', required=True,
49 |                         help='Path to the raw gesture directory')
50 |     args = parser.parse_args()
51 | 
52 |     print('Remove velocities from the '
53 |           'gestures in "{}"'.format(args.gesture))
54 |     print('')
55 | 
56 |     # List of gesture files
57 |     txt_paths = sorted(glob.glob(os.path.join(args.gesture, '*.txt')))
58 | 
59 |     # Check file existence
60 |     if not txt_paths:
61 |         raise ValueError('Could not find the gesture files in "{}". '
62 |                          'Please specify correct folder as --gesture flag.'
63 |                          .format(args.gesture))
64 | 
65 |     # Make output directory
66 |     out_dir = os.path.join(args.gesture, 'no_vel')
67 |     if not os.path.isdir(out_dir):
68 |         os.makedirs(out_dir)
69 | 
70 |     for txt_path in txt_paths:
71 |         print('Process "{}"'.format(txt_path))
72 | 
73 |         pos_vel = np.loadtxt(txt_path)
74 | 
75 |         # Remove velocity values
76 |         only_pos = remove_velocity(pos_vel)
77 | 
78 |         gesture_name, _ = os.path.splitext(os.path.basename(txt_path))
79 |         save_positions(out_dir, gesture_name, only_pos)
80 | 
81 |     print('')
82 |     print('Results were written in "{}"'.format(out_dir))
83 |     print('')
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/hierarchy.txt:
--------------------------------------------------------------------------------
  1 | HIERARCHY
  2 | ROOT Hips
  3 | {
  4 |   OFFSET 0.000000 60.000000 0.000000
  5 | 	CHANNELS 3 Zrotation Xrotation Yrotation
  6 | 	JOINT Spine
  7 | 	{
  8 | 		OFFSET 0.000000 4.744019 0.000000
  9 | 		CHANNELS 3 Zrotation Xrotation Yrotation
 10 | 		JOINT Spine1
 11 | 		{
 12 | 			OFFSET 0.000000 11.747704 0.000000
 13 | 			CHANNELS 3 Zrotation Xrotation Yrotation
 14 | 			JOINT Neck
 15 | 			{
 16 | 				OFFSET 0.000000 11.699501 -1.063590
 17 | 				CHANNELS 3 Zrotation Xrotation Yrotation
 18 | 				JOINT Head
 19 | 				{
 20 | 					OFFSET 0.000000 8.099556 1.157080
 21 | 					CHANNELS 3 Zrotation Xrotation Yrotation
 22 | 					End Site
 23 | 					{
 24 | 						OFFSET 0.000000 7.463501 0.000000
 25 | 					}
 26 | 				}
 27 | 			}
 28 | 			JOINT LeftShoulder
 29 | 			{
 30 | 				OFFSET -2.323960 10.457596 0.333555
 31 | 				CHANNELS 3 Zrotation Xrotation Yrotation
 32 | 				JOINT LeftArm
 33 | 				{
 34 | 					OFFSET -7.102620 0.000000 0.000000
 35 | 					CHANNELS 3 Zrotation Xrotation Yrotation
 36 | 					JOINT LeftForeArm
 37 | 					{
 38 | 						OFFSET -15.301900 0.000000 0.000000
 39 | 						CHANNELS 3 Zrotation Xrotation Yrotation
 40 | 						JOINT LeftHand
 41 | 						{
 42 | 							OFFSET -17.165703 0.000000 0.000000
 43 | 							CHANNELS 3 Zrotation Xrotation Yrotation
 44 | 							JOINT LeftHandThumb1
 45 | 							{
 46 | 								OFFSET -1.446360 -0.867805 -2.892700
 47 | 								CHANNELS 3 Zrotation Xrotation Yrotation
 48 | 								JOINT LeftHandThumb2
 49 | 								{
 50 | 									OFFSET -1.735620 0.000000 0.000000
 51 | 									CHANNELS 3 Zrotation Xrotation Yrotation
 52 | 									JOINT LeftHandThumb3
 53 | 									{
 54 | 										OFFSET -1.735620 0.000000 0.000000
 55 | 										CHANNELS 3 Zrotation Xrotation Yrotation
 56 | 										End Site
 57 | 										{
 58 | 											OFFSET -1.673540 0.000000 0.000000
 59 | 										}
 60 | 									}
 61 | 								}
 62 | 							}
 63 | 							JOINT LeftHandIndex1
 64 | 							{
 65 | 								OFFSET -7.345020 0.000000 -2.024890
 66 | 								CHANNELS 3 Zrotation Xrotation Yrotation
 67 | 								JOINT LeftHandIndex2
 68 | 								{
 69 | 									OFFSET -2.892700 0.000000 0.000000
 70 | 									CHANNELS 3 Zrotation Xrotation Yrotation
 71 | 									JOINT LeftHandIndex3
 72 | 									{
 73 | 										OFFSET -1.446362 0.000000 0.000000
 74 | 										CHANNELS 3 Zrotation Xrotation Yrotation
 75 | 										End Site
 76 | 										{
 77 | 											OFFSET -1.394619 0.000000 0.000000
 78 | 										}
 79 | 									}
 80 | 								}
 81 | 							}
 82 | 							JOINT LeftHandMiddle1
 83 | 							{
 84 | 								OFFSET -7.345009 0.000000 -0.671109
 85 | 								CHANNELS 3 Zrotation Xrotation Yrotation
 86 | 								JOINT LeftHandMiddle2
 87 | 								{
 88 | 									OFFSET -3.181961 0.000000 0.000000
 89 | 									CHANNELS 3 Zrotation Xrotation Yrotation
 90 | 									JOINT LeftHandMiddle3
 91 | 									{
 92 | 										OFFSET -1.735611 0.000000 0.000000
 93 | 										CHANNELS 3 Zrotation Xrotation Yrotation
 94 | 										End Site
 95 | 										{
 96 | 											OFFSET -1.673538 0.000000 0.000000
 97 | 										}
 98 | 									}
 99 | 								}
100 | 							}
101 | 							JOINT LeftHandRing1
102 | 							{
103 | 								OFFSET -5.666491 0.000000 0.671104
104 | 								CHANNELS 3 Zrotation Xrotation Yrotation
105 | 								JOINT LeftHandRing2
106 | 								{
107 | 									OFFSET -2.892691 0.000000 0.000000
108 | 									CHANNELS 3 Zrotation Xrotation Yrotation
109 | 									JOINT LeftHandRing3
110 | 									{
111 | 										OFFSET -1.446339 0.000000 0.000000
112 | 										CHANNELS 3 Zrotation Xrotation Yrotation
113 | 										End Site
114 | 										{
115 | 											OFFSET -1.394619 0.000000 0.000000
116 | 										}
117 | 									}
118 | 								}
119 | 							}
120 | 							JOINT LeftHandPinky1
121 | 							{
122 | 								OFFSET -3.987949 0.000000 2.024890
123 | 								CHANNELS 3 Zrotation Xrotation Yrotation
124 | 								JOINT LeftHandPinky2
125 | 								{
126 | 									OFFSET -2.314140 0.000000 0.000000
127 | 									CHANNELS 3 Zrotation Xrotation Yrotation
128 | 									JOINT LeftHandPinky3
129 | 									{
130 | 										OFFSET -1.157070 0.000000 0.000000
131 | 										CHANNELS 3 Zrotation Xrotation Yrotation
132 | 										End Site
133 | 										{
134 | 											OFFSET -1.115688 0.000000 0.000000
135 | 										}
136 | 									}
137 | 								}
138 | 							}
139 | 						}
140 | 					}
141 | 				}
142 | 			}
143 | 			JOINT RightShoulder
144 | 			{
145 | 				OFFSET 2.286459 10.457596 0.333558
146 | 				CHANNELS 3 Zrotation Xrotation Yrotation
147 | 				JOINT RightArm
148 | 				{
149 | 					OFFSET 7.102619 0.000000 0.000000
150 | 					CHANNELS 3 Zrotation Xrotation Yrotation
151 | 					JOINT RightForeArm
152 | 					{
153 | 						OFFSET 15.301899 0.000000 0.000000
154 | 						CHANNELS 3 Zrotation Xrotation Yrotation
155 | 						JOINT RightHand
156 | 						{
157 | 							OFFSET 17.165699 0.000000 0.000000
158 | 							CHANNELS 3 Zrotation Xrotation Yrotation
159 | 							JOINT RightHandThumb1
160 | 							{
161 | 								OFFSET 1.446362 -0.867805 -2.892700
162 | 								CHANNELS 3 Zrotation Xrotation Yrotation
163 | 								JOINT RightHandThumb2
164 | 								{
165 | 									OFFSET 1.735611 0.000000 0.000000
166 | 									CHANNELS 3 Zrotation Xrotation Yrotation
167 | 									JOINT RightHandThumb3
168 | 									{
169 | 										OFFSET 1.735619 0.000000 0.000000
170 | 										CHANNELS 3 Zrotation Xrotation Yrotation
171 | 										End Site
172 | 										{
173 | 											OFFSET 1.673542 0.000000 0.000000
174 | 										}
175 | 									}
176 | 								}
177 | 							}
178 | 							JOINT RightHandIndex1
179 | 							{
180 | 								OFFSET 7.345032 0.000000 -2.024890
181 | 								CHANNELS 3 Zrotation Xrotation Yrotation
182 | 								JOINT RightHandIndex2
183 | 								{
184 | 									OFFSET 2.892723 0.000000 0.000000
185 | 									CHANNELS 3 Zrotation Xrotation Yrotation
186 | 									JOINT RightHandIndex3
187 | 									{
188 | 										OFFSET 1.446350 0.000000 0.000000
189 | 										CHANNELS 3 Zrotation Xrotation Yrotation
190 | 										End Site
191 | 										{
192 | 											OFFSET 1.394623 0.000000 0.000000
193 | 										}
194 | 									}
195 | 								}
196 | 							}
197 | 							JOINT RightHandMiddle1
198 | 							{
199 | 								OFFSET 7.345032 0.000000 -0.671109
200 | 								CHANNELS 3 Zrotation Xrotation Yrotation
201 | 								JOINT RightHandMiddle2
202 | 								{
203 | 									OFFSET 3.181969 0.000000 0.000000
204 | 									CHANNELS 3 Zrotation Xrotation Yrotation
205 | 									JOINT RightHandMiddle3
206 | 									{
207 | 										OFFSET 1.735611 0.000000 0.000000
208 | 										CHANNELS 3 Zrotation Xrotation Yrotation
209 | 										End Site
210 | 										{
211 | 											OFFSET 1.673538 0.000000 0.000000
212 | 										}
213 | 									}
214 | 								}
215 | 							}
216 | 							JOINT RightHandRing1
217 | 							{
218 | 								OFFSET 5.666489 0.000000 0.671106
219 | 								CHANNELS 3 Zrotation Xrotation Yrotation
220 | 								JOINT RightHandRing2
221 | 								{
222 | 									OFFSET 2.892708 0.000000 0.000000
223 | 									CHANNELS 3 Zrotation Xrotation Yrotation
224 | 									JOINT RightHandRing3
225 | 									{
226 | 										OFFSET 1.446358 0.000000 0.000000
227 | 										CHANNELS 3 Zrotation Xrotation Yrotation
228 | 										End Site
229 | 										{
230 | 											OFFSET 1.394623 0.000000 0.000000
231 | 										}
232 | 									}
233 | 								}
234 | 							}
235 | 							JOINT RightHandPinky1
236 | 							{
237 | 								OFFSET 3.987961 0.000000 2.024890
238 | 								CHANNELS 3 Zrotation Xrotation Yrotation
239 | 								JOINT RightHandPinky2
240 | 								{
241 | 									OFFSET 2.314171 0.000000 0.000000
242 | 									CHANNELS 3 Zrotation Xrotation Yrotation
243 | 									JOINT RightHandPinky3
244 | 									{
245 | 										OFFSET 1.157082 0.000000 0.000000
246 | 										CHANNELS 3 Zrotation Xrotation Yrotation
247 | 										End Site
248 | 										{
249 | 											OFFSET 1.115692 0.000000 0.000000
250 | 										}
251 | 									}
252 | 								}
253 | 							}
254 | 						}
255 | 					}
256 | 				}
257 | 			}
258 | 		}
259 | 	}
260 | 	JOINT LeftUpLeg
261 | 	{
262 | 		OFFSET -5.785400 0.000000 0.000000
263 | 		CHANNELS 3 Zrotation Xrotation Yrotation
264 | 		JOINT LeftLeg
265 | 		{
266 | 			OFFSET 0.000000 -30.002701 0.000000
267 | 			CHANNELS 3 Zrotation Xrotation Yrotation
268 | 			JOINT LeftFoot
269 | 			{
270 | 				OFFSET 0.000000 -22.702000 0.000000
271 | 				CHANNELS 3 Zrotation Xrotation Yrotation
272 | 				JOINT LeftToeBase
273 | 				{
274 | 					OFFSET 0.000000 -3.760510 -8.678090
275 | 					CHANNELS 3 Zrotation Xrotation Yrotation
276 | 					End Site
277 | 					{
278 | 						OFFSET 0.000000 0.000000 -2.314159
279 | 					}
280 | 				}
281 | 			}
282 | 		}
283 | 	}
284 | 	JOINT RightUpLeg
285 | 	{
286 | 		OFFSET 5.785400 0.000000 0.000000
287 | 		CHANNELS 3 Zrotation Xrotation Yrotation
288 | 		JOINT RightLeg
289 | 		{
290 | 			OFFSET 0.000000 -30.002701 0.000000
291 | 			CHANNELS 3 Zrotation Xrotation Yrotation
292 | 			JOINT RightFoot
293 | 			{
294 | 				OFFSET 0.000000 -22.702101 0.000000
295 | 				CHANNELS 3 Zrotation Xrotation Yrotation
296 | 				JOINT RightToeBase
297 | 				{
298 | 					OFFSET 0.000000 -3.760510 -8.678100
299 | 					CHANNELS 3 Zrotation Xrotation Yrotation
300 | 					End Site
301 | 					{
302 | 						OFFSET 0.000000 0.000000 -2.314159
303 | 					}
304 | 				}
305 | 			}
306 | 		}
307 | 	}
308 | }
309 | 
310 | 


--------------------------------------------------------------------------------
/motion_repr_learning/README.md:
--------------------------------------------------------------------------------
 1 | # Motion Representation Learning
 2 | 
 3 | This is a folder for learning a compact and informative representation of the human motion sequence.
 4 | 
 5 | ## The main idea
 6 | The aim is to learn a better representation of the motion frames using an auto-encoding neural networks, such as Denoising Autoencoder or Variational Autoencoder.
 7 | 
 8 | Encoding (MotionE) is a mapping from a sequence of the 3D positions of the human to a lower-dimensional representation, which will contain enough information to reconstruct original motion sequence, but will have less redundancy and hence will be better for the speech-to-motion mapping.
 9 | Decoding (MotionD) is a mapping from the encoded vector back to the 3D motion sequence.
10 | 
11 | Once a motion encoder MotionE and a motion decoder MotionD are learned, we train a novel encoder network SpeechE to map from speech to a corresponding low-dimensional motion representation (code for this mapping is given in the main folder of this repository).
12 | 
13 | At test time, the speech encoder and the motion decoder networks are combined: SpeechE predicts motion representations based on a given speech signal and MotionD then decodes these representations to produce motion sequences.
14 | 
15 | ## Data preparation
16 | 
17 | 1. Follow the instruction on data preparation at the root folder of this repository.
18 | 2. Indicate the directory for the data at utils/flags.py as "data_dir" value.
19 | 3. Indicate the directory to the checkpoints (will be used to store the model) at utils/flags as "chkpt_dir" value.
20 | 
21 | ## Run
22 | To run the default example execute the following command. 
23 | 
24 | ```bash
25 | # Learn dataset encoding
26 | python learn_dataset_encoding.py DATA_DIR motion -chkpt_dir=CHKPT_DIR -layer1_width=DIM
27 | 
28 | #Encode dataset
29 | python encode_dataset.py DATA_DIR motion -chkpt_dir=CHKPT_DIR -restore=True -pretrain=False -layer1_width=DIM
30 | ```
31 | 
32 | Where DATA_DIR is a directory where the data is stored, CHKPT_DIR is a directory to store the model checkpoints and DIM is dimensionality of the representation.
33 | 
34 | 
35 | ## Customizing
36 | You can play around with the run options, including the neural net size and shape, dropout, learning rates, etc. in the file flags.py.
37 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/DAE.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains an implementation of a particular type of AE,
  3 | namely Denoising Autoendoder.
  4 | 
  5 | To be used in the files learn_dataset_encoding and train.py
  6 | 
  7 | Developed by Taras Kucherenko (tarask@kth.se)
  8 | """
  9 | 
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | 
 13 | import tensorflow as tf
 14 | import numpy as np
 15 | 
 16 | from utils.utils import add_noise, loss_reconstruction
 17 | from utils.flags import FLAGS
 18 | 
 19 | 
 20 | class DAE:
 21 |     """ Denoising Autoendoder (DAE)
 22 | 
 23 |     More details about the network in the original paper:
 24 |     http://www.jmlr.org/papers/v11/vincent10a.html
 25 | 
 26 |     The user specifies the structure of this network
 27 |     by specifying number of inputs, the number of hidden
 28 |     units for each layer and the number of final outputs.
 29 |     All this information is set in the utils/flags.py file.
 30 | 
 31 |     The number of input neurons is defined as a frame_size*chunk_length,
 32 |     since it will take a time-window as an input
 33 | 
 34 |     """
 35 | 
 36 |     def __init__(self, shape, sess, variance_coef, data_info):
 37 |         """DAE initializer
 38 | 
 39 |         Args:
 40 |           shape:          list of ints specifying
 41 |                           num input, hidden1 units,...hidden_n units, num outputs
 42 |           sess:           tensorflow session object to use
 43 |           varience_coef:  multiplicative factor for the variance of noise wrt the variance of data
 44 |           data_info:      key information about the dataset
 45 |         """
 46 | 
 47 |         self.__shape = shape  # [input_dim,hidden1_dim,...,hidden_n_dim,output_dim]
 48 |         self.__variables = {}
 49 |         self.__sess = sess
 50 | 
 51 |         self.num_hidden_layers = np.size(shape) - 2
 52 | 
 53 |         self.batch_size = FLAGS.batch_size
 54 |         self.sequence_length = FLAGS.chunk_length
 55 | 
 56 |         self.scaling_factor = 1
 57 | 
 58 | 	    # maximal value and mean pose in the dataset (used for scaling it to interval [-1,1] and back)
 59 |         self.max_val = data_info.max_val
 60 |         self.mean_pose = data_info.mean_pose
 61 | 
 62 | 
 63 |         #################### Add the DATASETS to the GRAPH ###############
 64 | 
 65 |         #### 1 - TRAIN ###
 66 |         self._train_data_initializer = tf.placeholder(dtype=tf.float32,
 67 |                                                       shape=data_info.train_shape)
 68 |         self._train_data = tf.Variable(self._train_data_initializer,
 69 |                                        trainable=False, collections=[], name='Train_data')
 70 |         train_epochs = FLAGS.training_epochs + FLAGS.pretraining_epochs * FLAGS.num_hidden_layers
 71 |         train_frames = tf.train.slice_input_producer([self._train_data], num_epochs=train_epochs)
 72 |         self._train_batch = tf.train.shuffle_batch(train_frames,
 73 |                                                    batch_size=FLAGS.batch_size, capacity=5000,
 74 |                                                    min_after_dequeue=1000, name='Train_batch')
 75 | 
 76 |         #### 2 - VALIDATE, can be used as TEST ###
 77 |         # When optimizing - this dataset stores as a validation dataset,
 78 |         # when testing - this dataset stores a test dataset
 79 |         self._valid_data_initializer = tf.placeholder(dtype=tf.float32,
 80 |                                                       shape=data_info.eval_shape)
 81 |         self._valid_data = tf.Variable(self._valid_data_initializer,
 82 |                                        trainable=False, collections=[], name='Valid_data')
 83 |         valid_frames = tf.train.slice_input_producer([self._valid_data],
 84 |                                                      num_epochs=FLAGS.training_epochs)
 85 |         self._valid_batch = tf.train.shuffle_batch(valid_frames,
 86 |                                                    batch_size=FLAGS.batch_size, capacity=5000,
 87 |                                                    min_after_dequeue=1000, name='Valid_batch')
 88 | 
 89 |         if FLAGS.weight_decay is not None:
 90 |             print('\nWe apply weight decay')
 91 | 
 92 |         ### Specify tensorflow setup  ###
 93 |         with sess.graph.as_default():
 94 | 
 95 |             ##############        SETUP VARIABLES       ######################
 96 | 
 97 |             with tf.variable_scope("AE_Variables"):
 98 | 
 99 |                 for i in range(self.num_hidden_layers + 1):  # go over layers
100 | 
101 |                     # create variables for matrices and biases for each layer
102 |                     self._create_variables(i, FLAGS.weight_decay)
103 | 
104 |                 ##############        DEFINE THE NETWORK     ##################
105 | 
106 |                 ''' 1 - Setup network for TRAINing '''
107 |                 # Input noisy data and reconstruct the original one
108 |                 # as in Denoising AutoEncoder
109 |                 self._input_ = add_noise(self._train_batch, variance_coef, data_info.data_sigma)
110 |                 self._target_ = self._train_batch
111 | 
112 |                 # Define output and loss for the training data
113 |                 self._output, _, _ = self.construct_graph(self._input_, FLAGS.dropout)
114 |                 self._reconstruction_loss = loss_reconstruction(self._output,
115 |                                                                 self._target_, self.max_val)
116 |                 tf.add_to_collection('losses', self._reconstruction_loss)  # add weight decay loses
117 |                 self._loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
118 | 
119 |                 ''' 2 - Setup network for TESTing '''
120 |                 self._valid_input_ = self._valid_batch
121 |                 self._valid_target_ = self._valid_batch
122 | 
123 |                 # Define output (no dropout)
124 |                 self._valid_output, self._encode, self._decode = \
125 |                     self.construct_graph(self._valid_input_, 1)
126 | 
127 |                 # Define loss
128 |                 self._valid_loss = loss_reconstruction(self._valid_output,
129 |                                                        self._valid_target_, self.max_val)
130 |     @property
131 |     def session(self):
132 |         """ Interface for the session"""
133 |         return self.__sess
134 | 
135 |     @property
136 |     def shape(self):
137 |         """ Interface for the shape"""
138 |         return self.__shape
139 | 
140 |     # Make more comfortable interface to the network weights
141 | 
142 |     def _w(self, n, suffix=""):
143 |         return self["matrix"+str(n)+suffix]
144 | 
145 |     def _b(self, n, suffix=""):
146 |         return self["bias"+str(n)+suffix]
147 | 
148 |     @staticmethod
149 |     def _feedforward(x, w, b):
150 |         """
151 |         Traditional feedforward layer: multiply on weight matrix, add bias vector
152 |          and apply activation function
153 | 
154 |         Args:
155 |             x: input ( usually - batch of vectors)
156 |             w: matrix to be multiplied on
157 |             b: bias to be added
158 | 
159 |         Returns:
160 |             y: result of applying this feedforward layer
161 |         """
162 | 
163 |         y = tf.tanh(tf.nn.bias_add(tf.matmul(x, w), b))
164 |         return y
165 | 
166 |     def construct_graph(self, input_seq_pl, dropout):
167 | 
168 |         """ Construct a TensorFlow graph for the AutoEncoding network
169 | 
170 |         Args:
171 |           input_seq_pl:     tf placeholder for input data: size [batch_size, sequence_length * DoF]
172 |           dropout:          how much of the input neurons will be activated, value in range [0,1]
173 |         Returns:
174 |           output:           output tensor: result of running input placeholder through the network
175 |           middle_layer:     tensor which is encoding input placeholder into a representation
176 |           decoding:         tensor which is decoding a representation back into the input vector
177 |         """
178 | 
179 |         network_input = input_seq_pl
180 | 
181 |         curr_layer = tf.reshape(network_input, [self.batch_size,
182 |                                                 FLAGS.chunk_length * FLAGS.frame_size])
183 | 
184 |         numb_layers = self.num_hidden_layers + 1
185 | 
186 |         with tf.name_scope("Joint_run"):
187 | 
188 |             # Pass through the network
189 |             for i in range(numb_layers):
190 | 
191 |                 if i == FLAGS.middle_layer:
192 |                     # Save middle layer
193 |                     with tf.name_scope('middle_layer'):
194 |                         middle_layer = tf.identity(curr_layer)
195 | 
196 |                 with tf.name_scope('hidden'+str(i)):
197 | 
198 |                     # First - Apply Dropout
199 |                     curr_layer = tf.nn.dropout(curr_layer, dropout)
200 | 
201 |                     w = self._w(i + 1)
202 |                     b = self._b(i + 1)
203 | 
204 |                     curr_layer = self._feedforward(curr_layer, w, b)
205 | 
206 |             output = curr_layer
207 | 
208 |         # Now create a decoding network
209 | 
210 |         with tf.name_scope("Decoding"):
211 | 
212 |             layer = self._representation = tf.placeholder\
213 |                 (dtype=tf.float32, shape=middle_layer.get_shape().as_list(), name="Respres.")
214 | 
215 |             for i in range(FLAGS.middle_layer, numb_layers):
216 | 
217 |                 with tf.name_scope('hidden' + str(i)):
218 | 
219 |                     # First - Apply Dropout
220 |                     layer = tf.nn.dropout(layer, dropout)
221 | 
222 |                     w = self._w(i + 1)
223 |                     b = self._b(i + 1)
224 | 
225 |                     layer = self._feedforward(layer, w, b)
226 | 
227 |             decoding = layer
228 | 
229 |         return output, middle_layer, decoding
230 | 
231 |     def __getitem__(self, item):
232 |         """Get AutoEncoder tf variable
233 | 
234 |         Returns the specified variable created by this object.
235 |         Names are weights#, biases#, biases#_out, weights#_fixed,
236 |         biases#_fixed.
237 | 
238 |         Args:
239 |          item: string, variables internal name
240 |         Returns:
241 |          Tensorflow variable
242 |         """
243 |         return self.__variables[item]
244 | 
245 |     def __setitem__(self, key, value):
246 |         """Store a TensorFlow variable
247 | 
248 |         NOTE: Don't call this explicitly. It should
249 |         be used only internally when setting up
250 |         variables.
251 | 
252 |         Args:
253 |           key: string, name of variable
254 |           value: tensorflow variable
255 |         """
256 |         self.__variables[key] = value
257 | 
258 |     def _create_variables(self, i, wd):
259 |         """Helper to create an initialized Variable with weight decay.
260 |         Note that the Variable is initialized with a truncated normal distribution.
261 |         A weight decay is added only if 'wd' is specified.
262 |         If 'wd' is None, weight decay is not added for this Variable.
263 | 
264 |         This function was taken from the web
265 | 
266 |         Args:
267 |           i: number of hidden layer
268 |           wd: add L2Loss weight decay multiplied by this float.
269 |         Returns:
270 |           Nothing
271 |         """
272 | 
273 |         # Initialize Train weights
274 |         w_shape = (self.__shape[i], self.__shape[i + 1])
275 |         a = tf.multiply(2.0, tf.sqrt(6.0 / (w_shape[0] + w_shape[1])))
276 |         name_w = "matrix"+str(i + 1)
277 |         self[name_w] = tf.get_variable("Variables/"+name_w,
278 |                                        initializer=tf.random_uniform(w_shape, -1 * a, a))
279 | 
280 |         # Add weight to the loss function for weight decay
281 |         if wd is not None:
282 |             weight_decay = tf.multiply(tf.nn.l2_loss(self[name_w]), wd, name='wgt_'+str(i)+'_loss')
283 |             tf.add_to_collection('losses', weight_decay)
284 | 
285 |         # Add the histogram summary
286 |         tf.summary.histogram(name_w, self[name_w])
287 | 
288 |         # Initialize Train biases
289 |         name_b = "bias"+str(i + 1)
290 |         b_shape = (self.__shape[i + 1],)
291 |         self[name_b] = tf.get_variable("Variables/"+name_b, initializer=tf.zeros(b_shape))
292 | 
293 |         if i < self.num_hidden_layers:
294 |             # Hidden layer pretrained weights
295 |             # which are used after pretraining before fine-tuning
296 |             self[name_w + "_pretr"] = tf.get_variable(name="Var/" + name_w + "_pretr", initializer=
297 |                                                       tf.random_uniform(w_shape, -1 * a, a),
298 |                                                       trainable=False)
299 |             # Hidden layer pretrained biases
300 |             self[name_b + "_pretr"] = tf.get_variable("Var/"+name_b+"_pretr", trainable=False,
301 |                                                       initializer=tf.zeros(b_shape))
302 | 
303 |             # Pretraining output training biases
304 |             name_b_out = "bias" + str(i+1) + "_out"
305 |             b_shape = (self.__shape[i],)
306 |             b_init = tf.zeros(b_shape)
307 |             self[name_b_out] = tf.get_variable(name="Var/"+name_b_out, initializer=b_init,
308 |                                                trainable=True)
309 | 
310 |     def run_less_layers(self, input_pl, n, is_target=False):
311 |         """Return result of a net after n layers or n-1 layer (if is_target is true)
312 |            This function will be used for the layer-wise pretraining of the AE
313 |         Args:
314 |           input_pl:  TensorFlow placeholder of AE inputs
315 |           n:         int specifying pretrain step
316 |           is_target: bool specifying if required tensor
317 |                       should be the target tensor
318 |                      meaning if we should run n layers or n-1 (if is_target)
319 |         Returns:
320 |           Tensor giving pretraining net result or pretraining target
321 |         """
322 |         assert n > 0
323 |         assert n <= self.num_hidden_layers
324 | 
325 |         last_output = input_pl
326 | 
327 |         for i in range(n - 1):
328 |             w = self._w(i + 1, "_pretrained")
329 |             b = self._b(i + 1, "_pretrained")
330 | 
331 |             last_output = self._feedforward(last_output, w, b)
332 | 
333 |         if is_target:
334 |             return last_output
335 | 
336 |         last_output = self._feedforward(last_output, self._w(n), self._b(n))
337 | 
338 |         out = self._feedforward(last_output, self._w(n), self["bias" + str(n) + "_out"])
339 | 
340 |         return out
341 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/decode.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains a usage script, intended to test using interface.
 3 | Developed by Taras Kucherenko (tarask@kth.se)
 4 | """
 5 | 
 6 | import train as tr
 7 | import utils.data as dt
 8 | import utils.flags as fl
 9 | from learn_dataset_encoding import create_nn, prepare_motion_data
10 | 
11 | import numpy as np
12 | 
13 | import sys
14 | 
15 | DATA_DIR = sys.argv[1]
16 | TEST_FILE = sys.argv[2]
17 | OUTPUT_FILE = sys.argv[3]
18 | 
19 | if __name__ == '__main__':
20 | 
21 |     # Get the data
22 |     Y_train_normalized, Y_train, Y_test_normalized, Y_test, Y_dev_normalized, max_val, mean_pose  = prepare_motion_data(DATA_DIR)
23 | 
24 |     # Train the network
25 |     nn = create_nn(Y_train_normalized, Y_dev_normalized, max_val, mean_pose, restoring=True)
26 | 
27 |     # Read the encoding
28 |     encoding = np.loadtxt(TEST_FILE)
29 | 
30 |     print(encoding.shape)
31 | 
32 |     # Decode it
33 |     decoding = tr.decode(nn, encoding)
34 | 
35 |     print(decoding.shape)
36 | 
37 |     np.savetxt(OUTPUT_FILE, decoding, delimiter = ' ')
38 | 
39 |     # Close Tf session
40 |     nn.session.close()
41 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/encode_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains a script for encoding motion dataset.
 3 | 
 4 | Usage example: python encode_dataset.py data_dir
 5 | 
 6 | Developed by Taras Kucherenko (tarask@kth.se)
 7 | """
 8 | 
 9 | import sys
10 | import numpy as np
11 | 
12 | import train as tr
13 | import utils.flags as fl
14 | from learn_dataset_encoding import create_nn, prepare_motion_data, check_params, os
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     # Check the parameters
19 |     check_params()
20 | 
21 |     DATA_DIR = sys.argv[1]
22 | 
23 |     # Additional check
24 |     if not os.path.exists(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)):
25 |         raise ValueError(
26 |             'Path to the dataset encoding ({}) does not exist!\nPlease, create a folder {} in the DATA_DIR directory'
27 |             ''.format(DATA_DIR+"/"+str(fl.FLAGS.layer1_width), str(fl.FLAGS.layer1_width)))
28 | 
29 |     # Get the data
30 |     train_normalized_data, train_data, test_normalized_data, test_data, dev_normalized_data, \
31 |     max_val, mean_pose = prepare_motion_data(DATA_DIR)
32 | 
33 |     # Restore the network
34 |     nn = create_nn(train_normalized_data, dev_normalized_data, max_val, mean_pose, restoring=True)
35 | 
36 |     debug = 0
37 | 
38 |     # For debug - shorten the dataset
39 |     if debug:
40 |         train_normalized_data = train_normalized_data[:12000]
41 | 
42 |     """                  Encode the train data                 """
43 | 
44 |     # Encode it
45 |     encoded_train_data = tr.encode(nn, train_normalized_data)
46 | 
47 |     # And save into file
48 |     np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_train_encoded.npy", encoded_train_data)
49 | 
50 |     if debug:
51 |         print(train_normalized_data.shape)
52 |         print(encoded_train_data.shape)
53 | 
54 |         # Decode train
55 |         decoded = tr.decode(nn, encoded_train_data)
56 |         print(decoded.shape)
57 | 
58 |         # Reshape back to the frames
59 |         decoded = np.reshape(decoded, (-1, fl.FLAGS.frame_size))
60 | 
61 |         # And calculate an error
62 | 
63 |         size = min(train_normalized_data.shape[0], decoded.shape[0])
64 |         error = decoded[:size] - train_data[:size]
65 |         rmse = np.sqrt(np.mean(error**2))
66 | 
67 |         print("AE Train Error is ", rmse)
68 | 
69 |     """                  Encode the test data                 """
70 | 
71 |     # Encode it
72 |     encoded_test_data = tr.encode(nn, test_normalized_data)
73 | 
74 |     # And save into files
75 |     np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_test_encoded.npy", encoded_test_data)
76 | 
77 |     if debug:
78 |         # Decode test
79 |         decoded = tr.decode(nn, encoded_test_data)
80 | 
81 |         # Reshape back to the frames
82 |         decoded = np.reshape(decoded, (-1, fl.FLAGS.frame_size))
83 | 
84 |         size = min(test_normalized_data.shape[0], decoded.shape[0])
85 |         error = decoded[:size] - test_data[:size]
86 |         rmse = np.sqrt(np.mean(error**2))
87 | 
88 |         print("AE Test Error is ", rmse)
89 | 
90 |     """                  Encode the dev data                     """
91 | 
92 |     # Encode it
93 |     encoded_dev_data = tr.encode(nn, dev_normalized_data)
94 | 
95 |     # And save into files
96 |     np.save(DATA_DIR+"/"+str(fl.FLAGS.layer1_width)+"/Y_dev_encoded.npy", encoded_dev_data)
97 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/learn_dataset_encoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains a script for learning encoding-decoding network
 3 | on our dataset.
 4 | 
 5 | Usage example: python learn_dataset_encoding.py data_dir
 6 | 
 7 | Developed by Taras Kucherenko (tarask@kth.se)
 8 | """
 9 | 
10 | import sys
11 | import numpy as np
12 | import os
13 | 
14 | import train as tr
15 | from utils.utils import prepare_motion_data, DataSet, DataSets, fl
16 | 
17 | def create_nn(train_data, dev_data, max_val, mean_pose, restoring):
18 |     """
19 |     Train or restore a neural network
20 |     Args:
21 |      train_data:         training dataset normalized to the values [-1,1]
22 |      dev_data:           dev dataset normalized to the values [-1,1]
23 |      max_val:            maximal values in the dataset
24 |      mean_pose:          mean pose of the dataset
25 |      restoring:          weather  we are going to just restore already trained model
26 |     Returns:
27 |      nn: neural network, which is ready to use
28 |     """
29 | 
30 |     # Create DataSet object
31 | 
32 |     data = DataSets()
33 | 
34 |     data.train = DataSet(train_data, fl.FLAGS.batch_size)
35 |     data.test = DataSet(dev_data, fl.FLAGS.batch_size)
36 | 
37 |     # Assign variance
38 |     data.train.sigma = np.std(train_data, axis=(0, 1))
39 | 
40 |     # Create information about the dataset
41 |     data_info = tr.DataInfo(data.train.sigma, data.train._sequences.shape,
42 |                             data.test._sequences.shape, max_val, mean_pose)
43 | 
44 |     # Set "restore" flag
45 |     fl.FLAGS.restore = restoring
46 | 
47 |     # Train the network
48 |     nn = tr.learning(data, data_info, just_restore=restoring)
49 | 
50 |     return nn
51 | 
52 | def check_params():
53 | 
54 |     # Check if script get enough parameters
55 |     if len(sys.argv)<2:
56 |         raise ValueError('Not enough paramters! \nUsage : python '+sys.argv[0].split("/")[-1]+' DATA_DIR')
57 | 
58 |     # Check if the dataset exists
59 |     if not os.path.exists(sys.argv[1]):
60 |         raise ValueError('Path to the dataset ({}) does not exist!\nPlease, provide correct DATA_DIR as a script parameter'
61 |                          ''.format(sys.argv[1]))
62 | 
63 |     # Check if the flags were set properly
64 | 
65 |     if not os.path.exists(fl.FLAGS.chkpt_dir):
66 |         raise ValueError('Path to the checkpoints ({}) does not exit!\nChange the "chkpt_dir" flag in utils/flags.py'
67 |                          ''.format(fl.FLAGS.chkpt_dir))
68 | 
69 | if __name__ == '__main__':
70 | 
71 |     # Check parameters
72 |     check_params()
73 | 
74 |     # Get the data
75 |     DATA_DIR = sys.argv[1]
76 |     train_normalized_data, train_data, test_normalized_data, test_data, dev_normalized_data, \
77 |     max_val, mean_pose = prepare_motion_data(DATA_DIR)
78 | 
79 |     # Train an AE network
80 |     nn = create_nn(train_normalized_data, dev_normalized_data, max_val, mean_pose, restoring=False)


--------------------------------------------------------------------------------
/motion_repr_learning/ae/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/motion_repr_learning/ae/utils/__init__.py


--------------------------------------------------------------------------------
/motion_repr_learning/ae/utils/data.py:
--------------------------------------------------------------------------------
  1 | """Functions for downloading,reading and preprocessing CMU data."""
  2 | 
  3 | import sys
  4 | import os
  5 | 
  6 | #sys.path.append('/home/taras/Desktop/Work/Code/Git/MotionCleaning/BVH_format/parser')
  7 | #from reader import MyReader
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | from mpl_toolkits.mplot3d import Axes3D
 11 | import numpy as np
 12 | from six.moves import xrange
 13 | 
 14 | import utils.flags as fl
 15 | 
 16 | class DataSet(object):
 17 |     '''
 18 |     A class for storing a dataset and all important information,
 19 |     which might be needed during training,
 20 |     such as batch size amount of epochs completed and so on.
 21 |     '''
 22 | 
 23 | 
 24 |     def __init__(self, sequences, batch_size):
 25 |         self._batch_size = batch_size
 26 |         self._sequences = sequences  # all the sequnces in the dataset
 27 |         self._num_sequences = sequences.shape[0]
 28 |         self._epochs_completed = 0
 29 |         self._index_in_epoch = 0
 30 | 
 31 |     @property
 32 |     def sequences(self):
 33 |         return self._sequences
 34 | 
 35 |     @property
 36 |     def num_sequences(self):
 37 |         return self._num_sequences
 38 | 
 39 |     @property
 40 |     def epochs_completed(self):
 41 |         return self._epochs_completed
 42 | 
 43 |     def next_batch(self):
 44 |         """Return the next batch of sequences from this data set."""
 45 |         batch_numb = self._index_in_epoch
 46 |         self._index_in_epoch += self._batch_size
 47 |         if self._index_in_epoch > self._num_chunks:
 48 |             # Finished epoch
 49 |             self._epochs_completed += 1
 50 |             # Shuffle the data
 51 |             perm = np.arange(self._num_sequences)
 52 |             np.random.shuffle(perm)
 53 |             self._sequences = self._sequences[perm]
 54 |             # Start next epoch
 55 |             batch_numb = 0
 56 |             self._index_in_epoch = self._batch_size
 57 |         return self._sequences[batch_numb:batch_numb + self._batch_size:1, :]
 58 | 
 59 | 
 60 | class DataSets(object):
 61 |     '''
 62 |       A class for storing Train and Eval datasets and all related information,
 63 |       '''
 64 |     pass
 65 | 
 66 | 
 67 | def read_bvh_file(fileName, test=False):
 68 |     """
 69 |        Reads a file from CMU MoCap dataset in BVH format
 70 | 
 71 |        Returns:
 72 |             sequence [sequence_length,frame_size] - local chanells transformed to the hips-centered coordinates
 73 |             hips [frame_size] - coordinates of the hips
 74 | 
 75 |     """
 76 | 
 77 |     # Read the data
 78 |     reader = MyReader(fileName);
 79 |     reader.read();
 80 |     sequence = np.array(reader.points)
 81 | 
 82 |     # Translate to the hips-center coordinate system
 83 |     hips = sequence[:,:,0]
 84 |     sequence = sequence - hips[:,:,np.newaxis]
 85 | 
 86 |     # This is a visualization for debug
 87 |     '''fig = plt.figure()
 88 |     ax = fig.add_subplot(111, projection='3d')
 89 |     treshhold = 22 # to show legs in a different color
 90 |     # use 10 to color only the spine, 16 - spine and right hand, 22 - spine and both arms, 27 - all except left leg, 32 - all
 91 |     time_step = 10
 92 |     ax.scatter(sequence[time_step ][2][0:treshhold],sequence[time_step ][0][0:treshhold], sequence[time_step ][1][0:treshhold],
 93 |                c='r', marker='o')
 94 |     ax.scatter(sequence[time_step ][2][treshhold:], sequence[time_step ][0][treshhold:], sequence[time_step ][1][treshhold:],
 95 |                c='g', marker='o')'''
 96 |     plt.show()
 97 | 
 98 |     # Transpose the last 2 dimensions
 99 |     sequence = np.transpose(sequence, axes = (0,2,1))
100 | 
101 |     #Flaten all the coords into one vector [T,3,m] -> [T,3m]
102 |     return np.reshape(sequence,(sequence.shape[0],sequence.shape[1]*sequence.shape[2])),hips
103 | 
104 | def read_a_folder(curr_dir):
105 |     chunk_length = fl.FLAGS.chunk_length
106 |     stride = fl.FLAGS.chunking_stride
107 | 
108 |     data = np.array([])
109 | 
110 |     for filename in os.listdir(curr_dir):
111 |             curr_sequence,_ = read_bvh_file(curr_dir + '/' + filename)
112 | 
113 |             # Split sequence into chunks
114 |             curr_chunks = np.array([curr_sequence[i:i + chunk_length, :] for i in
115 |                                     xrange(0, len(curr_sequence) - chunk_length, stride)])
116 | 
117 |             if curr_chunks.shape[0] > 0:
118 |                 # Concatanate curr chunks to all of them
119 |                 data = np.vstack([data, curr_chunks]) if data.size else np.array(curr_chunks)
120 | 
121 |             print(data.shape)
122 | 
123 |     data = np.array(data)
124 | 
125 |     return data
126 | 
127 | def read_unlabeled_data(train_dir, evaluate):
128 |     """
129 |       Reads all 3 datasets from CMU MoCap dataset in C3D format
130 | 
131 |       Args:
132 |           train_dir - address to the train, dev and eval datasets
133 |           evaluate - flag : weather we want to evaluate a network or we just optimize parameters
134 |       Returns:
135 |           datasets - object of class DataSets, containing Train and Eval datasets
136 |           max_val - maximal value in the raw data ( for post-processing)
137 |           mean_pose - mean pose in the raw data ( for post-processing)
138 |     """
139 | 
140 |     data_sets = DataSets()
141 | 
142 |     # Get constants from the file
143 |     data_dir = fl.FLAGS.data_dir
144 |     chunk_length = fl.FLAGS.chunk_length
145 |     stride = fl.FLAGS.chunking_stride
146 | 
147 |     if stride > chunk_length:
148 |         print(
149 |             'ERROR! \nYou have stride bigger than lentgh of chunks. '
150 |             'Please, change those values at flags.py, so that you don\'t ignore the data')
151 |         exit(0)
152 | 
153 |     # #########             Get TRAIN data                  ###########
154 |     print('\nReading train data from the following folder ... ', data_dir + '/train/labels')
155 | 
156 |     train_data = read_a_folder(data_dir + '/train/labels')
157 | 
158 |     [amount_of_train_strings, seq_length, DoF] = train_data.shape
159 |     print('\n' + str(amount_of_train_strings) + ' sequences with length ' + str(
160 |         seq_length) + ' will be used for training')
161 | 
162 |     #         #########             Get TEST data                  ###########
163 | 
164 |     if evaluate:
165 |         print('\nReading test data from the following folder : ', data_dir + '/eval/labels')
166 |         test_data = read_a_folder(data_dir + '/eval/labels')
167 |     else:
168 |         print('\nReading test data from the following folder : ', data_dir + '/dev/labels')
169 |         test_data = read_a_folder(data_dir + '/dev/labels')
170 | 
171 |     [amount_of_test_strings, seq_length, DoF] = test_data.shape
172 |     print('\n' + str(amount_of_test_strings) + ' sequences with length '
173 |           + str(seq_length) + ' will be used for testing')
174 | 
175 |     # Do mean normalization : substract mean pose
176 |     mean_pose = train_data.mean(axis=(0, 1))
177 |     train_data = train_data - mean_pose[np.newaxis, np.newaxis, :]
178 |     test_data = test_data - mean_pose[np.newaxis, np.newaxis, :]
179 | 
180 |     # Scales all values in the input_data to be between -1 and 1
181 |     eps = 1e-8
182 |     max_train = np.amax(np.absolute(train_data), axis=(0, 1))
183 |     max_test = np.amax(np.absolute(test_data), axis=(0, 1))
184 |     max_val = np.maximum(max_train, max_test)
185 |     train_data = np.divide(train_data, max_val[np.newaxis, np.newaxis, :] + eps)
186 |     test_data = np.divide(test_data, max_val[np.newaxis, np.newaxis, :] + eps)
187 | 
188 |     # Check the data range
189 |     max_ = test_data.max()
190 |     min_ = test_data.min()
191 | 
192 |     print("MAximum value in the normalized test dataset : " + str(max_))
193 |     print("Minimum value in the normalized test dataset : " + str(min_))
194 | 
195 |     print('\nTrain data shape: ', train_data.shape)
196 | 
197 |     data_sets.train = DataSet(train_data, fl.FLAGS.batch_size)
198 |     data_sets.test = DataSet(test_data, fl.FLAGS.batch_size)
199 | 
200 |     # Assign variance
201 |     data_sets.train.sigma = np.std(train_data, axis=(0, 1))
202 | 
203 |     # Check if we have enough data
204 |     if data_sets.train._num_sequences < data_sets.train._batch_size:
205 |         print('ERROR: We have got not enough data! '
206 |               'Reduce batch_size or increase amount of subfolder you use.')
207 |         exit(1)
208 | 
209 |     return data_sets, max_val, mean_pose
210 | 
211 | 
212 | def read_dataset_and_write_in_binary(evaluate):
213 |     """
214 |               Reads 3 datasets: "Train","Dev" and "Eval" from the CMU MoCap dataset in bvh format
215 |               And write them in the binary format.
216 |               Will get the address of the folder with the data from flags.py
217 |               Args:
218 |                   evaluate - flag: weather we evaluate the system or we optimize parameters
219 |               Returns:
220 |                   will write binary files in the same folder as the original data
221 |     """
222 | 
223 |     # Get the data
224 |     data, max_val, mean_pose = read_unlabeled_data(fl.FLAGS.data_dir, False)  # read_all_the_data()
225 | 
226 |     # Write all important information into binary files
227 | 
228 |     # Datasets themselfs
229 |     train_file = open(fl.FLAGS.data_dir + '/train.binary', 'wb')
230 |     data.train._sequences.tofile(train_file)
231 |     train_file.close()
232 | 
233 |     eval_file = open(fl.FLAGS.data_dir + '/eval.binary', 'wb')
234 |     data.test._sequences.tofile(eval_file)
235 |     eval_file.close()
236 | 
237 |     # Dataset properties
238 | 
239 |     sigma_file = open(fl.FLAGS.data_dir + '/variance.binary', 'wb')
240 |     data.train.sigma.tofile(sigma_file)
241 |     sigma_file.close()
242 | 
243 |     max_val_file = open(fl.FLAGS.data_dir + '/maximums.binary', 'wb')
244 |     max_val.tofile(max_val_file)
245 |     max_val_file.close()
246 | 
247 |     mean_file = open(fl.FLAGS.data_dir + '/mean.binary', 'wb')
248 |     mean_pose.tofile(mean_file)
249 |     mean_file.close()
250 | 
251 |     print('All the binary files for the dataset was saved in the folder ', fl.FLAGS.data_dir)
252 | 
253 | 
254 | def read_binary_dataset(dataset_name):
255 |     filename = fl.FLAGS.data_dir + '/' + dataset_name + '.binary'
256 |     dataset = np.fromfile(filename)
257 |     amount_of_frames = int(dataset.shape[0] /(fl.FLAGS.chunk_length * fl.FLAGS.frame_size))
258 |     # Clip array so that it divides exactly into the inputs we want (frame_size *chunk_length)
259 |     dataset = dataset[0:amount_of_frames * fl.FLAGS.chunk_length * fl.FLAGS.frame_size]
260 |     # Reshape
261 |     dataset = dataset.reshape(amount_of_frames, fl.FLAGS.chunk_length, fl.FLAGS.frame_size)
262 |     return dataset
263 | 
264 | 
265 | def read_3_datasets_from_binary():
266 |     """
267 |       Reads train and test datasets and their properties from binary file format
268 | 
269 |       Will take them from the corresponding file in the folder, which is defined by FLAGS.data_dir
270 | 
271 |       Returns:
272 |           datasets  - object of class DataSets, containing Train and Eval datasets
273 |           max_val   - maximal value in the raw data ( for post-processing)
274 |           mean_pose - mean pose in the raw data ( for post-processing)
275 | 
276 |     """
277 |     data_sets = DataSets()
278 | 
279 |     #         #########             Get TRAIN data                  ###########
280 | 
281 |     train_data = read_binary_dataset('train')
282 |     [amount_of_train_strings, seq_length, DoF] = train_data.shape
283 |     print('\n' + str(amount_of_train_strings) + ' sequences with length ' + str(fl.FLAGS.chunk_length)
284 |           + ' frames in each will be used for training')
285 | 
286 |     # Merge all the time-frames together
287 |     train_data = np.reshape(train_data, [amount_of_train_strings, seq_length * DoF])
288 | 
289 |     #         #########             Get TEST data                  ###########
290 | 
291 |     test_data = read_binary_dataset('eval')
292 |     [amount_of_test_strings, seq_length, DoF] = test_data.shape
293 |     print(str(amount_of_test_strings) + ' sequences will be used for testing')
294 | 
295 |     # Merge all the time-frames together
296 |     test_data = np.reshape(test_data, [amount_of_test_strings, seq_length * DoF])
297 | 
298 |     # Shuffle the data
299 |     perm = np.arange(amount_of_train_strings)
300 |     np.random.shuffle(perm)
301 |     train_data = train_data[perm]
302 | 
303 |     data_sets.train = DataSet(train_data, fl.FLAGS.batch_size)
304 |     data_sets.test = DataSet(test_data, fl.FLAGS.batch_size)
305 | 
306 |     # Assign variance
307 |     data_sets.train.sigma = np.std(train_data, axis=(0, 1))
308 | 
309 |     # Read maximal value and mean pose before normalizatio
310 |     max_val = np.fromfile(fl.FLAGS.data_dir + '/maximums.binary')
311 |     mean_pose = np.fromfile(fl.FLAGS.data_dir + '/mean.binary')
312 | 
313 |     # Check if we have enough data
314 |     if data_sets.train._num_sequences < data_sets.train._batch_size:
315 |         print('ERROR: We have got not enough data! '
316 |               'Reduce batch_size or increase amount of subfolder you use.')
317 |         exit(1)
318 | 
319 |     return data_sets, max_val, mean_pose
320 | 
321 | 
322 | def write_test_seq_in_binary(input_file_name, output_file_name):
323 |     """ Read test sequence in c3d format and
324 |         write it into the binart file
325 | 
326 |       Args:
327 |         input_file_name:  the name of the input file
328 |         output_file_name: the name of the output file
329 |       Returns:
330 |         nothing
331 |     """
332 |     test_file = open(output_file_name, 'wb')
333 |     test_seq,_ = read_bvh_file(input_file_name)
334 |     test_seq.tofile(test_file)
335 |     test_file.close()
336 |     print("The test sequence was read from", input_file_name, " and written to", output_file_name)
337 | 
338 | 
339 | def read_test_seq_from_binary(binary_file_name):
340 |     """ Read test sequence from the binart file
341 | 
342 |           Args:
343 |             binary_file_name:  the name of the input binary file
344 |           Returns:
345 |             read_seq:          test sequence
346 |     """
347 |     # Read the sequence
348 |     read_seq = np.fromfile(binary_file_name)
349 |     # Reshape
350 |     read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size)
351 |     amount_of_frames = int(read_seq.shape[0] / (fl.FLAGS.chunk_length))
352 |     if amount_of_frames > 0:
353 |         # Clip array so that it divides exactly into the inputs we want (frame_size * chunk_length)
354 |         read_seq = read_seq[0:amount_of_frames * fl.FLAGS.chunk_length]
355 | 
356 |     # Reshape
357 |     read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size * fl.FLAGS.chunk_length) #?
358 | 
359 |     return read_seq
360 | 
361 | 
362 | def visualize(mocap_seq, test=False):
363 |     all_3d_coords = mocap_seq.reshape(-1, 3, int(fl.FLAGS.frame_size/3))  # Concatanate all coords into one vector
364 | 
365 |     # For debug - Visualize the skeleton
366 |     fig = plt.figure()
367 |     ax = fig.add_subplot(111, projection='3d')
368 | 
369 |     start_frame = 40
370 |     treshhold_0 = 14
371 |     treshhold_1 = 20
372 |     treshhold_2 = 27
373 |     coef = 100
374 |     for step in range(start_frame, start_frame + 30, 10):
375 | 
376 |         # Visualize a 3D point cloud
377 |         ax.scatter3D(all_3d_coords[step][0][:treshhold_0],
378 |                      np.add(all_3d_coords[step][1][:treshhold_0], (step - start_frame) * coef),
379 |                      all_3d_coords[step][2][:treshhold_0], c='c', marker='o')
380 |         ax.scatter3D(all_3d_coords[step][0][treshhold_0:treshhold_1],
381 |                      np.add(all_3d_coords[step][1][treshhold_0:treshhold_1],
382 |                             (step - start_frame) * coef),
383 |                      all_3d_coords[step][2][treshhold_0:treshhold_1], c='r', marker='o')
384 |         ax.scatter3D(all_3d_coords[step][0][treshhold_1:treshhold_2],
385 |                      np.add(all_3d_coords[step][1][treshhold_1:treshhold_2],
386 |                             (step - start_frame) * coef),
387 |                      all_3d_coords[step][2][treshhold_1:treshhold_2], c='y', marker='o')
388 |         ax.scatter3D(all_3d_coords[step][0][treshhold_2:],
389 |                      np.add(all_3d_coords[step][1][treshhold_2:], (step - start_frame) * coef),
390 |                      all_3d_coords[step][2][treshhold_2:], c='b', marker='o')
391 | 
392 |         # Find which points are present
393 | 
394 |         key_point_arm = []
395 |         for point in list([0, 1, 2, 7, 8, 9]):
396 |             if all_3d_coords[step][0][point] != 0 and all_3d_coords[step][0][point + 1] != 0:
397 |                 if all_3d_coords[step][1][point] != 0 and all_3d_coords[step][1][point + 1] != 0:
398 |                     if all_3d_coords[step][2][point] != 0 and all_3d_coords[step][2][point + 1] != 0:
399 |                         key_point_arm.append(point)
400 | 
401 |         key_point_arm = np.array(key_point_arm)
402 | 
403 |         key_point_leg = []
404 |         for point in list([27, 34]):  # 28, 35
405 |             if all_3d_coords[step][0][point] != 0 and all_3d_coords[step][0][point + 1] != 0:
406 |                 if all_3d_coords[step][1][point] != 0 and all_3d_coords[step][1][point + 1] != 0:
407 |                     if all_3d_coords[step][2][point] != 0 and all_3d_coords[step][2][point + 1] != 0:
408 |                         key_point_leg.append(point)
409 |         key_point_leg = np.array(key_point_leg)
410 | 
411 |         # Add lines in between
412 | 
413 |         for point in key_point_arm:
414 |             xline = all_3d_coords[step][0][point:point + 2]
415 |             yline = np.add(all_3d_coords[step][1][point:point + 2], (step - start_frame) * coef)
416 |             zline = all_3d_coords[step][2][point:point + 2]
417 |             ax.plot(xline, yline, zline, c='c')
418 |         for point in key_point_leg:
419 |             xline = all_3d_coords[step][0][point:point + 3:2]
420 |             yline = np.add(all_3d_coords[step][1][point:point + 3:2], (step - start_frame) * coef)
421 |             zline = all_3d_coords[step][2][point:point + 3:2]
422 |             ax.plot(xline, yline, zline, c='b')
423 | 
424 |     plt.show()
425 | 
426 | 
427 | if __name__ == '__main__':
428 | 
429 |     # Do some testing
430 | 
431 |     Test = False
432 | 
433 |     if Test:
434 |         input_file_name = '/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture22.bvh'
435 |         output_file_name = fl.FLAGS.data_dir + '/talking2.csv'
436 | 
437 |         test_file = open(output_file_name, 'wb')
438 |         test_seq, _ = read_bvh_file(input_file_name)
439 | 
440 |         visualize(test_seq, test=False)
441 | 
442 |         # Save the data into a file
443 |         with open(output_file_name, 'w') as fp:
444 |             np.savetxt(fp, test_seq, delimiter=",")
445 | 
446 |         print("The test sequence was read from", input_file_name, " and written to", output_file_name)
447 | 
448 |         write_test_seq_in_binary('/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture1093.bvh',
449 |                                  fl.FLAGS.data_dir + '/test_1.binary')
450 |         write_test_seq_in_binary('/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/dataset/motion/gesture1097.bvh',
451 |                                  fl.FLAGS.data_dir + '/test_2.binary')
452 | 
453 |     else:
454 |         read_dataset_and_write_in_binary(True)
455 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/utils/flags.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contrains all the flags for the motion representation learning repository
 3 | """
 4 | from __future__ import division
 5 | import os
 6 | from os.path import join as pjoin
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | # Modify this function to set your home directory for this repo
11 | def home_out(path):
12 |     return pjoin(os.environ['HOME'], 'tmp', 'MoCap', path)
13 | 
14 | flags = tf.app.flags
15 | FLAGS = flags.FLAGS
16 | 
17 | """  							Fine-tuning Parameters 				"""
18 | 
19 | #                       Flags about the sequence processing
20 | 
21 | flags.DEFINE_integer('chunk_length', 1, 'Length of the chunks, for the data processing.')
22 | 
23 | #                               Flags about training
24 | flags.DEFINE_float('learning_rate', 0.0001,
25 |                    'learning rate for training .')
26 | flags.DEFINE_float('pretraining_learning_rate', 0.001 ,
27 |                    'learning rate for training .')
28 | 
29 | flags.DEFINE_float('variance_of_noise', 0.05, 'Coefficient for the gaussian noise '
30 |                                               'added to every point in input during the training')
31 | 
32 | flags.DEFINE_boolean('pretrain', False,' Whether we pretrain the model in a layerwise way')
33 | flags.DEFINE_boolean('restore', False,' Whether we restore the model from the checkpoint')
34 | 
35 | flags.DEFINE_boolean('evaluate', False, ' Whether we are evaluating the system')
36 | 
37 | flags.DEFINE_float('dropout', 0.9, 'Probability to keep the neuron on')
38 | 
39 | flags.DEFINE_integer('batch_size', 128,
40 |                      'Size of the mini batch')
41 | 
42 | flags.DEFINE_integer('training_epochs', 20,
43 |                      "Number of training epochs for pretraining layers")
44 | flags.DEFINE_integer('pretraining_epochs', 5,
45 |                      "Number of training epochs for pretraining layers")
46 | 
47 | flags.DEFINE_float('weight_decay', 0.5, ' Whether we apply weight decay')
48 | 
49 | flags.DEFINE_boolean('early_stopping', True, ' Whether we do early stopping')
50 | flags.DEFINE_float('delta_for_early_stopping', 0.5, 'How much worst the results must get in order'
51 |                                                     ' for training to be terminated.'
52 |                                                     ' 0.05 mean 5% worst than best we had.')
53 | 
54 | #                       Network Architecture Specific Flags
55 | flags.DEFINE_integer('frame_size', 384, 'Dimensionality of the input for a single frame')
56 | 
57 | flags.DEFINE_integer("num_hidden_layers", 1, "Number of hidden layers")
58 | flags.DEFINE_integer("middle_layer", 1, "Number of hidden layers")
59 | 
60 | flags.DEFINE_integer('layer1_width', 312, 'Number of units in each hidden layer ')
61 | flags.DEFINE_integer('layer2_width', 248, 'Number of units in each hidden layer ')
62 | flags.DEFINE_integer('layer3_width', 312, 'Number of units in each hidden layer ')
63 | 
64 | #                           Constants
65 | 
66 | flags.DEFINE_integer('seed', 123456, 'Random seed')
67 | 
68 | flags.DEFINE_string('summary_dir', home_out('summaries_exp'),
69 |                     'Directory to put the summary data')
70 | 
71 | flags.DEFINE_string('chkpt_dir', home_out('chkpts_exp'),
72 |                     'Directory to put the model checkpoints')
73 | 
74 | flags.DEFINE_string('results_file', home_out('results.txt'),
75 |                     'File to put the experimental results')
76 | 


--------------------------------------------------------------------------------
/motion_repr_learning/ae/utils/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains helping function for the training and testing of the AE
  3 | """
  4 | 
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | import utils.flags as fl
 11 | 
 12 | """ Dataset class"""
 13 | 
 14 | class DataSet(object):
 15 |     '''
 16 |     A class for storing a dataset and all important information,
 17 |     which might be needed during training,
 18 |     such as batch size amount of epochs completed and so on.
 19 |     '''
 20 | 
 21 | 
 22 |     def __init__(self, sequences, batch_size):
 23 |         self._batch_size = batch_size
 24 |         self._sequences = sequences  # all the sequnces in the dataset
 25 |         self._num_sequences = sequences.shape[0]
 26 |         self._epochs_completed = 0
 27 |         self._index_in_epoch = 0
 28 | 
 29 |     # Make interface to the protected variables
 30 |     @property
 31 |     def sequences(self):
 32 |         return self._sequences
 33 | 
 34 |     @property
 35 |     def num_sequences(self):
 36 |         return self._num_sequences
 37 | 
 38 | class DataSets(object):
 39 |     '''
 40 |       A class for storing Train and Eval datasets and all related information,
 41 |       '''
 42 |     pass
 43 | 
 44 | def read_test_seq_from_binary(binary_file_name):
 45 |     """ Read test sequence from the binart file
 46 |           Args:
 47 |             binary_file_name:  the name of the input binary file
 48 |           Returns:
 49 |             read_seq:          test sequence
 50 |     """
 51 |     # Read the sequence
 52 |     read_seq = np.fromfile(binary_file_name)
 53 |     # Reshape
 54 |     read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size)
 55 |     amount_of_frames = int(read_seq.shape[0] / (fl.FLAGS.chunk_length))
 56 |     if amount_of_frames > 0:
 57 |         # Clip array so that it divides exactly into the inputs we want (frame_size * chunk_length)
 58 |         read_seq = read_seq[0:amount_of_frames * fl.FLAGS.chunk_length]
 59 | 
 60 |     # Reshape
 61 |     read_seq = read_seq.reshape(-1, fl.FLAGS.frame_size * fl.FLAGS.chunk_length) #?
 62 | 
 63 |     return read_seq
 64 | 
 65 | def add_noise(x, variance_multiplier, sigma):
 66 |     """
 67 |            Add Gaussian noise to the data
 68 |            Args:
 69 |                x                   - input vector
 70 |                variance_multiplier - coefficient to multiple variance of the noise on
 71 |                sigma               - variance of the dataset
 72 |            Returns:
 73 |                x - output vector, noisy data
 74 |     """
 75 |     eps = 1e-15
 76 |     noise = tf.random_normal(x.shape, 0.0, stddev=np.multiply(sigma, variance_multiplier) + eps)
 77 |     x = x + noise
 78 |     return x
 79 | 
 80 | def loss_reconstruction(output, target, max_vals, pretrain=False):
 81 |     """ Reconstruction error. Square of the RMSE
 82 | 
 83 |     Args:
 84 |       output:    tensor of net output
 85 |       target:    tensor of net we are trying to reconstruct
 86 |       max_vals:  array of absolute maximal values in the dataset,
 87 |                 is used for scaling an error to the original space
 88 |       pretrain:  wether we are using it during the pretraining phase
 89 |     Returns:
 90 |       Scalar tensor of mean squared Eucledean distance
 91 |     """
 92 |     with tf.name_scope("reconstruction_loss"):
 93 |         net_output_tf = tf.convert_to_tensor(tf.cast(output, tf.float32), name='input')
 94 |         target_tf = tf.convert_to_tensor(tf.cast(target, tf.float32), name='target')
 95 | 
 96 |         # Euclidean distance between net_output_tf,target_tf
 97 |         error = tf.subtract(net_output_tf, target_tf)
 98 | 
 99 |         if not pretrain:
100 |             # Convert it back from the [-1,1] to original values
101 |             error_scaled = tf.multiply(error, max_vals[np.newaxis, :] + 1e-15)
102 |         else:
103 |             error_scaled = error
104 | 
105 |         squared_error = tf.reduce_mean(tf.square(error_scaled, name="square"), name="averaging")
106 |     return squared_error
107 | 
108 | def convert_back_to_3d_coords(sequence, max_val, mean_pose):
109 |     '''
110 |     Convert back from the normalized values between -1 and 1 to original 3d coordinates
111 |     and unroll them into the sequence
112 | 
113 |     Args:
114 |         sequence: sequence of the normalized values
115 |         max_val: maximal value in the dataset
116 |         mean_pose: mean value in the dataset
117 | 
118 |     Return:
119 |         3d coordinates corresponding to the batch
120 |     '''
121 | 
122 |     # Convert it back from the [-1,1] to original values
123 |     reconstructed = np.multiply(sequence, max_val[np.newaxis, :] + 1e-15)
124 | 
125 |     # Add the mean pose back
126 |     reconstructed = reconstructed + mean_pose[np.newaxis, :]
127 | 
128 |     # Unroll batches into the sequence
129 |     reconstructed = reconstructed.reshape(-1, reconstructed.shape[-1])
130 | 
131 |     return reconstructed
132 | 
133 | def reshape_dataset(dataset):
134 |     """
135 |     Changing the shape of the dataset array to correspond to the frame dimentionality
136 | 
137 |     Args:
138 |         dataset: an array of the dataset
139 |     Return:
140 |         dataset_final: array of the dataset in a proper shape
141 |     """
142 | 
143 |     amount_of_train_chunks = int(dataset.shape[0] / fl.FLAGS.chunk_length)
144 |     dataset_shorten = dataset[:amount_of_train_chunks * fl.FLAGS.chunk_length, :fl.FLAGS.frame_size]
145 |     dataset_chunks = np.reshape(dataset_shorten, (-1, fl.FLAGS.chunk_length * fl.FLAGS.frame_size))
146 | 
147 |     # Merge all the time-frames together
148 |     dataset_final = np.reshape(dataset_chunks, [amount_of_train_chunks,
149 |                                                 fl.FLAGS.chunk_length * fl.FLAGS.frame_size])
150 | 
151 |     return dataset_final
152 | 
153 | def prepare_motion_data(data_dir):
154 |     """
155 |     Read and preprocess the motion dataset
156 | 
157 |     Args:
158 |         data_dir:           a directory with the dataset
159 |     Return:
160 |         Y_train:            an array of the training dataset
161 |         Y_train_normalized: training dataset normalized to the values [-1,1]
162 |         Y_test:             an array of the test dataset
163 |         Y_test_normalized:  test dataset normalized to the values [-1,1]
164 |         Y_dev_normalized:   dev dataset normalized to the values [-1,1]
165 |         max_val:            maximal values in the dataset
166 |         mean_pose:          mean pose of the dataset
167 |     """
168 | 
169 |     # Get the data
170 | 
171 |     Y_train = np.load(data_dir + '/Y_train.npy')
172 |     Y_dev = np.load(data_dir + '/Y_dev.npy')
173 |     Y_test = np.load(data_dir + '/Y_test.npy')
174 | 
175 |     # Normalize dataset
176 |     max_val = np.amax(np.absolute(Y_train), axis=(0))
177 |     mean_pose = Y_train.mean(axis=(0))
178 | 
179 |     Y_train_centered = Y_train - mean_pose[np.newaxis, :]
180 |     Y_dev_centered = Y_dev - mean_pose[np.newaxis, :]
181 |     Y_test_centered = Y_test - mean_pose[np.newaxis, :]
182 | 
183 |     # Scales all values in the input_data to be between -1 and 1
184 |     eps = 1e-8
185 |     Y_train_normalized = np.divide(Y_train_centered, max_val[np.newaxis, :] + eps)
186 |     Y_dev_normalized = np.divide(Y_dev_centered, max_val[np.newaxis, :] + eps)
187 |     Y_test_normalized = np.divide(Y_test_centered, max_val[np.newaxis, :] + eps)
188 | 
189 |     # Reshape to accomodate multiple frames at each input
190 | 
191 |     if fl.FLAGS.chunk_length > 1:
192 |         Y_train_normalized = reshape_dataset(Y_train_normalized)
193 |         Y_dev_normalized = reshape_dataset(Y_dev_normalized)
194 |         Y_test_normalized = reshape_dataset(Y_test_normalized)
195 | 
196 |     # Pad max values and the mean pose, if neeeded
197 |     if fl.FLAGS.chunk_length > 1:
198 |         max_val = np.tile(max_val, fl.FLAGS.chunk_length)
199 |         mean_pose = np.tile(mean_pose, fl.FLAGS.chunk_length)
200 | 
201 | 
202 |     # Some tests for flags
203 |     if fl.FLAGS.restore and fl.FLAGS.pretrain:
204 |         print('ERROR! You cannot restore and pretrain at the same time!'
205 |               ' Please, chose one of these options')
206 |         exit(1)
207 | 
208 |     if fl.FLAGS.middle_layer > fl.FLAGS.num_hidden_layers:
209 |         print('ERROR! Middle layer cannot be more than number of hidden layers!'
210 |               ' Please, update flags')
211 |         exit(1)
212 | 
213 |     return Y_train_normalized, Y_train, Y_test_normalized, Y_test,\
214 |            Y_dev_normalized, max_val, mean_pose
215 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script generates gestures output based on the speech input.
 3 | The gestures will be written in the text file:
 4 | 3d coordinates together with the velocities.
 5 | """
 6 | 
 7 | import sys
 8 | from keras.models import load_model
 9 | import numpy as np
10 | 
11 | 
12 | def predict(model_name, input_file, output_file):
13 |     """ Predict human gesture based on the speech
14 | 
15 |     Args:
16 |         model_name:  name of the Keras model to be used
17 |         input_file:  file name of the audio input
18 |         output_file: file name for the gesture output
19 | 
20 |     Returns:
21 | 
22 |     """
23 |     model = load_model(model_name)
24 |     X = np.load(input_file)
25 | 
26 |     predicted = np.array(model.predict(X))
27 |     print(predicted.shape)
28 |     np.savetxt(output_file, predicted)
29 | 
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     # Check if script get enough parameters
34 |     if len(sys.argv) < 4:
35 |         raise ValueError('Not enough paramters! \nUsage : python ' + sys.argv[0].split("/")[-1] +
36 |                          ' MODEL_NAME INPUT_FILE OUTPUT_FILE')
37 | 
38 |     predict(sys.argv[1], sys.argv[2], sys.argv[3])
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h5py==2.10.0
 2 | Keras==2.3.1
 3 | Keras-Applications==1.0.8
 4 | Keras-Preprocessing==1.1.0
 5 | librosa==0.7.1
 6 | matplotlib==3.1.1
 7 | numpy==1.17.2
 8 | pandas==0.25.2
 9 | praat-parselmouth==0.3.3
10 | pydub==0.23.1
11 | pyquaternion==0.9.5
12 | pysptk==0.1.17
13 | python-speech-features==0.6
14 | scikit-learn==0.21.3
15 | scipy==1.3.1
16 | seaborn==0.7.1


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the main script for the training.
  3 | It contains speech-motion neural network implemented in Keras
  4 | This script should be used to train the model, as described in READ.me
  5 | """
  6 | 
  7 | import sys
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | from keras.models import Sequential
 12 | from keras.layers import Dense, Activation, Dropout
 13 | from keras.layers.recurrent import SimpleRNN, LSTM, GRU
 14 | from keras.optimizers import SGD, Adam
 15 | from keras.layers.wrappers import TimeDistributed, Bidirectional
 16 | from keras.layers.normalization import BatchNormalization
 17 | 
 18 | import matplotlib
 19 | matplotlib.use('Agg')
 20 | from matplotlib import pyplot
 21 | 
 22 | # Check if script get enough parameters
 23 | if len(sys.argv) < 6:
 24 |         raise ValueError(
 25 |            'Not enough paramters! \nUsage : python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE (DIM)')
 26 | ENCODED = sys.argv[5].lower() == 'true'
 27 | 
 28 | if ENCODED:
 29 |     if len(sys.argv) < 7:
 30 |         raise ValueError(
 31 |            'Not enough paramters! \nUsage : python train.py MODEL_NAME EPOCHS DATA_DIR N_INPUT ENCODE DIM')
 32 |     else:    
 33 |         N_OUTPUT = int(sys.argv[6])  # Representation dimensionality
 34 | else:
 35 |     N_OUTPUT = 192 * 2  # Number of Gesture Feature (position + velocity)
 36 | 
 37 | 
 38 | EPOCHS = int(sys.argv[2])
 39 | DATA_DIR = sys.argv[3]
 40 | N_INPUT = int(sys.argv[4])  # Number of input features
 41 | 
 42 | BATCH_SIZE = 2056
 43 | N_HIDDEN = 256
 44 | 
 45 | N_CONTEXT = 60 + 1  # The number of frames in the context
 46 | 
 47 | 
 48 | def train(model_file):
 49 |     """
 50 |     Train a neural network to take speech as input and produce gesture as an output
 51 | 
 52 |     Args:
 53 |         model_file: file to store the model
 54 | 
 55 |     Returns:
 56 | 
 57 |     """
 58 | 
 59 |     # Get the data
 60 |     X = np.load(DATA_DIR + '/X_train.npy')
 61 | 
 62 |     if ENCODED:
 63 | 
 64 |         # If we learn speech-representation mapping we use encoded motion as output
 65 |         Y = np.load(DATA_DIR + '/' + str(N_OUTPUT)+ '/Y_train_encoded.npy')
 66 | 
 67 |         # Correct the sizes
 68 |         train_size = min(X.shape[0], Y.shape[0])
 69 |         X = X[:train_size]
 70 |         Y = Y[:train_size]
 71 | 
 72 |     else:
 73 |         Y = np.load(DATA_DIR + '/Y_train.npy')
 74 | 
 75 |     N_train = int(len(X)*0.9)
 76 |     N_validation = len(X) - N_train
 77 | 
 78 |     # Split on training and validation
 79 |     X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=N_validation)
 80 | 
 81 |     # Define Keras model
 82 | 
 83 |     model = Sequential()
 84 |     model.add(TimeDistributed(Dense(N_HIDDEN), input_shape=(N_CONTEXT, N_INPUT)))
 85 |     model.add(BatchNormalization())
 86 |     model.add(Activation('relu'))
 87 |     model.add(Dropout(0.1))
 88 |     
 89 |     model.add(TimeDistributed(Dense(N_HIDDEN)))
 90 |     model.add(BatchNormalization())
 91 |     model.add(Activation('relu'))
 92 |     model.add(Dropout(0.1))
 93 |     
 94 |     model.add(TimeDistributed(Dense(N_HIDDEN)))
 95 |     model.add(BatchNormalization())
 96 |     model.add(Activation('relu'))
 97 |     model.add(Dropout(0.1))
 98 | 
 99 |     model.add(GRU(N_HIDDEN, return_sequences=False))
100 |     model.add(BatchNormalization())
101 |     model.add(Activation('relu'))
102 |     model.add(Dropout(0.1))
103 |     
104 |     model.add(Dense(N_OUTPUT))
105 |     model.add(Activation('linear'))
106 | 
107 |     print(model.summary())
108 | 
109 |     optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
110 |     model.compile(loss='mean_squared_error', optimizer=optimizer)
111 | 
112 |     hist = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_validation, Y_validation))
113 |      
114 |     model.save(model_file)
115 | 
116 |     # Save convergence results into an image
117 |     pyplot.plot(hist.history['loss'], linewidth=3, label='train')
118 |     pyplot.plot(hist.history['val_loss'], linewidth=3, label='valid')
119 |     pyplot.grid()
120 |     pyplot.legend()
121 |     pyplot.xlabel('epoch')
122 |     pyplot.ylabel('loss')
123 |     pyplot.savefig(model_file.replace('hdf5', 'png'))
124 | 
125 | 
126 | if __name__ == "__main__":
127 | 
128 |     train(sys.argv[1])
129 | 


--------------------------------------------------------------------------------
/visuals/SpeechReprMotion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genea-workshop/Speech_driven_gesture_generation_with_autoencoder/59e8ebdb0f6e87b0e81268046c99a4d6c9bf62a8/visuals/SpeechReprMotion.png


--------------------------------------------------------------------------------