├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── __init__.py ├── audio.py ├── preprocessor.py └── wavenet_preprocessor.py ├── docker └── Dockerfile ├── griffin_lim_synthesis_tool.ipynb ├── hparams.py ├── infolog.py ├── paper_hparams.py ├── papers ├── (content+location) attention.pdf ├── ClariNet.pdf ├── Tacotron 2 revised.pdf ├── bahdanau (content) attention.pdf ├── deepvoice 3.pdf ├── effective approaches attention.pdf ├── fast_wavenet.pdf ├── tacotron.pdf ├── tacotron2.pdf └── wavenet.pdf ├── preprocess.py ├── requirements.txt ├── sentences.txt ├── synthesize.py ├── tacotron ├── __init__.py ├── feeder.py ├── models │ ├── Architecture_wrappers.py │ ├── __init__.py │ ├── attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── modules.py │ └── tacotron.py ├── synthesize.py ├── synthesizer.py ├── train.py └── utils │ ├── __init__.py │ ├── cleaners.py │ ├── cmudict.py │ ├── numbers.py │ ├── plot.py │ ├── symbols.py │ └── text.py ├── test_wavenet_feeder.py ├── train.py ├── wavenet_preprocess.py └── wavenet_vocoder ├── __init__.py ├── feeder.py ├── models ├── __init__.py ├── gaussian.py ├── mixture.py ├── modules.py └── wavenet.py ├── synthesize.py ├── synthesizer.py ├── train.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Tacotron 2 oddities 107 | logs-*/ 108 | training_data/ 109 | 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rayhane Mama 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron-2: 2 | Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf) 3 | 4 | This Repository contains additional improvements and attempts over the paper, we thus propose **paper_hparams.py** file which holds the exact hyperparameters to reproduce the paper results without any additional extras. 5 | 6 | Suggested **hparams.py** file which is default in use, contains the hyperparameters with extras that proved to provide better results in most cases. Feel free to toy with the parameters as needed. 7 | 8 | DIFFERENCES WILL BE HIGHLIGHTED IN DOCUMENTATION SHORTLY. 9 | 10 | 11 | # Repository Structure: 12 | Tacotron-2 13 | ├── datasets 14 | ├── en_UK (0) 15 | │   └── by_book 16 | │   └── female 17 | ├── en_US (0) 18 | │   └── by_book 19 | │   ├── female 20 | │   └── male 21 | ├── LJSpeech-1.1 (0) 22 | │   └── wavs 23 | ├── logs-Tacotron (2) 24 | │   ├── eval_-dir 25 | │   │  ├── plots 26 | │  │  └── wavs 27 | │   ├── mel-spectrograms 28 | │   ├── plots 29 | │   ├── taco_pretrained 30 | │   ├── metas 31 | │   └── wavs 32 | ├── logs-Wavenet (4) 33 | │   ├── eval-dir 34 | │   │  ├── plots 35 | │  │  └── wavs 36 | │   ├── plots 37 | │   ├── wave_pretrained 38 | │   ├── metas 39 | │   └── wavs 40 | ├── logs-Tacotron-2 ( * ) 41 | │   ├── eval-dir 42 | │   │  ├── plots 43 | │  │  └── wavs 44 | │   ├── plots 45 | │   ├── taco_pretrained 46 | │   ├── wave_pretrained 47 | │   ├── metas 48 | │   └── wavs 49 | ├── papers 50 | ├── tacotron 51 | │   ├── models 52 | │   └── utils 53 | ├── tacotron_output (3) 54 | │   ├── eval 55 | │   ├── gta 56 | │   ├── logs-eval 57 | │   │   ├── plots 58 | │   │   └── wavs 59 | │   └── natural 60 | ├── wavenet_output (5) 61 | │   ├── plots 62 | │   └── wavs 63 | ├── training_data (1) 64 | │   ├── audio 65 | │   ├── linear 66 | │ └── mels 67 | └── wavenet_vocoder 68 | └── models 69 | 70 | 71 | The previous tree shows the current state of the repository (separate training, one step at a time). 72 | 73 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**). 74 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder. 75 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder. 76 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder. 77 | - Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder. 78 | - Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder. 79 | 80 | - Note: Steps 2, 3, and 4 can be made with a simple run for both Tacotron and WaveNet (Tacotron-2, step ( * )). 81 | 82 | 83 | Note: 84 | - **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script. 85 | - In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity. 86 | - If you run training of both **models at the same time**, repository structure will be different. 87 | 88 | # Pretrained model and Samples: 89 | Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). THIS IS VERY OUTDATED, I WILL UPDATE THIS SOON 90 | 91 | # Model Architecture: 92 |

93 | 94 |

95 | 96 | The model described by the authors can be divided in two parts: 97 | - Spectrogram prediction network 98 | - Wavenet vocoder 99 | 100 | To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki) 101 | 102 | # Current state: 103 | 104 | To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4) 105 | 106 | since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training. 107 | 108 | # How to start 109 | - **Machine Setup:** 110 | 111 | First, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/). 112 | 113 | Next, you need to install some Linux dependencies to ensure audio libraries work properly: 114 | 115 | > apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools 116 | 117 | Finally, you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**) 118 | 119 | > pip install -r requirements.txt 120 | 121 | - **Docker:** 122 | 123 | Alternatively, one can build the **docker image** to ensure everything is setup automatically and use the project inside the docker containers. 124 | **Dockerfile is insider "docker" folder** 125 | 126 | docker image can be built with: 127 | 128 | > docker build -t tacotron-2_image docker/ 129 | 130 | Then containers are runnable with: 131 | 132 | > docker run -i --name new_container tacotron-2_image 133 | 134 | Please report any issues with the Docker usage with our models, I'll get to it. Thanks! 135 | 136 | # Dataset: 137 | We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it) 138 | 139 | We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages. 140 | 141 | After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.** 142 | 143 | # Hparams setting: 144 | Before proceeding, you must pick the hyperparameters that suit best your needs. While it is possible to change the hyper parameters from command line during preprocessing/training, I still recommend making the changes once and for all on the **hparams.py** file directly. 145 | 146 | To pick optimal fft parameters, I have made a **griffin_lim_synthesis_tool** notebook that you can use to invert real extracted mel/linear spectrograms and choose how good your preprocessing is. All other options are well explained in the **hparams.py** and have meaningful names so that you can try multiple things with them. 147 | 148 | AWAIT DOCUMENTATION ON HPARAMS SHORTLY!! 149 | 150 | # Preprocessing 151 | Before running the following steps, please make sure you are inside **Tacotron-2 folder** 152 | 153 | > cd Tacotron-2 154 | 155 | Preprocessing can then be started using: 156 | 157 | > python preprocess.py 158 | 159 | dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**. 160 | 161 | Example M-AILABS: 162 | 163 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth' 164 | 165 | or if you want to use all books for a single speaker: 166 | 167 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True 168 | 169 | This should take no longer than a **few minutes.** 170 | 171 | # Training: 172 | To **train both models** sequentially (one after the other): 173 | 174 | > python train.py --model='Tacotron-2' 175 | 176 | 177 | Feature prediction model can **separately** be **trained** using: 178 | 179 | > python train.py --model='Tacotron' 180 | 181 | checkpoints will be made each **5000 steps** and stored under **logs-Tacotron folder.** 182 | 183 | Naturally, **training the wavenet separately** is done by: 184 | 185 | > python train.py --model='WaveNet' 186 | 187 | logs will be stored inside **logs-Wavenet**. 188 | 189 | **Note:** 190 | - If model argument is not provided, training will default to Tacotron-2 model training. (both models) 191 | - Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use. 192 | - It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**. 193 | 194 | # Synthesis 195 | To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work): 196 | 197 | > python synthesize.py --model='Tacotron-2' 198 | 199 | For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis: 200 | 201 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model. 202 | 203 | > python synthesize.py --model='Tacotron' 204 | 205 | - **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step). 206 | 207 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=False 208 | 209 | 210 | - **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper) 211 | 212 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=True 213 | 214 | Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with: 215 | 216 | > python synthesize.py --model='WaveNet' 217 | 218 | **Note:** 219 | - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS) 220 | - Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use. 221 | 222 | 223 | # References and Resources: 224 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf) 225 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf) 226 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf) 227 | - [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf) 228 | - [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf) 229 | - [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) 230 | - [keithito/tacotron](https://github.com/keithito/tacotron) 231 | 232 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /datasets/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr, inv_preemphasize, k): 18 | # wav = inv_preemphasis(wav, k, inv_preemphasize) 19 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 20 | wavfile.write(path, sr, wav.astype(np.int16)) 21 | 22 | def preemphasis(wav, k, preemphasize=True): 23 | if preemphasize: 24 | return signal.lfilter([1, -k], [1], wav) 25 | return wav 26 | 27 | def inv_preemphasis(wav, k, inv_preemphasize=True): 28 | if inv_preemphasize: 29 | return signal.lfilter([1], [1, -k], wav) 30 | return wav 31 | 32 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py 33 | def start_and_end_indices(quantized, silence_threshold=2): 34 | for start in range(quantized.size): 35 | if abs(quantized[start] - 127) > silence_threshold: 36 | break 37 | for end in range(quantized.size - 1, 1, -1): 38 | if abs(quantized[end] - 127) > silence_threshold: 39 | break 40 | 41 | assert abs(quantized[start] - 127) > silence_threshold 42 | assert abs(quantized[end] - 127) > silence_threshold 43 | 44 | return start, end 45 | 46 | def trim_silence(wav, hparams): 47 | '''Trim leading and trailing silence 48 | 49 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end. 50 | ''' 51 | #Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset. 52 | return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0] 53 | 54 | def get_hop_size(hparams): 55 | hop_size = hparams.hop_size 56 | if hop_size is None: 57 | assert hparams.frame_shift_ms is not None 58 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 59 | return hop_size 60 | 61 | def linearspectrogram(wav, hparams): 62 | # D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 63 | D = _stft(wav, hparams) 64 | S = _amp_to_db(np.abs(D)**hparams.magnitude_power, hparams) - hparams.ref_level_db 65 | 66 | if hparams.signal_normalization: 67 | return _normalize(S, hparams) 68 | return S 69 | 70 | def melspectrogram(wav, hparams): 71 | # D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 72 | D = _stft(wav, hparams) 73 | S = _amp_to_db(_linear_to_mel(np.abs(D)**hparams.magnitude_power, hparams), hparams) - hparams.ref_level_db 74 | 75 | if hparams.signal_normalization: 76 | return _normalize(S, hparams) 77 | return S 78 | 79 | def inv_linear_spectrogram(linear_spectrogram, hparams): 80 | '''Converts linear spectrogram to waveform using librosa''' 81 | if hparams.signal_normalization: 82 | D = _denormalize(linear_spectrogram, hparams) 83 | else: 84 | D = linear_spectrogram 85 | 86 | S = _db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power) #Convert back to linear 87 | 88 | if hparams.use_lws: 89 | processor = _lws_processor(hparams) 90 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 91 | y = processor.istft(D).astype(np.float32) 92 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 93 | else: 94 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 95 | 96 | 97 | def inv_mel_spectrogram(mel_spectrogram, hparams): 98 | '''Converts mel spectrogram to waveform using librosa''' 99 | if hparams.signal_normalization: 100 | D = _denormalize(mel_spectrogram, hparams) 101 | else: 102 | D = mel_spectrogram 103 | 104 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power), hparams) # Convert back to linear 105 | 106 | if hparams.use_lws: 107 | processor = _lws_processor(hparams) 108 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 109 | y = processor.istft(D).astype(np.float32) 110 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 111 | else: 112 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 113 | 114 | ########################################################################################### 115 | # tensorflow Griffin-Lim 116 | # Thanks to @begeekmyfriend: https://github.com/begeekmyfriend/Tacotron-2/blob/mandarin-new/datasets/audio.py 117 | 118 | def inv_linear_spectrogram_tensorflow(spectrogram, hparams): 119 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow. 120 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call 121 | inv_preemphasis on the output after running the graph. 122 | ''' 123 | if hparams.signal_normalization: 124 | D = _denormalize_tensorflow(spectrogram, hparams) 125 | else: 126 | D = linear_spectrogram 127 | 128 | S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power)) 129 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams) 130 | 131 | def inv_mel_spectrogram_tensorflow(mel_spectrogram, hparams): 132 | '''Builds computational graph to convert mel spectrogram to waveform using TensorFlow. 133 | Unlike inv_mel_spectrogram, this does NOT invert the preemphasis. The caller should call 134 | inv_preemphasis on the output after running the graph. 135 | ''' 136 | if hparams.signal_normalization: 137 | D = _denormalize_tensorflow(mel_spectrogram, hparams) 138 | else: 139 | D = mel_spectrogram 140 | 141 | S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power)) 142 | S = _mel_to_linear_tensorflow(S, hparams) # Convert back to linear 143 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams) 144 | 145 | ########################################################################################### 146 | 147 | def _lws_processor(hparams): 148 | import lws 149 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") 150 | 151 | def _griffin_lim(S, hparams): 152 | '''librosa implementation of Griffin-Lim 153 | Based on https://github.com/librosa/librosa/issues/434 154 | ''' 155 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 156 | S_complex = np.abs(S).astype(np.complex) 157 | y = _istft(S_complex * angles, hparams) 158 | for i in range(hparams.griffin_lim_iters): 159 | angles = np.exp(1j * np.angle(_stft(y, hparams))) 160 | y = _istft(S_complex * angles, hparams) 161 | return y 162 | 163 | def _griffin_lim_tensorflow(S, hparams): 164 | '''TensorFlow implementation of Griffin-Lim 165 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb 166 | ''' 167 | with tf.variable_scope('griffinlim'): 168 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 169 | S = tf.expand_dims(S, 0) 170 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 171 | y = tf.contrib.signal.inverse_stft(S_complex, hparams.win_size, get_hop_size(hparams), hparams.n_fft) 172 | for i in range(hparams.griffin_lim_iters): 173 | est = tf.contrib.signal.stft(y, hparams.win_size, get_hop_size(hparams), hparams.n_fft) 174 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 175 | y = tf.contrib.signal.inverse_stft(S_complex * angles, hparams.win_size, get_hop_size(hparams), hparams.n_fft) 176 | return tf.squeeze(y, 0) 177 | 178 | def _stft(y, hparams): 179 | if hparams.use_lws: 180 | return _lws_processor(hparams).stft(y).T 181 | else: 182 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size, pad_mode='constant') 183 | 184 | def _istft(y, hparams): 185 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 186 | 187 | ########################################################## 188 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 189 | def num_frames(length, fsize, fshift): 190 | """Compute number of time frames of spectrogram 191 | """ 192 | pad = (fsize - fshift) 193 | if length % fshift == 0: 194 | M = (length + pad * 2 - fsize) // fshift + 1 195 | else: 196 | M = (length + pad * 2 - fsize) // fshift + 2 197 | return M 198 | 199 | 200 | def pad_lr(x, fsize, fshift): 201 | """Compute left and right padding 202 | """ 203 | M = num_frames(len(x), fsize, fshift) 204 | pad = (fsize - fshift) 205 | T = len(x) + 2 * pad 206 | r = (M - 1) * fshift + fsize - T 207 | return pad, pad + r 208 | ########################################################## 209 | #Librosa correct padding 210 | def librosa_pad_lr(x, fsize, fshift, pad_sides=1): 211 | '''compute right padding (final frame) or both sides padding (first and final frames) 212 | ''' 213 | assert pad_sides in (1, 2) 214 | # return int(fsize // 2) 215 | pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] 216 | if pad_sides == 1: 217 | return 0, pad 218 | else: 219 | return pad // 2, pad // 2 + pad % 2 220 | 221 | # Conversions 222 | _mel_basis = None 223 | _inv_mel_basis = None 224 | 225 | def _linear_to_mel(spectogram, hparams): 226 | global _mel_basis 227 | if _mel_basis is None: 228 | _mel_basis = _build_mel_basis(hparams) 229 | return np.dot(_mel_basis, spectogram) 230 | 231 | def _mel_to_linear(mel_spectrogram, hparams): 232 | global _inv_mel_basis 233 | if _inv_mel_basis is None: 234 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 235 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 236 | 237 | def _mel_to_linear_tensorflow(mel_spectrogram, hparams): 238 | global _inv_mel_basis 239 | if _inv_mel_basis is None: 240 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 241 | return tf.transpose(tf.maximum(1e-10, tf.matmul(tf.cast(_inv_mel_basis, tf.float32), tf.transpose(mel_spectrogram, [1, 0]))), [1, 0]) 242 | 243 | def _build_mel_basis(hparams): 244 | assert hparams.fmax <= hparams.sample_rate // 2 245 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 246 | fmin=hparams.fmin, fmax=hparams.fmax) 247 | 248 | def _amp_to_db(x, hparams): 249 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 250 | return 20 * np.log10(np.maximum(min_level, x)) 251 | 252 | def _db_to_amp(x): 253 | return np.power(10.0, (x) * 0.05) 254 | 255 | def _db_to_amp_tensorflow(x): 256 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 257 | 258 | def _normalize(S, hparams): 259 | if hparams.allow_clipping_in_normalization: 260 | if hparams.symmetric_mels: 261 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 262 | -hparams.max_abs_value, hparams.max_abs_value) 263 | else: 264 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 265 | 266 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 267 | if hparams.symmetric_mels: 268 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 269 | else: 270 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 271 | 272 | def _denormalize(D, hparams): 273 | if hparams.allow_clipping_in_normalization: 274 | if hparams.symmetric_mels: 275 | return (((np.clip(D, -hparams.max_abs_value, 276 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 277 | + hparams.min_level_db) 278 | else: 279 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 280 | 281 | if hparams.symmetric_mels: 282 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 283 | else: 284 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 285 | 286 | def _denormalize_tensorflow(D, hparams): 287 | if hparams.allow_clipping_in_normalization: 288 | if hparams.symmetric_mels: 289 | return (((tf.clip_by_value(D, -hparams.max_abs_value, 290 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 291 | + hparams.min_level_db) 292 | else: 293 | return ((tf.clip_by_value(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 294 | 295 | if hparams.symmetric_mels: 296 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 297 | else: 298 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 299 | -------------------------------------------------------------------------------- /datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 19 | - wav_dir: output directory of the preprocessed speech audio dataset 20 | - n_jobs: Optional, number of worker process to parallelize across 21 | - tqdm: Optional, provides a nice progress bar 22 | 23 | Returns: 24 | - A list of tuple describing the train examples. this should be written to train.txt 25 | """ 26 | 27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 28 | # optimization purposes and it can be omited 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | futures = [] 31 | index = 1 32 | for input_dir in input_dirs: 33 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: 34 | for line in f: 35 | parts = line.strip().split('|') 36 | basename = parts[0] 37 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename)) 38 | text = parts[2] 39 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams))) 40 | index += 1 41 | 42 | return [future.result() for future in tqdm(futures) if future.result() is not None] 43 | 44 | 45 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): 46 | """ 47 | Preprocesses a single utterance wav/text pair 48 | 49 | this writes the mel scale spectogram to disk and return a tuple to write 50 | to the train.txt file 51 | 52 | Args: 53 | - mel_dir: the directory to write the mel spectograms into 54 | - linear_dir: the directory to write the linear spectrograms into 55 | - wav_dir: the directory to write the preprocessed wav into 56 | - index: the numeric index to use in the spectogram filename 57 | - wav_path: path to the audio file containing the speech input 58 | - text: text spoken in the input audio file 59 | - hparams: hyper parameters 60 | 61 | Returns: 62 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 63 | """ 64 | try: 65 | # Load the audio as numpy array 66 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 67 | except FileNotFoundError: #catch missing wav exception 68 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 69 | wav_path)) 70 | return None 71 | 72 | #Trim lead/trail silences 73 | if hparams.trim_silence: 74 | wav = audio.trim_silence(wav, hparams) 75 | 76 | #Pre-emphasize 77 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 78 | 79 | #rescale wav 80 | if hparams.rescale: 81 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 82 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max 83 | 84 | #Assert all audio is in [-1, 1] 85 | if (wav > 1.).any() or (wav < -1.).any(): 86 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 87 | if (preem_wav > 1.).any() or (preem_wav < -1.).any(): 88 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 89 | 90 | #Mu-law quantize 91 | if is_mulaw_quantize(hparams.input_type): 92 | #[0, quantize_channels) 93 | out = mulaw_quantize(wav, hparams.quantize_channels) 94 | 95 | #Trim silences 96 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 97 | wav = wav[start: end] 98 | preem_wav = preem_wav[start: end] 99 | out = out[start: end] 100 | 101 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 102 | out_dtype = np.int16 103 | 104 | elif is_mulaw(hparams.input_type): 105 | #[-1, 1] 106 | out = mulaw(wav, hparams.quantize_channels) 107 | constant_values = mulaw(0., hparams.quantize_channels) 108 | out_dtype = np.float32 109 | 110 | else: 111 | #[-1, 1] 112 | out = wav 113 | constant_values = 0. 114 | out_dtype = np.float32 115 | 116 | # Compute the mel scale spectrogram from the wav 117 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) 118 | mel_frames = mel_spectrogram.shape[1] 119 | 120 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 121 | return None 122 | 123 | #Compute the linear scale spectrogram from the wav 124 | linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) 125 | linear_frames = linear_spectrogram.shape[1] 126 | 127 | #sanity check 128 | assert linear_frames == mel_frames 129 | 130 | if hparams.use_lws: 131 | #Ensure time resolution adjustement between audio and mel-spectrogram 132 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 133 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 134 | 135 | #Zero pad audio signal 136 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 137 | else: 138 | #Ensure time resolution adjustement between audio and mel-spectrogram 139 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) 140 | 141 | #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) 142 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) 143 | 144 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 145 | 146 | #time resolution adjustement 147 | #ensure length of raw audio is multiple of hop size so that we can use 148 | #transposed convolution to upsample 149 | out = out[:mel_frames * audio.get_hop_size(hparams)] 150 | assert len(out) % audio.get_hop_size(hparams) == 0 151 | time_steps = len(out) 152 | 153 | # Write the spectrogram and audio to disk 154 | audio_filename = 'audio-{}.npy'.format(index) 155 | mel_filename = 'mel-{}.npy'.format(index) 156 | linear_filename = 'linear-{}.npy'.format(index) 157 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) 158 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 159 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) 160 | 161 | # Return a tuple describing this training example 162 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text) 163 | -------------------------------------------------------------------------------- /datasets/wavenet_preprocessor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from concurrent.futures import ProcessPoolExecutor 3 | from functools import partial 4 | 5 | import numpy as np 6 | from datasets import audio 7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize 8 | 9 | 10 | def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - hparams: hyper parameters 16 | - input_dir: input directory that contains the files to prerocess 17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 19 | - wav_dir: output directory of the preprocessed speech audio dataset 20 | - n_jobs: Optional, number of worker process to parallelize across 21 | - tqdm: Optional, provides a nice progress bar 22 | 23 | Returns: 24 | - A list of tuple describing the train examples. this should be written to train.txt 25 | """ 26 | 27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 28 | # optimization purposes and it can be omited 29 | executor = ProcessPoolExecutor(max_workers=n_jobs) 30 | futures = [] 31 | for file in os.listdir(input_dir): 32 | wav_path = os.path.join(input_dir, file) 33 | basename = os.path.basename(wav_path).replace('.wav', '') 34 | futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams))) 35 | 36 | return [future.result() for future in tqdm(futures) if future.result() is not None] 37 | 38 | 39 | def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams): 40 | """ 41 | Preprocesses a single utterance wav/text pair 42 | 43 | this writes the mel scale spectogram to disk and return a tuple to write 44 | to the train.txt file 45 | 46 | Args: 47 | - mel_dir: the directory to write the mel spectograms into 48 | - linear_dir: the directory to write the linear spectrograms into 49 | - wav_dir: the directory to write the preprocessed wav into 50 | - index: the numeric index to use in the spectrogram filename 51 | - wav_path: path to the audio file containing the speech input 52 | - text: text spoken in the input audio file 53 | - hparams: hyper parameters 54 | 55 | Returns: 56 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 57 | """ 58 | try: 59 | # Load the audio as numpy array 60 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 61 | except FileNotFoundError: #catch missing wav exception 62 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 63 | wav_path)) 64 | return None 65 | 66 | #M-AILABS extra silence specific 67 | if hparams.trim_silence: 68 | wav = audio.trim_silence(wav, hparams) 69 | 70 | #Pre-emphasize 71 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 72 | 73 | #rescale wav 74 | if hparams.rescale: 75 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 76 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max 77 | 78 | #Assert all audio is in [-1, 1] 79 | if (wav > 1.).any() or (wav < -1.).any(): 80 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 81 | if (preem_wav > 1.).any() or (preem_wav < -1.).any(): 82 | raise RuntimeError('wav has invalid value: {}'.format(wav_path)) 83 | 84 | #Mu-law quantize 85 | if is_mulaw_quantize(hparams.input_type): 86 | #[0, quantize_channels) 87 | out = mulaw_quantize(wav, hparams.quantize_channels) 88 | 89 | #Trim silences 90 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 91 | wav = wav[start: end] 92 | preem_wav = preem_wav[start: end] 93 | out = out[start: end] 94 | 95 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 96 | out_dtype = np.int16 97 | 98 | elif is_mulaw(hparams.input_type): 99 | #[-1, 1] 100 | out = mulaw(wav, hparams.quantize_channels) 101 | constant_values = mulaw(0., hparams.quantize_channels) 102 | out_dtype = np.float32 103 | 104 | else: 105 | #[-1, 1] 106 | out = wav 107 | constant_values = 0. 108 | out_dtype = np.float32 109 | 110 | # Compute the mel scale spectrogram from the wav 111 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) 112 | mel_frames = mel_spectrogram.shape[1] 113 | 114 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 115 | return None 116 | 117 | if hparams.use_lws: 118 | #Ensure time resolution adjustement between audio and mel-spectrogram 119 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 120 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 121 | 122 | #Zero pad audio signal 123 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 124 | else: 125 | #Ensure time resolution adjustement between audio and mel-spectrogram 126 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) 127 | 128 | #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) 129 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) 130 | 131 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 132 | 133 | #time resolution adjustement 134 | #ensure length of raw audio is multiple of hop size so that we can use 135 | #transposed convolution to upsample 136 | out = out[:mel_frames * audio.get_hop_size(hparams)] 137 | assert len(out) % audio.get_hop_size(hparams) == 0 138 | time_steps = len(out) 139 | 140 | # Write the spectrogram and audio to disk 141 | audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) 142 | mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) 143 | np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) 144 | np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) 145 | 146 | #global condition features 147 | if hparams.gin_channels > 0: 148 | raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training') 149 | speaker_id = '' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) 150 | else: 151 | speaker_id = '' 152 | 153 | # Return a tuple describing this training example 154 | return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps, mel_frames) 155 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda3:latest 2 | FROM tensorflow/tensorflow:latest-gpu-py3 3 | 4 | RUN apt-get update 5 | RUN apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools wget git vim 6 | 7 | RUN wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 8 | RUN tar -jxvf LJSpeech-1.1.tar.bz2 9 | 10 | RUN git clone https://github.com/Rayhane-mamah/Tacotron-2.git 11 | 12 | WORKDIR Tacotron-2 13 | RUN ln -s ../LJSpeech-1.1 . 14 | RUN pip install -r requirements.txt -------------------------------------------------------------------------------- /griffin_lim_synthesis_tool.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "from datasets.audio import *\n", 13 | "import os\n", 14 | "from hparams import hparams\n", 15 | "\n", 16 | "n_sample = 0 #Change n_steps here\n", 17 | "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n", 18 | "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n", 19 | "out_dir = 'wav_out'\n", 20 | "\n", 21 | "os.makedirs(out_dir, exist_ok=True)\n", 22 | "\n", 23 | "#mel_file = os.path.join(mel_folder, mel_file)\n", 24 | "mel_file = 'training_data/mels/mel-LJ001-0001.npy'\n", 25 | "mel_spectro = np.load(mel_file)\n", 26 | "mel_spectro.shape" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n", 36 | "#save the wav under test__\n", 37 | "save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 38 | " sr=hparams.sample_rate)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from tacotron.utils.plot import *\n", 48 | "\n", 49 | "plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "lin_file = 'training_data/linear/linear-LJ001-0001.npy'\n", 59 | "lin_spectro = np.load(lin_file)\n", 60 | "lin_spectro.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n", 70 | "save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 71 | " sr=hparams.sample_rate)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n", 81 | " auto_aspect=True)" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.6.4" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import json 3 | from datetime import datetime 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | _format = '%Y-%m-%d %H:%M:%S.%f' 8 | _file = None 9 | _run_name = None 10 | _slack_url = None 11 | 12 | 13 | def init(filename, run_name, slack_url=None): 14 | global _file, _run_name, _slack_url 15 | _close_logfile() 16 | _file = open(filename, 'a') 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new {} training run\n'.format(run_name)) 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, end='\n', slack=False): 26 | print(msg, end=end) 27 | if _file is not None: 28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header('Content-Type', 'application/json') 43 | urlopen(req, json.dumps({ 44 | 'username': 'tacotron', 45 | 'icon_emoji': ':taco:', 46 | 'text': '*%s*: %s' % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /papers/(content+location) attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/(content+location) attention.pdf -------------------------------------------------------------------------------- /papers/ClariNet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/ClariNet.pdf -------------------------------------------------------------------------------- /papers/Tacotron 2 revised.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/Tacotron 2 revised.pdf -------------------------------------------------------------------------------- /papers/bahdanau (content) attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/bahdanau (content) attention.pdf -------------------------------------------------------------------------------- /papers/deepvoice 3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/deepvoice 3.pdf -------------------------------------------------------------------------------- /papers/effective approaches attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/effective approaches attention.pdf -------------------------------------------------------------------------------- /papers/fast_wavenet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/fast_wavenet.pdf -------------------------------------------------------------------------------- /papers/tacotron.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron.pdf -------------------------------------------------------------------------------- /papers/tacotron2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron2.pdf -------------------------------------------------------------------------------- /papers/wavenet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/wavenet.pdf -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | 5 | from datasets import preprocessor 6 | from hparams import hparams 7 | from tqdm import tqdm 8 | 9 | 10 | def preprocess(args, input_folders, out_dir, hparams): 11 | mel_dir = os.path.join(out_dir, 'mels') 12 | wav_dir = os.path.join(out_dir, 'audio') 13 | linear_dir = os.path.join(out_dir, 'linear') 14 | os.makedirs(mel_dir, exist_ok=True) 15 | os.makedirs(wav_dir, exist_ok=True) 16 | os.makedirs(linear_dir, exist_ok=True) 17 | metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm) 18 | write_metadata(metadata, out_dir) 19 | 20 | def write_metadata(metadata, out_dir): 21 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 22 | for m in metadata: 23 | f.write('|'.join([str(x) for x in m]) + '\n') 24 | mel_frames = sum([int(m[4]) for m in metadata]) 25 | timesteps = sum([int(m[3]) for m in metadata]) 26 | sr = hparams.sample_rate 27 | hours = timesteps / sr / 3600 28 | print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format( 29 | len(metadata), mel_frames, timesteps, hours)) 30 | print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata))) 31 | print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata))) 32 | print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 33 | 34 | def norm_data(args): 35 | 36 | merge_books = (args.merge_books=='True') 37 | 38 | print('Selecting data folders..') 39 | supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS'] 40 | if args.dataset not in supported_datasets: 41 | raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format( 42 | args.dataset, supported_datasets)) 43 | 44 | if args.dataset.startswith('LJSpeech'): 45 | return [os.path.join(args.base_dir, args.dataset)] 46 | 47 | 48 | if args.dataset == 'M-AILABS': 49 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 50 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA'] 51 | if args.language not in supported_languages: 52 | raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format( 53 | supported_languages)) 54 | 55 | supported_voices = ['female', 'male', 'mix'] 56 | if args.voice not in supported_voices: 57 | raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format( 58 | supported_voices)) 59 | 60 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice) 61 | supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 62 | if args.reader not in supported_readers: 63 | raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format( 64 | supported_readers)) 65 | 66 | path = os.path.join(path, args.reader) 67 | supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 68 | if merge_books: 69 | return [os.path.join(path, book) for book in supported_books] 70 | 71 | else: 72 | if args.book not in supported_books: 73 | raise ValueError('Please enter a valid book for your reader settings! \n{}'.format( 74 | supported_books)) 75 | 76 | return [os.path.join(path, args.book)] 77 | 78 | 79 | def run_preprocess(args, hparams): 80 | input_folders = norm_data(args) 81 | output_folder = os.path.join(args.base_dir, args.output) 82 | 83 | preprocess(args, input_folders, output_folder, hparams) 84 | 85 | 86 | def main(): 87 | print('initializing preprocessing..') 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument('--base_dir', default='') 90 | parser.add_argument('--hparams', default='', 91 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 92 | parser.add_argument('--dataset', default='LJSpeech-1.1') 93 | parser.add_argument('--language', default='en_US') 94 | parser.add_argument('--voice', default='female') 95 | parser.add_argument('--reader', default='mary_ann') 96 | parser.add_argument('--merge_books', default='False') 97 | parser.add_argument('--book', default='northandsouth') 98 | parser.add_argument('--output', default='training_data') 99 | parser.add_argument('--n_jobs', type=int, default=cpu_count()) 100 | args = parser.parse_args() 101 | 102 | modified_hp = hparams.parse(args.hparams) 103 | 104 | assert args.merge_books in ('False', 'True') 105 | 106 | run_preprocess(args, modified_hp) 107 | 108 | 109 | if __name__ == '__main__': 110 | main() 111 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | falcon==1.2.0 2 | inflect==0.2.5 3 | audioread==2.1.5 4 | librosa==0.5.1 5 | matplotlib==2.0.2 6 | numpy==1.14.0 7 | scipy==1.0.0 8 | tqdm==4.11.2 9 | Unidecode==0.4.20 10 | pyaudio==0.2.11 11 | sounddevice==0.3.10 12 | lws 13 | keras -------------------------------------------------------------------------------- /sentences.txt: -------------------------------------------------------------------------------- 1 | Scientists at the CERN laboratory say they have discovered a new particle. 2 | There's a way to measure the acute emotional intelligence that has never gone out of style. 3 | President Trump met with other leaders at the Group of 20 conference. 4 | The Senate's bill to repeal and replace the Affordable Care Act is now imperiled. 5 | Generative adversarial network or variational auto-encoder. 6 | Basilar membrane and otolaryngology are not auto-correlations. 7 | He has read the whole thing. 8 | He reads books. 9 | He thought it was time to present the present. 10 | Thisss isrealy awhsome. 11 | Punctuation sensitivity, is working. 12 | Punctuation sensitivity is working. 13 | Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick? 14 | She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure. 15 | Tajima Airport serves Toyooka. 16 | On offering to help the blind man, the man who then stole his car, had not, at that precise moment, had any evil intention, quite the contrary, what he did was nothing more than obey those feelings of generosity and altruism which, as everyone knows, are the two best traits of human nature and to be found in much more hardened criminals than this one, a simple car-thief without any hope of advancing in his profession, exploited by the real owners this enterprise, for it is they who take advantage of the needs of the poor. 17 | Thank you so much for your support! -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from warnings import warn 4 | from time import sleep 5 | 6 | import tensorflow as tf 7 | 8 | from hparams import hparams 9 | from infolog import log 10 | from tacotron.synthesize import tacotron_synthesize 11 | from wavenet_vocoder.synthesize import wavenet_synthesize 12 | 13 | 14 | def prepare_run(args): 15 | modified_hp = hparams.parse(args.hparams) 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 17 | 18 | run_name = args.name or args.tacotron_name or args.model 19 | taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint) 20 | 21 | run_name = args.name or args.wavenet_name or args.model 22 | wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint) 23 | return taco_checkpoint, wave_checkpoint, modified_hp 24 | 25 | def get_sentences(args): 26 | if args.text_list != '': 27 | with open(args.text_list, 'rb') as f: 28 | sentences = list(map(lambda l: l.decode("utf-8")[:-1], f.readlines())) 29 | else: 30 | sentences = hparams.sentences 31 | return sentences 32 | 33 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences): 34 | log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model)) 35 | log('Synthesizing mel-spectrograms from text..') 36 | wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 37 | #Delete Tacotron model from graph 38 | tf.reset_default_graph() 39 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is synthesizing 40 | sleep(0.5) 41 | log('Synthesizing audio from mel-spectrograms.. (This may take a while)') 42 | wavenet_synthesize(args, hparams, wave_checkpoint) 43 | log('Tacotron-2 TTS synthesis complete!') 44 | 45 | 46 | 47 | def main(): 48 | accepted_modes = ['eval', 'synthesis', 'live'] 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint') 51 | parser.add_argument('--hparams', default='', 52 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 53 | parser.add_argument('--name', help='Name of logging directory if the two models were trained together.') 54 | parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately') 55 | parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately') 56 | parser.add_argument('--model', default='Tacotron-2') 57 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') 58 | parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') 59 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 60 | parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) 61 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') 62 | parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval') 63 | parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids') 64 | args = parser.parse_args() 65 | 66 | accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2'] 67 | 68 | if args.model not in accepted_models: 69 | raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models)) 70 | 71 | if args.mode not in accepted_modes: 72 | raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode)) 73 | 74 | if args.mode == 'live' and args.model == 'Wavenet': 75 | raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!') 76 | 77 | if args.GTA not in ('True', 'False'): 78 | raise ValueError('GTA option must be either True or False') 79 | 80 | if args.model == 'Tacotron-2': 81 | if args.mode == 'live': 82 | warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!') 83 | if args.mode == 'synthesis': 84 | raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)') 85 | 86 | taco_checkpoint, wave_checkpoint, hparams = prepare_run(args) 87 | sentences = get_sentences(args) 88 | 89 | if args.model == 'Tacotron': 90 | _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 91 | elif args.model == 'WaveNet': 92 | wavenet_synthesize(args, hparams, wave_checkpoint) 93 | elif args.model == 'Tacotron-2': 94 | synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) 95 | else: 96 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 97 | 98 | 99 | if __name__ == '__main__': 100 | main() 101 | -------------------------------------------------------------------------------- /tacotron/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /tacotron/feeder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import time 4 | import traceback 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from infolog import log 9 | from sklearn.model_selection import train_test_split 10 | from tacotron.utils.text import text_to_sequence 11 | 12 | _batches_per_group = 64 13 | 14 | class Feeder: 15 | """ 16 | Feeds batches of data into queue on a background thread. 17 | """ 18 | 19 | def __init__(self, coordinator, metadata_filename, hparams): 20 | super(Feeder, self).__init__() 21 | self._coord = coordinator 22 | self._hparams = hparams 23 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 24 | self._train_offset = 0 25 | self._test_offset = 0 26 | 27 | # Load metadata 28 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') 29 | self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') 30 | with open(metadata_filename, encoding='utf-8') as f: 31 | self._metadata = [line.strip().split('|') for line in f] 32 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 33 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) 34 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) 35 | 36 | #Train test split 37 | if hparams.tacotron_test_size is None: 38 | assert hparams.tacotron_test_batches is not None 39 | 40 | test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 41 | else hparams.tacotron_test_batches * hparams.tacotron_batch_size) 42 | indices = np.arange(len(self._metadata)) 43 | train_indices, test_indices = train_test_split(indices, 44 | test_size=test_size, random_state=hparams.tacotron_data_random_state) 45 | 46 | #Make sure test_indices is a multiple of batch_size else round down 47 | len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) 48 | extra_test = test_indices[len_test_indices:] 49 | test_indices = test_indices[:len_test_indices] 50 | train_indices = np.concatenate([train_indices, extra_test]) 51 | 52 | self._train_meta = list(np.array(self._metadata)[train_indices]) 53 | self._test_meta = list(np.array(self._metadata)[test_indices]) 54 | 55 | self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size 56 | 57 | if hparams.tacotron_test_size is None: 58 | assert hparams.tacotron_test_batches == self.test_steps 59 | 60 | #pad input sequences with the 0 ( _ ) 61 | self._pad = 0 62 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 63 | #to avoid any possible conflicts, without affecting the output range of the model too much 64 | if hparams.symmetric_mels: 65 | self._target_pad = -hparams.max_abs_value 66 | else: 67 | self._target_pad = 0. 68 | #Mark finished sequences with 1s 69 | self._token_pad = 1. 70 | 71 | with tf.device('/cpu:0'): 72 | # Create placeholders for inputs and targets. Don't specify batch size because we want 73 | # to be able to feed different batch sizes at eval time. 74 | self._placeholders = [ 75 | tf.placeholder(tf.int32, shape=(None, None), name='inputs'), 76 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 77 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), 78 | tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), 79 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), 80 | tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), 81 | tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos'), 82 | ] 83 | 84 | # Create queue for buffering data 85 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue') 86 | self._enqueue_op = queue.enqueue(self._placeholders) 87 | self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = queue.dequeue() 88 | 89 | self.inputs.set_shape(self._placeholders[0].shape) 90 | self.input_lengths.set_shape(self._placeholders[1].shape) 91 | self.mel_targets.set_shape(self._placeholders[2].shape) 92 | self.token_targets.set_shape(self._placeholders[3].shape) 93 | self.linear_targets.set_shape(self._placeholders[4].shape) 94 | self.targets_lengths.set_shape(self._placeholders[5].shape) 95 | self.split_infos.set_shape(self._placeholders[6].shape) 96 | 97 | # Create eval queue for buffering eval data 98 | eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='eval_queue') 99 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 100 | self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ 101 | self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = eval_queue.dequeue() 102 | 103 | self.eval_inputs.set_shape(self._placeholders[0].shape) 104 | self.eval_input_lengths.set_shape(self._placeholders[1].shape) 105 | self.eval_mel_targets.set_shape(self._placeholders[2].shape) 106 | self.eval_token_targets.set_shape(self._placeholders[3].shape) 107 | self.eval_linear_targets.set_shape(self._placeholders[4].shape) 108 | self.eval_targets_lengths.set_shape(self._placeholders[5].shape) 109 | self.eval_split_infos.set_shape(self._placeholders[6].shape) 110 | 111 | def start_threads(self, session): 112 | self._session = session 113 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group) 114 | thread.daemon = True #Thread will close when parent quits 115 | thread.start() 116 | 117 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group) 118 | thread.daemon = True #Thread will close when parent quits 119 | thread.start() 120 | 121 | def _get_test_groups(self): 122 | meta = self._test_meta[self._test_offset] 123 | self._test_offset += 1 124 | 125 | text = meta[5] 126 | 127 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 128 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 129 | #Create parallel sequences containing zeros to represent a non finished sequence 130 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 131 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 132 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 133 | 134 | def make_test_batches(self): 135 | start = time.time() 136 | 137 | # Read a group of examples 138 | n = self._hparams.tacotron_batch_size 139 | r = self._hparams.outputs_per_step 140 | 141 | #Test on entire test set 142 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 143 | 144 | # Bucket examples based on similar output sequence length for efficiency 145 | examples.sort(key=lambda x: x[-1]) 146 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 147 | np.random.shuffle(batches) 148 | 149 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 150 | return batches, r 151 | 152 | def _enqueue_next_train_group(self): 153 | while not self._coord.should_stop(): 154 | start = time.time() 155 | 156 | # Read a group of examples 157 | n = self._hparams.tacotron_batch_size 158 | r = self._hparams.outputs_per_step 159 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 160 | 161 | # Bucket examples based on similar output sequence length for efficiency 162 | examples.sort(key=lambda x: x[-1]) 163 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 164 | np.random.shuffle(batches) 165 | 166 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 167 | for batch in batches: 168 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 169 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 170 | 171 | def _enqueue_next_test_group(self): 172 | #Create test batches once and evaluate on them for all test steps 173 | test_batches, r = self.make_test_batches() 174 | while not self._coord.should_stop(): 175 | for batch in test_batches: 176 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 177 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 178 | 179 | def _get_next_example(self): 180 | """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk 181 | """ 182 | if self._train_offset >= len(self._train_meta): 183 | self._train_offset = 0 184 | np.random.shuffle(self._train_meta) 185 | 186 | meta = self._train_meta[self._train_offset] 187 | self._train_offset += 1 188 | 189 | text = meta[5] 190 | 191 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 192 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 193 | #Create parallel sequences containing zeros to represent a non finished sequence 194 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 195 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 196 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 197 | 198 | def _prepare_batch(self, batches, outputs_per_step): 199 | assert 0 == len(batches) % self._hparams.tacotron_num_gpus 200 | size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus) 201 | np.random.shuffle(batches) 202 | 203 | inputs = None 204 | mel_targets = None 205 | token_targets = None 206 | linear_targets = None 207 | targets_lengths = None 208 | split_infos = [] 209 | 210 | targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss 211 | input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32) 212 | 213 | #Produce inputs/targets of variables lengths for different GPUs 214 | for i in range(self._hparams.tacotron_num_gpus): 215 | batch = batches[size_per_device * i: size_per_device * (i + 1)] 216 | input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch]) 217 | inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device 218 | mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step) 219 | mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device 220 | 221 | #Pad sequences with 1 to infer that the sequence is done 222 | token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step) 223 | token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device 224 | linear_targets_cur_device, linear_target_max_len = self._prepare_targets([x[3] for x in batch], outputs_per_step) 225 | linear_targets = np.concatenate((linear_targets, linear_targets_cur_device), axis=1) if linear_targets is not None else linear_targets_cur_device 226 | split_infos.append([input_max_len, mel_target_max_len, token_target_max_len, linear_target_max_len]) 227 | 228 | split_infos = np.asarray(split_infos, dtype=np.int32) 229 | return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths, split_infos) 230 | 231 | def _prepare_inputs(self, inputs): 232 | max_len = max([len(x) for x in inputs]) 233 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 234 | 235 | def _prepare_targets(self, targets, alignment): 236 | max_len = max([len(t) for t in targets]) 237 | data_len = self._round_up(max_len, alignment) 238 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 239 | 240 | def _prepare_token_targets(self, targets, alignment): 241 | max_len = max([len(t) for t in targets]) + 1 242 | data_len = self._round_up(max_len, alignment) 243 | return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len 244 | 245 | def _pad_input(self, x, length): 246 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad) 247 | 248 | def _pad_target(self, t, length): 249 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) 250 | 251 | def _pad_token_target(self, t, length): 252 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad) 253 | 254 | def _round_up(self, x, multiple): 255 | remainder = x % multiple 256 | return x if remainder == 0 else x + multiple - remainder 257 | 258 | def _round_down(self, x, multiple): 259 | remainder = x % multiple 260 | return x if remainder == 0 else x - remainder 261 | -------------------------------------------------------------------------------- /tacotron/models/Architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers usefull for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from tacotron.models.attention import _compute_attention 9 | from tensorflow.contrib.rnn import RNNCell 10 | from tensorflow.python.framework import ops, tensor_shape 11 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops 12 | from tensorflow.python.util import nest 13 | 14 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 15 | 16 | 17 | 18 | class TacotronEncoderCell(RNNCell): 19 | """Tacotron 2 Encoder Cell 20 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 21 | layer to predict the hidden representation vector (or memory) 22 | """ 23 | 24 | def __init__(self, convolutional_layers, lstm_layer): 25 | """Initialize encoder parameters 26 | 27 | Args: 28 | convolutional_layers: Encoder convolutional block class 29 | lstm_layer: encoder bidirectional lstm layer class 30 | """ 31 | super(TacotronEncoderCell, self).__init__() 32 | #Initialize encoder layers 33 | self._convolutions = convolutional_layers 34 | self._cell = lstm_layer 35 | 36 | def __call__(self, inputs, input_lengths=None): 37 | #Pass input sequence through a stack of convolutional layers 38 | conv_output = self._convolutions(inputs) 39 | 40 | #Extract hidden representation from encoder lstm cells 41 | hidden_representation = self._cell(conv_output, input_lengths) 42 | 43 | #For shape visualization 44 | self.conv_output_shape = conv_output.shape 45 | return hidden_representation 46 | 47 | 48 | class TacotronDecoderCellState( 49 | collections.namedtuple("TacotronDecoderCellState", 50 | ("cell_state", "attention", "time", "alignments", 51 | "alignment_history", "max_attentions"))): 52 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 53 | Contains: 54 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 55 | step. 56 | - `attention`: The attention emitted at the previous time step. 57 | - `time`: int32 scalar containing the current time step. 58 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 59 | emitted at the previous time step for each attention mechanism. 60 | - `alignment_history`: a single or tuple of `TensorArray`(s) 61 | containing alignment matrices from all time steps for each attention 62 | mechanism. Call `stack()` on each to convert to a `Tensor`. 63 | """ 64 | def replace(self, **kwargs): 65 | """Clones the current state while overwriting components provided by kwargs. 66 | """ 67 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 68 | 69 | class TacotronDecoderCell(RNNCell): 70 | """Tactron 2 Decoder Cell 71 | Decodes encoder output and previous mel frames into next r frames 72 | 73 | Decoder Step i: 74 | 1) Prenet to compress last output information 75 | 2) Concat compressed inputs with previous context vector (input feeding) * 76 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 77 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 78 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 79 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 80 | 81 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper, 82 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 83 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 84 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only. 85 | """ 86 | 87 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): 88 | """Initialize decoder parameters 89 | 90 | Args: 91 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 92 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 93 | learn encoder-decoder alignments 94 | rnn_cell: Instance of RNNCell, main body of the decoder 95 | frame_projection: tensorflow fully connected layer with r * num_mels output units 96 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 97 | and through a sigmoid activation 98 | mask_finished: Boolean, Whether to mask decoder frames after the 99 | """ 100 | super(TacotronDecoderCell, self).__init__() 101 | #Initialize decoder layers 102 | self._prenet = prenet 103 | self._attention_mechanism = attention_mechanism 104 | self._cell = rnn_cell 105 | self._frame_projection = frame_projection 106 | self._stop_projection = stop_projection 107 | 108 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 109 | 110 | def _batch_size_checks(self, batch_size, error_message): 111 | return [check_ops.assert_equal(batch_size, 112 | self._attention_mechanism.batch_size, 113 | message=error_message)] 114 | 115 | @property 116 | def output_size(self): 117 | return self._frame_projection.shape 118 | 119 | @property 120 | def state_size(self): 121 | """The `state_size` property of `TacotronDecoderCell`. 122 | 123 | Returns: 124 | An `TacotronDecoderCell` tuple containing shapes used by this object. 125 | """ 126 | return TacotronDecoderCellState( 127 | cell_state=self._cell._cell.state_size, 128 | time=tensor_shape.TensorShape([]), 129 | attention=self._attention_layer_size, 130 | alignments=self._attention_mechanism.alignments_size, 131 | alignment_history=(), 132 | max_attentions=()) 133 | 134 | def zero_state(self, batch_size, dtype): 135 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 136 | 137 | Args: 138 | batch_size: `0D` integer tensor: the batch size. 139 | dtype: The internal state data type. 140 | Returns: 141 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 142 | possibly, empty `TensorArray` objects. 143 | Raises: 144 | ValueError: (or, possibly at runtime, InvalidArgument), if 145 | `batch_size` does not match the output size of the encoder passed 146 | to the wrapper object at initialization time. 147 | """ 148 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 149 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 150 | error_message = ( 151 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 152 | "Non-matching batch sizes between the memory " 153 | "(encoder output) and the requested batch size.") 154 | with ops.control_dependencies( 155 | self._batch_size_checks(batch_size, error_message)): 156 | cell_state = nest.map_structure( 157 | lambda s: array_ops.identity(s, name="checked_cell_state"), 158 | cell_state) 159 | return TacotronDecoderCellState( 160 | cell_state=cell_state, 161 | time=array_ops.zeros([], dtype=tf.int32), 162 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 163 | dtype), 164 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 165 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 166 | dynamic_size=True), 167 | max_attentions=tf.zeros((batch_size, ), dtype=tf.int32)) 168 | 169 | def __call__(self, inputs, state): 170 | #Information bottleneck (essential for learning attention) 171 | prenet_output = self._prenet(inputs) 172 | 173 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 174 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 175 | 176 | #Unidirectional LSTM layers 177 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 178 | 179 | 180 | #Compute the attention (context) vector and alignments using 181 | #the new decoder cell hidden state as query vector 182 | #and cumulative alignments to extract location features 183 | #The choice of the new cell hidden state (s_{i}) of the last 184 | #decoder RNN Cell is based on Luong et Al. (2015): 185 | #https://arxiv.org/pdf/1508.04025.pdf 186 | previous_alignments = state.alignments 187 | previous_alignment_history = state.alignment_history 188 | context_vector, alignments, cumulated_alignments, max_attentions = _compute_attention(self._attention_mechanism, 189 | LSTM_output, 190 | previous_alignments, 191 | attention_layer=None, 192 | prev_max_attentions=state.max_attentions) 193 | 194 | #Concat LSTM outputs and context vector to form projections inputs 195 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 196 | 197 | #Compute predicted frames and predicted 198 | cell_outputs = self._frame_projection(projections_input) 199 | stop_tokens = self._stop_projection(projections_input) 200 | 201 | #Save alignment history 202 | alignment_history = previous_alignment_history.write(state.time, alignments) 203 | 204 | #Prepare next decoder state 205 | next_state = TacotronDecoderCellState( 206 | time=state.time + 1, 207 | cell_state=next_cell_state, 208 | attention=context_vector, 209 | alignments=cumulated_alignments, 210 | alignment_history=alignment_history, 211 | max_attentions=max_attentions) 212 | 213 | return (cell_outputs, stop_tokens), next_state 214 | -------------------------------------------------------------------------------- /tacotron/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'Tacotron': 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /tacotron/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.layers import core as layers_core 6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope 7 | 8 | 9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, 11 | attention_layer, prev_max_attentions): 12 | """Computes the attention and alignments for a given attention_mechanism.""" 13 | alignments, next_attention_state, max_attentions = attention_mechanism( 14 | cell_output, state=attention_state, prev_max_attentions=prev_max_attentions) 15 | 16 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 17 | expanded_alignments = array_ops.expand_dims(alignments, 1) 18 | # Context is the inner product of alignments and values along the 19 | # memory time dimension. 20 | # alignments shape is 21 | # [batch_size, 1, memory_time] 22 | # attention_mechanism.values shape is 23 | # [batch_size, memory_time, memory_size] 24 | # the batched matmul is over memory_time, so the output shape is 25 | # [batch_size, 1, memory_size]. 26 | # we then squeeze out the singleton dim. 27 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 28 | context = array_ops.squeeze(context, [1]) 29 | 30 | if attention_layer is not None: 31 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 32 | else: 33 | attention = context 34 | 35 | return attention, alignments, next_attention_state, max_attentions 36 | 37 | 38 | def _location_sensitive_score(W_query, W_fil, W_keys): 39 | """Impelements Bahdanau-style (cumulative) scoring function. 40 | This attention is described in: 41 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 42 | gio, “Attention-based models for speech recognition,” in Ad- 43 | vances in Neural Information Processing Systems, 2015, pp. 44 | 577–585. 45 | 46 | ############################################################################# 47 | hybrid attention (content-based + location-based) 48 | f = F * α_{i-1} 49 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 50 | ############################################################################# 51 | 52 | Args: 53 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 54 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 55 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 56 | Returns: 57 | A '[batch_size, max_time]' attention score (energy) 58 | """ 59 | # Get the number of hidden units from the trailing dimension of keys 60 | dtype = W_query.dtype 61 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 62 | 63 | v_a = tf.get_variable( 64 | 'attention_variable_projection', shape=[num_units], dtype=dtype, 65 | initializer=tf.contrib.layers.xavier_initializer()) 66 | b_a = tf.get_variable( 67 | 'attention_bias', shape=[num_units], dtype=dtype, 68 | initializer=tf.zeros_initializer()) 69 | 70 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 71 | 72 | def _smoothing_normalization(e): 73 | """Applies a smoothing normalization function instead of softmax 74 | Introduced in: 75 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 76 | gio, “Attention-based models for speech recognition,” in Ad- 77 | vances in Neural Information Processing Systems, 2015, pp. 78 | 577–585. 79 | 80 | ############################################################################ 81 | Smoothing normalization function 82 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 83 | ############################################################################ 84 | 85 | Args: 86 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 87 | values of an attention mechanism 88 | Returns: 89 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 90 | attendance to multiple memory time steps. 91 | """ 92 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 93 | 94 | 95 | class LocationSensitiveAttention(BahdanauAttention): 96 | """Impelements Bahdanau-style (cumulative) scoring function. 97 | Usually referred to as "hybrid" attention (content-based + location-based) 98 | Extends the additive attention described in: 99 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 100 | tion by jointly learning to align and translate,” in Proceedings 101 | of ICLR, 2015." 102 | to use previous alignments as additional location features. 103 | 104 | This attention is described in: 105 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 106 | gio, “Attention-based models for speech recognition,” in Ad- 107 | vances in Neural Information Processing Systems, 2015, pp. 108 | 577–585. 109 | """ 110 | 111 | def __init__(self, 112 | num_units, 113 | memory, 114 | hparams, 115 | is_training, 116 | mask_encoder=True, 117 | memory_sequence_length=None, 118 | smoothing=False, 119 | cumulate_weights=True, 120 | name='LocationSensitiveAttention'): 121 | """Construct the Attention mechanism. 122 | Args: 123 | num_units: The depth of the query mechanism. 124 | memory: The memory to query; usually the output of an RNN encoder. This 125 | tensor should be shaped `[batch_size, max_time, ...]`. 126 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 127 | memory_sequence_length (optional): Sequence lengths for the batch entries 128 | in memory. If provided, the memory tensor rows are masked with zeros 129 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 130 | smoothing (optional): Boolean. Determines which normalization function to use. 131 | Default normalization function (probablity_fn) is softmax. If smoothing is 132 | enabled, we replace softmax with: 133 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 134 | Introduced in: 135 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 136 | gio, “Attention-based models for speech recognition,” in Ad- 137 | vances in Neural Information Processing Systems, 2015, pp. 138 | 577–585. 139 | This is mainly used if the model wants to attend to multiple input parts 140 | at the same decoding step. We probably won't be using it since multiple sound 141 | frames may depend on the same character/phone, probably not the way around. 142 | Note: 143 | We still keep it implemented in case we want to test it. They used it in the 144 | paper in the context of speech recognition, where one phoneme may depend on 145 | multiple subsequent sound frames. 146 | name: Name to use when creating ops. 147 | """ 148 | #Create normalization function 149 | #Setting it to None defaults in using softmax 150 | normalization_function = _smoothing_normalization if (smoothing == True) else None 151 | memory_length = memory_sequence_length if (mask_encoder==True) else None 152 | super(LocationSensitiveAttention, self).__init__( 153 | num_units=num_units, 154 | memory=memory, 155 | memory_sequence_length=memory_length, 156 | probability_fn=normalization_function, 157 | name=name) 158 | 159 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 160 | kernel_size=hparams.attention_kernel, padding='same', use_bias=True, 161 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution') 162 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 163 | dtype=tf.float32, name='location_features_layer') 164 | self._cumulate = cumulate_weights 165 | self.synthesis_constraint = hparams.synthesis_constraint and not is_training 166 | self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32) 167 | self.constraint_type = hparams.synthesis_constraint_type 168 | 169 | def __call__(self, query, state, prev_max_attentions): 170 | """Score the query based on the keys and values. 171 | Args: 172 | query: Tensor of dtype matching `self.values` and shape 173 | `[batch_size, query_depth]`. 174 | state (previous alignments): Tensor of dtype matching `self.values` and shape 175 | `[batch_size, alignments_size]` 176 | (`alignments_size` is memory's `max_time`). 177 | Returns: 178 | alignments: Tensor of dtype matching `self.values` and shape 179 | `[batch_size, alignments_size]` (`alignments_size` is memory's 180 | `max_time`). 181 | """ 182 | previous_alignments = state 183 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 184 | 185 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 186 | processed_query = self.query_layer(query) if self.query_layer else query 187 | # -> [batch_size, 1, attention_dim] 188 | processed_query = tf.expand_dims(processed_query, 1) 189 | 190 | # processed_location_features shape [batch_size, max_time, attention dimension] 191 | # [batch_size, max_time] -> [batch_size, max_time, 1] 192 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 193 | # location features [batch_size, max_time, filters] 194 | f = self.location_convolution(expanded_alignments) 195 | # Projected location features [batch_size, max_time, attention_dim] 196 | processed_location_features = self.location_layer(f) 197 | 198 | # energy shape [batch_size, max_time] 199 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 200 | 201 | if self.synthesis_constraint: 202 | Tx = tf.shape(energy)[-1] 203 | # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1]) 204 | if self.constraint_type == 'monotonic': 205 | key_masks = tf.sequence_mask(prev_max_attentions, Tx) 206 | reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1] 207 | else: 208 | assert self.constraint_type == 'window' 209 | key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx) 210 | reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1] 211 | 212 | masks = tf.logical_or(key_masks, reverse_masks) 213 | paddings = tf.ones_like(energy) * (-2 ** 32 + 1) # (N, Ty/r, Tx) 214 | energy = tf.where(tf.equal(masks, False), energy, paddings) 215 | 216 | # alignments shape = energy shape = [batch_size, max_time] 217 | alignments = self._probability_fn(energy, previous_alignments) 218 | max_attentions = tf.argmax(alignments, -1, output_type=tf.int32) # (N, Ty/r) 219 | 220 | # Cumulate alignments 221 | if self._cumulate: 222 | next_state = alignments + previous_alignments 223 | else: 224 | next_state = alignments 225 | 226 | return alignments, next_state, max_attentions 227 | -------------------------------------------------------------------------------- /tacotron/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import collections 4 | 5 | import tensorflow as tf 6 | from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper 7 | from tensorflow.contrib.seq2seq.python.ops import decoder 8 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 9 | from tensorflow.python.framework import ops, tensor_shape 10 | from tensorflow.python.layers import base as layers_base 11 | from tensorflow.python.ops import rnn_cell_impl 12 | from tensorflow.python.util import nest 13 | 14 | 15 | class CustomDecoderOutput( 16 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 17 | pass 18 | 19 | 20 | class CustomDecoder(decoder.Decoder): 21 | """Custom sampling decoder. 22 | 23 | Allows for stop token prediction at inference time 24 | and returns equivalent loss in training time. 25 | 26 | Note: 27 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 28 | """ 29 | 30 | def __init__(self, cell, helper, initial_state, output_layer=None): 31 | """Initialize CustomDecoder. 32 | Args: 33 | cell: An `RNNCell` instance. 34 | helper: A `Helper` instance. 35 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 36 | The initial state of the RNNCell. 37 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 38 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 39 | to storing the result or sampling. 40 | Raises: 41 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 42 | """ 43 | rnn_cell_impl.assert_like_rnncell(type(cell), cell) 44 | if not isinstance(helper, helper_py.Helper): 45 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 46 | if (output_layer is not None 47 | and not isinstance(output_layer, layers_base.Layer)): 48 | raise TypeError( 49 | "output_layer must be a Layer, received: %s" % type(output_layer)) 50 | self._cell = cell 51 | self._helper = helper 52 | self._initial_state = initial_state 53 | self._output_layer = output_layer 54 | 55 | @property 56 | def batch_size(self): 57 | return self._helper.batch_size 58 | 59 | def _rnn_output_size(self): 60 | size = self._cell.output_size 61 | if self._output_layer is None: 62 | return size 63 | else: 64 | # To use layer's compute_output_shape, we need to convert the 65 | # RNNCell's output_size entries into shapes with an unknown 66 | # batch size. We then pass this through the layer's 67 | # compute_output_shape and read off all but the first (batch) 68 | # dimensions to get the output size of the rnn with the layer 69 | # applied to the top. 70 | output_shape_with_unknown_batch = nest.map_structure( 71 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 72 | size) 73 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 74 | output_shape_with_unknown_batch) 75 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 76 | 77 | @property 78 | def output_size(self): 79 | # Return the cell output and the id 80 | return CustomDecoderOutput( 81 | rnn_output=self._rnn_output_size(), 82 | token_output=self._helper.token_output_size, 83 | sample_id=self._helper.sample_ids_shape) 84 | 85 | @property 86 | def output_dtype(self): 87 | # Assume the dtype of the cell is the output_size structure 88 | # containing the input_state's first component's dtype. 89 | # Return that structure and the sample_ids_dtype from the helper. 90 | dtype = nest.flatten(self._initial_state)[0].dtype 91 | return CustomDecoderOutput( 92 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 93 | tf.float32, 94 | self._helper.sample_ids_dtype) 95 | 96 | def initialize(self, name=None): 97 | """Initialize the decoder. 98 | Args: 99 | name: Name scope for any created operations. 100 | Returns: 101 | `(finished, first_inputs, initial_state)`. 102 | """ 103 | return self._helper.initialize() + (self._initial_state,) 104 | 105 | def step(self, time, inputs, state, name=None): 106 | """Perform a custom decoding step. 107 | Enables for dyanmic prediction 108 | Args: 109 | time: scalar `int32` tensor. 110 | inputs: A (structure of) input tensors. 111 | state: A (structure of) state tensors and TensorArrays. 112 | name: Name scope for any created operations. 113 | Returns: 114 | `(outputs, next_state, next_inputs, finished)`. 115 | """ 116 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 117 | #Call outputprojection wrapper cell 118 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 119 | 120 | #apply output_layer (if existant) 121 | if self._output_layer is not None: 122 | cell_outputs = self._output_layer(cell_outputs) 123 | sample_ids = self._helper.sample( 124 | time=time, outputs=cell_outputs, state=cell_state) 125 | 126 | (finished, next_inputs, next_state) = self._helper.next_inputs( 127 | time=time, 128 | outputs=cell_outputs, 129 | state=cell_state, 130 | sample_ids=sample_ids, 131 | stop_token_prediction=stop_token) 132 | 133 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 134 | return (outputs, next_state, next_inputs, finished) 135 | -------------------------------------------------------------------------------- /tacotron/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | class TacoTestHelper(Helper): 7 | def __init__(self, batch_size, hparams): 8 | with tf.name_scope('TacoTestHelper'): 9 | self._batch_size = batch_size 10 | self._output_dim = hparams.num_mels 11 | self._reduction_factor = hparams.outputs_per_step 12 | self.stop_at_any = hparams.stop_at_any 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 38 | with tf.name_scope('TacoTestHelper'): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn't 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if self.stop_at_any: 52 | finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended 53 | else: 54 | finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope('TacoTrainingHelper'): 66 | self._batch_size = batch_size 67 | self._output_dim = hparams.num_mels 68 | self._reduction_factor = hparams.outputs_per_step 69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) 70 | self.gta = gta 71 | self.eval = evaluating 72 | self._hparams = hparams 73 | self.global_step = global_step 74 | 75 | r = self._reduction_factor 76 | # Feed every r-th target frame as input 77 | self._targets = targets[:, r-1::r, :] 78 | 79 | #Maximal sequence length 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | #Compute teacher forcing ratio for this global step. 100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing 101 | if self.gta: 102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth 103 | elif self.eval and self._hparams.tacotron_natural_eval: 104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions 105 | else: 106 | if self._hparams.tacotron_teacher_forcing_mode == 'scheduled': 107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, 108 | self.global_step, self._hparams) 109 | 110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 111 | 112 | def sample(self, time, outputs, state, name=None): 113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 114 | 115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 116 | with tf.name_scope(name or 'TacoTrainingHelper'): 117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions) 118 | finished = (time + 1 >= self._lengths) 119 | 120 | #Pick previous outputs randomly with respect to teacher forcing ratio 121 | next_inputs = tf.cond( 122 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 124 | lambda: outputs[:,-self._output_dim:]) 125 | 126 | #Pass on state 127 | next_state = state 128 | return (finished, next_inputs, next_state) 129 | 130 | 131 | def _go_frames(batch_size, output_dim): 132 | '''Returns all-zero frames for a given batch size and output dimension''' 133 | return tf.tile([[0.0]], [batch_size, output_dim]) 134 | 135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): 136 | ################################################################# 137 | # Narrow Cosine Decay: 138 | 139 | # Phase 1: tfr = init 140 | # We only start learning rate decay after 10k steps 141 | 142 | # Phase 2: tfr in ]init, final[ 143 | # decay reach minimal value at step ~40k 144 | 145 | # Phase 3: tfr = final 146 | # clip by minimal teacher forcing ratio value (step >~ 40k) 147 | ################################################################# 148 | #Pick final teacher forcing rate value 149 | if hparams.tacotron_teacher_forcing_final_ratio is not None: 150 | alpha = float(hparams.tacotron_teacher_forcing_final_ratio / hparams.tacotron_teacher_forcing_init_ratio) 151 | 152 | else: 153 | assert hparams.tacotron_teacher_forcing_decay_alpha is not None 154 | alpha = hparams.tacotron_teacher_forcing_decay_alpha 155 | 156 | #Compute natural cosine decay 157 | tfr = tf.train.cosine_decay(init_tfr, 158 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr ~= init at step 10k 159 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr ~= final at step ~40k 160 | alpha=alpha, #tfr = alpha% of init_tfr as final value 161 | name='tfr_cosine_decay') 162 | 163 | #force teacher forcing ratio to take initial value when global step < start decay step. 164 | narrow_tfr = tf.cond( 165 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), 166 | lambda: tf.convert_to_tensor(init_tfr), 167 | lambda: tfr) 168 | 169 | return narrow_tfr -------------------------------------------------------------------------------- /tacotron/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import time 5 | from time import sleep 6 | 7 | import tensorflow as tf 8 | from hparams import hparams, hparams_debug_string 9 | from infolog import log 10 | from tacotron.synthesizer import Synthesizer 11 | from tqdm import tqdm 12 | 13 | 14 | def generate_fast(model, text): 15 | model.synthesize([text], None, None, None, None) 16 | 17 | 18 | def run_live(args, checkpoint_path, hparams): 19 | #Log to Terminal without keeping any records in files 20 | log(hparams_debug_string()) 21 | synth = Synthesizer() 22 | synth.load(checkpoint_path, hparams) 23 | 24 | #Generate fast greeting message 25 | greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' 26 | log(greetings) 27 | generate_fast(synth, greetings) 28 | 29 | #Interaction loop 30 | while True: 31 | try: 32 | text = input() 33 | generate_fast(synth, text) 34 | 35 | except KeyboardInterrupt: 36 | leave = 'Thank you for testing our features. see you soon.' 37 | log(leave) 38 | generate_fast(synth, leave) 39 | sleep(2) 40 | break 41 | 42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences): 43 | eval_dir = os.path.join(output_dir, 'eval') 44 | log_dir = os.path.join(output_dir, 'logs-eval') 45 | 46 | if args.model == 'Tacotron-2': 47 | assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) 48 | 49 | #Create output path if it doesn't exist 50 | os.makedirs(eval_dir, exist_ok=True) 51 | os.makedirs(log_dir, exist_ok=True) 52 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) 53 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) 54 | 55 | log(hparams_debug_string()) 56 | synth = Synthesizer() 57 | synth.load(checkpoint_path, hparams) 58 | 59 | #Set inputs batch wise 60 | sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] 61 | 62 | log('Starting Synthesis') 63 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: 64 | for i, texts in enumerate(tqdm(sentences)): 65 | start = time.time() 66 | basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))] 67 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) 68 | 69 | for elems in zip(texts, mel_filenames, speaker_ids): 70 | file.write('|'.join([str(x) for x in elems]) + '\n') 71 | log('synthesized mel spectrograms at {}'.format(eval_dir)) 72 | return eval_dir 73 | 74 | def run_synthesis(args, checkpoint_path, output_dir, hparams): 75 | GTA = (args.GTA == 'True') 76 | if GTA: 77 | synth_dir = os.path.join(output_dir, 'gta') 78 | 79 | #Create output path if it doesn't exist 80 | os.makedirs(synth_dir, exist_ok=True) 81 | else: 82 | synth_dir = os.path.join(output_dir, 'natural') 83 | 84 | #Create output path if it doesn't exist 85 | os.makedirs(synth_dir, exist_ok=True) 86 | 87 | 88 | metadata_filename = os.path.join(args.input_dir, 'train.txt') 89 | log(hparams_debug_string()) 90 | synth = Synthesizer() 91 | synth.load(checkpoint_path, hparams, gta=GTA) 92 | with open(metadata_filename, encoding='utf-8') as f: 93 | metadata = [line.strip().split('|') for line in f] 94 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 95 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) 96 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) 97 | 98 | #Set inputs batch wise 99 | metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] 100 | 101 | log('Starting Synthesis') 102 | mel_dir = os.path.join(args.input_dir, 'mels') 103 | wav_dir = os.path.join(args.input_dir, 'audio') 104 | with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: 105 | for i, meta in enumerate(tqdm(metadata)): 106 | texts = [m[5] for m in meta] 107 | mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] 108 | wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta] 109 | basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames] 110 | mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames) 111 | 112 | for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts): 113 | file.write('|'.join([str(x) for x in elems]) + '\n') 114 | log('synthesized mel spectrograms at {}'.format(synth_dir)) 115 | return os.path.join(synth_dir, 'map.txt') 116 | 117 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None): 118 | output_dir = 'tacotron_' + args.output_dir 119 | 120 | try: 121 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 122 | log('loaded model at {}'.format(checkpoint_path)) 123 | except: 124 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) 125 | 126 | if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus: 127 | raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format( 128 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) 129 | 130 | if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0: 131 | raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format( 132 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus)) 133 | 134 | if args.mode == 'eval': 135 | return run_eval(args, checkpoint_path, output_dir, hparams, sentences) 136 | elif args.mode == 'synthesis': 137 | return run_synthesis(args, checkpoint_path, output_dir, hparams) 138 | else: 139 | run_live(args, checkpoint_path, hparams) 140 | -------------------------------------------------------------------------------- /tacotron/synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wave 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | import pyaudio 7 | import sounddevice as sd 8 | import tensorflow as tf 9 | from datasets import audio 10 | from infolog import log 11 | from librosa import effects 12 | from tacotron.models import create_model 13 | from tacotron.utils import plot 14 | from tacotron.utils.text import text_to_sequence 15 | 16 | 17 | class Synthesizer: 18 | def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): 19 | log('Constructing model: %s' % model_name) 20 | #Force the batch size to be known in order to use attention masking in batch synthesis 21 | inputs = tf.placeholder(tf.int32, (None, None), name='inputs') 22 | input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths') 23 | targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets') 24 | split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos') 25 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: 26 | self.model = create_model(model_name, hparams) 27 | if gta: 28 | self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos) 29 | else: 30 | self.model.initialize(inputs, input_lengths, split_infos=split_infos) 31 | 32 | self.mel_outputs = self.model.tower_mel_outputs 33 | self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None 34 | self.alignments = self.model.tower_alignments 35 | self.stop_token_prediction = self.model.tower_stop_token_prediction 36 | self.targets = targets 37 | 38 | if hparams.GL_on_GPU: 39 | self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') 40 | self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') 41 | 42 | self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams) 43 | self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams) 44 | 45 | self.gta = gta 46 | self._hparams = hparams 47 | #pad input sequences with the 0 ( _ ) 48 | self._pad = 0 49 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 50 | #to avoid any possible conflicts, without affecting the output range of the model too much 51 | if hparams.symmetric_mels: 52 | self._target_pad = -hparams.max_abs_value 53 | else: 54 | self._target_pad = 0. 55 | 56 | self.inputs = inputs 57 | self.input_lengths = input_lengths 58 | self.targets = targets 59 | self.split_infos = split_infos 60 | 61 | log('Loading checkpoint: %s' % checkpoint_path) 62 | #Memory allocation on the GPUs as needed 63 | config = tf.ConfigProto() 64 | config.gpu_options.allow_growth = True 65 | config.allow_soft_placement = True 66 | 67 | self.session = tf.Session(config=config) 68 | self.session.run(tf.global_variables_initializer()) 69 | 70 | saver = tf.train.Saver() 71 | saver.restore(self.session, checkpoint_path) 72 | 73 | 74 | def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): 75 | hparams = self._hparams 76 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 77 | #[-max, max] or [0,max] 78 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value) 79 | 80 | #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) 81 | while len(texts) % hparams.tacotron_synthesis_batch_size != 0: 82 | texts.append(texts[-1]) 83 | basenames.append(basenames[-1]) 84 | if mel_filenames is not None: 85 | mel_filenames.append(mel_filenames[-1]) 86 | 87 | assert 0 == len(texts) % self._hparams.tacotron_num_gpus 88 | seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 89 | input_lengths = [len(seq) for seq in seqs] 90 | 91 | size_per_device = len(seqs) // self._hparams.tacotron_num_gpus 92 | 93 | #Pad inputs according to each GPU max length 94 | input_seqs = None 95 | split_infos = [] 96 | for i in range(self._hparams.tacotron_num_gpus): 97 | device_input = seqs[size_per_device*i: size_per_device*(i+1)] 98 | device_input, max_seq_len = self._prepare_inputs(device_input) 99 | input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input 100 | split_infos.append([max_seq_len, 0, 0, 0]) 101 | 102 | feed_dict = { 103 | self.inputs: input_seqs, 104 | self.input_lengths: np.asarray(input_lengths, dtype=np.int32), 105 | } 106 | 107 | if self.gta: 108 | np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] 109 | target_lengths = [len(np_target) for np_target in np_targets] 110 | 111 | #pad targets according to each GPU max length 112 | target_seqs = None 113 | for i in range(self._hparams.tacotron_num_gpus): 114 | device_target = np_targets[size_per_device*i: size_per_device*(i+1)] 115 | device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step) 116 | target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target 117 | split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe? 118 | 119 | feed_dict[self.targets] = target_seqs 120 | assert len(np_targets) == len(texts) 121 | 122 | feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) 123 | 124 | if self.gta or not hparams.predict_linear: 125 | mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) 126 | 127 | #Linearize outputs (n_gpus -> 1D) 128 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 129 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 130 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 131 | 132 | if not self.gta: 133 | #Natural batch synthesis 134 | #Get Mel lengths for the entire batch from stop_tokens predictions 135 | target_lengths = self._get_output_lengths(stop_tokens) 136 | 137 | #Take off the batch wise padding 138 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 139 | assert len(mels) == len(texts) 140 | 141 | else: 142 | linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) 143 | 144 | #Linearize outputs (1D arrays) 145 | linears = [linear for gpu_linear in linears for linear in gpu_linear] 146 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 147 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 148 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 149 | 150 | #Natural batch synthesis 151 | #Get Mel/Linear lengths for the entire batch from stop_tokens predictions 152 | target_lengths = self._get_output_lengths(stop_tokens) 153 | 154 | #Take off the batch wise padding 155 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 156 | linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] 157 | linears = np.clip(linears, T2_output_range[0], T2_output_range[1]) 158 | assert len(mels) == len(linears) == len(texts) 159 | 160 | mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) 161 | 162 | if basenames is None: 163 | #Generate wav and read it 164 | if hparams.GL_on_GPU: 165 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]}) 166 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 167 | else: 168 | wav = audio.inv_mel_spectrogram(mels[0].T, hparams) 169 | audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way 170 | 171 | if platform.system() == 'Linux': 172 | #Linux wav reader 173 | os.system('aplay temp.wav') 174 | 175 | elif platform.system() == 'Windows': 176 | #windows wav reader 177 | os.system('start /min mplay32 /play /close temp.wav') 178 | 179 | else: 180 | raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') 181 | 182 | return 183 | 184 | 185 | saved_mels_paths = [] 186 | speaker_ids = [] 187 | for i, mel in enumerate(mels): 188 | #Get speaker id for global conditioning (only used with GTA generally) 189 | if hparams.gin_channels > 0: 190 | raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.') 191 | speaker_id = '' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) 192 | speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) 193 | else: 194 | speaker_id = '' 195 | speaker_ids.append(speaker_id) 196 | 197 | # Write the spectrogram to disk 198 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders 199 | mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) 200 | np.save(mel_filename, mel, allow_pickle=False) 201 | saved_mels_paths.append(mel_filename) 202 | 203 | if log_dir is not None: 204 | #save wav (mel -> wav) 205 | if hparams.GL_on_GPU: 206 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel}) 207 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 208 | else: 209 | wav = audio.inv_mel_spectrogram(mel.T, hparams) 210 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) 211 | 212 | #save alignments 213 | plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])), 214 | title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) 215 | 216 | #save mel spectrogram plot 217 | plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), 218 | title='{}'.format(texts[i]), split_title=True) 219 | 220 | if hparams.predict_linear: 221 | #save wav (linear -> wav) 222 | if hparams.GL_on_GPU: 223 | wav = self.session.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]}) 224 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 225 | else: 226 | wav = audio.inv_linear_spectrogram(linears[i].T, hparams) 227 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) 228 | 229 | #save linear spectrogram plot 230 | plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), 231 | title='{}'.format(texts[i]), split_title=True, auto_aspect=True) 232 | 233 | return saved_mels_paths, speaker_ids 234 | 235 | def _round_up(self, x, multiple): 236 | remainder = x % multiple 237 | return x if remainder == 0 else x + multiple - remainder 238 | 239 | def _prepare_inputs(self, inputs): 240 | max_len = max([len(x) for x in inputs]) 241 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 242 | 243 | def _pad_input(self, x, length): 244 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad) 245 | 246 | def _prepare_targets(self, targets, alignment): 247 | max_len = max([len(t) for t in targets]) 248 | data_len = self._round_up(max_len, alignment) 249 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 250 | 251 | def _pad_target(self, t, length): 252 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) 253 | 254 | def _get_output_lengths(self, stop_tokens): 255 | #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise) 256 | output_lengths = [row.index(1) if 1 in row else len(row) for row in np.round(stop_tokens).tolist()] 257 | return output_lengths 258 | -------------------------------------------------------------------------------- /tacotron/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | import time 5 | import traceback 6 | from datetime import datetime 7 | 8 | import infolog 9 | import numpy as np 10 | import tensorflow as tf 11 | from datasets import audio 12 | from hparams import hparams_debug_string 13 | from tacotron.feeder import Feeder 14 | from tacotron.models import create_model 15 | from tacotron.utils import ValueWindow, plot 16 | from tacotron.utils.text import sequence_to_text 17 | from tacotron.utils.symbols import symbols 18 | from tqdm import tqdm 19 | 20 | log = infolog.log 21 | 22 | 23 | def time_string(): 24 | return datetime.now().strftime('%Y-%m-%d %H:%M') 25 | 26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path): 27 | #Create tensorboard projector 28 | config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() 29 | config.model_checkpoint_path = checkpoint_path 30 | 31 | for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta): 32 | #Initialize config 33 | embedding = config.embeddings.add() 34 | #Specifiy the embedding variable and the metadata 35 | embedding.tensor_name = embedding_name 36 | embedding.metadata_path = path_to_meta 37 | 38 | #Project the embeddings to space dimensions for visualization 39 | tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config) 40 | 41 | def add_train_stats(model, hparams): 42 | with tf.variable_scope('stats') as scope: 43 | for i in range(hparams.tacotron_num_gpus): 44 | tf.summary.histogram('mel_outputs %d' % i, model.tower_mel_outputs[i]) 45 | tf.summary.histogram('mel_targets %d' % i, model.tower_mel_targets[i]) 46 | tf.summary.scalar('before_loss', model.before_loss) 47 | tf.summary.scalar('after_loss', model.after_loss) 48 | 49 | if hparams.predict_linear: 50 | tf.summary.scalar('linear_loss', model.linear_loss) 51 | for i in range(hparams.tacotron_num_gpus): 52 | tf.summary.histogram('linear_outputs %d' % i, model.tower_linear_outputs[i]) 53 | tf.summary.histogram('linear_targets %d' % i, model.tower_linear_targets[i]) 54 | 55 | tf.summary.scalar('regularization_loss', model.regularization_loss) 56 | tf.summary.scalar('stop_token_loss', model.stop_token_loss) 57 | tf.summary.scalar('loss', model.loss) 58 | tf.summary.scalar('learning_rate', model.learning_rate) #Control learning rate decay speed 59 | if hparams.tacotron_teacher_forcing_mode == 'scheduled': 60 | tf.summary.scalar('teacher_forcing_ratio', model.ratio) #Control teacher forcing ratio decay when mode = 'scheduled' 61 | gradient_norms = [tf.norm(grad) for grad in model.gradients] 62 | tf.summary.histogram('gradient_norm', gradient_norms) 63 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion) 64 | return tf.summary.merge_all() 65 | 66 | def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): 67 | values = [ 68 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_before_loss', simple_value=before_loss), 69 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_after_loss', simple_value=after_loss), 70 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/stop_token_loss', simple_value=stop_token_loss), 71 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_loss', simple_value=loss), 72 | ] 73 | if linear_loss is not None: 74 | values.append(tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_linear_loss', simple_value=linear_loss)) 75 | test_summary = tf.Summary(value=values) 76 | summary_writer.add_summary(test_summary, step) 77 | 78 | def model_train_mode(args, feeder, hparams, global_step): 79 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: 80 | model_name = None 81 | if args.model == 'Tacotron-2': 82 | model_name = 'Tacotron' 83 | model = create_model(model_name or args.model, hparams) 84 | if hparams.predict_linear: 85 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, linear_targets=feeder.linear_targets, 86 | targets_lengths=feeder.targets_lengths, global_step=global_step, 87 | is_training=True, split_infos=feeder.split_infos) 88 | else: 89 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, 90 | targets_lengths=feeder.targets_lengths, global_step=global_step, 91 | is_training=True, split_infos=feeder.split_infos) 92 | model.add_loss() 93 | model.add_optimizer(global_step) 94 | stats = add_train_stats(model, hparams) 95 | return model, stats 96 | 97 | def model_test_mode(args, feeder, hparams, global_step): 98 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: 99 | model_name = None 100 | if args.model == 'Tacotron-2': 101 | model_name = 'Tacotron' 102 | model = create_model(model_name or args.model, hparams) 103 | if hparams.predict_linear: 104 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 105 | linear_targets=feeder.eval_linear_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step, 106 | is_training=False, is_evaluating=True, split_infos=feeder.eval_split_infos) 107 | else: 108 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 109 | targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True, 110 | split_infos=feeder.eval_split_infos) 111 | model.add_loss() 112 | return model 113 | 114 | def train(log_dir, args, hparams): 115 | save_dir = os.path.join(log_dir, 'taco_pretrained') 116 | plot_dir = os.path.join(log_dir, 'plots') 117 | wav_dir = os.path.join(log_dir, 'wavs') 118 | mel_dir = os.path.join(log_dir, 'mel-spectrograms') 119 | eval_dir = os.path.join(log_dir, 'eval-dir') 120 | eval_plot_dir = os.path.join(eval_dir, 'plots') 121 | eval_wav_dir = os.path.join(eval_dir, 'wavs') 122 | tensorboard_dir = os.path.join(log_dir, 'tacotron_events') 123 | meta_folder = os.path.join(log_dir, 'metas') 124 | os.makedirs(save_dir, exist_ok=True) 125 | os.makedirs(plot_dir, exist_ok=True) 126 | os.makedirs(wav_dir, exist_ok=True) 127 | os.makedirs(mel_dir, exist_ok=True) 128 | os.makedirs(eval_dir, exist_ok=True) 129 | os.makedirs(eval_plot_dir, exist_ok=True) 130 | os.makedirs(eval_wav_dir, exist_ok=True) 131 | os.makedirs(tensorboard_dir, exist_ok=True) 132 | os.makedirs(meta_folder, exist_ok=True) 133 | 134 | checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') 135 | input_path = os.path.join(args.base_dir, args.tacotron_input) 136 | 137 | if hparams.predict_linear: 138 | linear_dir = os.path.join(log_dir, 'linear-spectrograms') 139 | os.makedirs(linear_dir, exist_ok=True) 140 | 141 | log('Checkpoint path: {}'.format(checkpoint_path)) 142 | log('Loading training data from: {}'.format(input_path)) 143 | log('Using model: {}'.format(args.model)) 144 | log(hparams_debug_string()) 145 | 146 | #Start by setting a seed for repeatability 147 | tf.set_random_seed(hparams.tacotron_random_seed) 148 | 149 | #Set up data feeder 150 | coord = tf.train.Coordinator() 151 | with tf.variable_scope('datafeeder') as scope: 152 | feeder = Feeder(coord, input_path, hparams) 153 | 154 | #Set up model: 155 | global_step = tf.Variable(0, name='global_step', trainable=False) 156 | model, stats = model_train_mode(args, feeder, hparams, global_step) 157 | eval_model = model_test_mode(args, feeder, hparams, global_step) 158 | 159 | #Embeddings metadata 160 | char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv') 161 | if not os.path.isfile(char_embedding_meta): 162 | with open(char_embedding_meta, 'w', encoding='utf-8') as f: 163 | for symbol in symbols: 164 | if symbol == ' ': 165 | symbol = '\\s' #For visual purposes, swap space with \s 166 | 167 | f.write('{}\n'.format(symbol)) 168 | 169 | char_embedding_meta = char_embedding_meta.replace(log_dir, '..') 170 | 171 | #Potential Griffin-Lim GPU setup 172 | if hparams.GL_on_GPU: 173 | GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') 174 | GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs') 175 | 176 | GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams) 177 | GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(GLGPU_lin_inputs, hparams) 178 | 179 | #Book keeping 180 | step = 0 181 | time_window = ValueWindow(100) 182 | loss_window = ValueWindow(100) 183 | saver = tf.train.Saver(max_to_keep=20) 184 | 185 | log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) 186 | 187 | #Memory allocation on the GPU as needed 188 | config = tf.ConfigProto() 189 | config.gpu_options.allow_growth = True 190 | config.allow_soft_placement = True 191 | 192 | #Train 193 | with tf.Session(config=config) as sess: 194 | try: 195 | summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) 196 | 197 | sess.run(tf.global_variables_initializer()) 198 | 199 | #saved model restoring 200 | if args.restore: 201 | # Restore saved model if the user requested it, default = True 202 | try: 203 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 204 | 205 | if (checkpoint_state and checkpoint_state.model_checkpoint_path): 206 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) 207 | saver.restore(sess, checkpoint_state.model_checkpoint_path) 208 | 209 | else: 210 | log('No model to load at {}'.format(save_dir), slack=True) 211 | saver.save(sess, checkpoint_path, global_step=global_step) 212 | 213 | except tf.errors.OutOfRangeError as e: 214 | log('Cannot restore checkpoint: {}'.format(e), slack=True) 215 | else: 216 | log('Starting new training!', slack=True) 217 | saver.save(sess, checkpoint_path, global_step=global_step) 218 | 219 | #initializing feeder 220 | feeder.start_threads(sess) 221 | 222 | #Training loop 223 | while not coord.should_stop() and step < args.tacotron_train_steps: 224 | start_time = time.time() 225 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 226 | time_window.append(time.time() - start_time) 227 | loss_window.append(loss) 228 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( 229 | step, time_window.average, loss, loss_window.average) 230 | log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) 231 | 232 | if np.isnan(loss) or loss > 100.: 233 | log('Loss exploded to {:.5f} at step {}'.format(loss, step)) 234 | raise Exception('Loss exploded') 235 | 236 | if step % args.summary_interval == 0: 237 | log('\nWriting summary at step {}'.format(step)) 238 | summary_writer.add_summary(sess.run(stats), step) 239 | 240 | if step % args.eval_interval == 0: 241 | #Run eval and save eval stats 242 | log('\nRunning evaluation at step {}'.format(step)) 243 | 244 | eval_losses = [] 245 | before_losses = [] 246 | after_losses = [] 247 | stop_token_losses = [] 248 | linear_losses = [] 249 | linear_loss = None 250 | 251 | if hparams.predict_linear: 252 | for i in tqdm(range(feeder.test_steps)): 253 | eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([ 254 | eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], 255 | eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], 256 | eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], 257 | eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], 258 | eval_model.tower_linear_targets[0][0], 259 | ]) 260 | eval_losses.append(eloss) 261 | before_losses.append(before_loss) 262 | after_losses.append(after_loss) 263 | stop_token_losses.append(stop_token_loss) 264 | linear_losses.append(linear_loss) 265 | linear_loss = sum(linear_losses) / len(linear_losses) 266 | 267 | if hparams.GL_on_GPU: 268 | wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p}) 269 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 270 | else: 271 | wav = audio.inv_linear_spectrogram(lin_p.T, hparams) 272 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) 273 | 274 | else: 275 | for i in tqdm(range(feeder.test_steps)): 276 | eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([ 277 | eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], 278 | eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], 279 | eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] 280 | ]) 281 | eval_losses.append(eloss) 282 | before_losses.append(before_loss) 283 | after_losses.append(after_loss) 284 | stop_token_losses.append(stop_token_loss) 285 | 286 | eval_loss = sum(eval_losses) / len(eval_losses) 287 | before_loss = sum(before_losses) / len(before_losses) 288 | after_loss = sum(after_losses) / len(after_losses) 289 | stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) 290 | 291 | log('Saving eval log to {}..'.format(eval_dir)) 292 | #Save some log to monitor model improvement on same unseen sequence 293 | if hparams.GL_on_GPU: 294 | wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p}) 295 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 296 | else: 297 | wav = audio.inv_mel_spectrogram(mel_p.T, hparams) 298 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) 299 | 300 | plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), 301 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), 302 | max_len=t_len // hparams.outputs_per_step) 303 | plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), 304 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, 305 | max_len=t_len) 306 | 307 | if hparams.predict_linear: 308 | plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)), 309 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, 310 | max_len=t_len, auto_aspect=True) 311 | 312 | log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) 313 | log('Writing eval summary!') 314 | add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) 315 | 316 | 317 | if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: 318 | #Save model and current global step 319 | saver.save(sess, checkpoint_path, global_step=global_step) 320 | 321 | log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') 322 | if hparams.predict_linear: 323 | input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([ 324 | model.tower_inputs[0][0], 325 | model.tower_mel_outputs[0][0], 326 | model.tower_linear_outputs[0][0], 327 | model.tower_alignments[0][0], 328 | model.tower_mel_targets[0][0], 329 | model.tower_targets_lengths[0][0], 330 | model.tower_linear_targets[0][0], 331 | ]) 332 | 333 | #save predicted linear spectrogram to disk (debug) 334 | linear_filename = 'linear-prediction-step-{}.npy'.format(step) 335 | np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) 336 | 337 | #save griffin lim inverted wav for debug (linear -> wav) 338 | if hparams.GL_on_GPU: 339 | wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction}) 340 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 341 | else: 342 | wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) 343 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) 344 | 345 | #Save real and predicted linear-spectrogram plot to disk (control purposes) 346 | plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)), 347 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, 348 | max_len=target_length, auto_aspect=True) 349 | 350 | else: 351 | input_seq, mel_prediction, alignment, target, target_length = sess.run([ 352 | model.tower_inputs[0][0], 353 | model.tower_mel_outputs[0][0], 354 | model.tower_alignments[0][0], 355 | model.tower_mel_targets[0][0], 356 | model.tower_targets_lengths[0][0], 357 | ]) 358 | 359 | #save predicted mel spectrogram to disk (debug) 360 | mel_filename = 'mel-prediction-step-{}.npy'.format(step) 361 | np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) 362 | 363 | #save griffin lim inverted wav for debug (mel -> wav) 364 | if hparams.GL_on_GPU: 365 | wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) 366 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) 367 | else: 368 | wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) 369 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) 370 | 371 | #save alignment plot to disk (control purposes) 372 | plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), 373 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), 374 | max_len=target_length // hparams.outputs_per_step) 375 | #save real and predicted mel-spectrogram plot to disk (control purposes) 376 | plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), 377 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=target, 378 | max_len=target_length) 379 | log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) 380 | 381 | if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: 382 | #Get current checkpoint state 383 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 384 | 385 | #Update Projector 386 | log('\nSaving Model Character Embeddings visualization..') 387 | add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) 388 | log('Tacotron Character embeddings have been updated on tensorboard!') 389 | 390 | log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True) 391 | return save_dir 392 | 393 | except Exception as e: 394 | log('Exiting due to exception: {}'.format(e), slack=True) 395 | traceback.print_exc() 396 | coord.request_stop(e) 397 | 398 | def tacotron_train(args, log_dir, hparams): 399 | return train(log_dir, args, hparams) 400 | -------------------------------------------------------------------------------- /tacotron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /tacotron/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | 15 | from unidecode import unidecode 16 | 17 | from .numbers import normalize_numbers 18 | 19 | # Regular expression matching whitespace: 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def expand_numbers(text): 52 | return normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | '''lowercase input tokens. 57 | ''' 58 | return text.lower() 59 | 60 | 61 | def collapse_whitespace(text): 62 | return re.sub(_whitespace_re, ' ', text) 63 | 64 | 65 | def convert_to_ascii(text): 66 | return unidecode(text) 67 | 68 | 69 | def basic_cleaners(text): 70 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 71 | text = lowercase(text) 72 | text = collapse_whitespace(text) 73 | return text 74 | 75 | 76 | def transliteration_cleaners(text): 77 | '''Pipeline for non-English text that transliterates to ASCII.''' 78 | text = convert_to_ascii(text) 79 | text = lowercase(text) 80 | text = collapse_whitespace(text) 81 | return text 82 | 83 | 84 | def english_cleaners(text): 85 | '''Pipeline for English text, including number and abbreviation expansion.''' 86 | text = convert_to_ascii(text) 87 | # text = lowercase(text) 88 | text = expand_numbers(text) 89 | text = expand_abbreviations(text) 90 | text = collapse_whitespace(text) 91 | return text 92 | -------------------------------------------------------------------------------- /tacotron/utils/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_symbols = [ 4 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 5 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 6 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 7 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 8 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 9 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 10 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 11 | ] 12 | 13 | _valid_symbol_set = set(valid_symbols) 14 | 15 | 16 | class CMUDict: 17 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 18 | def __init__(self, file_or_path, keep_ambiguous=True): 19 | if isinstance(file_or_path, str): 20 | with open(file_or_path, encoding='latin-1') as f: 21 | entries = _parse_cmudict(f) 22 | else: 23 | entries = _parse_cmudict(file_or_path) 24 | if not keep_ambiguous: 25 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 26 | self._entries = entries 27 | 28 | 29 | def __len__(self): 30 | return len(self._entries) 31 | 32 | 33 | def lookup(self, word): 34 | '''Returns list of ARPAbet pronunciations of the given word.''' 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | 39 | _alt_re = re.compile(r'\([0-9]+\)') 40 | 41 | 42 | def _parse_cmudict(file): 43 | cmudict = {} 44 | for line in file: 45 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 46 | parts = line.split(' ') 47 | word = re.sub(_alt_re, '', parts[0]) 48 | pronunciation = _get_pronunciation(parts[1]) 49 | if pronunciation: 50 | if word in cmudict: 51 | cmudict[word].append(pronunciation) 52 | else: 53 | cmudict[word] = [pronunciation] 54 | return cmudict 55 | 56 | 57 | def _get_pronunciation(s): 58 | parts = s.strip().split(' ') 59 | for part in parts: 60 | if part not in _valid_symbol_set: 61 | return None 62 | return ' '.join(parts) 63 | -------------------------------------------------------------------------------- /tacotron/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /tacotron/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | import numpy as np 6 | 7 | 8 | def split_title_line(title_text, max_words=5): 9 | """ 10 | A function that splits any string based on specific character 11 | (returning it with the string), with maximum number of words on it 12 | """ 13 | seq = title_text.split() 14 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 15 | 16 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None): 17 | if max_len is not None: 18 | alignment = alignment[:, :max_len] 19 | 20 | fig = plt.figure(figsize=(8, 6)) 21 | ax = fig.add_subplot(111) 22 | 23 | im = ax.imshow( 24 | alignment, 25 | aspect='auto', 26 | origin='lower', 27 | interpolation='none') 28 | fig.colorbar(im, ax=ax) 29 | xlabel = 'Decoder timestep' 30 | 31 | if split_title: 32 | title = split_title_line(title) 33 | 34 | plt.xlabel(xlabel) 35 | plt.title(title) 36 | plt.ylabel('Encoder timestep') 37 | plt.tight_layout() 38 | plt.savefig(path, format='png') 39 | plt.close() 40 | 41 | 42 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 43 | if max_len is not None: 44 | target_spectrogram = target_spectrogram[:max_len] 45 | pred_spectrogram = pred_spectrogram[:max_len] 46 | 47 | if split_title: 48 | title = split_title_line(title) 49 | 50 | fig = plt.figure(figsize=(10, 8)) 51 | # Set common labels 52 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 53 | 54 | #target spectrogram subplot 55 | if target_spectrogram is not None: 56 | ax1 = fig.add_subplot(311) 57 | ax2 = fig.add_subplot(312) 58 | 59 | if auto_aspect: 60 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none') 61 | else: 62 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none') 63 | ax1.set_title('Target Mel-Spectrogram') 64 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) 65 | ax2.set_title('Predicted Mel-Spectrogram') 66 | else: 67 | ax2 = fig.add_subplot(211) 68 | 69 | if auto_aspect: 70 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none') 71 | else: 72 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none') 73 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2) 74 | 75 | plt.tight_layout() 76 | plt.savefig(path, format='png') 77 | plt.close() 78 | -------------------------------------------------------------------------------- /tacotron/utils/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from . import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | #_arpabet = ['@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet 18 | -------------------------------------------------------------------------------- /tacotron/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from . import cleaners 4 | from .symbols import symbols 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /test_wavenet_feeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import argparse 4 | from hparams import hparams 5 | from datasets import audio 6 | from tqdm import tqdm 7 | 8 | 9 | 10 | def _limit_time(hparams): 11 | '''Limit time resolution to save GPU memory. 12 | ''' 13 | if hparams.max_time_sec is not None: 14 | return int(hparams.max_time_sec * hparams.sample_rate) 15 | elif hparams.max_time_steps is not None: 16 | return hparams.max_time_steps 17 | else: 18 | return None 19 | 20 | 21 | def get_groups(args, hparams, meta, local_condition): 22 | if hparams.train_with_GTA: 23 | mel_file = meta[2] 24 | else: 25 | mel_file = meta[1] 26 | audio_file = meta[0] 27 | 28 | input_data = np.load(os.path.join(args.base_dir, audio_file)) 29 | 30 | if local_condition: 31 | local_condition_features = np.load(os.path.join(args.base_dir, mel_file)) 32 | else: 33 | local_condition_features = None 34 | 35 | return (input_data, local_condition_features, None, len(input_data)) 36 | 37 | def _adjust_time_resolution(hparams, batch, local_condition, max_time_steps): 38 | '''Adjust time resolution between audio and local condition 39 | ''' 40 | if local_condition: 41 | new_batch = [] 42 | for b in batch: 43 | x, c, g, l = b 44 | _assert_ready_for_upsample(hparams, x, c) 45 | if max_time_steps is not None: 46 | max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(hparams), True) 47 | if len(x) > max_time_steps: 48 | max_time_frames = max_steps // audio.get_hop_size(hparams) 49 | start = np.random.randint(0, len(c) - max_time_frames) 50 | time_start = start * audio.get_hop_size(hparams) 51 | x = x[time_start: time_start + max_time_frames * audio.get_hop_size(hparams)] 52 | c = c[start: start + max_time_frames, :] 53 | _assert_ready_for_upsample(hparams, x, c) 54 | 55 | new_batch.append((x, c, g, l)) 56 | return new_batch 57 | else: 58 | new_batch = [] 59 | for b in batch: 60 | x, c, g, l = b 61 | x = audio.trim_silence(x, hparams) 62 | if max_time_steps is not None and len(x) > max_time_steps: 63 | start = np.random.randint(0, len(c) - max_time_steps) 64 | x = x[start: start + max_time_steps] 65 | new_batch.append((x, c, g, l)) 66 | return new_batch 67 | 68 | def _assert_ready_for_upsample(hparams, x, c): 69 | assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(hparams) 70 | 71 | def check_time_alignment(hparams, batch, local_condition): 72 | #No need to check beyond this step when preparing data 73 | #Limit time steps to save GPU Memory usage 74 | max_time_steps = _limit_time(hparams) 75 | #Adjust time resolution for upsampling 76 | batch = _adjust_time_resolution(hparams, batch, local_condition, max_time_steps) 77 | 78 | def _ensure_divisible(length, divisible_by=256, lower=True): 79 | if length % divisible_by == 0: 80 | return length 81 | if lower: 82 | return length - length % divisible_by 83 | else: 84 | return length + (divisible_by - length % divisible_by) 85 | 86 | def run(args, hparams): 87 | with open(args.metadata, 'r') as file: 88 | metadata = [line.strip().split('|') for line in file] 89 | 90 | local_condition = hparams.cin_channels > 0 91 | 92 | examples = [get_groups(args, hparams, meta, local_condition) for meta in metadata] 93 | batches = [examples[i: i+hparams.wavenet_batch_size] for i in range(0, len(examples), hparams.wavenet_batch_size)] 94 | 95 | for batch in tqdm(batches): 96 | check_time_alignment(hparams, batch, local_condition) 97 | 98 | 99 | 100 | def main(): 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('--base_dir', default='') 103 | parser.add_argument('--hparams', default='', 104 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 105 | parser.add_argument('--metadata', default='tacotron_output/gta/map.txt') 106 | args = parser.parse_args() 107 | 108 | modified_hparams = hparams.parse(args.hparams) 109 | run(args, modified_hparams) 110 | 111 | 112 | if __name__ == '__main__': 113 | main() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from time import sleep 4 | 5 | import infolog 6 | import tensorflow as tf 7 | from hparams import hparams 8 | from infolog import log 9 | from tacotron.synthesize import tacotron_synthesize 10 | from tacotron.train import tacotron_train 11 | from wavenet_vocoder.train import wavenet_train 12 | 13 | log = infolog.log 14 | 15 | 16 | def save_seq(file, sequence, input_path): 17 | '''Save Tacotron-2 training state to disk. (To skip for future runs) 18 | ''' 19 | sequence = [str(int(s)) for s in sequence] + [input_path] 20 | with open(file, 'w') as f: 21 | f.write('|'.join(sequence)) 22 | 23 | def read_seq(file): 24 | '''Load Tacotron-2 training state from disk. (To skip if not first run) 25 | ''' 26 | if os.path.isfile(file): 27 | with open(file, 'r') as f: 28 | sequence = f.read().split('|') 29 | 30 | return [bool(int(s)) for s in sequence[:-1]], sequence[-1] 31 | else: 32 | return [0, 0, 0], '' 33 | 34 | def prepare_run(args): 35 | modified_hp = hparams.parse(args.hparams) 36 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 37 | run_name = args.name or args.model 38 | log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name)) 39 | os.makedirs(log_dir, exist_ok=True) 40 | infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url) 41 | return log_dir, modified_hp 42 | 43 | def train(args, log_dir, hparams): 44 | state_file = os.path.join(log_dir, 'state_log') 45 | #Get training states 46 | (taco_state, GTA_state, wave_state), input_path = read_seq(state_file) 47 | 48 | if not taco_state: 49 | log('\n#############################################################\n') 50 | log('Tacotron Train\n') 51 | log('###########################################################\n') 52 | checkpoint = tacotron_train(args, log_dir, hparams) 53 | tf.reset_default_graph() 54 | #Sleep 1/2 second to let previous graph close and avoid error messages while synthesis 55 | sleep(0.5) 56 | if checkpoint is None: 57 | raise('Error occured while training Tacotron, Exiting!') 58 | taco_state = 1 59 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 60 | else: 61 | checkpoint = os.path.join(log_dir, 'taco_pretrained/') 62 | 63 | if not GTA_state: 64 | log('\n#############################################################\n') 65 | log('Tacotron GTA Synthesis\n') 66 | log('###########################################################\n') 67 | input_path = tacotron_synthesize(args, hparams, checkpoint) 68 | tf.reset_default_graph() 69 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is training 70 | sleep(0.5) 71 | GTA_state = 1 72 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 73 | else: 74 | input_path = os.path.join('tacotron_' + args.output_dir, 'gta', 'map.txt') 75 | 76 | if input_path == '' or input_path is None: 77 | raise RuntimeError('input_path has an unpleasant value -> {}'.format(input_path)) 78 | 79 | if not wave_state: 80 | log('\n#############################################################\n') 81 | log('Wavenet Train\n') 82 | log('###########################################################\n') 83 | checkpoint = wavenet_train(args, log_dir, hparams, input_path) 84 | if checkpoint is None: 85 | raise ('Error occured while training Wavenet, Exiting!') 86 | wave_state = 1 87 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 88 | 89 | if wave_state and GTA_state and taco_state: 90 | log('TRAINING IS ALREADY COMPLETE!!') 91 | 92 | def main(): 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--base_dir', default='') 95 | parser.add_argument('--hparams', default='', 96 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 97 | parser.add_argument('--tacotron_input', default='training_data/train.txt') 98 | parser.add_argument('--wavenet_input', default='tacotron_output/gta/map.txt') 99 | parser.add_argument('--name', help='Name of logging directory.') 100 | parser.add_argument('--model', default='Tacotron-2') 101 | parser.add_argument('--input_dir', default='training_data', help='folder to contain inputs sentences/targets') 102 | parser.add_argument('--output_dir', default='output', help='folder to contain synthesized mel spectrograms') 103 | parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training') 104 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode') 105 | parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training') 106 | parser.add_argument('--summary_interval', type=int, default=250, 107 | help='Steps between running summary ops') 108 | parser.add_argument('--embedding_interval', type=int, default=5000, 109 | help='Steps between updating embeddings projection visualization') 110 | parser.add_argument('--checkpoint_interval', type=int, default=2500, 111 | help='Steps between writing checkpoints') 112 | parser.add_argument('--eval_interval', type=int, default=5000, 113 | help='Steps between eval on test data') 114 | parser.add_argument('--tacotron_train_steps', type=int, default=100000, help='total number of tacotron training steps') 115 | parser.add_argument('--wavenet_train_steps', type=int, default=500000, help='total number of wavenet training steps') 116 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') 117 | parser.add_argument('--slack_url', default=None, help='slack webhook notification destination link') 118 | args = parser.parse_args() 119 | 120 | accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2'] 121 | 122 | if args.model not in accepted_models: 123 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models)) 124 | 125 | log_dir, hparams = prepare_run(args) 126 | 127 | if args.model == 'Tacotron': 128 | tacotron_train(args, log_dir, hparams) 129 | elif args.model == 'WaveNet': 130 | wavenet_train(args, log_dir, hparams, args.wavenet_input) 131 | elif args.model == 'Tacotron-2': 132 | train(args, log_dir, hparams) 133 | else: 134 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 135 | 136 | 137 | if __name__ == '__main__': 138 | main() 139 | -------------------------------------------------------------------------------- /wavenet_preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | 5 | from datasets import wavenet_preprocessor 6 | from hparams import hparams 7 | from tqdm import tqdm 8 | 9 | 10 | def preprocess(args, input_dir, out_dir, hparams): 11 | mel_dir = os.path.join(out_dir, 'mels') 12 | wav_dir = os.path.join(out_dir, 'audio') 13 | os.makedirs(mel_dir, exist_ok=True) 14 | os.makedirs(wav_dir, exist_ok=True) 15 | metadata = wavenet_preprocessor.build_from_path(hparams, input_dir, mel_dir, wav_dir, args.n_jobs, tqdm=tqdm) 16 | write_metadata(metadata, out_dir) 17 | 18 | def write_metadata(metadata, out_dir): 19 | with open(os.path.join(out_dir, 'map.txt'), 'w', encoding='utf-8') as f: 20 | for m in metadata: 21 | f.write('|'.join([str(x) for x in m]) + '\n') 22 | mel_frames = sum([int(m[5]) for m in metadata]) 23 | timesteps = sum([int(m[4]) for m in metadata]) 24 | sr = hparams.sample_rate 25 | hours = timesteps / sr / 3600 26 | print('Write {} utterances, {} audio timesteps, ({:.2f} hours)'.format( 27 | len(metadata), timesteps, hours)) 28 | print('Max mel frames length: {}'.format(max(int(m[5]) for m in metadata))) 29 | print('Max audio timesteps length: {}'.format(max(m[4] for m in metadata))) 30 | 31 | def run_preprocess(args, hparams): 32 | output_folder = os.path.join(args.base_dir, args.output) 33 | 34 | preprocess(args, args.input_dir, output_folder, hparams) 35 | 36 | def main(): 37 | print('initializing preprocessing..') 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('--base_dir', default='') 40 | parser.add_argument('--hparams', default='', 41 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 42 | parser.add_argument('--input_dir', default='LJSpeech-1.1/wavs') 43 | parser.add_argument('--output', default='tacotron_output/gta/') 44 | parser.add_argument('--n_jobs', type=int, default=cpu_count()) 45 | args = parser.parse_args() 46 | 47 | modified_hp = hparams.parse(args.hparams) 48 | 49 | run_preprocess(args, modified_hp) 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /wavenet_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /wavenet_vocoder/feeder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | import time 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from datasets import audio 8 | from infolog import log 9 | from keras.utils import np_utils 10 | from sklearn.model_selection import train_test_split 11 | 12 | from .util import is_mulaw_quantize, is_scalar_input 13 | 14 | 15 | 16 | _batches_per_group = 64 17 | 18 | 19 | class Feeder: 20 | """ 21 | Feeds batches of data into queue in a background thread. 22 | """ 23 | def __init__(self, coordinator, metadata_filename, base_dir, hparams): 24 | super(Feeder, self).__init__() 25 | 26 | self._coord = coordinator 27 | self._hparams = hparams 28 | self._train_offset = 0 29 | self._test_offset = 0 30 | 31 | if hparams.symmetric_mels: 32 | self._spec_pad = -hparams.max_abs_value 33 | else: 34 | self._spec_pad = 0. 35 | 36 | #Base directory of the project (to map files from different locations) 37 | self._base_dir = base_dir 38 | 39 | #Load metadata 40 | self._data_dir = os.path.dirname(metadata_filename) 41 | with open(metadata_filename, 'r') as f: 42 | self._metadata = [line.strip().split('|') for line in f] 43 | 44 | #Train test split 45 | if hparams.wavenet_test_size is None: 46 | assert hparams.wavenet_test_batches is not None 47 | 48 | test_size = (hparams.wavenet_test_size if hparams.wavenet_test_size is not None 49 | else hparams.wavenet_test_batches * hparams.wavenet_batch_size) 50 | indices = np.arange(len(self._metadata)) 51 | train_indices, test_indices = train_test_split(indices, 52 | test_size=test_size, random_state=hparams.wavenet_data_random_state) 53 | 54 | #Make sure test size is a multiple of batch size else round up 55 | len_test_indices = _round_down(len(test_indices), hparams.wavenet_batch_size) 56 | extra_test = test_indices[len_test_indices:] 57 | test_indices = test_indices[:len_test_indices] 58 | train_indices = np.concatenate([train_indices, extra_test]) 59 | 60 | self._train_meta = list(np.array(self._metadata)[train_indices]) 61 | self._test_meta = list(np.array(self._metadata)[test_indices]) 62 | 63 | self.test_steps = len(self._test_meta) // hparams.wavenet_batch_size 64 | 65 | if hparams.wavenet_test_size is None: 66 | assert hparams.wavenet_test_batches == self.test_steps 67 | 68 | #Get conditioning status 69 | self.local_condition, self.global_condition = self._check_conditions() 70 | 71 | with tf.device('/cpu:0'): 72 | # Create placeholders for inputs and targets. Don't specify batch size because we want 73 | # to be able to feed different batch sizes at eval time. 74 | if is_scalar_input(hparams.input_type): 75 | input_placeholder = tf.placeholder(tf.float32, shape=(None, 1, None), name='audio_inputs') 76 | target_placeholder = tf.placeholder(tf.float32, shape=(None, None, 1), name='audio_targets') 77 | target_type = tf.float32 78 | else: 79 | input_placeholder = tf.placeholder(tf.float32, shape=(None, hparams.quantize_channels, None), name='audio_inputs') 80 | target_placeholder = tf.placeholder(tf.int32, shape=(None, None, 1), name='audio_targets') 81 | target_type = tf.int32 82 | 83 | self._placeholders = [ 84 | input_placeholder, 85 | target_placeholder, 86 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 87 | ] 88 | 89 | queue_types = [tf.float32, target_type, tf.int32] 90 | 91 | if self.local_condition: 92 | self._placeholders.append(tf.placeholder(tf.float32, shape=(None, hparams.num_mels, None), name='local_condition_features')) 93 | queue_types.append(tf.float32) 94 | if self.global_condition: 95 | self._placeholders.append(tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features')) 96 | queue_types.append(tf.int32) 97 | 98 | # Create queue for buffering data 99 | queue = tf.FIFOQueue(8, queue_types, name='input_queue') 100 | self._enqueue_op = queue.enqueue(self._placeholders) 101 | variables = queue.dequeue() 102 | 103 | self.inputs = variables[0] 104 | self.inputs.set_shape(self._placeholders[0].shape) 105 | self.targets = variables[1] 106 | self.targets.set_shape(self._placeholders[1].shape) 107 | self.input_lengths = variables[2] 108 | self.input_lengths.set_shape(self._placeholders[2].shape) 109 | 110 | idx = 3 111 | 112 | #If local conditioning disabled override c inputs with None 113 | if hparams.cin_channels < 0: 114 | self.local_condition_features = None 115 | else: 116 | self.local_condition_features = variables[idx] 117 | self.local_condition_features.set_shape(self._placeholders[idx].shape) 118 | idx += 1 119 | 120 | #If global conditioning disabled override g inputs with None 121 | if hparams.gin_channels < 0: 122 | self.global_condition_features = None 123 | else: 124 | self.global_condition_features = variables[idx] 125 | self.global_condition_features.set_shape(self._placeholders[idx].shape) 126 | 127 | # Create queue for buffering eval data 128 | eval_queue = tf.FIFOQueue(1, queue_types, name='eval_queue') 129 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 130 | eval_variables = eval_queue.dequeue() 131 | 132 | self.eval_inputs = eval_variables[0] 133 | self.eval_inputs.set_shape(self._placeholders[0].shape) 134 | self.eval_targets = eval_variables[1] 135 | self.eval_targets.set_shape(self._placeholders[1].shape) 136 | self.eval_input_lengths = eval_variables[2] 137 | self.eval_input_lengths.set_shape(self._placeholders[2].shape) 138 | 139 | eval_idx = 3 140 | 141 | #If local conditioning disabled override c inputs with None 142 | if hparams.cin_channels < 0: 143 | self.eval_local_condition_features = None 144 | else: 145 | self.eval_local_condition_features = eval_variables[eval_idx] 146 | self.eval_local_condition_features.set_shape(self._placeholders[eval_idx].shape) 147 | eval_idx += 1 148 | 149 | #If global conditioning disabled override g inputs with None 150 | if hparams.gin_channels < 0: 151 | self.eval_global_condition_features = None 152 | else: 153 | self.eval_global_condition_features = eval_variables[eval_idx] 154 | self.eval_global_condition_features.set_shape(self._placeholders[eval_idx].shape) 155 | 156 | 157 | def start_threads(self, session): 158 | self._session = session 159 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group) 160 | thread.daemon = True #Thread will close when parent quits 161 | thread.start() 162 | 163 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group) 164 | thread.daemon = True #Thread will close when parent quits 165 | thread.start() 166 | 167 | def _get_test_groups(self): 168 | meta = self._test_meta[self._test_offset] 169 | self._test_offset += 1 170 | 171 | if self._hparams.train_with_GTA: 172 | mel_file = meta[2] 173 | else: 174 | mel_file = meta[1] 175 | audio_file = meta[0] 176 | 177 | input_data = np.load(os.path.join(self._base_dir, audio_file)) 178 | 179 | if self.local_condition: 180 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file)) 181 | else: 182 | local_condition_features = None 183 | 184 | if self.global_condition: 185 | global_condition_features = meta[3] 186 | if global_condition_features == '': 187 | raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!') 188 | else: 189 | global_condition_features = None 190 | 191 | return (input_data, local_condition_features, global_condition_features, len(input_data)) 192 | 193 | def make_test_batches(self): 194 | start = time.time() 195 | 196 | #Read one example for evaluation 197 | n = 1 198 | 199 | #Test on entire test set (one sample at an evaluation step) 200 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 201 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 202 | np.random.shuffle(batches) 203 | 204 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 205 | return batches 206 | 207 | def _enqueue_next_train_group(self): 208 | while not self._coord.should_stop(): 209 | start = time.time() 210 | 211 | # Read a group of examples 212 | n = self._hparams.wavenet_batch_size 213 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 214 | 215 | # Bucket examples base on similiar output length for efficiency 216 | examples.sort(key=lambda x: x[-1]) 217 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 218 | np.random.shuffle(batches) 219 | 220 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 221 | for batch in batches: 222 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch))) 223 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 224 | 225 | def _enqueue_next_test_group(self): 226 | test_batches = self.make_test_batches() 227 | while not self._coord.should_stop(): 228 | for batch in test_batches: 229 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch))) 230 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 231 | 232 | def _get_next_example(self): 233 | '''Get a single example (input, output, len_output) from disk 234 | ''' 235 | if self._train_offset >= len(self._train_meta): 236 | self._train_offset = 0 237 | np.random.shuffle(self._train_meta) 238 | meta = self._train_meta[self._train_offset] 239 | self._train_offset += 1 240 | 241 | if self._hparams.train_with_GTA: 242 | mel_file = meta[2] 243 | if 'linear' in mel_file: 244 | raise RuntimeError('Linear spectrogram files selected instead of GTA mels, did you specify the wrong metadata?') 245 | else: 246 | mel_file = meta[1] 247 | audio_file = meta[0] 248 | 249 | input_data = np.load(os.path.join(self._base_dir, audio_file)) 250 | 251 | if self.local_condition: 252 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file)) 253 | else: 254 | local_condition_features = None 255 | 256 | if self.global_condition: 257 | global_condition_features = meta[3] 258 | if global_condition_features == '': 259 | raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!') 260 | else: 261 | global_condition_features = None 262 | 263 | return (input_data, local_condition_features, global_condition_features, len(input_data)) 264 | 265 | 266 | def _prepare_batch(self, batches): 267 | assert 0 == len(batches) % self._hparams.wavenet_num_gpus 268 | size_per_device = int(len(batches) / self._hparams.wavenet_num_gpus) 269 | np.random.shuffle(batches) 270 | 271 | #Limit time steps to save GPU Memory usage 272 | max_time_steps = self._limit_time() 273 | #Adjust time resolution for upsampling 274 | batches = self._adjust_time_resolution(batches, self.local_condition, max_time_steps) 275 | 276 | #time lengths 277 | input_lengths = np.asarray([len(x[0]) for x in batches], np.int32) 278 | max_input_length = max(input_lengths) 279 | 280 | #Since all inputs/targets will have the same lengths for all GPUs, we can simply treat all GPUs batches as one big batch and stack all data. (fixed length) 281 | inputs = self._prepare_inputs([x[0] for x in batches], max_input_length) 282 | targets = self._prepare_targets([x[0] for x in batches], max_input_length) 283 | local_condition_features = self._prepare_local_conditions(self.local_condition, [x[1] for x in batches]) 284 | global_condition_features = self._prepare_global_conditions(self.global_condition, [x[2] for x in batches]) 285 | 286 | #Create final batches 287 | new_batches = (inputs, targets, input_lengths) 288 | if local_condition_features is not None: 289 | new_batches += (local_condition_features, ) 290 | if global_condition_features is not None: 291 | new_batches += (global_condition_features, ) 292 | 293 | return new_batches 294 | 295 | def _prepare_inputs(self, inputs, maxlen): 296 | if is_mulaw_quantize(self._hparams.input_type): 297 | #[batch_size, time_steps, quantize_channels] 298 | x_batch = np.stack([_pad_inputs(np_utils.to_categorical( 299 | x, num_classes=self._hparams.quantize_channels), maxlen) for x in inputs]).astype(np.float32) 300 | else: 301 | #[batch_size, time_steps, 1] 302 | x_batch = np.stack([_pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs]).astype(np.float32) 303 | assert len(x_batch.shape) == 3 304 | #Convert to channels first [batch_size, quantize_channels (or 1), time_steps] 305 | x_batch = np.transpose(x_batch, (0, 2, 1)) 306 | return x_batch 307 | 308 | def _prepare_targets(self, targets, maxlen): 309 | #[batch_size, time_steps] 310 | if is_mulaw_quantize(self._hparams.input_type): 311 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.int32) 312 | else: 313 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.float32) 314 | assert len(y_batch.shape) == 2 315 | #Add extra axis (make 3 dimension) 316 | y_batch = np.expand_dims(y_batch, axis=-1) 317 | return y_batch 318 | 319 | def _prepare_local_conditions(self, local_condition, c_features): 320 | if local_condition: 321 | maxlen = max([len(x) for x in c_features]) 322 | #[-max, max] or [0,max] 323 | T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value) 324 | 325 | if self._hparams.clip_for_wavenet: 326 | c_features = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in c_features] 327 | 328 | c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in c_features]).astype(np.float32) 329 | assert len(c_batch.shape) == 3 330 | #[batch_size, c_channels, time_steps] 331 | c_batch = np.transpose(c_batch, (0, 2, 1)) 332 | 333 | if self._hparams.normalize_for_wavenet: 334 | #rerange to [0, 1] 335 | c_batch = _interp(c_batch, T2_output_range).astype(np.float32) 336 | 337 | else: 338 | c_batch = None 339 | 340 | return c_batch 341 | 342 | def _prepare_global_conditions(self, global_condition, g_features): 343 | if global_condition: 344 | g_batch = np.array(g_features).astype(np.int32).reshape(-1, 1) 345 | 346 | else: 347 | g_batch = None 348 | 349 | return g_batch 350 | 351 | def _check_conditions(self): 352 | local_condition = self._hparams.cin_channels > 0 353 | global_condition = self._hparams.gin_channels > 0 354 | return local_condition, global_condition 355 | 356 | def _limit_time(self): 357 | '''Limit time resolution to save GPU memory. 358 | ''' 359 | if self._hparams.max_time_sec is not None: 360 | return int(self._hparams.max_time_sec * self._hparams.sample_rate) 361 | 362 | elif self._hparams.max_time_steps is not None: 363 | return self._hparams.max_time_steps 364 | 365 | else: 366 | return None 367 | 368 | def _adjust_time_resolution(self, batch, local_condition, max_time_steps): 369 | '''Adjust time resolution between audio and local condition 370 | ''' 371 | if local_condition: 372 | new_batch = [] 373 | for b in batch: 374 | x, c, g, l = b 375 | self._assert_ready_for_upsample(x, c) 376 | if max_time_steps is not None: 377 | max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(self._hparams), True) 378 | if len(x) > max_time_steps: 379 | max_time_frames = max_steps // audio.get_hop_size(self._hparams) 380 | start = np.random.randint(0, len(c) - max_time_frames) 381 | time_start = start * audio.get_hop_size(self._hparams) 382 | x = x[time_start: time_start + max_time_frames * audio.get_hop_size(self._hparams)] 383 | c = c[start: start + max_time_frames, :] 384 | self._assert_ready_for_upsample(x, c) 385 | 386 | new_batch.append((x, c, g, l)) 387 | return new_batch 388 | 389 | else: 390 | new_batch = [] 391 | for b in batch: 392 | x, c, g, l = b 393 | x = audio.trim_silence(x, hparams) 394 | if max_time_steps is not None and len(x) > max_time_steps: 395 | start = np.random.randint(0, len(c) - max_time_steps) 396 | x = x[start: start + max_time_steps] 397 | new_batch.append((x, c, g, l)) 398 | return new_batch 399 | 400 | def _assert_ready_for_upsample(self, x, c): 401 | assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(self._hparams) 402 | 403 | 404 | def _pad_inputs(x, maxlen, _pad=0): 405 | return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad) 406 | 407 | def _pad_targets(x, maxlen, _pad=0): 408 | return np.pad(x, (0, maxlen - len(x)), mode='constant', constant_values=_pad) 409 | 410 | def _round_up(x, multiple): 411 | remainder = x % multiple 412 | return x if remainder == 0 else x + multiple - remainder 413 | 414 | def _round_down(x, multiple): 415 | remainder = x % multiple 416 | return x if remainder == 0 else x - remainder 417 | 418 | def _ensure_divisible(length, divisible_by=256, lower=True): 419 | if length % divisible_by == 0: 420 | return length 421 | if lower: 422 | return length - length % divisible_by 423 | else: 424 | return length + (divisible_by - length % divisible_by) 425 | 426 | def _interp(feats, in_range): 427 | #rescales from [-max, max] (or [0, max]) to [0, 1] 428 | return (feats - in_range[0]) / (in_range[1] - in_range[0]) 429 | -------------------------------------------------------------------------------- /wavenet_vocoder/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .wavenet import WaveNet 2 | from warnings import warn 3 | from wavenet_vocoder.util import is_mulaw_quantize 4 | 5 | def create_model(name, hparams, init=False): 6 | if is_mulaw_quantize(hparams.input_type): 7 | if hparams.out_channels != hparams.quantize_channels: 8 | raise RuntimeError( 9 | "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'") 10 | 11 | if name == 'WaveNet': 12 | return WaveNet(hparams, init) 13 | else: 14 | raise Exception('Unknow model: {}'.format(name)) 15 | -------------------------------------------------------------------------------- /wavenet_vocoder/models/gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def gaussian_maximum_likelihood_estimation_loss(y_hat, y, log_scale_min_gauss, num_classes, use_cdf=True, reduce=True): 6 | '''compute the gaussian MLE loss''' 7 | with tf.control_dependencies([tf.assert_equal(tf.shape(y_hat)[1], 2), tf.assert_equal(tf.rank(y_hat), 3)]): 8 | #[batch_size, time_steps, channels] 9 | y_hat = tf.transpose(y_hat, [0, 2, 1]) 10 | 11 | #Unpack parameters: mean and log_scale outputs 12 | mean = y_hat[:, :, 0] 13 | log_scale = tf.maximum(y_hat[:, :, 1], log_scale_min_gauss) 14 | y = tf.squeeze(y, [-1]) 15 | 16 | if use_cdf: 17 | #Compute log_probs using CDF trick (Normalized loss value and more stable training than with natural log prob) 18 | #Instantiate a Normal distribution with model outputs 19 | gaussian = tf.contrib.distributions.Normal(loc=mean, scale=tf.exp(log_scale)) 20 | 21 | #Draw CDF+ and CDF- neighbors to the true sample y 22 | cdf_plus = gaussian.cdf(y + 1. / (num_classes - 1)) 23 | cdf_min = gaussian.cdf(y - 1. / (num_classes - 1)) 24 | 25 | #Maximize the difference between CDF+ and CDF- (or its log) 26 | log_prob = tf.log(tf.maximum(cdf_plus - cdf_min, 1e-12)) 27 | 28 | else: 29 | #Get log probability of each sample under this distribution in a computationally stable fashion 30 | #This is the log(PDF) 31 | log_prob = -0.5 * (np.log(2. * np.pi) + 2. * log_scale + tf.square(y - mean) * tf.exp(-2. * log_scale)) 32 | 33 | #Loss (Maximize log probability by minimizing its negative) 34 | if reduce: 35 | return -tf.reduce_sum(log_prob) 36 | else: 37 | return -tf.expand_dims(log_prob, [-1]) 38 | 39 | def sample_from_gaussian(y, log_scale_min_gauss): 40 | '''sample from learned gaussian distribution''' 41 | with tf.control_dependencies([tf.assert_equal(tf.shape(y)[1], 2)]): 42 | #[batch_size, time_length, channels] 43 | y = tf.transpose(y, [0, 2, 1]) 44 | 45 | mean = y[:, :, 0] 46 | log_scale = tf.maximum(y[:, :, 1], log_scale_min_gauss) 47 | scale = tf.exp(log_scale) 48 | 49 | gaussian_dist = tf.contrib.distributions.Normal(loc=mean, scale=scale, allow_nan_stats=False) 50 | x = gaussian_dist.sample() 51 | 52 | return tf.minimum(tf.maximum(x, -1.), 1.) 53 | -------------------------------------------------------------------------------- /wavenet_vocoder/models/mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def log_sum_exp(x): 6 | """ numerically stable log_sum_exp implementation that prevents overflow """ 7 | axis = len(x.get_shape())-1 8 | m = tf.reduce_max(x, axis) 9 | m2 = tf.reduce_max(x, axis, keepdims=True) 10 | return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis)) 11 | 12 | def log_prob_from_logits(x): 13 | """ numerically stable log_softmax implementation that prevents overflow """ 14 | axis = len(x.get_shape())-1 15 | m = tf.reduce_max(x, axis, keepdims=True) 16 | return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keepdims=True)) 17 | 18 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256, 19 | log_scale_min=-7.0, reduce=True): 20 | '''Discretized mix of logistic distributions loss. 21 | 22 | Note that it is assumed that input is scaled to [-1, 1] 23 | 24 | Args: 25 | y_hat: Tensor [batch_size, channels, time_length], predicted output. 26 | y: Tensor [batch_size, time_length, 1], Target. 27 | Returns: 28 | Tensor loss 29 | ''' 30 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y_hat)[1], 3), 0), tf.assert_equal(tf.rank(y_hat), 3)]): 31 | nr_mix = tf.shape(y_hat)[1] // 3 32 | 33 | #[Batch_size, time_length, channels] 34 | y_hat = tf.transpose(y_hat, [0, 2, 1]) 35 | 36 | #unpack parameters. [batch_size, time_length, num_mixtures] x 3 37 | logit_probs = y_hat[:, :, :nr_mix] 38 | means = y_hat[:, :, nr_mix:2 * nr_mix] 39 | log_scales = tf.maximum(y_hat[:, :, 2* nr_mix: 3 * nr_mix], log_scale_min) 40 | 41 | #[batch_size, time_length, 1] -> [batch_size, time_length, num_mixtures] 42 | y = y * tf.ones(shape=[1, 1, nr_mix], dtype=tf.float32) 43 | 44 | centered_y = y - means 45 | inv_stdv = tf.exp(-log_scales) 46 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 47 | cdf_plus = tf.nn.sigmoid(plus_in) 48 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 49 | cdf_min = tf.nn.sigmoid(min_in) 50 | 51 | log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling) 52 | log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling) 53 | 54 | #probability for all other cases 55 | cdf_delta = cdf_plus - cdf_min 56 | 57 | mid_in = inv_stdv * centered_y 58 | #log probability in the center of the bin, to be used in extreme cases 59 | #(not actually used in this code) 60 | log_pdf_mid = mid_in - log_scales - 2. * tf.nn.softplus(mid_in) 61 | 62 | log_probs = tf.where(y < -0.999, log_cdf_plus, 63 | tf.where(y > 0.999, log_one_minus_cdf_min, 64 | tf.where(cdf_delta > 1e-5, 65 | tf.log(tf.maximum(cdf_delta, 1e-12)), 66 | log_pdf_mid - np.log((num_classes - 1) / 2)))) 67 | 68 | #log_probs = log_probs + tf.nn.log_softmax(logit_probs, -1) 69 | log_probs = log_probs + log_prob_from_logits(logit_probs) 70 | 71 | if reduce: 72 | return -tf.reduce_sum(log_sum_exp(log_probs)) 73 | else: 74 | return -tf.expand_dims(log_sum_exp(log_probs), [-1]) 75 | 76 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.): 77 | ''' 78 | Args: 79 | y: Tensor, [batch_size, channels, time_length] 80 | Returns: 81 | Tensor: sample in range of [-1, 1] 82 | ''' 83 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y)[1], 3), 0)]): 84 | nr_mix = tf.shape(y)[1] // 3 85 | 86 | #[batch_size, time_length, channels] 87 | y = tf.transpose(y, [0, 2, 1]) 88 | logit_probs = y[:, :, :nr_mix] 89 | 90 | #sample mixture indicator from softmax 91 | temp = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5) 92 | temp = logit_probs - tf.log(-tf.log(temp)) 93 | argmax = tf.argmax(temp, -1) 94 | 95 | #[batch_size, time_length] -> [batch_size, time_length, nr_mix] 96 | one_hot = tf.one_hot(argmax, depth=nr_mix, dtype=tf.float32) 97 | #select logistic parameters 98 | means = tf.reduce_sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1) 99 | log_scales = tf.maximum(tf.reduce_sum( 100 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), log_scale_min) 101 | 102 | #sample from logistic & clip to interval 103 | #we don't actually round to the nearest 8-bit value when sampling 104 | u = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5) 105 | x = means + tf.exp(log_scales) * (tf.log(u) - tf.log(1 -u)) 106 | 107 | return tf.minimum(tf.maximum(x, -1.), 1.) 108 | -------------------------------------------------------------------------------- /wavenet_vocoder/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from hparams import hparams, hparams_debug_string 7 | from infolog import log 8 | from tqdm import tqdm 9 | from wavenet_vocoder.synthesizer import Synthesizer 10 | 11 | 12 | def run_synthesis(args, checkpoint_path, output_dir, hparams): 13 | log_dir = os.path.join(output_dir, 'plots') 14 | wav_dir = os.path.join(output_dir, 'wavs') 15 | 16 | #We suppose user will provide correct folder depending on training method 17 | log(hparams_debug_string()) 18 | synth = Synthesizer() 19 | synth.load(checkpoint_path, hparams) 20 | 21 | if args.model == 'Tacotron-2': 22 | #If running all Tacotron-2, synthesize audio from evaluated mels 23 | metadata_filename = os.path.join(args.mels_dir, 'map.txt') 24 | with open(metadata_filename, encoding='utf-8') as f: 25 | metadata = np.array([line.strip().split('|') for line in f]) 26 | 27 | speaker_ids = metadata[:, 2] 28 | mel_files = metadata[:, 1] 29 | texts = metadata[:, 0] 30 | 31 | speaker_ids = None if (speaker_ids == '').all() else speaker_ids 32 | else: 33 | #else Get all npy files in input_dir (supposing they are mels) 34 | mel_files = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy']) 35 | speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',') 36 | if speaker_ids is not None: 37 | assert len(speaker_ids) == len(mel_files) 38 | 39 | texts = None 40 | 41 | log('Starting synthesis! (this will take a while..)') 42 | os.makedirs(log_dir, exist_ok=True) 43 | os.makedirs(wav_dir, exist_ok=True) 44 | 45 | mel_files = [mel_files[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)] 46 | speaker_ids = None if speaker_ids is None else [speaker_ids[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(speaker_ids), hparams.wavenet_synthesis_batch_size)] 47 | texts = None if texts is None else [texts[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size)] 48 | 49 | with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: 50 | for i, mel_batch in enumerate(tqdm(mel_files)): 51 | mel_spectros = [np.load(mel_file) for mel_file in mel_batch] 52 | 53 | basenames = [os.path.basename(mel_file).replace('.npy', '') for mel_file in mel_batch] 54 | speaker_id_batch = None if speaker_ids is None else speaker_ids[i] 55 | audio_files = synth.synthesize(mel_spectros, speaker_id_batch, basenames, wav_dir, log_dir) 56 | 57 | speaker_logs = [''] * len(mel_batch) if speaker_id_batch is None else speaker_id_batch 58 | 59 | for j, mel_file in enumerate(mel_batch): 60 | if texts is None: 61 | file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j])) 62 | else: 63 | file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j])) 64 | 65 | log('synthesized audio waveforms at {}'.format(wav_dir)) 66 | 67 | 68 | 69 | def wavenet_synthesize(args, hparams, checkpoint): 70 | output_dir = 'wavenet_' + args.output_dir 71 | 72 | try: 73 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 74 | log('loaded model at {}'.format(checkpoint_path)) 75 | except: 76 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) 77 | 78 | run_synthesis(args, checkpoint_path, output_dir, hparams) 79 | -------------------------------------------------------------------------------- /wavenet_vocoder/synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from datasets.audio import save_wavenet_wav, get_hop_size, melspectrogram 6 | from infolog import log 7 | from wavenet_vocoder.models import create_model 8 | from wavenet_vocoder.train import create_shadow_saver, load_averaged_model 9 | from wavenet_vocoder.feeder import _interp 10 | 11 | from . import util 12 | 13 | 14 | class Synthesizer: 15 | def load(self, checkpoint_path, hparams, model_name='WaveNet'): 16 | log('Constructing model: {}'.format(model_name)) 17 | self._hparams = hparams 18 | local_cond, global_cond = self._check_conditions() 19 | 20 | self.local_conditions = tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='local_condition_features') if local_cond else None 21 | self.global_conditions = tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None 22 | self.synthesis_length = tf.placeholder(tf.int32, shape=(), name='synthesis_length') if not local_cond else None 23 | self.targets = tf.placeholder(tf.float32, shape=(1, None, 1), name='audio_targets') if hparams.wavenet_synth_debug else None #Debug only with 1 wav 24 | self.input_lengths = tf.placeholder(tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None 25 | self.synth_debug = hparams.wavenet_synth_debug 26 | 27 | with tf.variable_scope('WaveNet_model') as scope: 28 | self.model = create_model(model_name, hparams) 29 | self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, 30 | input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=self.targets) 31 | 32 | self._hparams = hparams 33 | sh_saver = create_shadow_saver(self.model) 34 | 35 | log('Loading checkpoint: {}'.format(checkpoint_path)) 36 | #Memory allocation on the GPU as needed 37 | config = tf.ConfigProto() 38 | config.gpu_options.allow_growth = True 39 | config.allow_soft_placement = True 40 | 41 | self.session = tf.Session(config=config) 42 | self.session.run(tf.global_variables_initializer()) 43 | 44 | load_averaged_model(self.session, sh_saver, checkpoint_path) 45 | 46 | def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir): 47 | hparams = self._hparams 48 | local_cond, global_cond = self._check_conditions() 49 | 50 | #Switch mels in case of debug 51 | if self.synth_debug: 52 | assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs) 53 | mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels] 54 | 55 | #Get True length of audio to be synthesized: audio_len = mel_len * hop_size 56 | audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms] 57 | 58 | #Prepare local condition batch 59 | maxlen = max([len(x) for x in mel_spectrograms]) 60 | #[-max, max] or [0,max] 61 | T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value) 62 | 63 | if self._hparams.clip_for_wavenet: 64 | mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms] 65 | 66 | c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32) 67 | 68 | if self._hparams.normalize_for_wavenet: 69 | #rerange to [0, 1] 70 | c_batch = _interp(c_batch, T2_output_range).astype(np.float32) 71 | 72 | g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1) 73 | feed_dict = {} 74 | 75 | if local_cond: 76 | feed_dict[self.local_conditions] = c_batch 77 | else: 78 | feed_dict[self.synthesis_length] = 100 79 | 80 | if global_cond: 81 | feed_dict[self.global_conditions] = g 82 | 83 | if self.synth_debug: 84 | debug_wavs = hparams.wavenet_debug_wavs 85 | assert len(debug_wavs) % hparams.wavenet_num_gpus == 0 86 | test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs] 87 | 88 | #pad wavs to same length 89 | max_test_len = max([len(x) for x in test_wavs]) 90 | test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32) 91 | 92 | assert len(test_wavs) == len(debug_wavs) 93 | feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1) 94 | feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]]) 95 | 96 | #Generate wavs and clip extra padding to select Real speech parts 97 | generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict) 98 | 99 | #Linearize outputs (n_gpus -> 1D) 100 | generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs] 101 | upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats] 102 | 103 | generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)] 104 | upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)] 105 | 106 | audio_filenames = [] 107 | for i, (generated_wav, input_mel, upsampled_feature) in enumerate(zip(generated_wavs, mel_spectrograms, upsampled_features)): 108 | #Save wav to disk 109 | audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i])) 110 | save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) 111 | audio_filenames.append(audio_filename) 112 | 113 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance 114 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. 115 | generated_mel = melspectrogram(generated_wav, hparams).T 116 | util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])), 117 | title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel) 118 | #Save upsampled features to visualize checkerboard artifacts. 119 | util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])), 120 | title='Upmsampled Local Condition features', auto_aspect=True) 121 | 122 | #Save waveplot to disk 123 | if log_dir is not None: 124 | plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i])) 125 | util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.') 126 | 127 | return audio_filenames 128 | 129 | def _check_conditions(self): 130 | local_condition = self._hparams.cin_channels > 0 131 | global_condition = self._hparams.gin_channels > 0 132 | return local_condition, global_condition 133 | 134 | 135 | def _pad_inputs(x, maxlen, _pad=0): 136 | return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad) 137 | -------------------------------------------------------------------------------- /wavenet_vocoder/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import time 5 | import traceback 6 | from datetime import datetime 7 | 8 | import infolog 9 | import librosa 10 | import numpy as np 11 | import tensorflow as tf 12 | from hparams import hparams_debug_string 13 | from datasets.audio import save_wavenet_wav, melspectrogram 14 | from tacotron.utils import ValueWindow 15 | from wavenet_vocoder.feeder import Feeder, _interp 16 | from wavenet_vocoder.models import create_model 17 | 18 | from . import util 19 | 20 | log = infolog.log 21 | 22 | 23 | def time_string(): 24 | return datetime.now().strftime('%Y-%m-%d %H:%M') 25 | 26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path): 27 | #Create tensorboard projector 28 | config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() 29 | config.model_checkpoint_path = checkpoint_path 30 | 31 | for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta): 32 | #Initialize config 33 | embedding = config.embeddings.add() 34 | #Specifiy the embedding variable and the metadata 35 | embedding.tensor_name = embedding_name 36 | embedding.metadata_path = path_to_meta 37 | 38 | #Project the embeddings to space dimensions for visualization 39 | tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config) 40 | 41 | def add_train_stats(model, hparams): 42 | with tf.variable_scope('stats') as scope: 43 | for i in range(hparams.wavenet_num_gpus): 44 | tf.summary.histogram('wav_outputs %d' % i, model.tower_y_hat_log[i]) 45 | tf.summary.histogram('wav_targets %d' % i, model.tower_y_log[i]) 46 | if model.tower_means[i] is not None: 47 | tf.summary.histogram('gaussian_means %d' % i, model.tower_means[i]) 48 | tf.summary.histogram('gaussian_log_scales %d' % i, model.tower_log_scales[i]) 49 | 50 | tf.summary.scalar('wavenet_learning_rate', model.learning_rate) 51 | tf.summary.scalar('wavenet_loss', model.loss) 52 | 53 | gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None] 54 | tf.summary.histogram('gradient_norm', gradient_norms) 55 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion) 56 | return tf.summary.merge_all() 57 | 58 | def add_test_stats(summary_writer, step, eval_loss, hparams): 59 | values = [ 60 | tf.Summary.Value(tag='Wavenet_eval_model/eval_stats/wavenet_eval_loss', simple_value=eval_loss), 61 | ] 62 | 63 | test_summary = tf.Summary(value=values) 64 | summary_writer.add_summary(test_summary, step) 65 | 66 | 67 | def create_shadow_saver(model, global_step=None): 68 | '''Load shadow variables of saved model. 69 | 70 | Inspired by: https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 71 | 72 | Can also use: shadow_dict = model.ema.variables_to_restore() 73 | ''' 74 | #Add global step to saved variables to save checkpoints correctly 75 | shadow_variables = [model.ema.average_name(v) for v in model.variables] 76 | variables = model.variables 77 | 78 | if global_step is not None: 79 | shadow_variables += ['global_step'] 80 | variables += [global_step] 81 | 82 | shadow_dict = dict(zip(shadow_variables, variables)) #dict(zip(keys, values)) -> {key1: value1, key2: value2, ...} 83 | return tf.train.Saver(shadow_dict, max_to_keep=20) 84 | 85 | def load_averaged_model(sess, sh_saver, checkpoint_path): 86 | sh_saver.restore(sess, checkpoint_path) 87 | 88 | 89 | def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name): 90 | '''Evaluate model during training. 91 | Supposes that model variables are averaged. 92 | ''' 93 | start_time = time.time() 94 | y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0], 95 | model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]]) 96 | duration = time.time() - start_time 97 | log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format( 98 | len(y_target), duration, len(y_target)/duration)) 99 | 100 | #Make audio and plot paths 101 | pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) 102 | target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) 103 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) 104 | mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) 105 | upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) 106 | 107 | #Save figure 108 | util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss)) 109 | log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) 110 | 111 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance 112 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. 113 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value) 114 | generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) 115 | util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format( 116 | global_step, loss), target_spectrogram=input_mel.T) 117 | util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format( 118 | global_step, loss), auto_aspect=True) 119 | 120 | #Save Audio 121 | save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) 122 | save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) 123 | 124 | #Write eval summary to tensorboard 125 | log('Writing eval summary!') 126 | add_test_stats(summary_writer, global_step, loss, hparams=hparams) 127 | 128 | def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name): 129 | log('\nSaving intermediate states at step {}'.format(global_step)) 130 | idx = 0 131 | y_hat, y, loss, length, input_mel, upsampled_features = sess.run([model.tower_y_hat_log[0][idx], 132 | model.tower_y_log[0][idx], 133 | model.loss, 134 | model.tower_input_lengths[0][idx], 135 | model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]]) 136 | 137 | #mask by length 138 | y_hat[length:] = 0 139 | y[length:] = 0 140 | 141 | #Make audio and plot paths 142 | pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step)) 143 | target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step)) 144 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) 145 | mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step)) 146 | upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step)) 147 | 148 | #Save figure 149 | util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss)) 150 | 151 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance 152 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels. 153 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value) 154 | generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range) 155 | util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format( 156 | global_step, loss), target_spectrogram=input_mel.T) 157 | util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format( 158 | global_step, loss), auto_aspect=True) 159 | 160 | #Save audio 161 | save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) 162 | save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis) 163 | 164 | def save_checkpoint(sess, saver, checkpoint_path, global_step): 165 | saver.save(sess, checkpoint_path, global_step=global_step) 166 | 167 | 168 | def model_train_mode(args, feeder, hparams, global_step, init=False): 169 | with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope: 170 | model_name = None 171 | if args.model == 'Tacotron-2': 172 | model_name = 'WaveNet' 173 | model = create_model(model_name or args.model, hparams, init) 174 | #initialize model to train mode 175 | model.initialize(feeder.targets, feeder.local_condition_features, feeder.global_condition_features, 176 | feeder.input_lengths, x=feeder.inputs) 177 | model.add_loss() 178 | model.add_optimizer(global_step) 179 | stats = add_train_stats(model, hparams) 180 | return model, stats 181 | 182 | def model_test_mode(args, feeder, hparams, global_step): 183 | with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope: 184 | model_name = None 185 | if args.model == 'Tacotron-2': 186 | model_name = 'WaveNet' 187 | model = create_model(model_name or args.model, hparams) 188 | #initialize model to test mode 189 | model.initialize(feeder.eval_targets, feeder.eval_local_condition_features, feeder.eval_global_condition_features, 190 | feeder.eval_input_lengths) 191 | model.add_loss() 192 | return model 193 | 194 | def train(log_dir, args, hparams, input_path): 195 | save_dir = os.path.join(log_dir, 'wave_pretrained') 196 | plot_dir = os.path.join(log_dir, 'plots') 197 | wav_dir = os.path.join(log_dir, 'wavs') 198 | eval_dir = os.path.join(log_dir, 'eval-dir') 199 | eval_plot_dir = os.path.join(eval_dir, 'plots') 200 | eval_wav_dir = os.path.join(eval_dir, 'wavs') 201 | tensorboard_dir = os.path.join(log_dir, 'wavenet_events') 202 | meta_folder = os.path.join(log_dir, 'metas') 203 | os.makedirs(save_dir, exist_ok=True) 204 | os.makedirs(plot_dir, exist_ok=True) 205 | os.makedirs(wav_dir, exist_ok=True) 206 | os.makedirs(eval_dir, exist_ok=True) 207 | os.makedirs(eval_plot_dir, exist_ok=True) 208 | os.makedirs(eval_wav_dir, exist_ok=True) 209 | os.makedirs(tensorboard_dir, exist_ok=True) 210 | os.makedirs(meta_folder, exist_ok=True) 211 | 212 | checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') 213 | input_path = os.path.join(args.base_dir, input_path) 214 | 215 | log('Checkpoint_path: {}'.format(checkpoint_path)) 216 | log('Loading training data from: {}'.format(input_path)) 217 | log('Using model: {}'.format(args.model)) 218 | log(hparams_debug_string()) 219 | 220 | #Start by setting a seed for repeatability 221 | tf.set_random_seed(hparams.wavenet_random_seed) 222 | 223 | #Set up data feeder 224 | coord = tf.train.Coordinator() 225 | with tf.variable_scope('datafeeder') as scope: 226 | feeder = Feeder(coord, input_path, args.base_dir, hparams) 227 | 228 | #Set up model 229 | global_step = tf.Variable(0, name='global_step', trainable=False) 230 | model, stats = model_train_mode(args, feeder, hparams, global_step) 231 | eval_model = model_test_mode(args, feeder, hparams, global_step) 232 | 233 | #Speaker Embeddings metadata 234 | if hparams.speakers_path is not None: 235 | speaker_embedding_meta = hparams.speakers_path 236 | 237 | else: 238 | speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv') 239 | if not os.path.isfile(speaker_embedding_meta): 240 | with open(speaker_embedding_meta, 'w', encoding='utf-8') as f: 241 | for speaker in hparams.speakers: 242 | f.write('{}\n'.format(speaker)) 243 | 244 | speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..') 245 | 246 | #book keeping 247 | step = 0 248 | time_window = ValueWindow(100) 249 | loss_window = ValueWindow(100) 250 | sh_saver = create_shadow_saver(model, global_step) 251 | 252 | log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps)) 253 | 254 | #Memory allocation on the memory 255 | config = tf.ConfigProto() 256 | config.gpu_options.allow_growth = True 257 | config.allow_soft_placement = True 258 | run_init = False 259 | 260 | #Train 261 | with tf.Session(config=config) as sess: 262 | try: 263 | summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) 264 | sess.run(tf.global_variables_initializer()) 265 | 266 | #saved model restoring 267 | if args.restore: 268 | # Restore saved model if the user requested it, default = True 269 | try: 270 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 271 | 272 | if (checkpoint_state and checkpoint_state.model_checkpoint_path): 273 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) 274 | load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) 275 | else: 276 | log('No model to load at {}'.format(save_dir), slack=True) 277 | if hparams.wavenet_weight_normalization: 278 | run_init = True 279 | 280 | except tf.errors.OutOfRangeError as e: 281 | log('Cannot restore checkpoint: {}'.format(e), slack=True) 282 | else: 283 | log('Starting new training!', slack=True) 284 | if hparams.wavenet_weight_normalization: 285 | run_init = True 286 | 287 | if run_init: 288 | log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..') 289 | #Create init_model 290 | init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True) 291 | 292 | #initializing feeder 293 | feeder.start_threads(sess) 294 | 295 | if run_init: 296 | #Run one forward pass for model parameters initialization (make prediction on init_batch) 297 | _ = sess.run(init_model.tower_y_hat) 298 | log('Data dependent initialization done. Starting training!') 299 | 300 | #Training loop 301 | while not coord.should_stop() and step < args.wavenet_train_steps: 302 | start_time = time.time() 303 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 304 | time_window.append(time.time() - start_time) 305 | loss_window.append(loss) 306 | 307 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( 308 | step, time_window.average, loss, loss_window.average) 309 | log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) 310 | 311 | if np.isnan(loss) or loss > 100: 312 | log('Loss exploded to {:.5f} at step {}'.format(loss, step)) 313 | raise Exception('Loss exploded') 314 | 315 | if step % args.summary_interval == 0: 316 | log('\nWriting summary at step {}'.format(step)) 317 | summary_writer.add_summary(sess.run(stats), step) 318 | 319 | if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: 320 | save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name=args.model) 321 | save_checkpoint(sess, sh_saver, checkpoint_path, global_step) 322 | 323 | if step % args.eval_interval == 0: 324 | log('\nEvaluating at step {}'.format(step)) 325 | eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer , hparams=model._hparams, model_name=args.model) 326 | 327 | if hparams.gin_channels > 0 and (step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1): 328 | #Get current checkpoint state 329 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 330 | 331 | #Update Projector 332 | log('\nSaving Model Speaker Embeddings visualization..') 333 | add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path) 334 | log('WaveNet Speaker embeddings have been updated on tensorboard!') 335 | 336 | log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True) 337 | return save_dir 338 | 339 | except Exception as e: 340 | log('Exiting due to exception: {}'.format(e), slack=True) 341 | traceback.print_exc() 342 | coord.request_stop(e) 343 | 344 | 345 | def wavenet_train(args, log_dir, hparams, input_path): 346 | return train(log_dir, args, hparams, input_path) 347 | -------------------------------------------------------------------------------- /wavenet_vocoder/util.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | import librosa.display as dsp 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | def _assert_valid_input_type(s): 11 | assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw' 12 | 13 | def is_mulaw_quantize(s): 14 | _assert_valid_input_type(s) 15 | return s == 'mulaw-quantize' 16 | 17 | def is_mulaw(s): 18 | _assert_valid_input_type(s) 19 | return s == 'mulaw' 20 | 21 | def is_raw(s): 22 | _assert_valid_input_type(s) 23 | return s == 'raw' 24 | 25 | def is_scalar_input(s): 26 | return is_raw(s) or is_mulaw(s) 27 | 28 | 29 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py 30 | def mulaw(x, mu=256): 31 | """Mu-Law companding 32 | Method described in paper [1]_. 33 | .. math:: 34 | f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu) 35 | Args: 36 | x (array-like): Input signal. Each value of input signal must be in 37 | range of [-1, 1]. 38 | mu (number): Compression parameter ``μ``. 39 | Returns: 40 | array-like: Compressed signal ([-1, 1]) 41 | See also: 42 | :func:`nnmnkwii.preprocessing.inv_mulaw` 43 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 44 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 45 | .. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding 46 | implementations using the tms320c54x." SPRA163 (1997). 47 | """ 48 | mu = 255 49 | return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu) 50 | 51 | 52 | def inv_mulaw(y, mu=256): 53 | """Inverse of mu-law companding (mu-law expansion) 54 | .. math:: 55 | f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1) 56 | Args: 57 | y (array-like): Compressed signal. Each value of input signal must be in 58 | range of [-1, 1]. 59 | mu (number): Compression parameter ``μ``. 60 | Returns: 61 | array-like: Uncomprresed signal (-1 <= x <= 1) 62 | See also: 63 | :func:`nnmnkwii.preprocessing.inv_mulaw` 64 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 65 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 66 | """ 67 | mu = 255 68 | return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0) 69 | 70 | 71 | def mulaw_quantize(x, mu=256): 72 | """Mu-Law companding + quantize 73 | Args: 74 | x (array-like): Input signal. Each value of input signal must be in 75 | range of [-1, 1]. 76 | mu (number): Compression parameter ``μ``. 77 | Returns: 78 | array-like: Quantized signal (dtype=int) 79 | - y ∈ [0, mu] if x ∈ [-1, 1] 80 | - y ∈ [0, mu) if x ∈ [-1, 1) 81 | .. note:: 82 | If you want to get quantized values of range [0, mu) (not [0, mu]), 83 | then you need to provide input signal of range [-1, 1). 84 | Examples: 85 | >>> from scipy.io import wavfile 86 | >>> import pysptk 87 | >>> import numpy as np 88 | >>> from nnmnkwii import preprocessing as P 89 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 90 | >>> x = (x / 32768.0).astype(np.float32) 91 | >>> y = P.mulaw_quantize(x) 92 | >>> print(y.min(), y.max(), y.dtype) 93 | 15 246 int64 94 | See also: 95 | :func:`nnmnkwii.preprocessing.mulaw` 96 | :func:`nnmnkwii.preprocessing.inv_mulaw` 97 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 98 | """ 99 | mu = 255 100 | y = mulaw(x, mu) 101 | # scale [-1, 1] to [0, mu] 102 | return _asint((y + 1) / 2 * mu) 103 | 104 | 105 | def inv_mulaw_quantize(y, mu=256): 106 | """Inverse of mu-law companding + quantize 107 | Args: 108 | y (array-like): Quantized signal (∈ [0, mu]). 109 | mu (number): Compression parameter ``μ``. 110 | Returns: 111 | array-like: Uncompressed signal ([-1, 1]) 112 | Examples: 113 | >>> from scipy.io import wavfile 114 | >>> import pysptk 115 | >>> import numpy as np 116 | >>> from nnmnkwii import preprocessing as P 117 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 118 | >>> x = (x / 32768.0).astype(np.float32) 119 | >>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x)) 120 | >>> x_hat = (x_hat * 32768).astype(np.int16) 121 | See also: 122 | :func:`nnmnkwii.preprocessing.mulaw` 123 | :func:`nnmnkwii.preprocessing.inv_mulaw` 124 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 125 | """ 126 | # [0, m) to [-1, 1] 127 | mu = 255 128 | y = 2 * _asfloat(y) / mu - 1 129 | return inv_mulaw(y, mu) 130 | 131 | def _sign(x): 132 | #wrapper to support tensorflow tensors/numpy arrays 133 | isnumpy = isinstance(x, np.ndarray) 134 | isscalar = np.isscalar(x) 135 | return np.sign(x) if (isnumpy or isscalar) else tf.sign(x) 136 | 137 | 138 | def _log1p(x): 139 | #wrapper to support tensorflow tensors/numpy arrays 140 | isnumpy = isinstance(x, np.ndarray) 141 | isscalar = np.isscalar(x) 142 | return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x) 143 | 144 | 145 | def _abs(x): 146 | #wrapper to support tensorflow tensors/numpy arrays 147 | isnumpy = isinstance(x, np.ndarray) 148 | isscalar = np.isscalar(x) 149 | return np.abs(x) if (isnumpy or isscalar) else tf.abs(x) 150 | 151 | 152 | def _asint(x): 153 | #wrapper to support tensorflow tensors/numpy arrays 154 | isnumpy = isinstance(x, np.ndarray) 155 | isscalar = np.isscalar(x) 156 | return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32) 157 | 158 | 159 | def _asfloat(x): 160 | #wrapper to support tensorflow tensors/numpy arrays 161 | isnumpy = isinstance(x, np.ndarray) 162 | isscalar = np.isscalar(x) 163 | return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32) 164 | 165 | def sequence_mask(input_lengths, max_len=None, expand=True): 166 | if max_len is None: 167 | max_len = tf.reduce_max(input_lengths) 168 | 169 | if expand: 170 | return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1) 171 | return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32) 172 | 173 | 174 | def waveplot(path, y_hat, y_target, hparams, title=None): 175 | sr = hparams.sample_rate 176 | 177 | fig = plt.figure(figsize=(12, 4)) 178 | if y_target is not None: 179 | ax = plt.subplot(3, 1, 1) 180 | dsp.waveplot(y_target, sr=sr) 181 | ax.set_title('Target waveform') 182 | ax = plt.subplot(3, 1, 2) 183 | dsp.waveplot(y_hat, sr=sr) 184 | ax.set_title('Predicted waveform') 185 | else: 186 | ax = plt.subplot(2, 1, 1) 187 | dsp.waveplot(y_hat, sr=sr) 188 | ax.set_title('Generated waveform') 189 | 190 | if title is not None: 191 | # Set common labels 192 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 193 | 194 | plt.tight_layout() 195 | plt.savefig(path, format="png") 196 | plt.close() 197 | 198 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 199 | if max_len is not None: 200 | target_spectrogram = target_spectrogram[:max_len] 201 | pred_spectrogram = pred_spectrogram[:max_len] 202 | 203 | if split_title: 204 | title = split_title_line(title) 205 | 206 | fig = plt.figure(figsize=(10, 8)) 207 | # Set common labels 208 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 209 | 210 | #target spectrogram subplot 211 | if target_spectrogram is not None: 212 | ax1 = fig.add_subplot(311) 213 | ax2 = fig.add_subplot(312) 214 | 215 | if auto_aspect: 216 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none') 217 | else: 218 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none') 219 | ax1.set_title('Target Mel-Spectrogram') 220 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) 221 | ax2.set_title('Predicted Mel-Spectrogram') 222 | else: 223 | ax2 = fig.add_subplot(211) 224 | 225 | if auto_aspect: 226 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none') 227 | else: 228 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none') 229 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2) 230 | 231 | plt.tight_layout() 232 | plt.savefig(path, format='png') 233 | plt.close() 234 | --------------------------------------------------------------------------------