├── .gitignore
├── LICENSE
├── README.md
├── datasets
    ├── __init__.py
    ├── audio.py
    ├── preprocessor.py
    └── wavenet_preprocessor.py
├── docker
    └── Dockerfile
├── griffin_lim_synthesis_tool.ipynb
├── hparams.py
├── infolog.py
├── paper_hparams.py
├── papers
    ├── (content+location) attention.pdf
    ├── ClariNet.pdf
    ├── Tacotron 2 revised.pdf
    ├── bahdanau (content) attention.pdf
    ├── deepvoice 3.pdf
    ├── effective approaches attention.pdf
    ├── fast_wavenet.pdf
    ├── tacotron.pdf
    ├── tacotron2.pdf
    └── wavenet.pdf
├── preprocess.py
├── requirements.txt
├── sentences.txt
├── synthesize.py
├── tacotron
    ├── __init__.py
    ├── feeder.py
    ├── models
    │   ├── Architecture_wrappers.py
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── custom_decoder.py
    │   ├── helpers.py
    │   ├── modules.py
    │   └── tacotron.py
    ├── synthesize.py
    ├── synthesizer.py
    ├── train.py
    └── utils
    │   ├── __init__.py
    │   ├── cleaners.py
    │   ├── cmudict.py
    │   ├── numbers.py
    │   ├── plot.py
    │   ├── symbols.py
    │   └── text.py
├── test_wavenet_feeder.py
├── train.py
├── wavenet_preprocess.py
└── wavenet_vocoder
    ├── __init__.py
    ├── feeder.py
    ├── models
        ├── __init__.py
        ├── gaussian.py
        ├── mixture.py
        ├── modules.py
        └── wavenet.py
    ├── synthesize.py
    ├── synthesizer.py
    ├── train.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Tacotron 2 oddities
107 | logs-*/
108 | training_data/
109 | 
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Rayhane Mama
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tacotron-2:
  2 | Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
  3 | 
  4 | This Repository contains additional improvements and attempts over the paper, we thus propose **paper_hparams.py** file which holds the exact hyperparameters to reproduce the paper results without any additional extras.
  5 | 
  6 | Suggested **hparams.py** file which is default in use, contains the hyperparameters with extras that proved to provide better results in most cases. Feel free to toy with the parameters as needed.
  7 | 
  8 | DIFFERENCES WILL BE HIGHLIGHTED IN DOCUMENTATION SHORTLY.
  9 | 
 10 | 
 11 | # Repository Structure:
 12 | 	Tacotron-2
 13 | 	├── datasets
 14 | 	├── en_UK		(0)
 15 | 	│   └── by_book
 16 | 	│       └── female
 17 | 	├── en_US		(0)
 18 | 	│   └── by_book
 19 | 	│       ├── female
 20 | 	│       └── male
 21 | 	├── LJSpeech-1.1	(0)
 22 | 	│   └── wavs
 23 | 	├── logs-Tacotron	(2)
 24 | 	│   ├── eval_-dir
 25 | 	│   │ 	├── plots
 26 | 	│ 	│ 	└── wavs
 27 | 	│   ├── mel-spectrograms
 28 | 	│   ├── plots
 29 | 	│   ├── taco_pretrained
 30 | 	│   ├── metas
 31 | 	│   └── wavs
 32 | 	├── logs-Wavenet	(4)
 33 | 	│   ├── eval-dir
 34 | 	│   │ 	├── plots
 35 | 	│ 	│ 	└── wavs
 36 | 	│   ├── plots
 37 | 	│   ├── wave_pretrained
 38 | 	│   ├── metas
 39 | 	│   └── wavs
 40 | 	├── logs-Tacotron-2	( * )
 41 | 	│   ├── eval-dir
 42 | 	│   │ 	├── plots
 43 | 	│ 	│ 	└── wavs
 44 | 	│   ├── plots
 45 | 	│   ├── taco_pretrained
 46 | 	│   ├── wave_pretrained
 47 | 	│   ├── metas
 48 | 	│   └── wavs
 49 | 	├── papers
 50 | 	├── tacotron
 51 | 	│   ├── models
 52 | 	│   └── utils
 53 | 	├── tacotron_output	(3)
 54 | 	│   ├── eval
 55 | 	│   ├── gta
 56 | 	│   ├── logs-eval
 57 | 	│   │   ├── plots
 58 | 	│   │   └── wavs
 59 | 	│   └── natural
 60 | 	├── wavenet_output	(5)
 61 | 	│   ├── plots
 62 | 	│   └── wavs
 63 | 	├── training_data	(1)
 64 | 	│   ├── audio
 65 | 	│   ├── linear
 66 | 	│	└── mels
 67 | 	└── wavenet_vocoder
 68 | 		└── models
 69 | 
 70 | 
 71 | The previous tree shows the current state of the repository (separate training, one step at a time).
 72 | 
 73 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**).
 74 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
 75 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
 76 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
 77 | - Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder.
 78 | - Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder.
 79 | 
 80 | - Note: Steps 2, 3, and 4 can be made with a simple run for both Tacotron and WaveNet (Tacotron-2, step ( * )).
 81 | 
 82 | 
 83 | Note:
 84 | - **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script.
 85 | - In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity.
 86 | - If you run training of both **models at the same time**, repository structure will be different.
 87 | 
 88 | # Pretrained model and Samples:
 89 | Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). THIS IS VERY OUTDATED, I WILL UPDATE THIS SOON
 90 | 
 91 | # Model Architecture:
 92 | <p align="center">
 93 |   <img src="https://preview.ibb.co/bU8sLS/Tacotron_2_Architecture.png"/>
 94 | </p>
 95 | 
 96 | The model described by the authors can be divided in two parts:
 97 | - Spectrogram prediction network
 98 | - Wavenet vocoder
 99 | 
100 | To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki)
101 | 
102 | # Current state:
103 | 
104 | To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4)
105 | 
106 | since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training.
107 | 
108 | # How to start
109 | - **Machine Setup:**
110 | 
111 | First, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/).
112 | 
113 | Next, you need to install some Linux dependencies to ensure audio libraries work properly:
114 | 
115 | > apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools
116 | 
117 | Finally, you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**)
118 | 
119 | > pip install -r requirements.txt
120 | 
121 | - **Docker:**
122 | 
123 | Alternatively, one can build the **docker image** to ensure everything is setup automatically and use the project inside the docker containers.
124 | **Dockerfile is insider "docker" folder**
125 | 
126 | docker image can be built with:
127 | 
128 | > docker build -t tacotron-2_image docker/
129 | 
130 | Then containers are runnable with:
131 | 
132 | > docker run -i --name new_container tacotron-2_image
133 | 
134 | Please report any issues with the Docker usage with our models, I'll get to it. Thanks!
135 | 
136 | # Dataset:
137 | We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it)
138 | 
139 | We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages.
140 | 
141 | After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.**
142 | 
143 | # Hparams setting:
144 | Before proceeding, you must pick the hyperparameters that suit best your needs. While it is possible to change the hyper parameters from command line during preprocessing/training, I still recommend making the changes once and for all on the **hparams.py** file directly.
145 | 
146 | To pick optimal fft parameters, I have made a **griffin_lim_synthesis_tool** notebook that you can use to invert real extracted mel/linear spectrograms and choose how good your preprocessing is. All other options are well explained in the **hparams.py** and have meaningful names so that you can try multiple things with them.
147 | 
148 | AWAIT DOCUMENTATION ON HPARAMS SHORTLY!!
149 | 
150 | # Preprocessing
151 | Before running the following steps, please make sure you are inside **Tacotron-2 folder**
152 | 
153 | > cd Tacotron-2
154 | 
155 | Preprocessing can then be started using: 
156 | 
157 | > python preprocess.py
158 | 
159 | dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**.
160 | 
161 | Example M-AILABS:
162 | 
163 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth'
164 | 
165 | or if you want to use all books for a single speaker:
166 | 
167 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True
168 | 
169 | This should take no longer than a **few minutes.**
170 | 
171 | # Training:
172 | To **train both models** sequentially (one after the other):
173 | 
174 | > python train.py --model='Tacotron-2'
175 | 
176 | 
177 | Feature prediction model can **separately** be **trained** using:
178 | 
179 | > python train.py --model='Tacotron'
180 | 
181 | checkpoints will be made each **5000 steps** and stored under **logs-Tacotron folder.**
182 | 
183 | Naturally, **training the wavenet separately** is done by:
184 | 
185 | > python train.py --model='WaveNet'
186 | 
187 | logs will be stored inside **logs-Wavenet**.
188 | 
189 | **Note:**
190 | - If model argument is not provided, training will default to Tacotron-2 model training. (both models)
191 | - Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
192 | - It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**.
193 | 
194 | # Synthesis
195 | To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work):
196 | 
197 | > python synthesize.py --model='Tacotron-2'
198 | 
199 | For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis:
200 | 
201 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
202 | 
203 | > python synthesize.py --model='Tacotron'
204 | 
205 | - **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step).
206 | 
207 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=False
208 | 
209 | 
210 | - **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper)
211 | 
212 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=True
213 | 
214 | Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with:
215 | 
216 | > python synthesize.py --model='WaveNet'
217 | 
218 | **Note:**
219 | - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
220 | - Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
221 | 
222 | 
223 | # References and Resources:
224 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
225 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
226 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
227 | - [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf)
228 | - [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf)
229 | - [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
230 | - [keithito/tacotron](https://github.com/keithito/tacotron)
231 | 
232 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/datasets/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | 
  8 | 
  9 | def load_wav(path, sr):
 10 | 	return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | def save_wav(wav, path, sr):
 13 | 	wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 14 | 	#proposed by @dsmiller
 15 | 	wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | def save_wavenet_wav(wav, path, sr, inv_preemphasize, k):
 18 | 	# wav = inv_preemphasis(wav, k, inv_preemphasize)
 19 | 	wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 20 | 	wavfile.write(path, sr, wav.astype(np.int16))
 21 | 
 22 | def preemphasis(wav, k, preemphasize=True):
 23 | 	if preemphasize:
 24 | 		return signal.lfilter([1, -k], [1], wav)
 25 | 	return wav
 26 | 
 27 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 28 | 	if inv_preemphasize:
 29 | 		return signal.lfilter([1], [1, -k], wav)
 30 | 	return wav
 31 | 
 32 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 33 | def start_and_end_indices(quantized, silence_threshold=2):
 34 | 	for start in range(quantized.size):
 35 | 		if abs(quantized[start] - 127) > silence_threshold:
 36 | 			break
 37 | 	for end in range(quantized.size - 1, 1, -1):
 38 | 		if abs(quantized[end] - 127) > silence_threshold:
 39 | 			break
 40 | 
 41 | 	assert abs(quantized[start] - 127) > silence_threshold
 42 | 	assert abs(quantized[end] - 127) > silence_threshold
 43 | 
 44 | 	return start, end
 45 | 
 46 | def trim_silence(wav, hparams):
 47 | 	'''Trim leading and trailing silence
 48 | 
 49 | 	Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end.
 50 | 	'''
 51 | 	#Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset.
 52 | 	return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
 53 | 
 54 | def get_hop_size(hparams):
 55 | 	hop_size = hparams.hop_size
 56 | 	if hop_size is None:
 57 | 		assert hparams.frame_shift_ms is not None
 58 | 		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 59 | 	return hop_size
 60 | 
 61 | def linearspectrogram(wav, hparams):
 62 | 	# D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 63 | 	D = _stft(wav, hparams)
 64 | 	S = _amp_to_db(np.abs(D)**hparams.magnitude_power, hparams) - hparams.ref_level_db
 65 | 
 66 | 	if hparams.signal_normalization:
 67 | 		return _normalize(S, hparams)
 68 | 	return S
 69 | 
 70 | def melspectrogram(wav, hparams):
 71 | 	# D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 72 | 	D = _stft(wav, hparams)
 73 | 	S = _amp_to_db(_linear_to_mel(np.abs(D)**hparams.magnitude_power, hparams), hparams) - hparams.ref_level_db
 74 | 
 75 | 	if hparams.signal_normalization:
 76 | 		return _normalize(S, hparams)
 77 | 	return S
 78 | 
 79 | def inv_linear_spectrogram(linear_spectrogram, hparams):
 80 | 	'''Converts linear spectrogram to waveform using librosa'''
 81 | 	if hparams.signal_normalization:
 82 | 		D = _denormalize(linear_spectrogram, hparams)
 83 | 	else:
 84 | 		D = linear_spectrogram
 85 | 
 86 | 	S = _db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power) #Convert back to linear
 87 | 
 88 | 	if hparams.use_lws:
 89 | 		processor = _lws_processor(hparams)
 90 | 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 91 | 		y = processor.istft(D).astype(np.float32)
 92 | 		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 93 | 	else:
 94 | 		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
 95 | 
 96 | 
 97 | def inv_mel_spectrogram(mel_spectrogram, hparams):
 98 | 	'''Converts mel spectrogram to waveform using librosa'''
 99 | 	if hparams.signal_normalization:
100 | 		D = _denormalize(mel_spectrogram, hparams)
101 | 	else:
102 | 		D = mel_spectrogram
103 | 
104 | 	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power), hparams)  # Convert back to linear
105 | 
106 | 	if hparams.use_lws:
107 | 		processor = _lws_processor(hparams)
108 | 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
109 | 		y = processor.istft(D).astype(np.float32)
110 | 		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
111 | 	else:
112 | 		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
113 | 
114 | ###########################################################################################
115 | # tensorflow Griffin-Lim
116 | # Thanks to @begeekmyfriend: https://github.com/begeekmyfriend/Tacotron-2/blob/mandarin-new/datasets/audio.py
117 | 
118 | def inv_linear_spectrogram_tensorflow(spectrogram, hparams):
119 | 	'''Builds computational graph to convert spectrogram to waveform using TensorFlow.
120 | 	Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
121 | 	inv_preemphasis on the output after running the graph.
122 | 	'''
123 | 	if hparams.signal_normalization:
124 | 		D = _denormalize_tensorflow(spectrogram, hparams)
125 | 	else:
126 | 		D = linear_spectrogram
127 | 
128 | 	S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power))
129 | 	return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams)
130 | 
131 | def inv_mel_spectrogram_tensorflow(mel_spectrogram, hparams):
132 | 	'''Builds computational graph to convert mel spectrogram to waveform using TensorFlow.
133 | 	Unlike inv_mel_spectrogram, this does NOT invert the preemphasis. The caller should call
134 | 	inv_preemphasis on the output after running the graph.
135 | 	'''
136 | 	if hparams.signal_normalization:
137 | 		D = _denormalize_tensorflow(mel_spectrogram, hparams)
138 | 	else:
139 | 		D = mel_spectrogram
140 | 
141 | 	S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power))
142 | 	S = _mel_to_linear_tensorflow(S, hparams)  # Convert back to linear
143 | 	return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams)
144 | 
145 | ###########################################################################################
146 | 
147 | def _lws_processor(hparams):
148 | 	import lws
149 | 	return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
150 | 
151 | def _griffin_lim(S, hparams):
152 | 	'''librosa implementation of Griffin-Lim
153 | 	Based on https://github.com/librosa/librosa/issues/434
154 | 	'''
155 | 	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
156 | 	S_complex = np.abs(S).astype(np.complex)
157 | 	y = _istft(S_complex * angles, hparams)
158 | 	for i in range(hparams.griffin_lim_iters):
159 | 		angles = np.exp(1j * np.angle(_stft(y, hparams)))
160 | 		y = _istft(S_complex * angles, hparams)
161 | 	return y
162 | 
163 | def _griffin_lim_tensorflow(S, hparams):
164 | 	'''TensorFlow implementation of Griffin-Lim
165 | 	Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
166 | 	'''
167 | 	with tf.variable_scope('griffinlim'):
168 | 		# TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
169 | 		S = tf.expand_dims(S, 0)
170 | 		S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
171 | 		y = tf.contrib.signal.inverse_stft(S_complex, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
172 | 		for i in range(hparams.griffin_lim_iters):
173 | 			est = tf.contrib.signal.stft(y, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
174 | 			angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
175 | 			y = tf.contrib.signal.inverse_stft(S_complex * angles, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
176 | 	return tf.squeeze(y, 0)
177 | 
178 | def _stft(y, hparams):
179 | 	if hparams.use_lws:
180 | 		return _lws_processor(hparams).stft(y).T
181 | 	else:
182 | 		return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size, pad_mode='constant')
183 | 
184 | def _istft(y, hparams):
185 | 	return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
186 | 
187 | ##########################################################
188 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
189 | def num_frames(length, fsize, fshift):
190 | 	"""Compute number of time frames of spectrogram
191 | 	"""
192 | 	pad = (fsize - fshift)
193 | 	if length % fshift == 0:
194 | 		M = (length + pad * 2 - fsize) // fshift + 1
195 | 	else:
196 | 		M = (length + pad * 2 - fsize) // fshift + 2
197 | 	return M
198 | 
199 | 
200 | def pad_lr(x, fsize, fshift):
201 | 	"""Compute left and right padding
202 | 	"""
203 | 	M = num_frames(len(x), fsize, fshift)
204 | 	pad = (fsize - fshift)
205 | 	T = len(x) + 2 * pad
206 | 	r = (M - 1) * fshift + fsize - T
207 | 	return pad, pad + r
208 | ##########################################################
209 | #Librosa correct padding
210 | def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
211 | 	'''compute right padding (final frame) or both sides padding (first and final frames)
212 | 	'''
213 | 	assert pad_sides in (1, 2)
214 | 	# return int(fsize // 2)
215 | 	pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
216 | 	if pad_sides == 1:
217 | 		return 0, pad
218 | 	else:
219 | 		return pad // 2, pad // 2 + pad % 2
220 | 
221 | # Conversions
222 | _mel_basis = None
223 | _inv_mel_basis = None
224 | 
225 | def _linear_to_mel(spectogram, hparams):
226 | 	global _mel_basis
227 | 	if _mel_basis is None:
228 | 		_mel_basis = _build_mel_basis(hparams)
229 | 	return np.dot(_mel_basis, spectogram)
230 | 
231 | def _mel_to_linear(mel_spectrogram, hparams):
232 | 	global _inv_mel_basis
233 | 	if _inv_mel_basis is None:
234 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
235 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
236 | 
237 | def _mel_to_linear_tensorflow(mel_spectrogram, hparams):
238 | 	global _inv_mel_basis
239 | 	if _inv_mel_basis is None:
240 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
241 | 	return tf.transpose(tf.maximum(1e-10, tf.matmul(tf.cast(_inv_mel_basis, tf.float32), tf.transpose(mel_spectrogram, [1, 0]))), [1, 0])
242 | 
243 | def _build_mel_basis(hparams):
244 | 	assert hparams.fmax <= hparams.sample_rate // 2
245 | 	return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
246 | 							   fmin=hparams.fmin, fmax=hparams.fmax)
247 | 
248 | def _amp_to_db(x, hparams):
249 | 	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
250 | 	return 20 * np.log10(np.maximum(min_level, x))
251 | 
252 | def _db_to_amp(x):
253 | 	return np.power(10.0, (x) * 0.05)
254 | 
255 | def _db_to_amp_tensorflow(x):
256 | 	return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
257 | 
258 | def _normalize(S, hparams):
259 | 	if hparams.allow_clipping_in_normalization:
260 | 		if hparams.symmetric_mels:
261 | 			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
262 | 			 -hparams.max_abs_value, hparams.max_abs_value)
263 | 		else:
264 | 			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
265 | 
266 | 	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
267 | 	if hparams.symmetric_mels:
268 | 		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
269 | 	else:
270 | 		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
271 | 
272 | def _denormalize(D, hparams):
273 | 	if hparams.allow_clipping_in_normalization:
274 | 		if hparams.symmetric_mels:
275 | 			return (((np.clip(D, -hparams.max_abs_value,
276 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
277 | 				+ hparams.min_level_db)
278 | 		else:
279 | 			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
280 | 
281 | 	if hparams.symmetric_mels:
282 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
283 | 	else:
284 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
285 | 
286 | def _denormalize_tensorflow(D, hparams):
287 | 	if hparams.allow_clipping_in_normalization:
288 | 		if hparams.symmetric_mels:
289 | 			return (((tf.clip_by_value(D, -hparams.max_abs_value,
290 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
291 | 				+ hparams.min_level_db)
292 | 		else:
293 | 			return ((tf.clip_by_value(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
294 | 
295 | 	if hparams.symmetric_mels:
296 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
297 | 	else:
298 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
299 | 


--------------------------------------------------------------------------------
/datasets/preprocessor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from concurrent.futures import ProcessPoolExecutor
  3 | from functools import partial
  4 | 
  5 | import numpy as np
  6 | from datasets import audio
  7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
  8 | 
  9 | 
 10 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
 11 | 	"""
 12 | 	Preprocesses the speech dataset from a gven input path to given output directories
 13 | 
 14 | 	Args:
 15 | 		- hparams: hyper parameters
 16 | 		- input_dir: input directory that contains the files to prerocess
 17 | 		- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
 18 | 		- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
 19 | 		- wav_dir: output directory of the preprocessed speech audio dataset
 20 | 		- n_jobs: Optional, number of worker process to parallelize across
 21 | 		- tqdm: Optional, provides a nice progress bar
 22 | 
 23 | 	Returns:
 24 | 		- A list of tuple describing the train examples. this should be written to train.txt
 25 | 	"""
 26 | 
 27 | 	# We use ProcessPoolExecutor to parallelize across processes, this is just for
 28 | 	# optimization purposes and it can be omited
 29 | 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 30 | 	futures = []
 31 | 	index = 1
 32 | 	for input_dir in input_dirs:
 33 | 		with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f:
 34 | 			for line in f:
 35 | 				parts = line.strip().split('|')
 36 | 				basename = parts[0]
 37 | 				wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename))
 38 | 				text = parts[2]
 39 | 				futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams)))
 40 | 				index += 1
 41 | 
 42 | 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 43 | 
 44 | 
 45 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
 46 | 	"""
 47 | 	Preprocesses a single utterance wav/text pair
 48 | 
 49 | 	this writes the mel scale spectogram to disk and return a tuple to write
 50 | 	to the train.txt file
 51 | 
 52 | 	Args:
 53 | 		- mel_dir: the directory to write the mel spectograms into
 54 | 		- linear_dir: the directory to write the linear spectrograms into
 55 | 		- wav_dir: the directory to write the preprocessed wav into
 56 | 		- index: the numeric index to use in the spectogram filename
 57 | 		- wav_path: path to the audio file containing the speech input
 58 | 		- text: text spoken in the input audio file
 59 | 		- hparams: hyper parameters
 60 | 
 61 | 	Returns:
 62 | 		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
 63 | 	"""
 64 | 	try:
 65 | 		# Load the audio as numpy array
 66 | 		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
 67 | 	except FileNotFoundError: #catch missing wav exception
 68 | 		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
 69 | 			wav_path))
 70 | 		return None
 71 | 
 72 | 	#Trim lead/trail silences
 73 | 	if hparams.trim_silence:
 74 | 		wav = audio.trim_silence(wav, hparams)
 75 | 
 76 | 	#Pre-emphasize
 77 | 	preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
 78 | 
 79 | 	#rescale wav
 80 | 	if hparams.rescale:
 81 | 		wav = wav / np.abs(wav).max() * hparams.rescaling_max
 82 | 		preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max
 83 | 
 84 | 		#Assert all audio is in [-1, 1]
 85 | 		if (wav > 1.).any() or (wav < -1.).any():
 86 | 			raise RuntimeError('wav has invalid value: {}'.format(wav_path))
 87 | 		if (preem_wav > 1.).any() or (preem_wav < -1.).any():
 88 | 			raise RuntimeError('wav has invalid value: {}'.format(wav_path))
 89 | 
 90 | 	#Mu-law quantize
 91 | 	if is_mulaw_quantize(hparams.input_type):
 92 | 		#[0, quantize_channels)
 93 | 		out = mulaw_quantize(wav, hparams.quantize_channels)
 94 | 
 95 | 		#Trim silences
 96 | 		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 97 | 		wav = wav[start: end]
 98 | 		preem_wav = preem_wav[start: end]
 99 | 		out = out[start: end]
100 | 
101 | 		constant_values = mulaw_quantize(0, hparams.quantize_channels)
102 | 		out_dtype = np.int16
103 | 
104 | 	elif is_mulaw(hparams.input_type):
105 | 		#[-1, 1]
106 | 		out = mulaw(wav, hparams.quantize_channels)
107 | 		constant_values = mulaw(0., hparams.quantize_channels)
108 | 		out_dtype = np.float32
109 | 
110 | 	else:
111 | 		#[-1, 1]
112 | 		out = wav
113 | 		constant_values = 0.
114 | 		out_dtype = np.float32
115 | 
116 | 	# Compute the mel scale spectrogram from the wav
117 | 	mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
118 | 	mel_frames = mel_spectrogram.shape[1]
119 | 
120 | 	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
121 | 		return None
122 | 
123 | 	#Compute the linear scale spectrogram from the wav
124 | 	linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
125 | 	linear_frames = linear_spectrogram.shape[1]
126 | 
127 | 	#sanity check
128 | 	assert linear_frames == mel_frames
129 | 
130 | 	if hparams.use_lws:
131 | 		#Ensure time resolution adjustement between audio and mel-spectrogram
132 | 		fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
133 | 		l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
134 | 
135 | 		#Zero pad audio signal
136 | 		out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
137 | 	else:
138 | 		#Ensure time resolution adjustement between audio and mel-spectrogram
139 | 		l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides)
140 | 
141 | 		#Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
142 | 		out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)
143 | 
144 | 	assert len(out) >= mel_frames * audio.get_hop_size(hparams)
145 | 
146 | 	#time resolution adjustement
147 | 	#ensure length of raw audio is multiple of hop size so that we can use
148 | 	#transposed convolution to upsample
149 | 	out = out[:mel_frames * audio.get_hop_size(hparams)]
150 | 	assert len(out) % audio.get_hop_size(hparams) == 0
151 | 	time_steps = len(out)
152 | 
153 | 	# Write the spectrogram and audio to disk
154 | 	audio_filename = 'audio-{}.npy'.format(index)
155 | 	mel_filename = 'mel-{}.npy'.format(index)
156 | 	linear_filename = 'linear-{}.npy'.format(index)
157 | 	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
158 | 	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
159 | 	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)
160 | 
161 | 	# Return a tuple describing this training example
162 | 	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
163 | 


--------------------------------------------------------------------------------
/datasets/wavenet_preprocessor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from concurrent.futures import ProcessPoolExecutor
  3 | from functools import partial
  4 | 
  5 | import numpy as np
  6 | from datasets import audio
  7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
  8 | 
  9 | 
 10 | def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
 11 | 	"""
 12 | 	Preprocesses the speech dataset from a gven input path to given output directories
 13 | 
 14 | 	Args:
 15 | 		- hparams: hyper parameters
 16 | 		- input_dir: input directory that contains the files to prerocess
 17 | 		- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
 18 | 		- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
 19 | 		- wav_dir: output directory of the preprocessed speech audio dataset
 20 | 		- n_jobs: Optional, number of worker process to parallelize across
 21 | 		- tqdm: Optional, provides a nice progress bar
 22 | 
 23 | 	Returns:
 24 | 		- A list of tuple describing the train examples. this should be written to train.txt
 25 | 	"""
 26 | 
 27 | 	# We use ProcessPoolExecutor to parallelize across processes, this is just for
 28 | 	# optimization purposes and it can be omited
 29 | 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 30 | 	futures = []
 31 | 	for file in os.listdir(input_dir):
 32 | 		wav_path = os.path.join(input_dir, file)
 33 | 		basename = os.path.basename(wav_path).replace('.wav', '')
 34 | 		futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams)))
 35 | 
 36 | 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 37 | 
 38 | 
 39 | def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
 40 | 	"""
 41 | 	Preprocesses a single utterance wav/text pair
 42 | 
 43 | 	this writes the mel scale spectogram to disk and return a tuple to write
 44 | 	to the train.txt file
 45 | 
 46 | 	Args:
 47 | 		- mel_dir: the directory to write the mel spectograms into
 48 | 		- linear_dir: the directory to write the linear spectrograms into
 49 | 		- wav_dir: the directory to write the preprocessed wav into
 50 | 		- index: the numeric index to use in the spectrogram filename
 51 | 		- wav_path: path to the audio file containing the speech input
 52 | 		- text: text spoken in the input audio file
 53 | 		- hparams: hyper parameters
 54 | 
 55 | 	Returns:
 56 | 		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
 57 | 	"""
 58 | 	try:
 59 | 		# Load the audio as numpy array
 60 | 		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
 61 | 	except FileNotFoundError: #catch missing wav exception
 62 | 		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
 63 | 			wav_path))
 64 | 		return None
 65 | 
 66 | 	#M-AILABS extra silence specific
 67 | 	if hparams.trim_silence:
 68 | 		wav = audio.trim_silence(wav, hparams)
 69 | 
 70 | 	#Pre-emphasize
 71 | 	preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
 72 | 
 73 | 	#rescale wav
 74 | 	if hparams.rescale:
 75 | 		wav = wav / np.abs(wav).max() * hparams.rescaling_max
 76 | 		preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max
 77 | 
 78 | 		#Assert all audio is in [-1, 1]
 79 | 		if (wav > 1.).any() or (wav < -1.).any():
 80 | 			raise RuntimeError('wav has invalid value: {}'.format(wav_path))
 81 | 		if (preem_wav > 1.).any() or (preem_wav < -1.).any():
 82 | 			raise RuntimeError('wav has invalid value: {}'.format(wav_path))
 83 | 
 84 | 	#Mu-law quantize
 85 | 	if is_mulaw_quantize(hparams.input_type):
 86 | 		#[0, quantize_channels)
 87 | 		out = mulaw_quantize(wav, hparams.quantize_channels)
 88 | 
 89 | 		#Trim silences
 90 | 		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 91 | 		wav = wav[start: end]
 92 | 		preem_wav = preem_wav[start: end]
 93 | 		out = out[start: end]
 94 | 
 95 | 		constant_values = mulaw_quantize(0, hparams.quantize_channels)
 96 | 		out_dtype = np.int16
 97 | 
 98 | 	elif is_mulaw(hparams.input_type):
 99 | 		#[-1, 1]
100 | 		out = mulaw(wav, hparams.quantize_channels)
101 | 		constant_values = mulaw(0., hparams.quantize_channels)
102 | 		out_dtype = np.float32
103 | 
104 | 	else:
105 | 		#[-1, 1]
106 | 		out = wav
107 | 		constant_values = 0.
108 | 		out_dtype = np.float32
109 | 
110 | 	# Compute the mel scale spectrogram from the wav
111 | 	mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
112 | 	mel_frames = mel_spectrogram.shape[1]
113 | 
114 | 	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
115 | 		return None
116 | 
117 | 	if hparams.use_lws:
118 | 		#Ensure time resolution adjustement between audio and mel-spectrogram
119 | 		fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
120 | 		l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
121 | 
122 | 		#Zero pad audio signal
123 | 		out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
124 | 	else:
125 | 		#Ensure time resolution adjustement between audio and mel-spectrogram
126 | 		l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
127 | 
128 | 		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
129 | 		out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)
130 | 
131 | 	assert len(out) >= mel_frames * audio.get_hop_size(hparams)
132 | 
133 | 	#time resolution adjustement
134 | 	#ensure length of raw audio is multiple of hop size so that we can use
135 | 	#transposed convolution to upsample
136 | 	out = out[:mel_frames * audio.get_hop_size(hparams)]
137 | 	assert len(out) % audio.get_hop_size(hparams) == 0
138 | 	time_steps = len(out)
139 | 
140 | 	# Write the spectrogram and audio to disk
141 | 	audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
142 | 	mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
143 | 	np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
144 | 	np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
145 | 
146 | 	#global condition features
147 | 	if hparams.gin_channels > 0:
148 | 		raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
149 | 		speaker_id = '<no_g>' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
150 | 	else:
151 | 		speaker_id = '<no_g>'
152 | 
153 | 	# Return a tuple describing this training example
154 | 	return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps, mel_frames)
155 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda3:latest
 2 | FROM tensorflow/tensorflow:latest-gpu-py3
 3 | 
 4 | RUN apt-get update
 5 | RUN apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools wget git vim
 6 | 
 7 | RUN wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 8 | RUN tar -jxvf LJSpeech-1.1.tar.bz2
 9 | 
10 | RUN git clone https://github.com/Rayhane-mamah/Tacotron-2.git
11 | 
12 | WORKDIR Tacotron-2
13 | RUN ln -s ../LJSpeech-1.1 .
14 | RUN pip install -r requirements.txt


--------------------------------------------------------------------------------
/griffin_lim_synthesis_tool.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "from datasets.audio import *\n",
 13 |     "import os\n",
 14 |     "from hparams import hparams\n",
 15 |     "\n",
 16 |     "n_sample = 0 #Change n_steps here\n",
 17 |     "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n",
 18 |     "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n",
 19 |     "out_dir = 'wav_out'\n",
 20 |     "\n",
 21 |     "os.makedirs(out_dir, exist_ok=True)\n",
 22 |     "\n",
 23 |     "#mel_file = os.path.join(mel_folder, mel_file)\n",
 24 |     "mel_file = 'training_data/mels/mel-LJ001-0001.npy'\n",
 25 |     "mel_spectro = np.load(mel_file)\n",
 26 |     "mel_spectro.shape"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n",
 36 |     "#save the wav under test_<folder>_<file>\n",
 37 |     "save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
 38 |     "        sr=hparams.sample_rate)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "from tacotron.utils.plot import *\n",
 48 |     "\n",
 49 |     "plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "lin_file = 'training_data/linear/linear-LJ001-0001.npy'\n",
 59 |     "lin_spectro = np.load(lin_file)\n",
 60 |     "lin_spectro.shape"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n",
 70 |     "save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
 71 |     "        sr=hparams.sample_rate)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
 81 |     "                auto_aspect=True)"
 82 |    ]
 83 |   }
 84 |  ],
 85 |  "metadata": {
 86 |   "kernelspec": {
 87 |    "display_name": "Python 3",
 88 |    "language": "python",
 89 |    "name": "python3"
 90 |   },
 91 |   "language_info": {
 92 |    "codemirror_mode": {
 93 |     "name": "ipython",
 94 |     "version": 3
 95 |    },
 96 |    "file_extension": ".py",
 97 |    "mimetype": "text/x-python",
 98 |    "name": "python",
 99 |    "nbconvert_exporter": "python",
100 |    "pygments_lexer": "ipython3",
101 |    "version": "3.6.4"
102 |   }
103 |  },
104 |  "nbformat": 4,
105 |  "nbformat_minor": 2
106 | }
107 | 


--------------------------------------------------------------------------------
/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import json
 3 | from datetime import datetime
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | _format = '%Y-%m-%d %H:%M:%S.%f'
 8 | _file = None
 9 | _run_name = None
10 | _slack_url = None
11 | 
12 | 
13 | def init(filename, run_name, slack_url=None):
14 | 	global _file, _run_name, _slack_url
15 | 	_close_logfile()
16 | 	_file = open(filename, 'a')
17 | 	_file = open(filename, 'a')
18 | 	_file.write('\n-----------------------------------------------------------------\n')
19 | 	_file.write('Starting new {} training run\n'.format(run_name))
20 | 	_file.write('-----------------------------------------------------------------\n')
21 | 	_run_name = run_name
22 | 	_slack_url = slack_url
23 | 
24 | 
25 | def log(msg, end='\n', slack=False):
26 | 	print(msg, end=end)
27 | 	if _file is not None:
28 | 		_file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
29 | 	if slack and _slack_url is not None:
30 | 		Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 | 	global _file
35 | 	if _file is not None:
36 | 		_file.close()
37 | 		_file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 | 	req = Request(_slack_url)
42 | 	req.add_header('Content-Type', 'application/json')
43 | 	urlopen(req, json.dumps({
44 | 		'username': 'tacotron',
45 | 		'icon_emoji': ':taco:',
46 | 		'text': '*%s*: %s' % (_run_name, msg)
47 | 	}).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/papers/(content+location) attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/(content+location) attention.pdf


--------------------------------------------------------------------------------
/papers/ClariNet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/ClariNet.pdf


--------------------------------------------------------------------------------
/papers/Tacotron 2 revised.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/Tacotron 2 revised.pdf


--------------------------------------------------------------------------------
/papers/bahdanau (content) attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/bahdanau (content) attention.pdf


--------------------------------------------------------------------------------
/papers/deepvoice 3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/deepvoice 3.pdf


--------------------------------------------------------------------------------
/papers/effective approaches attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/effective approaches attention.pdf


--------------------------------------------------------------------------------
/papers/fast_wavenet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/fast_wavenet.pdf


--------------------------------------------------------------------------------
/papers/tacotron.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron.pdf


--------------------------------------------------------------------------------
/papers/tacotron2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron2.pdf


--------------------------------------------------------------------------------
/papers/wavenet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/wavenet.pdf


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from multiprocessing import cpu_count
  4 | 
  5 | from datasets import preprocessor
  6 | from hparams import hparams
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | def preprocess(args, input_folders, out_dir, hparams):
 11 | 	mel_dir = os.path.join(out_dir, 'mels')
 12 | 	wav_dir = os.path.join(out_dir, 'audio')
 13 | 	linear_dir = os.path.join(out_dir, 'linear')
 14 | 	os.makedirs(mel_dir, exist_ok=True)
 15 | 	os.makedirs(wav_dir, exist_ok=True)
 16 | 	os.makedirs(linear_dir, exist_ok=True)
 17 | 	metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm)
 18 | 	write_metadata(metadata, out_dir)
 19 | 
 20 | def write_metadata(metadata, out_dir):
 21 | 	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
 22 | 		for m in metadata:
 23 | 			f.write('|'.join([str(x) for x in m]) + '\n')
 24 | 	mel_frames = sum([int(m[4]) for m in metadata])
 25 | 	timesteps = sum([int(m[3]) for m in metadata])
 26 | 	sr = hparams.sample_rate
 27 | 	hours = timesteps / sr / 3600
 28 | 	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
 29 | 		len(metadata), mel_frames, timesteps, hours))
 30 | 	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
 31 | 	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
 32 | 	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
 33 | 
 34 | def norm_data(args):
 35 | 
 36 | 	merge_books = (args.merge_books=='True')
 37 | 
 38 | 	print('Selecting data folders..')
 39 | 	supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS']
 40 | 	if args.dataset not in supported_datasets:
 41 | 		raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
 42 | 			args.dataset, supported_datasets))
 43 | 
 44 | 	if args.dataset.startswith('LJSpeech'):
 45 | 		return [os.path.join(args.base_dir, args.dataset)]
 46 | 
 47 | 
 48 | 	if args.dataset == 'M-AILABS':
 49 | 		supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
 50 | 			'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
 51 | 		if args.language not in supported_languages:
 52 | 			raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
 53 | 				supported_languages))
 54 | 
 55 | 		supported_voices = ['female', 'male', 'mix']
 56 | 		if args.voice not in supported_voices:
 57 | 			raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
 58 | 				supported_voices))
 59 | 
 60 | 		path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
 61 | 		supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 62 | 		if args.reader not in supported_readers:
 63 | 			raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
 64 | 				supported_readers))
 65 | 
 66 | 		path = os.path.join(path, args.reader)
 67 | 		supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 68 | 		if merge_books:
 69 | 			return [os.path.join(path, book) for book in supported_books]
 70 | 
 71 | 		else:
 72 | 			if args.book not in supported_books:
 73 | 				raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
 74 | 					supported_books))
 75 | 
 76 | 			return [os.path.join(path, args.book)]
 77 | 
 78 | 
 79 | def run_preprocess(args, hparams):
 80 | 	input_folders = norm_data(args)
 81 | 	output_folder = os.path.join(args.base_dir, args.output)
 82 | 
 83 | 	preprocess(args, input_folders, output_folder, hparams)
 84 | 
 85 | 
 86 | def main():
 87 | 	print('initializing preprocessing..')
 88 | 	parser = argparse.ArgumentParser()
 89 | 	parser.add_argument('--base_dir', default='')
 90 | 	parser.add_argument('--hparams', default='',
 91 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 92 | 	parser.add_argument('--dataset', default='LJSpeech-1.1')
 93 | 	parser.add_argument('--language', default='en_US')
 94 | 	parser.add_argument('--voice', default='female')
 95 | 	parser.add_argument('--reader', default='mary_ann')
 96 | 	parser.add_argument('--merge_books', default='False')
 97 | 	parser.add_argument('--book', default='northandsouth')
 98 | 	parser.add_argument('--output', default='training_data')
 99 | 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
100 | 	args = parser.parse_args()
101 | 
102 | 	modified_hp = hparams.parse(args.hparams)
103 | 
104 | 	assert args.merge_books in ('False', 'True')
105 | 
106 | 	run_preprocess(args, modified_hp)
107 | 
108 | 
109 | if __name__ == '__main__':
110 | 	main()
111 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | falcon==1.2.0
 2 | inflect==0.2.5
 3 | audioread==2.1.5
 4 | librosa==0.5.1
 5 | matplotlib==2.0.2
 6 | numpy==1.14.0
 7 | scipy==1.0.0
 8 | tqdm==4.11.2
 9 | Unidecode==0.4.20
10 | pyaudio==0.2.11
11 | sounddevice==0.3.10
12 | lws
13 | keras


--------------------------------------------------------------------------------
/sentences.txt:
--------------------------------------------------------------------------------
 1 | Scientists at the CERN laboratory say they have discovered a new particle.
 2 | There's a way to measure the acute emotional intelligence that has never gone out of style.
 3 | President Trump met with other leaders at the Group of 20 conference.
 4 | The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
 5 | Generative adversarial network or variational auto-encoder.
 6 | Basilar membrane and otolaryngology are not auto-correlations.
 7 | He has read the whole thing.
 8 | He reads books.
 9 | He thought it was time to present the present.
10 | Thisss isrealy awhsome.
11 | Punctuation sensitivity, is working.
12 | Punctuation sensitivity is working.
13 | Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
14 | She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
15 | Tajima Airport serves Toyooka.
16 | On offering to help the blind man, the man who then stole his car, had not, at that precise moment, had any evil intention, quite the contrary, what he did was nothing more than obey those feelings of generosity and altruism which, as everyone knows, are the two best traits of human nature and to be found in much more hardened criminals than this one, a simple car-thief without any hope of advancing in his profession, exploited by the real owners this enterprise, for it is they who take advantage of the needs of the poor.
17 | Thank you so much for your support!


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from warnings import warn
  4 | from time import sleep
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | from hparams import hparams
  9 | from infolog import log
 10 | from tacotron.synthesize import tacotron_synthesize
 11 | from wavenet_vocoder.synthesize import wavenet_synthesize
 12 | 
 13 | 
 14 | def prepare_run(args):
 15 | 	modified_hp = hparams.parse(args.hparams)
 16 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 17 | 
 18 | 	run_name = args.name or args.tacotron_name or args.model
 19 | 	taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint)
 20 | 
 21 | 	run_name = args.name or args.wavenet_name or args.model
 22 | 	wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
 23 | 	return taco_checkpoint, wave_checkpoint, modified_hp
 24 | 
 25 | def get_sentences(args):
 26 | 	if args.text_list != '':
 27 | 		with open(args.text_list, 'rb') as f:
 28 | 			sentences = list(map(lambda l: l.decode("utf-8")[:-1], f.readlines()))
 29 | 	else:
 30 | 		sentences = hparams.sentences
 31 | 	return sentences
 32 | 
 33 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
 34 | 	log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
 35 | 	log('Synthesizing mel-spectrograms from text..')
 36 | 	wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
 37 | 	#Delete Tacotron model from graph
 38 | 	tf.reset_default_graph()
 39 | 	#Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is synthesizing
 40 | 	sleep(0.5)
 41 | 	log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
 42 | 	wavenet_synthesize(args, hparams, wave_checkpoint)
 43 | 	log('Tacotron-2 TTS synthesis complete!')
 44 | 
 45 | 
 46 | 
 47 | def main():
 48 | 	accepted_modes = ['eval', 'synthesis', 'live']
 49 | 	parser = argparse.ArgumentParser()
 50 | 	parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint')
 51 | 	parser.add_argument('--hparams', default='',
 52 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 53 | 	parser.add_argument('--name', help='Name of logging directory if the two models were trained together.')
 54 | 	parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately')
 55 | 	parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately')
 56 | 	parser.add_argument('--model', default='Tacotron-2')
 57 | 	parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
 58 | 	parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet')
 59 | 	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
 60 | 	parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
 61 | 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
 62 | 	parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
 63 | 	parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
 64 | 	args = parser.parse_args()
 65 | 
 66 | 	accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
 67 | 
 68 | 	if args.model not in accepted_models:
 69 | 		raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models))
 70 | 
 71 | 	if args.mode not in accepted_modes:
 72 | 		raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))
 73 | 
 74 | 	if args.mode == 'live' and args.model == 'Wavenet':
 75 | 		raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!')
 76 | 
 77 | 	if args.GTA not in ('True', 'False'):
 78 | 		raise ValueError('GTA option must be either True or False')
 79 | 
 80 | 	if args.model == 'Tacotron-2':
 81 | 		if args.mode == 'live':
 82 | 			warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!')
 83 | 		if args.mode == 'synthesis':
 84 | 			raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)')
 85 | 
 86 | 	taco_checkpoint, wave_checkpoint, hparams = prepare_run(args)
 87 | 	sentences = get_sentences(args)
 88 | 
 89 | 	if args.model == 'Tacotron':
 90 | 		_ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
 91 | 	elif args.model == 'WaveNet':
 92 | 		wavenet_synthesize(args, hparams, wave_checkpoint)
 93 | 	elif args.model == 'Tacotron-2':
 94 | 		synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences)
 95 | 	else:
 96 | 		raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 | 	main()
101 | 


--------------------------------------------------------------------------------
/tacotron/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/tacotron/feeder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | import time
  4 | import traceback
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from infolog import log
  9 | from sklearn.model_selection import train_test_split
 10 | from tacotron.utils.text import text_to_sequence
 11 | 
 12 | _batches_per_group = 64
 13 | 
 14 | class Feeder:
 15 | 	"""
 16 | 		Feeds batches of data into queue on a background thread.
 17 | 	"""
 18 | 
 19 | 	def __init__(self, coordinator, metadata_filename, hparams):
 20 | 		super(Feeder, self).__init__()
 21 | 		self._coord = coordinator
 22 | 		self._hparams = hparams
 23 | 		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 24 | 		self._train_offset = 0
 25 | 		self._test_offset = 0
 26 | 
 27 | 		# Load metadata
 28 | 		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
 29 | 		self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
 30 | 		with open(metadata_filename, encoding='utf-8') as f:
 31 | 			self._metadata = [line.strip().split('|') for line in f]
 32 | 			frame_shift_ms = hparams.hop_size / hparams.sample_rate
 33 | 			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
 34 | 			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
 35 | 
 36 | 		#Train test split
 37 | 		if hparams.tacotron_test_size is None:
 38 | 			assert hparams.tacotron_test_batches is not None
 39 | 
 40 | 		test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
 41 | 			else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
 42 | 		indices = np.arange(len(self._metadata))
 43 | 		train_indices, test_indices = train_test_split(indices,
 44 | 			test_size=test_size, random_state=hparams.tacotron_data_random_state)
 45 | 
 46 | 		#Make sure test_indices is a multiple of batch_size else round down
 47 | 		len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
 48 | 		extra_test = test_indices[len_test_indices:]
 49 | 		test_indices = test_indices[:len_test_indices]
 50 | 		train_indices = np.concatenate([train_indices, extra_test])
 51 | 
 52 | 		self._train_meta = list(np.array(self._metadata)[train_indices])
 53 | 		self._test_meta = list(np.array(self._metadata)[test_indices])
 54 | 
 55 | 		self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
 56 | 
 57 | 		if hparams.tacotron_test_size is None:
 58 | 			assert hparams.tacotron_test_batches == self.test_steps
 59 | 
 60 | 		#pad input sequences with the <pad_token> 0 ( _ )
 61 | 		self._pad = 0
 62 | 		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
 63 | 		#to avoid any possible conflicts, without affecting the output range of the model too much
 64 | 		if hparams.symmetric_mels:
 65 | 			self._target_pad = -hparams.max_abs_value
 66 | 		else:
 67 | 			self._target_pad = 0.
 68 | 		#Mark finished sequences with 1s
 69 | 		self._token_pad = 1.
 70 | 
 71 | 		with tf.device('/cpu:0'):
 72 | 			# Create placeholders for inputs and targets. Don't specify batch size because we want
 73 | 			# to be able to feed different batch sizes at eval time.
 74 | 			self._placeholders = [
 75 | 			tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
 76 | 			tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 77 | 			tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
 78 | 			tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
 79 | 			tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
 80 | 			tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
 81 | 			tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos'),
 82 | 			]
 83 | 
 84 | 			# Create queue for buffering data
 85 | 			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue')
 86 | 			self._enqueue_op = queue.enqueue(self._placeholders)
 87 | 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = queue.dequeue()
 88 | 
 89 | 			self.inputs.set_shape(self._placeholders[0].shape)
 90 | 			self.input_lengths.set_shape(self._placeholders[1].shape)
 91 | 			self.mel_targets.set_shape(self._placeholders[2].shape)
 92 | 			self.token_targets.set_shape(self._placeholders[3].shape)
 93 | 			self.linear_targets.set_shape(self._placeholders[4].shape)
 94 | 			self.targets_lengths.set_shape(self._placeholders[5].shape)
 95 | 			self.split_infos.set_shape(self._placeholders[6].shape)
 96 | 
 97 | 			# Create eval queue for buffering eval data
 98 | 			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='eval_queue')
 99 | 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
100 | 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
101 | 				self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = eval_queue.dequeue()
102 | 
103 | 			self.eval_inputs.set_shape(self._placeholders[0].shape)
104 | 			self.eval_input_lengths.set_shape(self._placeholders[1].shape)
105 | 			self.eval_mel_targets.set_shape(self._placeholders[2].shape)
106 | 			self.eval_token_targets.set_shape(self._placeholders[3].shape)
107 | 			self.eval_linear_targets.set_shape(self._placeholders[4].shape)
108 | 			self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
109 | 			self.eval_split_infos.set_shape(self._placeholders[6].shape)
110 | 
111 | 	def start_threads(self, session):
112 | 		self._session = session
113 | 		thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
114 | 		thread.daemon = True #Thread will close when parent quits
115 | 		thread.start()
116 | 
117 | 		thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
118 | 		thread.daemon = True #Thread will close when parent quits
119 | 		thread.start()
120 | 
121 | 	def _get_test_groups(self):
122 | 		meta = self._test_meta[self._test_offset]
123 | 		self._test_offset += 1
124 | 
125 | 		text = meta[5]
126 | 
127 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
128 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
129 | 		#Create parallel sequences containing zeros to represent a non finished sequence
130 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
131 | 		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
132 | 		return (input_data, mel_target, token_target, linear_target, len(mel_target))
133 | 
134 | 	def make_test_batches(self):
135 | 		start = time.time()
136 | 
137 | 		# Read a group of examples
138 | 		n = self._hparams.tacotron_batch_size
139 | 		r = self._hparams.outputs_per_step
140 | 
141 | 		#Test on entire test set
142 | 		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
143 | 
144 | 		# Bucket examples based on similar output sequence length for efficiency
145 | 		examples.sort(key=lambda x: x[-1])
146 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
147 | 		np.random.shuffle(batches)
148 | 
149 | 		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
150 | 		return batches, r
151 | 
152 | 	def _enqueue_next_train_group(self):
153 | 		while not self._coord.should_stop():
154 | 			start = time.time()
155 | 
156 | 			# Read a group of examples
157 | 			n = self._hparams.tacotron_batch_size
158 | 			r = self._hparams.outputs_per_step
159 | 			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
160 | 
161 | 			# Bucket examples based on similar output sequence length for efficiency
162 | 			examples.sort(key=lambda x: x[-1])
163 | 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
164 | 			np.random.shuffle(batches)
165 | 
166 | 			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
167 | 			for batch in batches:
168 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
169 | 				self._session.run(self._enqueue_op, feed_dict=feed_dict)
170 | 
171 | 	def _enqueue_next_test_group(self):
172 | 		#Create test batches once and evaluate on them for all test steps
173 | 		test_batches, r = self.make_test_batches()
174 | 		while not self._coord.should_stop():
175 | 			for batch in test_batches:
176 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
177 | 				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
178 | 
179 | 	def _get_next_example(self):
180 | 		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
181 | 		"""
182 | 		if self._train_offset >= len(self._train_meta):
183 | 			self._train_offset = 0
184 | 			np.random.shuffle(self._train_meta)
185 | 
186 | 		meta = self._train_meta[self._train_offset]
187 | 		self._train_offset += 1
188 | 
189 | 		text = meta[5]
190 | 
191 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
192 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
193 | 		#Create parallel sequences containing zeros to represent a non finished sequence
194 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
195 | 		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
196 | 		return (input_data, mel_target, token_target, linear_target, len(mel_target))
197 | 
198 | 	def _prepare_batch(self, batches, outputs_per_step):
199 | 		assert 0 == len(batches) % self._hparams.tacotron_num_gpus
200 | 		size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
201 | 		np.random.shuffle(batches)
202 | 
203 | 		inputs = None
204 | 		mel_targets = None
205 | 		token_targets = None
206 | 		linear_targets = None
207 | 		targets_lengths = None
208 | 		split_infos = []
209 | 
210 | 		targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
211 | 		input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
212 | 
213 | 		#Produce inputs/targets of variables lengths for different GPUs
214 | 		for i in range(self._hparams.tacotron_num_gpus):
215 | 			batch = batches[size_per_device * i: size_per_device * (i + 1)]
216 | 			input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
217 | 			inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
218 | 			mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
219 | 			mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
220 | 
221 | 			#Pad sequences with 1 to infer that the sequence is done
222 | 			token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
223 | 			token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
224 | 			linear_targets_cur_device, linear_target_max_len = self._prepare_targets([x[3] for x in batch], outputs_per_step)
225 | 			linear_targets = np.concatenate((linear_targets, linear_targets_cur_device), axis=1) if linear_targets is not None else linear_targets_cur_device
226 | 			split_infos.append([input_max_len, mel_target_max_len, token_target_max_len, linear_target_max_len])
227 | 
228 | 		split_infos = np.asarray(split_infos, dtype=np.int32)
229 | 		return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths, split_infos)
230 | 
231 | 	def _prepare_inputs(self, inputs):
232 | 		max_len = max([len(x) for x in inputs])
233 | 		return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
234 | 
235 | 	def _prepare_targets(self, targets, alignment):
236 | 		max_len = max([len(t) for t in targets])
237 | 		data_len = self._round_up(max_len, alignment)
238 | 		return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
239 | 
240 | 	def _prepare_token_targets(self, targets, alignment):
241 | 		max_len = max([len(t) for t in targets]) + 1
242 | 		data_len = self._round_up(max_len, alignment)
243 | 		return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
244 | 
245 | 	def _pad_input(self, x, length):
246 | 		return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
247 | 
248 | 	def _pad_target(self, t, length):
249 | 		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
250 | 
251 | 	def _pad_token_target(self, t, length):
252 | 		return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad)
253 | 
254 | 	def _round_up(self, x, multiple):
255 | 		remainder = x % multiple
256 | 		return x if remainder == 0 else x + multiple - remainder
257 | 
258 | 	def _round_down(self, x, multiple):
259 | 		remainder = x % multiple
260 | 		return x if remainder == 0 else x - remainder
261 | 


--------------------------------------------------------------------------------
/tacotron/models/Architecture_wrappers.py:
--------------------------------------------------------------------------------
  1 | """A set of wrappers usefull for tacotron 2 architecture
  2 | All notations and variable names were used in concordance with originial tensorflow implementation
  3 | """
  4 | import collections
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from tacotron.models.attention import _compute_attention
  9 | from tensorflow.contrib.rnn import RNNCell
 10 | from tensorflow.python.framework import ops, tensor_shape
 11 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
 12 | from tensorflow.python.util import nest
 13 | 
 14 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 15 | 
 16 | 
 17 | 
 18 | class TacotronEncoderCell(RNNCell):
 19 | 	"""Tacotron 2 Encoder Cell
 20 | 	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
 21 | 	layer to predict the hidden representation vector (or memory)
 22 | 	"""
 23 | 
 24 | 	def __init__(self, convolutional_layers, lstm_layer):
 25 | 		"""Initialize encoder parameters
 26 | 
 27 | 		Args:
 28 | 			convolutional_layers: Encoder convolutional block class
 29 | 			lstm_layer: encoder bidirectional lstm layer class
 30 | 		"""
 31 | 		super(TacotronEncoderCell, self).__init__()
 32 | 		#Initialize encoder layers
 33 | 		self._convolutions = convolutional_layers
 34 | 		self._cell = lstm_layer
 35 | 
 36 | 	def __call__(self, inputs, input_lengths=None):
 37 | 		#Pass input sequence through a stack of convolutional layers
 38 | 		conv_output = self._convolutions(inputs)
 39 | 
 40 | 		#Extract hidden representation from encoder lstm cells
 41 | 		hidden_representation = self._cell(conv_output, input_lengths)
 42 | 
 43 | 		#For shape visualization
 44 | 		self.conv_output_shape = conv_output.shape
 45 | 		return hidden_representation
 46 | 
 47 | 
 48 | class TacotronDecoderCellState(
 49 | 	collections.namedtuple("TacotronDecoderCellState",
 50 | 	 ("cell_state", "attention", "time", "alignments",
 51 | 	  "alignment_history", "max_attentions"))):
 52 | 	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
 53 | 	Contains:
 54 | 	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 55 | 		step.
 56 | 	  - `attention`: The attention emitted at the previous time step.
 57 | 	  - `time`: int32 scalar containing the current time step.
 58 | 	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 59 | 		 emitted at the previous time step for each attention mechanism.
 60 | 	  - `alignment_history`: a single or tuple of `TensorArray`(s)
 61 | 		 containing alignment matrices from all time steps for each attention
 62 | 		 mechanism. Call `stack()` on each to convert to a `Tensor`.
 63 | 	"""
 64 | 	def replace(self, **kwargs):
 65 | 		"""Clones the current state while overwriting components provided by kwargs.
 66 | 		"""
 67 | 		return super(TacotronDecoderCellState, self)._replace(**kwargs)
 68 | 
 69 | class TacotronDecoderCell(RNNCell):
 70 | 	"""Tactron 2 Decoder Cell
 71 | 	Decodes encoder output and previous mel frames into next r frames
 72 | 
 73 | 	Decoder Step i:
 74 | 		1) Prenet to compress last output information
 75 | 		2) Concat compressed inputs with previous context vector (input feeding) *
 76 | 		3) Decoder RNN (actual decoding) to predict current state s_{i} *
 77 | 		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 78 | 		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 79 | 		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 80 | 
 81 | 	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
 82 | 	and wrap that with the prenet before doing an input feeding, and with the prediction layer
 83 | 	that uses RNN states to project on output space. Actions marked with (*) can be replaced with
 84 | 	tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
 85 | 	"""
 86 | 
 87 | 	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
 88 | 		"""Initialize decoder parameters
 89 | 
 90 | 		Args:
 91 | 		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
 92 | 		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to
 93 | 			    learn encoder-decoder alignments
 94 | 		    rnn_cell: Instance of RNNCell, main body of the decoder
 95 | 		    frame_projection: tensorflow fully connected layer with r * num_mels output units
 96 | 		    stop_projection: tensorflow fully connected layer, expected to project to a scalar
 97 | 			    and through a sigmoid activation
 98 | 			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
 99 | 		"""
100 | 		super(TacotronDecoderCell, self).__init__()
101 | 		#Initialize decoder layers
102 | 		self._prenet = prenet
103 | 		self._attention_mechanism = attention_mechanism
104 | 		self._cell = rnn_cell
105 | 		self._frame_projection = frame_projection
106 | 		self._stop_projection = stop_projection
107 | 
108 | 		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
109 | 
110 | 	def _batch_size_checks(self, batch_size, error_message):
111 | 		return [check_ops.assert_equal(batch_size,
112 | 		  self._attention_mechanism.batch_size,
113 | 		  message=error_message)]
114 | 
115 | 	@property
116 | 	def output_size(self):
117 | 		return self._frame_projection.shape
118 | 
119 | 	@property
120 | 	def state_size(self):
121 | 		"""The `state_size` property of `TacotronDecoderCell`.
122 | 
123 | 		Returns:
124 | 		  An `TacotronDecoderCell` tuple containing shapes used by this object.
125 | 		"""
126 | 		return TacotronDecoderCellState(
127 | 			cell_state=self._cell._cell.state_size,
128 | 			time=tensor_shape.TensorShape([]),
129 | 			attention=self._attention_layer_size,
130 | 			alignments=self._attention_mechanism.alignments_size,
131 | 			alignment_history=(),
132 | 			max_attentions=())
133 | 
134 | 	def zero_state(self, batch_size, dtype):
135 | 		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
136 | 
137 | 		Args:
138 | 		  batch_size: `0D` integer tensor: the batch size.
139 | 		  dtype: The internal state data type.
140 | 		Returns:
141 | 		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
142 | 		  possibly, empty `TensorArray` objects.
143 | 		Raises:
144 | 		  ValueError: (or, possibly at runtime, InvalidArgument), if
145 | 			`batch_size` does not match the output size of the encoder passed
146 | 			to the wrapper object at initialization time.
147 | 		"""
148 | 		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
149 | 			cell_state = self._cell._cell.zero_state(batch_size, dtype)
150 | 			error_message = (
151 | 				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
152 | 				"Non-matching batch sizes between the memory "
153 | 				"(encoder output) and the requested batch size.")
154 | 			with ops.control_dependencies(
155 | 				self._batch_size_checks(batch_size, error_message)):
156 | 				cell_state = nest.map_structure(
157 | 					lambda s: array_ops.identity(s, name="checked_cell_state"),
158 | 					cell_state)
159 | 			return TacotronDecoderCellState(
160 | 				cell_state=cell_state,
161 | 				time=array_ops.zeros([], dtype=tf.int32),
162 | 				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
163 | 				  dtype),
164 | 				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
165 | 				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
166 | 				dynamic_size=True),
167 | 				max_attentions=tf.zeros((batch_size, ), dtype=tf.int32))
168 | 
169 | 	def __call__(self, inputs, state):
170 | 		#Information bottleneck (essential for learning attention)
171 | 		prenet_output = self._prenet(inputs)
172 | 
173 | 		#Concat context vector and prenet output to form LSTM cells input (input feeding)
174 | 		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
175 | 
176 | 		#Unidirectional LSTM layers
177 | 		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
178 | 
179 | 
180 | 		#Compute the attention (context) vector and alignments using
181 | 		#the new decoder cell hidden state as query vector
182 | 		#and cumulative alignments to extract location features
183 | 		#The choice of the new cell hidden state (s_{i}) of the last
184 | 		#decoder RNN Cell is based on Luong et Al. (2015):
185 | 		#https://arxiv.org/pdf/1508.04025.pdf
186 | 		previous_alignments = state.alignments
187 | 		previous_alignment_history = state.alignment_history
188 | 		context_vector, alignments, cumulated_alignments, max_attentions = _compute_attention(self._attention_mechanism,
189 | 			LSTM_output,
190 | 			previous_alignments,
191 | 			attention_layer=None,
192 | 			prev_max_attentions=state.max_attentions)
193 | 
194 | 		#Concat LSTM outputs and context vector to form projections inputs
195 | 		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
196 | 
197 | 		#Compute predicted frames and predicted <stop_token>
198 | 		cell_outputs = self._frame_projection(projections_input)
199 | 		stop_tokens = self._stop_projection(projections_input)
200 | 
201 | 		#Save alignment history
202 | 		alignment_history = previous_alignment_history.write(state.time, alignments)
203 | 
204 | 		#Prepare next decoder state
205 | 		next_state = TacotronDecoderCellState(
206 | 			time=state.time + 1,
207 | 			cell_state=next_cell_state,
208 | 			attention=context_vector,
209 | 			alignments=cumulated_alignments,
210 | 			alignment_history=alignment_history,
211 | 			max_attentions=max_attentions)
212 | 
213 | 		return (cell_outputs, stop_tokens), next_state
214 | 


--------------------------------------------------------------------------------
/tacotron/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == 'Tacotron':
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception('Unknown model: ' + name)
9 | 


--------------------------------------------------------------------------------
/tacotron/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.layers import core as layers_core
  6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
  7 | 
  8 | 
  9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 10 | def _compute_attention(attention_mechanism, cell_output, attention_state,
 11 | 					   attention_layer, prev_max_attentions):
 12 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 13 | 	alignments, next_attention_state, max_attentions = attention_mechanism(
 14 | 		cell_output, state=attention_state, prev_max_attentions=prev_max_attentions)
 15 | 
 16 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 17 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 18 | 	# Context is the inner product of alignments and values along the
 19 | 	# memory time dimension.
 20 | 	# alignments shape is
 21 | 	#   [batch_size, 1, memory_time]
 22 | 	# attention_mechanism.values shape is
 23 | 	#   [batch_size, memory_time, memory_size]
 24 | 	# the batched matmul is over memory_time, so the output shape is
 25 | 	#   [batch_size, 1, memory_size].
 26 | 	# we then squeeze out the singleton dim.
 27 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 28 | 	context = array_ops.squeeze(context, [1])
 29 | 
 30 | 	if attention_layer is not None:
 31 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 32 | 	else:
 33 | 		attention = context
 34 | 
 35 | 	return attention, alignments, next_attention_state, max_attentions
 36 | 
 37 | 
 38 | def _location_sensitive_score(W_query, W_fil, W_keys):
 39 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 40 | 	This attention is described in:
 41 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 42 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 43 | 	  vances in Neural Information Processing Systems, 2015, pp.
 44 | 	  577–585.
 45 | 
 46 | 	#############################################################################
 47 | 			  hybrid attention (content-based + location-based)
 48 | 							   f = F * α_{i-1}
 49 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 50 | 	#############################################################################
 51 | 
 52 | 	Args:
 53 | 		W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
 54 | 		W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
 55 | 		W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
 56 | 	Returns:
 57 | 		A '[batch_size, max_time]' attention score (energy)
 58 | 	"""
 59 | 	# Get the number of hidden units from the trailing dimension of keys
 60 | 	dtype = W_query.dtype
 61 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 62 | 
 63 | 	v_a = tf.get_variable(
 64 | 		'attention_variable_projection', shape=[num_units], dtype=dtype,
 65 | 		initializer=tf.contrib.layers.xavier_initializer())
 66 | 	b_a = tf.get_variable(
 67 | 		'attention_bias', shape=[num_units], dtype=dtype,
 68 | 		initializer=tf.zeros_initializer())
 69 | 
 70 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 71 | 
 72 | def _smoothing_normalization(e):
 73 | 	"""Applies a smoothing normalization function instead of softmax
 74 | 	Introduced in:
 75 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 76 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 77 | 	  vances in Neural Information Processing Systems, 2015, pp.
 78 | 	  577–585.
 79 | 
 80 | 	############################################################################
 81 | 						Smoothing normalization function
 82 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 83 | 	############################################################################
 84 | 
 85 | 	Args:
 86 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 87 | 			values of an attention mechanism
 88 | 	Returns:
 89 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 90 | 			attendance to multiple memory time steps.
 91 | 	"""
 92 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 93 | 
 94 | 
 95 | class LocationSensitiveAttention(BahdanauAttention):
 96 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 97 | 	Usually referred to as "hybrid" attention (content-based + location-based)
 98 | 	Extends the additive attention described in:
 99 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
100 |   tion by jointly learning to align and translate,” in Proceedings
101 |   of ICLR, 2015."
102 | 	to use previous alignments as additional location features.
103 | 
104 | 	This attention is described in:
105 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
106 |   gio, “Attention-based models for speech recognition,” in Ad-
107 |   vances in Neural Information Processing Systems, 2015, pp.
108 |   577–585.
109 | 	"""
110 | 
111 | 	def __init__(self,
112 | 				 num_units,
113 | 				 memory,
114 | 				 hparams,
115 | 				 is_training,
116 | 				 mask_encoder=True,
117 | 				 memory_sequence_length=None,
118 | 				 smoothing=False,
119 | 				 cumulate_weights=True,
120 | 				 name='LocationSensitiveAttention'):
121 | 		"""Construct the Attention mechanism.
122 | 		Args:
123 | 			num_units: The depth of the query mechanism.
124 | 			memory: The memory to query; usually the output of an RNN encoder.  This
125 | 				tensor should be shaped `[batch_size, max_time, ...]`.
126 | 			mask_encoder (optional): Boolean, whether to mask encoder paddings.
127 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
128 | 				in memory.  If provided, the memory tensor rows are masked with zeros
129 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
130 | 			smoothing (optional): Boolean. Determines which normalization function to use.
131 | 				Default normalization function (probablity_fn) is softmax. If smoothing is
132 | 				enabled, we replace softmax with:
133 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
134 | 				Introduced in:
135 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
136 | 				  gio, “Attention-based models for speech recognition,” in Ad-
137 | 				  vances in Neural Information Processing Systems, 2015, pp.
138 | 				  577–585.
139 | 				This is mainly used if the model wants to attend to multiple input parts
140 | 				at the same decoding step. We probably won't be using it since multiple sound
141 | 				frames may depend on the same character/phone, probably not the way around.
142 | 				Note:
143 | 					We still keep it implemented in case we want to test it. They used it in the
144 | 					paper in the context of speech recognition, where one phoneme may depend on
145 | 					multiple subsequent sound frames.
146 | 			name: Name to use when creating ops.
147 | 		"""
148 | 		#Create normalization function
149 | 		#Setting it to None defaults in using softmax
150 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
151 | 		memory_length = memory_sequence_length if (mask_encoder==True) else None
152 | 		super(LocationSensitiveAttention, self).__init__(
153 | 				num_units=num_units,
154 | 				memory=memory,
155 | 				memory_sequence_length=memory_length,
156 | 				probability_fn=normalization_function,
157 | 				name=name)
158 | 
159 | 		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
160 | 			kernel_size=hparams.attention_kernel, padding='same', use_bias=True,
161 | 			bias_initializer=tf.zeros_initializer(), name='location_features_convolution')
162 | 		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
163 | 			dtype=tf.float32, name='location_features_layer')
164 | 		self._cumulate = cumulate_weights
165 | 		self.synthesis_constraint = hparams.synthesis_constraint and not is_training
166 | 		self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32)
167 | 		self.constraint_type = hparams.synthesis_constraint_type
168 | 
169 | 	def __call__(self, query, state, prev_max_attentions):
170 | 		"""Score the query based on the keys and values.
171 | 		Args:
172 | 			query: Tensor of dtype matching `self.values` and shape
173 | 				`[batch_size, query_depth]`.
174 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
175 | 				`[batch_size, alignments_size]`
176 | 				(`alignments_size` is memory's `max_time`).
177 | 		Returns:
178 | 			alignments: Tensor of dtype matching `self.values` and shape
179 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
180 | 				`max_time`).
181 | 		"""
182 | 		previous_alignments = state
183 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
184 | 
185 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
186 | 			processed_query = self.query_layer(query) if self.query_layer else query
187 | 			# -> [batch_size, 1, attention_dim]
188 | 			processed_query = tf.expand_dims(processed_query, 1)
189 | 
190 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
191 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
192 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
193 | 			# location features [batch_size, max_time, filters]
194 | 			f = self.location_convolution(expanded_alignments)
195 | 			# Projected location features [batch_size, max_time, attention_dim]
196 | 			processed_location_features = self.location_layer(f)
197 | 
198 | 			# energy shape [batch_size, max_time]
199 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
200 | 
201 | 		if self.synthesis_constraint:
202 | 			Tx = tf.shape(energy)[-1]
203 | 			# prev_max_attentions = tf.squeeze(prev_max_attentions, [-1])
204 | 			if self.constraint_type == 'monotonic':
205 | 				key_masks = tf.sequence_mask(prev_max_attentions, Tx)
206 | 				reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1]
207 | 			else:
208 | 				assert self.constraint_type == 'window'
209 | 				key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx)
210 | 				reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1]
211 | 			
212 | 			masks = tf.logical_or(key_masks, reverse_masks)
213 | 			paddings = tf.ones_like(energy) * (-2 ** 32 + 1)  # (N, Ty/r, Tx)
214 | 			energy = tf.where(tf.equal(masks, False), energy, paddings)
215 | 
216 | 		# alignments shape = energy shape = [batch_size, max_time]
217 | 		alignments = self._probability_fn(energy, previous_alignments)
218 | 		max_attentions = tf.argmax(alignments, -1, output_type=tf.int32) # (N, Ty/r)
219 | 
220 | 		# Cumulate alignments
221 | 		if self._cumulate:
222 | 			next_state = alignments + previous_alignments
223 | 		else:
224 | 			next_state = alignments
225 | 
226 | 		return alignments, next_state, max_attentions
227 | 


--------------------------------------------------------------------------------
/tacotron/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import collections
  4 | 
  5 | import tensorflow as tf
  6 | from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper
  7 | from tensorflow.contrib.seq2seq.python.ops import decoder
  8 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
  9 | from tensorflow.python.framework import ops, tensor_shape
 10 | from tensorflow.python.layers import base as layers_base
 11 | from tensorflow.python.ops import rnn_cell_impl
 12 | from tensorflow.python.util import nest
 13 | 
 14 | 
 15 | class CustomDecoderOutput(
 16 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 17 | 	pass
 18 | 
 19 | 
 20 | class CustomDecoder(decoder.Decoder):
 21 | 	"""Custom sampling decoder.
 22 | 
 23 | 	Allows for stop token prediction at inference time
 24 | 	and returns equivalent loss in training time.
 25 | 
 26 | 	Note:
 27 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 28 | 	"""
 29 | 
 30 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 31 | 		"""Initialize CustomDecoder.
 32 | 		Args:
 33 | 			cell: An `RNNCell` instance.
 34 | 			helper: A `Helper` instance.
 35 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 36 | 				The initial state of the RNNCell.
 37 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 38 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 39 | 				to storing the result or sampling.
 40 | 		Raises:
 41 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 42 | 		"""
 43 | 		rnn_cell_impl.assert_like_rnncell(type(cell), cell)
 44 | 		if not isinstance(helper, helper_py.Helper):
 45 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 46 | 		if (output_layer is not None
 47 | 				and not isinstance(output_layer, layers_base.Layer)):
 48 | 			raise TypeError(
 49 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 50 | 		self._cell = cell
 51 | 		self._helper = helper
 52 | 		self._initial_state = initial_state
 53 | 		self._output_layer = output_layer
 54 | 
 55 | 	@property
 56 | 	def batch_size(self):
 57 | 		return self._helper.batch_size
 58 | 
 59 | 	def _rnn_output_size(self):
 60 | 		size = self._cell.output_size
 61 | 		if self._output_layer is None:
 62 | 			return size
 63 | 		else:
 64 | 			# To use layer's compute_output_shape, we need to convert the
 65 | 			# RNNCell's output_size entries into shapes with an unknown
 66 | 			# batch size.  We then pass this through the layer's
 67 | 			# compute_output_shape and read off all but the first (batch)
 68 | 			# dimensions to get the output size of the rnn with the layer
 69 | 			# applied to the top.
 70 | 			output_shape_with_unknown_batch = nest.map_structure(
 71 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 72 | 					size)
 73 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 74 | 					output_shape_with_unknown_batch)
 75 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 76 | 
 77 | 	@property
 78 | 	def output_size(self):
 79 | 		# Return the cell output and the id
 80 | 		return CustomDecoderOutput(
 81 | 				rnn_output=self._rnn_output_size(),
 82 | 				token_output=self._helper.token_output_size,
 83 | 				sample_id=self._helper.sample_ids_shape)
 84 | 
 85 | 	@property
 86 | 	def output_dtype(self):
 87 | 		# Assume the dtype of the cell is the output_size structure
 88 | 		# containing the input_state's first component's dtype.
 89 | 		# Return that structure and the sample_ids_dtype from the helper.
 90 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 91 | 		return CustomDecoderOutput(
 92 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 93 | 				tf.float32,
 94 | 				self._helper.sample_ids_dtype)
 95 | 
 96 | 	def initialize(self, name=None):
 97 | 		"""Initialize the decoder.
 98 | 		Args:
 99 | 			name: Name scope for any created operations.
100 | 		Returns:
101 | 			`(finished, first_inputs, initial_state)`.
102 | 		"""
103 | 		return self._helper.initialize() + (self._initial_state,)
104 | 
105 | 	def step(self, time, inputs, state, name=None):
106 | 		"""Perform a custom decoding step.
107 | 		Enables for dyanmic <stop_token> prediction
108 | 		Args:
109 | 			time: scalar `int32` tensor.
110 | 			inputs: A (structure of) input tensors.
111 | 			state: A (structure of) state tensors and TensorArrays.
112 | 			name: Name scope for any created operations.
113 | 		Returns:
114 | 			`(outputs, next_state, next_inputs, finished)`.
115 | 		"""
116 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
117 | 			#Call outputprojection wrapper cell
118 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
119 | 
120 | 			#apply output_layer (if existant)
121 | 			if self._output_layer is not None:
122 | 				cell_outputs = self._output_layer(cell_outputs)
123 | 			sample_ids = self._helper.sample(
124 | 					time=time, outputs=cell_outputs, state=cell_state)
125 | 
126 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
127 | 					time=time,
128 | 					outputs=cell_outputs,
129 | 					state=cell_state,
130 | 					sample_ids=sample_ids,
131 | 					stop_token_prediction=stop_token)
132 | 
133 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
134 | 		return (outputs, next_state, next_inputs, finished)
135 | 


--------------------------------------------------------------------------------
/tacotron/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | 
  5 | 
  6 | class TacoTestHelper(Helper):
  7 | 	def __init__(self, batch_size, hparams):
  8 | 		with tf.name_scope('TacoTestHelper'):
  9 | 			self._batch_size = batch_size
 10 | 			self._output_dim = hparams.num_mels
 11 | 			self._reduction_factor = hparams.outputs_per_step
 12 | 			self.stop_at_any = hparams.stop_at_any
 13 | 
 14 | 	@property
 15 | 	def batch_size(self):
 16 | 		return self._batch_size
 17 | 
 18 | 	@property
 19 | 	def token_output_size(self):
 20 | 		return self._reduction_factor
 21 | 
 22 | 	@property
 23 | 	def sample_ids_shape(self):
 24 | 		return tf.TensorShape([])
 25 | 
 26 | 	@property
 27 | 	def sample_ids_dtype(self):
 28 | 		return np.int32
 29 | 
 30 | 	def initialize(self, name=None):
 31 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 | 	def sample(self, time, outputs, state, name=None):
 34 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
 37 | 		'''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
 38 | 		with tf.name_scope('TacoTestHelper'):
 39 | 			#A sequence is finished when the output probability is > 0.5
 40 | 			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
 41 | 
 42 | 			#Since we are predicting r frames at each step, two modes are
 43 | 			#then possible:
 44 | 			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
 45 | 			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
 46 | 			#Note:
 47 | 			#	With enough training steps, the model should be able to predict when to stop correctly
 48 | 			#	and the use of stop_at_any = True would be recommended. If however the model didn't
 49 | 			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option
 50 | 			#	to get a correct synthesis
 51 | 			if self.stop_at_any:
 52 | 				finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
 53 | 			else:
 54 | 				finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
 55 | 
 56 | 			# Feed last output frame as next input. outputs is [N, output_dim * r]
 57 | 			next_inputs = outputs[:, -self._output_dim:]
 58 | 			next_state = state
 59 | 			return (finished, next_inputs, next_state)
 60 | 
 61 | 
 62 | class TacoTrainingHelper(Helper):
 63 | 	def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
 64 | 		# inputs is [N, T_in], targets is [N, T_out, D]
 65 | 		with tf.name_scope('TacoTrainingHelper'):
 66 | 			self._batch_size = batch_size
 67 | 			self._output_dim = hparams.num_mels
 68 | 			self._reduction_factor = hparams.outputs_per_step
 69 | 			self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
 70 | 			self.gta = gta
 71 | 			self.eval = evaluating
 72 | 			self._hparams = hparams
 73 | 			self.global_step = global_step
 74 | 
 75 | 			r = self._reduction_factor
 76 | 			# Feed every r-th target frame as input
 77 | 			self._targets = targets[:, r-1::r, :]
 78 | 
 79 | 			#Maximal sequence length
 80 | 			self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
 81 | 
 82 | 	@property
 83 | 	def batch_size(self):
 84 | 		return self._batch_size
 85 | 
 86 | 	@property
 87 | 	def token_output_size(self):
 88 | 		return self._reduction_factor
 89 | 
 90 | 	@property
 91 | 	def sample_ids_shape(self):
 92 | 		return tf.TensorShape([])
 93 | 
 94 | 	@property
 95 | 	def sample_ids_dtype(self):
 96 | 		return np.int32
 97 | 
 98 | 	def initialize(self, name=None):
 99 | 		#Compute teacher forcing ratio for this global step.
100 | 		#In GTA mode, override teacher forcing scheme to work with full teacher forcing
101 | 		if self.gta:
102 | 			self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
103 | 		elif self.eval and self._hparams.tacotron_natural_eval:
104 | 			self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
105 | 		else:
106 | 			if self._hparams.tacotron_teacher_forcing_mode == 'scheduled':
107 | 				self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
108 | 					self.global_step, self._hparams)
109 | 
110 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
111 | 
112 | 	def sample(self, time, outputs, state, name=None):
113 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
114 | 
115 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
116 | 		with tf.name_scope(name or 'TacoTrainingHelper'):
117 | 			#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
118 | 			finished = (time + 1 >= self._lengths)
119 | 
120 | 			#Pick previous outputs randomly with respect to teacher forcing ratio
121 | 			next_inputs = tf.cond(
122 | 				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
123 | 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
124 | 				lambda: outputs[:,-self._output_dim:])
125 | 
126 | 			#Pass on state
127 | 			next_state = state
128 | 			return (finished, next_inputs, next_state)
129 | 
130 | 
131 | def _go_frames(batch_size, output_dim):
132 | 	'''Returns all-zero <GO> frames for a given batch size and output dimension'''
133 | 	return tf.tile([[0.0]], [batch_size, output_dim])
134 | 
135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
136 | 		#################################################################
137 | 		# Narrow Cosine Decay:
138 | 
139 | 		# Phase 1: tfr = init
140 | 		# We only start learning rate decay after 10k steps
141 | 
142 | 		# Phase 2: tfr in ]init, final[
143 | 		# decay reach minimal value at step ~40k
144 | 
145 | 		# Phase 3: tfr = final
146 | 		# clip by minimal teacher forcing ratio value (step >~ 40k)
147 | 		#################################################################
148 | 		#Pick final teacher forcing rate value
149 | 		if hparams.tacotron_teacher_forcing_final_ratio is not None:
150 | 			alpha = float(hparams.tacotron_teacher_forcing_final_ratio / hparams.tacotron_teacher_forcing_init_ratio)
151 | 
152 | 		else:
153 | 			assert hparams.tacotron_teacher_forcing_decay_alpha is not None
154 | 			alpha = hparams.tacotron_teacher_forcing_decay_alpha
155 | 
156 | 		#Compute natural cosine decay
157 | 		tfr = tf.train.cosine_decay(init_tfr,
158 | 			global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr ~= init at step 10k
159 | 			decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr ~= final at step ~40k
160 | 			alpha=alpha, #tfr = alpha% of init_tfr as final value
161 | 			name='tfr_cosine_decay')
162 | 
163 | 		#force teacher forcing ratio to take initial value when global step < start decay step.
164 | 		narrow_tfr = tf.cond(
165 | 			tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
166 | 			lambda: tf.convert_to_tensor(init_tfr),
167 | 			lambda: tfr)
168 | 
169 | 		return narrow_tfr


--------------------------------------------------------------------------------
/tacotron/synthesize.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import re
  4 | import time
  5 | from time import sleep
  6 | 
  7 | import tensorflow as tf
  8 | from hparams import hparams, hparams_debug_string
  9 | from infolog import log
 10 | from tacotron.synthesizer import Synthesizer
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def generate_fast(model, text):
 15 | 	model.synthesize([text], None, None, None, None)
 16 | 
 17 | 
 18 | def run_live(args, checkpoint_path, hparams):
 19 | 	#Log to Terminal without keeping any records in files
 20 | 	log(hparams_debug_string())
 21 | 	synth = Synthesizer()
 22 | 	synth.load(checkpoint_path, hparams)
 23 | 
 24 | 	#Generate fast greeting message
 25 | 	greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
 26 | 	log(greetings)
 27 | 	generate_fast(synth, greetings)
 28 | 
 29 | 	#Interaction loop
 30 | 	while True:
 31 | 		try:
 32 | 			text = input()
 33 | 			generate_fast(synth, text)
 34 | 
 35 | 		except KeyboardInterrupt:
 36 | 			leave = 'Thank you for testing our features. see you soon.'
 37 | 			log(leave)
 38 | 			generate_fast(synth, leave)
 39 | 			sleep(2)
 40 | 			break
 41 | 
 42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 43 | 	eval_dir = os.path.join(output_dir, 'eval')
 44 | 	log_dir = os.path.join(output_dir, 'logs-eval')
 45 | 
 46 | 	if args.model == 'Tacotron-2':
 47 | 		assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)
 48 | 
 49 | 	#Create output path if it doesn't exist
 50 | 	os.makedirs(eval_dir, exist_ok=True)
 51 | 	os.makedirs(log_dir, exist_ok=True)
 52 | 	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
 53 | 	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
 54 | 
 55 | 	log(hparams_debug_string())
 56 | 	synth = Synthesizer()
 57 | 	synth.load(checkpoint_path, hparams)
 58 | 
 59 | 	#Set inputs batch wise
 60 | 	sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
 61 | 
 62 | 	log('Starting Synthesis')
 63 | 	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
 64 | 		for i, texts in enumerate(tqdm(sentences)):
 65 | 			start = time.time()
 66 | 			basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))]
 67 | 			mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
 68 | 
 69 | 			for elems in zip(texts, mel_filenames, speaker_ids):
 70 | 				file.write('|'.join([str(x) for x in elems]) + '\n')
 71 | 	log('synthesized mel spectrograms at {}'.format(eval_dir))
 72 | 	return eval_dir
 73 | 
 74 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
 75 | 	GTA = (args.GTA == 'True')
 76 | 	if GTA:
 77 | 		synth_dir = os.path.join(output_dir, 'gta')
 78 | 
 79 | 		#Create output path if it doesn't exist
 80 | 		os.makedirs(synth_dir, exist_ok=True)
 81 | 	else:
 82 | 		synth_dir = os.path.join(output_dir, 'natural')
 83 | 
 84 | 		#Create output path if it doesn't exist
 85 | 		os.makedirs(synth_dir, exist_ok=True)
 86 | 
 87 | 
 88 | 	metadata_filename = os.path.join(args.input_dir, 'train.txt')
 89 | 	log(hparams_debug_string())
 90 | 	synth = Synthesizer()
 91 | 	synth.load(checkpoint_path, hparams, gta=GTA)
 92 | 	with open(metadata_filename, encoding='utf-8') as f:
 93 | 		metadata = [line.strip().split('|') for line in f]
 94 | 		frame_shift_ms = hparams.hop_size / hparams.sample_rate
 95 | 		hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
 96 | 		log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
 97 | 
 98 | 	#Set inputs batch wise
 99 | 	metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
100 | 
101 | 	log('Starting Synthesis')
102 | 	mel_dir = os.path.join(args.input_dir, 'mels')
103 | 	wav_dir = os.path.join(args.input_dir, 'audio')
104 | 	with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
105 | 		for i, meta in enumerate(tqdm(metadata)):
106 | 			texts = [m[5] for m in meta]
107 | 			mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
108 | 			wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
109 | 			basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
110 | 			mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)
111 | 
112 | 			for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
113 | 				file.write('|'.join([str(x) for x in elems]) + '\n')
114 | 	log('synthesized mel spectrograms at {}'.format(synth_dir))
115 | 	return os.path.join(synth_dir, 'map.txt')
116 | 
117 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
118 | 	output_dir = 'tacotron_' + args.output_dir
119 | 
120 | 	try:
121 | 		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
122 | 		log('loaded model at {}'.format(checkpoint_path))
123 | 	except:
124 | 		raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
125 | 
126 | 	if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
127 | 		raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format(
128 | 			hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))
129 | 
130 | 	if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
131 | 		raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(
132 | 			hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))
133 | 
134 | 	if args.mode == 'eval':
135 | 		return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
136 | 	elif args.mode == 'synthesis':
137 | 		return run_synthesis(args, checkpoint_path, output_dir, hparams)
138 | 	else:
139 | 		run_live(args, checkpoint_path, hparams)
140 | 


--------------------------------------------------------------------------------
/tacotron/synthesizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import wave
  3 | from datetime import datetime
  4 | 
  5 | import numpy as np
  6 | import pyaudio
  7 | import sounddevice as sd
  8 | import tensorflow as tf
  9 | from datasets import audio
 10 | from infolog import log
 11 | from librosa import effects
 12 | from tacotron.models import create_model
 13 | from tacotron.utils import plot
 14 | from tacotron.utils.text import text_to_sequence
 15 | 
 16 | 
 17 | class Synthesizer:
 18 | 	def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
 19 | 		log('Constructing model: %s' % model_name)
 20 | 		#Force the batch size to be known in order to use attention masking in batch synthesis
 21 | 		inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
 22 | 		input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
 23 | 		targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
 24 | 		split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos')
 25 | 		with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
 26 | 			self.model = create_model(model_name, hparams)
 27 | 			if gta:
 28 | 				self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos)
 29 | 			else:
 30 | 				self.model.initialize(inputs, input_lengths, split_infos=split_infos)
 31 | 
 32 | 			self.mel_outputs = self.model.tower_mel_outputs
 33 | 			self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
 34 | 			self.alignments = self.model.tower_alignments
 35 | 			self.stop_token_prediction = self.model.tower_stop_token_prediction
 36 | 			self.targets = targets
 37 | 
 38 | 		if hparams.GL_on_GPU:
 39 | 			self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
 40 | 			self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')
 41 | 
 42 | 			self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams)
 43 | 			self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams)
 44 | 
 45 | 		self.gta = gta
 46 | 		self._hparams = hparams
 47 | 		#pad input sequences with the <pad_token> 0 ( _ )
 48 | 		self._pad = 0
 49 | 		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
 50 | 		#to avoid any possible conflicts, without affecting the output range of the model too much
 51 | 		if hparams.symmetric_mels:
 52 | 			self._target_pad = -hparams.max_abs_value
 53 | 		else:
 54 | 			self._target_pad = 0.
 55 | 
 56 | 		self.inputs = inputs
 57 | 		self.input_lengths = input_lengths
 58 | 		self.targets = targets
 59 | 		self.split_infos = split_infos
 60 | 
 61 | 		log('Loading checkpoint: %s' % checkpoint_path)
 62 | 		#Memory allocation on the GPUs as needed
 63 | 		config = tf.ConfigProto()
 64 | 		config.gpu_options.allow_growth = True
 65 | 		config.allow_soft_placement = True
 66 | 
 67 | 		self.session = tf.Session(config=config)
 68 | 		self.session.run(tf.global_variables_initializer())
 69 | 
 70 | 		saver = tf.train.Saver()
 71 | 		saver.restore(self.session, checkpoint_path)
 72 | 
 73 | 
 74 | 	def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
 75 | 		hparams = self._hparams
 76 | 		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 77 | 		#[-max, max] or [0,max]
 78 | 		T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
 79 | 
 80 | 		#Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
 81 | 		while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
 82 | 			texts.append(texts[-1])
 83 | 			basenames.append(basenames[-1])
 84 | 			if mel_filenames is not None:
 85 | 				mel_filenames.append(mel_filenames[-1])
 86 | 
 87 | 		assert 0 == len(texts) % self._hparams.tacotron_num_gpus
 88 | 		seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
 89 | 		input_lengths = [len(seq) for seq in seqs]
 90 | 
 91 | 		size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
 92 | 
 93 | 		#Pad inputs according to each GPU max length
 94 | 		input_seqs = None
 95 | 		split_infos = []
 96 | 		for i in range(self._hparams.tacotron_num_gpus):
 97 | 			device_input = seqs[size_per_device*i: size_per_device*(i+1)]
 98 | 			device_input, max_seq_len = self._prepare_inputs(device_input)
 99 | 			input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
100 | 			split_infos.append([max_seq_len, 0, 0, 0])
101 | 
102 | 		feed_dict = {
103 | 			self.inputs: input_seqs,
104 | 			self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
105 | 		}
106 | 
107 | 		if self.gta:
108 | 			np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
109 | 			target_lengths = [len(np_target) for np_target in np_targets]
110 | 
111 | 			#pad targets according to each GPU max length
112 | 			target_seqs = None
113 | 			for i in range(self._hparams.tacotron_num_gpus):
114 | 				device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
115 | 				device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
116 | 				target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
117 | 				split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
118 | 
119 | 			feed_dict[self.targets] = target_seqs
120 | 			assert len(np_targets) == len(texts)
121 | 
122 | 		feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
123 | 
124 | 		if self.gta or not hparams.predict_linear:
125 | 			mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
126 | 
127 | 			#Linearize outputs (n_gpus -> 1D)
128 | 			mels = [mel for gpu_mels in mels for mel in gpu_mels]
129 | 			alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
130 | 			stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
131 | 
132 | 			if not self.gta:
133 | 				#Natural batch synthesis
134 | 				#Get Mel lengths for the entire batch from stop_tokens predictions
135 | 				target_lengths = self._get_output_lengths(stop_tokens)
136 | 
137 | 			#Take off the batch wise padding
138 | 			mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
139 | 			assert len(mels) == len(texts)
140 | 
141 | 		else:
142 | 			linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
143 | 			
144 | 			#Linearize outputs (1D arrays)
145 | 			linears = [linear for gpu_linear in linears for linear in gpu_linear]
146 | 			mels = [mel for gpu_mels in mels for mel in gpu_mels]
147 | 			alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
148 | 			stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
149 | 
150 | 			#Natural batch synthesis
151 | 			#Get Mel/Linear lengths for the entire batch from stop_tokens predictions
152 | 			target_lengths = self._get_output_lengths(stop_tokens)
153 | 
154 | 			#Take off the batch wise padding
155 | 			mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
156 | 			linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
157 | 			linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
158 | 			assert len(mels) == len(linears) == len(texts)
159 | 
160 | 		mels = np.clip(mels, T2_output_range[0], T2_output_range[1])
161 | 
162 | 		if basenames is None:
163 | 			#Generate wav and read it
164 | 			if hparams.GL_on_GPU:
165 | 				wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]})
166 | 				wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
167 | 			else:
168 | 				wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
169 | 			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way
170 | 
171 | 			if platform.system() == 'Linux':
172 | 				#Linux wav reader
173 | 				os.system('aplay temp.wav')
174 | 
175 | 			elif platform.system() == 'Windows':
176 | 				#windows wav reader
177 | 				os.system('start /min mplay32 /play /close temp.wav')
178 | 
179 | 			else:
180 | 				raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')
181 | 
182 | 			return
183 | 
184 | 
185 | 		saved_mels_paths = []
186 | 		speaker_ids = []
187 | 		for i, mel in enumerate(mels):
188 | 			#Get speaker id for global conditioning (only used with GTA generally)
189 | 			if hparams.gin_channels > 0:
190 | 				raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
191 | 				speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
192 | 				speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
193 | 			else:
194 | 				speaker_id = '<no_g>'
195 | 				speaker_ids.append(speaker_id)
196 | 
197 | 			# Write the spectrogram to disk
198 | 			# Note: outputs mel-spectrogram files and target ones have same names, just different folders
199 | 			mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
200 | 			np.save(mel_filename, mel, allow_pickle=False)
201 | 			saved_mels_paths.append(mel_filename)
202 | 
203 | 			if log_dir is not None:
204 | 				#save wav (mel -> wav)
205 | 				if hparams.GL_on_GPU:
206 | 					wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel})
207 | 					wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
208 | 				else:
209 | 					wav = audio.inv_mel_spectrogram(mel.T, hparams)
210 | 				audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate)
211 | 
212 | 				#save alignments
213 | 				plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
214 | 					title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])
215 | 
216 | 				#save mel spectrogram plot
217 | 				plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
218 | 					title='{}'.format(texts[i]), split_title=True)
219 | 
220 | 				if hparams.predict_linear:
221 | 					#save wav (linear -> wav)
222 | 					if hparams.GL_on_GPU:
223 | 						wav = self.session.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]})
224 | 						wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
225 | 					else:
226 | 						wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
227 | 					audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate)
228 | 
229 | 					#save linear spectrogram plot
230 | 					plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
231 | 						title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
232 | 
233 | 		return saved_mels_paths, speaker_ids
234 | 
235 | 	def _round_up(self, x, multiple):
236 | 		remainder = x % multiple
237 | 		return x if remainder == 0 else x + multiple - remainder
238 | 
239 | 	def _prepare_inputs(self, inputs):
240 | 		max_len = max([len(x) for x in inputs])
241 | 		return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
242 | 
243 | 	def _pad_input(self, x, length):
244 | 		return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
245 | 
246 | 	def _prepare_targets(self, targets, alignment):
247 | 		max_len = max([len(t) for t in targets])
248 | 		data_len = self._round_up(max_len, alignment)
249 | 		return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
250 | 
251 | 	def _pad_target(self, t, length):
252 | 		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
253 | 
254 | 	def _get_output_lengths(self, stop_tokens):
255 | 		#Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
256 | 		output_lengths = [row.index(1) if 1 in row else len(row) for row in np.round(stop_tokens).tolist()]
257 | 		return output_lengths
258 | 


--------------------------------------------------------------------------------
/tacotron/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | import time
  5 | import traceback
  6 | from datetime import datetime
  7 | 
  8 | import infolog
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from datasets import audio
 12 | from hparams import hparams_debug_string
 13 | from tacotron.feeder import Feeder
 14 | from tacotron.models import create_model
 15 | from tacotron.utils import ValueWindow, plot
 16 | from tacotron.utils.text import sequence_to_text
 17 | from tacotron.utils.symbols import symbols
 18 | from tqdm import tqdm
 19 | 
 20 | log = infolog.log
 21 | 
 22 | 
 23 | def time_string():
 24 | 	return datetime.now().strftime('%Y-%m-%d %H:%M')
 25 | 
 26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
 27 | 	#Create tensorboard projector
 28 | 	config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
 29 | 	config.model_checkpoint_path = checkpoint_path
 30 | 
 31 | 	for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
 32 | 		#Initialize config
 33 | 		embedding = config.embeddings.add()
 34 | 		#Specifiy the embedding variable and the metadata
 35 | 		embedding.tensor_name = embedding_name
 36 | 		embedding.metadata_path = path_to_meta
 37 | 	
 38 | 	#Project the embeddings to space dimensions for visualization
 39 | 	tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
 40 | 
 41 | def add_train_stats(model, hparams):
 42 | 	with tf.variable_scope('stats') as scope:
 43 | 		for i in range(hparams.tacotron_num_gpus):
 44 | 			tf.summary.histogram('mel_outputs %d' % i, model.tower_mel_outputs[i])
 45 | 			tf.summary.histogram('mel_targets %d' % i, model.tower_mel_targets[i])
 46 | 		tf.summary.scalar('before_loss', model.before_loss)
 47 | 		tf.summary.scalar('after_loss', model.after_loss)
 48 | 
 49 | 		if hparams.predict_linear:
 50 | 			tf.summary.scalar('linear_loss', model.linear_loss)
 51 | 			for i in range(hparams.tacotron_num_gpus):
 52 | 				tf.summary.histogram('linear_outputs %d' % i, model.tower_linear_outputs[i])
 53 | 				tf.summary.histogram('linear_targets %d' % i, model.tower_linear_targets[i])
 54 | 		
 55 | 		tf.summary.scalar('regularization_loss', model.regularization_loss)
 56 | 		tf.summary.scalar('stop_token_loss', model.stop_token_loss)
 57 | 		tf.summary.scalar('loss', model.loss)
 58 | 		tf.summary.scalar('learning_rate', model.learning_rate) #Control learning rate decay speed
 59 | 		if hparams.tacotron_teacher_forcing_mode == 'scheduled':
 60 | 			tf.summary.scalar('teacher_forcing_ratio', model.ratio) #Control teacher forcing ratio decay when mode = 'scheduled'
 61 | 		gradient_norms = [tf.norm(grad) for grad in model.gradients]
 62 | 		tf.summary.histogram('gradient_norm', gradient_norms)
 63 | 		tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
 64 | 		return tf.summary.merge_all()
 65 | 
 66 | def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss):
 67 | 	values = [
 68 | 	tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_before_loss', simple_value=before_loss),
 69 | 	tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_after_loss', simple_value=after_loss),
 70 | 	tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/stop_token_loss', simple_value=stop_token_loss),
 71 | 	tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_loss', simple_value=loss),
 72 | 	]
 73 | 	if linear_loss is not None:
 74 | 		values.append(tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_linear_loss', simple_value=linear_loss))
 75 | 	test_summary = tf.Summary(value=values)
 76 | 	summary_writer.add_summary(test_summary, step)
 77 | 
 78 | def model_train_mode(args, feeder, hparams, global_step):
 79 | 	with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
 80 | 		model_name = None
 81 | 		if args.model == 'Tacotron-2':
 82 | 			model_name = 'Tacotron'
 83 | 		model = create_model(model_name or args.model, hparams)
 84 | 		if hparams.predict_linear:
 85 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, linear_targets=feeder.linear_targets,
 86 | 				targets_lengths=feeder.targets_lengths, global_step=global_step,
 87 | 				is_training=True, split_infos=feeder.split_infos)
 88 | 		else:
 89 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets,
 90 | 				targets_lengths=feeder.targets_lengths, global_step=global_step,
 91 | 				is_training=True, split_infos=feeder.split_infos)
 92 | 		model.add_loss()
 93 | 		model.add_optimizer(global_step)
 94 | 		stats = add_train_stats(model, hparams)
 95 | 		return model, stats
 96 | 
 97 | def model_test_mode(args, feeder, hparams, global_step):
 98 | 	with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
 99 | 		model_name = None
100 | 		if args.model == 'Tacotron-2':
101 | 			model_name = 'Tacotron'
102 | 		model = create_model(model_name or args.model, hparams)
103 | 		if hparams.predict_linear:
104 | 			model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets,
105 | 				linear_targets=feeder.eval_linear_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step,
106 | 				is_training=False, is_evaluating=True, split_infos=feeder.eval_split_infos)
107 | 		else:
108 | 			model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets,
109 | 				targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True, 
110 | 				split_infos=feeder.eval_split_infos)
111 | 		model.add_loss()
112 | 		return model
113 | 
114 | def train(log_dir, args, hparams):
115 | 	save_dir = os.path.join(log_dir, 'taco_pretrained')
116 | 	plot_dir = os.path.join(log_dir, 'plots')
117 | 	wav_dir = os.path.join(log_dir, 'wavs')
118 | 	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
119 | 	eval_dir = os.path.join(log_dir, 'eval-dir')
120 | 	eval_plot_dir = os.path.join(eval_dir, 'plots')
121 | 	eval_wav_dir = os.path.join(eval_dir, 'wavs')
122 | 	tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
123 | 	meta_folder = os.path.join(log_dir, 'metas')
124 | 	os.makedirs(save_dir, exist_ok=True)
125 | 	os.makedirs(plot_dir, exist_ok=True)
126 | 	os.makedirs(wav_dir, exist_ok=True)
127 | 	os.makedirs(mel_dir, exist_ok=True)
128 | 	os.makedirs(eval_dir, exist_ok=True)
129 | 	os.makedirs(eval_plot_dir, exist_ok=True)
130 | 	os.makedirs(eval_wav_dir, exist_ok=True)
131 | 	os.makedirs(tensorboard_dir, exist_ok=True)
132 | 	os.makedirs(meta_folder, exist_ok=True)
133 | 
134 | 	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
135 | 	input_path = os.path.join(args.base_dir, args.tacotron_input)
136 | 
137 | 	if hparams.predict_linear:
138 | 		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
139 | 		os.makedirs(linear_dir, exist_ok=True)
140 | 
141 | 	log('Checkpoint path: {}'.format(checkpoint_path))
142 | 	log('Loading training data from: {}'.format(input_path))
143 | 	log('Using model: {}'.format(args.model))
144 | 	log(hparams_debug_string())
145 | 
146 | 	#Start by setting a seed for repeatability
147 | 	tf.set_random_seed(hparams.tacotron_random_seed)
148 | 
149 | 	#Set up data feeder
150 | 	coord = tf.train.Coordinator()
151 | 	with tf.variable_scope('datafeeder') as scope:
152 | 		feeder = Feeder(coord, input_path, hparams)
153 | 
154 | 	#Set up model:
155 | 	global_step = tf.Variable(0, name='global_step', trainable=False)
156 | 	model, stats = model_train_mode(args, feeder, hparams, global_step)
157 | 	eval_model = model_test_mode(args, feeder, hparams, global_step)
158 | 
159 | 	#Embeddings metadata
160 | 	char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
161 | 	if not os.path.isfile(char_embedding_meta):
162 | 		with open(char_embedding_meta, 'w', encoding='utf-8') as f:
163 | 			for symbol in symbols:
164 | 				if symbol == ' ':
165 | 					symbol = '\\s' #For visual purposes, swap space with \s
166 | 
167 | 				f.write('{}\n'.format(symbol))
168 | 
169 | 	char_embedding_meta = char_embedding_meta.replace(log_dir, '..')
170 | 
171 | 	#Potential Griffin-Lim GPU setup
172 | 	if hparams.GL_on_GPU:
173 | 		GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
174 | 		GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')
175 | 
176 | 		GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams)
177 | 		GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(GLGPU_lin_inputs, hparams)
178 | 
179 | 	#Book keeping
180 | 	step = 0
181 | 	time_window = ValueWindow(100)
182 | 	loss_window = ValueWindow(100)
183 | 	saver = tf.train.Saver(max_to_keep=20)
184 | 
185 | 	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))
186 | 
187 | 	#Memory allocation on the GPU as needed
188 | 	config = tf.ConfigProto()
189 | 	config.gpu_options.allow_growth = True
190 | 	config.allow_soft_placement = True
191 | 
192 | 	#Train
193 | 	with tf.Session(config=config) as sess:
194 | 		try:
195 | 			summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
196 | 
197 | 			sess.run(tf.global_variables_initializer())
198 | 
199 | 			#saved model restoring
200 | 			if args.restore:
201 | 				# Restore saved model if the user requested it, default = True
202 | 				try:
203 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
204 | 
205 | 					if (checkpoint_state and checkpoint_state.model_checkpoint_path):
206 | 						log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
207 | 						saver.restore(sess, checkpoint_state.model_checkpoint_path)
208 | 
209 | 					else:
210 | 						log('No model to load at {}'.format(save_dir), slack=True)
211 | 						saver.save(sess, checkpoint_path, global_step=global_step)
212 | 
213 | 				except tf.errors.OutOfRangeError as e:
214 | 					log('Cannot restore checkpoint: {}'.format(e), slack=True)
215 | 			else:
216 | 				log('Starting new training!', slack=True)
217 | 				saver.save(sess, checkpoint_path, global_step=global_step)
218 | 
219 | 			#initializing feeder
220 | 			feeder.start_threads(sess)
221 | 
222 | 			#Training loop
223 | 			while not coord.should_stop() and step < args.tacotron_train_steps:
224 | 				start_time = time.time()
225 | 				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
226 | 				time_window.append(time.time() - start_time)
227 | 				loss_window.append(loss)
228 | 				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
229 | 					step, time_window.average, loss, loss_window.average)
230 | 				log(message, end='\r', slack=(step % args.checkpoint_interval == 0))
231 | 
232 | 				if np.isnan(loss) or loss > 100.:
233 | 					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
234 | 					raise Exception('Loss exploded')
235 | 
236 | 				if step % args.summary_interval == 0:
237 | 					log('\nWriting summary at step {}'.format(step))
238 | 					summary_writer.add_summary(sess.run(stats), step)
239 | 
240 | 				if step % args.eval_interval == 0:
241 | 					#Run eval and save eval stats
242 | 					log('\nRunning evaluation at step {}'.format(step))
243 | 
244 | 					eval_losses = []
245 | 					before_losses = []
246 | 					after_losses = []
247 | 					stop_token_losses = []
248 | 					linear_losses = []
249 | 					linear_loss = None
250 | 
251 | 					if hparams.predict_linear:
252 | 						for i in tqdm(range(feeder.test_steps)):
253 | 							eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([
254 | 								eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
255 | 								eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0],
256 | 								eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0],
257 | 								eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0],
258 | 								eval_model.tower_linear_targets[0][0],
259 | 								])
260 | 							eval_losses.append(eloss)
261 | 							before_losses.append(before_loss)
262 | 							after_losses.append(after_loss)
263 | 							stop_token_losses.append(stop_token_loss)
264 | 							linear_losses.append(linear_loss)
265 | 						linear_loss = sum(linear_losses) / len(linear_losses)
266 | 
267 | 						if hparams.GL_on_GPU:
268 | 							wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p})
269 | 							wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
270 | 						else:
271 | 							wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
272 | 						audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
273 | 
274 | 					else:
275 | 						for i in tqdm(range(feeder.test_steps)):
276 | 							eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([
277 | 								eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
278 | 								eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0],
279 | 								eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0]
280 | 								])
281 | 							eval_losses.append(eloss)
282 | 							before_losses.append(before_loss)
283 | 							after_losses.append(after_loss)
284 | 							stop_token_losses.append(stop_token_loss)
285 | 
286 | 					eval_loss = sum(eval_losses) / len(eval_losses)
287 | 					before_loss = sum(before_losses) / len(before_losses)
288 | 					after_loss = sum(after_losses) / len(after_losses)
289 | 					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
290 | 
291 | 					log('Saving eval log to {}..'.format(eval_dir))
292 | 					#Save some log to monitor model improvement on same unseen sequence
293 | 					if hparams.GL_on_GPU:
294 | 						wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p})
295 | 						wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
296 | 					else:
297 | 						wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
298 | 					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
299 | 
300 | 					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
301 | 						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss),
302 | 						max_len=t_len // hparams.outputs_per_step)
303 | 					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
304 | 						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t,
305 | 						max_len=t_len)
306 | 
307 | 					if hparams.predict_linear:
308 | 						plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)),
309 | 							title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t,
310 | 							max_len=t_len, auto_aspect=True)
311 | 
312 | 					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
313 | 					log('Writing eval summary!')
314 | 					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)
315 | 
316 | 
317 | 				if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
318 | 					#Save model and current global step
319 | 					saver.save(sess, checkpoint_path, global_step=global_step)
320 | 
321 | 					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
322 | 					if hparams.predict_linear:
323 | 						input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([
324 | 							model.tower_inputs[0][0],
325 | 							model.tower_mel_outputs[0][0],
326 | 							model.tower_linear_outputs[0][0],
327 | 							model.tower_alignments[0][0],
328 | 							model.tower_mel_targets[0][0],
329 | 							model.tower_targets_lengths[0][0],
330 | 							model.tower_linear_targets[0][0],
331 | 							])
332 | 
333 | 						#save predicted linear spectrogram to disk (debug)
334 | 						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
335 | 						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
336 | 
337 | 						#save griffin lim inverted wav for debug (linear -> wav)
338 | 						if hparams.GL_on_GPU:
339 | 							wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction})
340 | 							wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
341 | 						else:
342 | 							wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
343 | 						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
344 | 
345 | 						#Save real and predicted linear-spectrogram plot to disk (control purposes)
346 | 						plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)),
347 | 							title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target,
348 | 							max_len=target_length, auto_aspect=True)
349 | 
350 | 					else:
351 | 						input_seq, mel_prediction, alignment, target, target_length = sess.run([
352 | 							model.tower_inputs[0][0],
353 | 							model.tower_mel_outputs[0][0],
354 | 							model.tower_alignments[0][0],
355 | 							model.tower_mel_targets[0][0],
356 | 							model.tower_targets_lengths[0][0],
357 | 							])
358 | 
359 | 					#save predicted mel spectrogram to disk (debug)
360 | 					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
361 | 					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)
362 | 
363 | 					#save griffin lim inverted wav for debug (mel -> wav)
364 | 					if hparams.GL_on_GPU:
365 | 						wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction})
366 | 						wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
367 | 					else:
368 | 						wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
369 | 					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
370 | 
371 | 					#save alignment plot to disk (control purposes)
372 | 					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
373 | 						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
374 | 						max_len=target_length // hparams.outputs_per_step)
375 | 					#save real and predicted mel-spectrogram plot to disk (control purposes)
376 | 					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
377 | 						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
378 | 						max_len=target_length)
379 | 					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
380 | 
381 | 				if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
382 | 					#Get current checkpoint state
383 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
384 | 
385 | 					#Update Projector
386 | 					log('\nSaving Model Character Embeddings visualization..')
387 | 					add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
388 | 					log('Tacotron Character embeddings have been updated on tensorboard!')
389 | 
390 | 			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True)
391 | 			return save_dir
392 | 
393 | 		except Exception as e:
394 | 			log('Exiting due to exception: {}'.format(e), slack=True)
395 | 			traceback.print_exc()
396 | 			coord.request_stop(e)
397 | 
398 | def tacotron_train(args, log_dir, hparams):
399 | 	return train(log_dir, args, hparams)
400 | 


--------------------------------------------------------------------------------
/tacotron/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/tacotron/utils/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | 
15 | from unidecode import unidecode
16 | 
17 | from .numbers import normalize_numbers
18 | 
19 | # Regular expression matching whitespace:
20 | _whitespace_re = re.compile(r'\s+')
21 | 
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 |   ('mrs', 'misess'),
25 |   ('mr', 'mister'),
26 |   ('dr', 'doctor'),
27 |   ('st', 'saint'),
28 |   ('co', 'company'),
29 |   ('jr', 'junior'),
30 |   ('maj', 'major'),
31 |   ('gen', 'general'),
32 |   ('drs', 'doctors'),
33 |   ('rev', 'reverend'),
34 |   ('lt', 'lieutenant'),
35 |   ('hon', 'honorable'),
36 |   ('sgt', 'sergeant'),
37 |   ('capt', 'captain'),
38 |   ('esq', 'esquire'),
39 |   ('ltd', 'limited'),
40 |   ('col', 'colonel'),
41 |   ('ft', 'fort'),
42 | ]]
43 | 
44 | 
45 | def expand_abbreviations(text):
46 |   for regex, replacement in _abbreviations:
47 |     text = re.sub(regex, replacement, text)
48 |   return text
49 | 
50 | 
51 | def expand_numbers(text):
52 |   return normalize_numbers(text)
53 | 
54 | 
55 | def lowercase(text):
56 |   '''lowercase input tokens.
57 |   '''
58 |   return text.lower()
59 | 
60 | 
61 | def collapse_whitespace(text):
62 |   return re.sub(_whitespace_re, ' ', text)
63 | 
64 | 
65 | def convert_to_ascii(text):
66 |   return unidecode(text)
67 | 
68 | 
69 | def basic_cleaners(text):
70 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
71 |   text = lowercase(text)
72 |   text = collapse_whitespace(text)
73 |   return text
74 | 
75 | 
76 | def transliteration_cleaners(text):
77 |   '''Pipeline for non-English text that transliterates to ASCII.'''
78 |   text = convert_to_ascii(text)
79 |   text = lowercase(text)
80 |   text = collapse_whitespace(text)
81 |   return text
82 | 
83 | 
84 | def english_cleaners(text):
85 |   '''Pipeline for English text, including number and abbreviation expansion.'''
86 |   text = convert_to_ascii(text)
87 |   # text = lowercase(text)
88 |   text = expand_numbers(text)
89 |   text = expand_abbreviations(text)
90 |   text = collapse_whitespace(text)
91 |   return text
92 | 


--------------------------------------------------------------------------------
/tacotron/utils/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | valid_symbols = [
 4 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 5 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 6 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 7 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 8 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
 9 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
10 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
11 | ]
12 | 
13 | _valid_symbol_set = set(valid_symbols)
14 | 
15 | 
16 | class CMUDict:
17 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
18 |   def __init__(self, file_or_path, keep_ambiguous=True):
19 |     if isinstance(file_or_path, str):
20 |       with open(file_or_path, encoding='latin-1') as f:
21 |         entries = _parse_cmudict(f)
22 |     else:
23 |       entries = _parse_cmudict(file_or_path)
24 |     if not keep_ambiguous:
25 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
26 |     self._entries = entries
27 | 
28 | 
29 |   def __len__(self):
30 |     return len(self._entries)
31 | 
32 | 
33 |   def lookup(self, word):
34 |     '''Returns list of ARPAbet pronunciations of the given word.'''
35 |     return self._entries.get(word.upper())
36 | 
37 | 
38 | 
39 | _alt_re = re.compile(r'\([0-9]+\)')
40 | 
41 | 
42 | def _parse_cmudict(file):
43 |   cmudict = {}
44 |   for line in file:
45 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
46 |       parts = line.split('  ')
47 |       word = re.sub(_alt_re, '', parts[0])
48 |       pronunciation = _get_pronunciation(parts[1])
49 |       if pronunciation:
50 |         if word in cmudict:
51 |           cmudict[word].append(pronunciation)
52 |         else:
53 |           cmudict[word] = [pronunciation]
54 |   return cmudict
55 | 
56 | 
57 | def _get_pronunciation(s):
58 |   parts = s.strip().split(' ')
59 |   for part in parts:
60 |     if part not in _valid_symbol_set:
61 |       return None
62 |   return ' '.join(parts)
63 | 


--------------------------------------------------------------------------------
/tacotron/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import inflect
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/tacotron/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | def split_title_line(title_text, max_words=5):
 9 | 	"""
10 | 	A function that splits any string based on specific character
11 | 	(returning it with the string), with maximum number of words on it
12 | 	"""
13 | 	seq = title_text.split()
14 | 	return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
15 | 
16 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
17 | 	if max_len is not None:
18 | 		alignment = alignment[:, :max_len]
19 | 
20 | 	fig = plt.figure(figsize=(8, 6))
21 | 	ax = fig.add_subplot(111)
22 | 
23 | 	im = ax.imshow(
24 | 		alignment,
25 | 		aspect='auto',
26 | 		origin='lower',
27 | 		interpolation='none')
28 | 	fig.colorbar(im, ax=ax)
29 | 	xlabel = 'Decoder timestep'
30 | 
31 | 	if split_title:
32 | 		title = split_title_line(title)
33 | 
34 | 	plt.xlabel(xlabel)
35 | 	plt.title(title)
36 | 	plt.ylabel('Encoder timestep')
37 | 	plt.tight_layout()
38 | 	plt.savefig(path, format='png')
39 | 	plt.close()
40 | 
41 | 
42 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
43 | 	if max_len is not None:
44 | 		target_spectrogram = target_spectrogram[:max_len]
45 | 		pred_spectrogram = pred_spectrogram[:max_len]
46 | 
47 | 	if split_title:
48 | 		title = split_title_line(title)
49 | 
50 | 	fig = plt.figure(figsize=(10, 8))
51 | 	# Set common labels
52 | 	fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
53 | 
54 | 	#target spectrogram subplot
55 | 	if target_spectrogram is not None:
56 | 		ax1 = fig.add_subplot(311)
57 | 		ax2 = fig.add_subplot(312)
58 | 
59 | 		if auto_aspect:
60 | 			im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
61 | 		else:
62 | 			im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
63 | 		ax1.set_title('Target Mel-Spectrogram')
64 | 		fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
65 | 		ax2.set_title('Predicted Mel-Spectrogram')
66 | 	else:
67 | 		ax2 = fig.add_subplot(211)
68 | 
69 | 	if auto_aspect:
70 | 		im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
71 | 	else:
72 | 		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
73 | 	fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
74 | 
75 | 	plt.tight_layout()
76 | 	plt.savefig(path, format='png')
77 | 	plt.close()
78 | 


--------------------------------------------------------------------------------
/tacotron/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from . import cmudict
 8 | 
 9 | _pad        = '_'
10 | _eos        = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | #_arpabet = ['@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet
18 | 


--------------------------------------------------------------------------------
/tacotron/utils/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from . import cleaners
 4 | from .symbols import symbols
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |     Args:
21 |       text: string to convert to a sequence
22 |       cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |     Returns:
25 |       List of integers corresponding to the symbols in the text
26 |   '''
27 |   sequence = []
28 | 
29 |   # Check for curly braces and treat their contents as ARPAbet:
30 |   while len(text):
31 |     m = _curly_re.match(text)
32 |     if not m:
33 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |       break
35 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |     sequence += _arpabet_to_sequence(m.group(2))
37 |     text = m.group(3)
38 | 
39 |   # Append EOS token
40 |   sequence.append(_symbol_to_id['~'])
41 |   return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |   '''Converts a sequence of IDs back to a string'''
46 |   result = ''
47 |   for symbol_id in sequence:
48 |     if symbol_id in _id_to_symbol:
49 |       s = _id_to_symbol[symbol_id]
50 |       # Enclose ARPAbet back in curly braces:
51 |       if len(s) > 1 and s[0] == '@':
52 |         s = '{%s}' % s[1:]
53 |       result += s
54 |   return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |   for name in cleaner_names:
59 |     cleaner = getattr(cleaners, name)
60 |     if not cleaner:
61 |       raise Exception('Unknown cleaner: %s' % name)
62 |     text = cleaner(text)
63 |   return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |   return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |   return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/test_wavenet_feeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import os
  3 | import argparse 
  4 | from hparams import hparams
  5 | from datasets import audio
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | 
 10 | def _limit_time(hparams):
 11 | 	'''Limit time resolution to save GPU memory.
 12 | 	'''
 13 | 	if hparams.max_time_sec is not None:
 14 | 		return int(hparams.max_time_sec * hparams.sample_rate)
 15 | 	elif hparams.max_time_steps is not None:
 16 | 		return hparams.max_time_steps
 17 | 	else:
 18 | 		return None
 19 | 
 20 | 
 21 | def get_groups(args, hparams, meta, local_condition):
 22 | 	if hparams.train_with_GTA:
 23 | 		mel_file = meta[2]
 24 | 	else:
 25 | 		mel_file = meta[1]
 26 | 	audio_file = meta[0]
 27 | 
 28 | 	input_data = np.load(os.path.join(args.base_dir, audio_file))
 29 | 
 30 | 	if local_condition:
 31 | 		local_condition_features = np.load(os.path.join(args.base_dir, mel_file))
 32 | 	else:
 33 | 		local_condition_features = None
 34 | 
 35 | 	return (input_data, local_condition_features, None, len(input_data))
 36 | 
 37 | def _adjust_time_resolution(hparams, batch, local_condition, max_time_steps):
 38 | 		'''Adjust time resolution between audio and local condition
 39 | 		'''
 40 | 		if local_condition:
 41 | 			new_batch = []
 42 | 			for b in batch:
 43 | 				x, c, g, l = b
 44 | 				_assert_ready_for_upsample(hparams, x, c)
 45 | 				if max_time_steps is not None:
 46 | 					max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(hparams), True)
 47 | 					if len(x) > max_time_steps:
 48 | 						max_time_frames = max_steps // audio.get_hop_size(hparams)
 49 | 						start = np.random.randint(0, len(c) - max_time_frames)
 50 | 						time_start = start * audio.get_hop_size(hparams)
 51 | 						x = x[time_start: time_start + max_time_frames * audio.get_hop_size(hparams)]
 52 | 						c = c[start: start + max_time_frames, :]
 53 | 						_assert_ready_for_upsample(hparams, x, c)
 54 | 
 55 | 				new_batch.append((x, c, g, l))
 56 | 			return new_batch
 57 | 		else:
 58 | 			new_batch = []
 59 | 			for b in batch:
 60 | 				x, c, g, l = b
 61 | 				x = audio.trim_silence(x, hparams)
 62 | 				if max_time_steps is not None and len(x) > max_time_steps:
 63 | 					start = np.random.randint(0, len(c) - max_time_steps)
 64 | 					x = x[start: start + max_time_steps]
 65 | 				new_batch.append((x, c, g, l))
 66 | 			return new_batch
 67 | 
 68 | def _assert_ready_for_upsample(hparams, x, c):
 69 | 	assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(hparams)
 70 | 
 71 | def check_time_alignment(hparams, batch, local_condition):
 72 | 	#No need to check beyond this step when preparing data
 73 | 	#Limit time steps to save GPU Memory usage
 74 | 	max_time_steps = _limit_time(hparams)
 75 | 	#Adjust time resolution for upsampling
 76 | 	batch = _adjust_time_resolution(hparams, batch, local_condition, max_time_steps)
 77 | 
 78 | def _ensure_divisible(length, divisible_by=256, lower=True):
 79 | 	if length % divisible_by == 0:
 80 | 		return length
 81 | 	if lower:
 82 | 		return length - length % divisible_by
 83 | 	else:
 84 | 		return length + (divisible_by - length % divisible_by)
 85 | 
 86 | def run(args, hparams):
 87 | 	with open(args.metadata, 'r') as file:
 88 | 		metadata = [line.strip().split('|') for line in file]
 89 | 
 90 | 	local_condition = hparams.cin_channels > 0
 91 | 
 92 | 	examples = [get_groups(args, hparams, meta, local_condition) for meta in metadata]
 93 | 	batches = [examples[i: i+hparams.wavenet_batch_size] for i in range(0, len(examples), hparams.wavenet_batch_size)]
 94 | 
 95 | 	for batch in tqdm(batches):
 96 | 		check_time_alignment(hparams, batch, local_condition)
 97 | 
 98 | 
 99 | 
100 | def main():
101 | 	parser = argparse.ArgumentParser()
102 | 	parser.add_argument('--base_dir', default='')
103 | 	parser.add_argument('--hparams', default='',
104 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
105 | 	parser.add_argument('--metadata', default='tacotron_output/gta/map.txt')
106 | 	args = parser.parse_args()
107 | 
108 | 	modified_hparams = hparams.parse(args.hparams)
109 | 	run(args, modified_hparams)
110 | 
111 | 
112 | if __name__ == '__main__':
113 | 	main()


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from time import sleep
  4 | 
  5 | import infolog
  6 | import tensorflow as tf
  7 | from hparams import hparams
  8 | from infolog import log
  9 | from tacotron.synthesize import tacotron_synthesize
 10 | from tacotron.train import tacotron_train
 11 | from wavenet_vocoder.train import wavenet_train
 12 | 
 13 | log = infolog.log
 14 | 
 15 | 
 16 | def save_seq(file, sequence, input_path):
 17 | 	'''Save Tacotron-2 training state to disk. (To skip for future runs)
 18 | 	'''
 19 | 	sequence = [str(int(s)) for s in sequence] + [input_path]
 20 | 	with open(file, 'w') as f:
 21 | 		f.write('|'.join(sequence))
 22 | 
 23 | def read_seq(file):
 24 | 	'''Load Tacotron-2 training state from disk. (To skip if not first run)
 25 | 	'''
 26 | 	if os.path.isfile(file):
 27 | 		with open(file, 'r') as f:
 28 | 			sequence = f.read().split('|')
 29 | 
 30 | 		return [bool(int(s)) for s in sequence[:-1]], sequence[-1]
 31 | 	else:
 32 | 		return [0, 0, 0], ''
 33 | 
 34 | def prepare_run(args):
 35 | 	modified_hp = hparams.parse(args.hparams)
 36 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
 37 | 	run_name = args.name or args.model
 38 | 	log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
 39 | 	os.makedirs(log_dir, exist_ok=True)
 40 | 	infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url)
 41 | 	return log_dir, modified_hp
 42 | 
 43 | def train(args, log_dir, hparams):
 44 | 	state_file = os.path.join(log_dir, 'state_log')
 45 | 	#Get training states
 46 | 	(taco_state, GTA_state, wave_state), input_path = read_seq(state_file)
 47 | 
 48 | 	if not taco_state:
 49 | 		log('\n#############################################################\n')
 50 | 		log('Tacotron Train\n')
 51 | 		log('###########################################################\n')
 52 | 		checkpoint = tacotron_train(args, log_dir, hparams)
 53 | 		tf.reset_default_graph()
 54 | 		#Sleep 1/2 second to let previous graph close and avoid error messages while synthesis
 55 | 		sleep(0.5)
 56 | 		if checkpoint is None:
 57 | 			raise('Error occured while training Tacotron, Exiting!')
 58 | 		taco_state = 1
 59 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 60 | 	else:
 61 | 		checkpoint = os.path.join(log_dir, 'taco_pretrained/')
 62 | 
 63 | 	if not GTA_state:
 64 | 		log('\n#############################################################\n')
 65 | 		log('Tacotron GTA Synthesis\n')
 66 | 		log('###########################################################\n')
 67 | 		input_path = tacotron_synthesize(args, hparams, checkpoint)
 68 | 		tf.reset_default_graph()
 69 | 		#Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is training
 70 | 		sleep(0.5)
 71 | 		GTA_state = 1
 72 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 73 | 	else:
 74 | 		input_path = os.path.join('tacotron_' + args.output_dir, 'gta', 'map.txt')
 75 | 
 76 | 	if input_path == '' or input_path is None:
 77 | 		raise RuntimeError('input_path has an unpleasant value -> {}'.format(input_path))
 78 | 
 79 | 	if not wave_state:
 80 | 		log('\n#############################################################\n')
 81 | 		log('Wavenet Train\n')
 82 | 		log('###########################################################\n')
 83 | 		checkpoint = wavenet_train(args, log_dir, hparams, input_path)
 84 | 		if checkpoint is None:
 85 | 			raise ('Error occured while training Wavenet, Exiting!')
 86 | 		wave_state = 1
 87 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 88 | 
 89 | 	if wave_state and GTA_state and taco_state:
 90 | 		log('TRAINING IS ALREADY COMPLETE!!')
 91 | 
 92 | def main():
 93 | 	parser = argparse.ArgumentParser()
 94 | 	parser.add_argument('--base_dir', default='')
 95 | 	parser.add_argument('--hparams', default='',
 96 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 97 | 	parser.add_argument('--tacotron_input', default='training_data/train.txt')
 98 | 	parser.add_argument('--wavenet_input', default='tacotron_output/gta/map.txt')
 99 | 	parser.add_argument('--name', help='Name of logging directory.')
100 | 	parser.add_argument('--model', default='Tacotron-2')
101 | 	parser.add_argument('--input_dir', default='training_data', help='folder to contain inputs sentences/targets')
102 | 	parser.add_argument('--output_dir', default='output', help='folder to contain synthesized mel spectrograms')
103 | 	parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training')
104 | 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode')
105 | 	parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training')
106 | 	parser.add_argument('--summary_interval', type=int, default=250,
107 | 		help='Steps between running summary ops')
108 | 	parser.add_argument('--embedding_interval', type=int, default=5000,
109 | 		help='Steps between updating embeddings projection visualization')
110 | 	parser.add_argument('--checkpoint_interval', type=int, default=2500,
111 | 		help='Steps between writing checkpoints')
112 | 	parser.add_argument('--eval_interval', type=int, default=5000,
113 | 		help='Steps between eval on test data')
114 | 	parser.add_argument('--tacotron_train_steps', type=int, default=100000, help='total number of tacotron training steps')
115 | 	parser.add_argument('--wavenet_train_steps', type=int, default=500000, help='total number of wavenet training steps')
116 | 	parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
117 | 	parser.add_argument('--slack_url', default=None, help='slack webhook notification destination link')
118 | 	args = parser.parse_args()
119 | 
120 | 	accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
121 | 
122 | 	if args.model not in accepted_models:
123 | 		raise ValueError('please enter a valid model to train: {}'.format(accepted_models))
124 | 
125 | 	log_dir, hparams = prepare_run(args)
126 | 
127 | 	if args.model == 'Tacotron':
128 | 		tacotron_train(args, log_dir, hparams)
129 | 	elif args.model == 'WaveNet':
130 | 		wavenet_train(args, log_dir, hparams, args.wavenet_input)
131 | 	elif args.model == 'Tacotron-2':
132 | 		train(args, log_dir, hparams)
133 | 	else:
134 | 		raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
135 | 
136 | 
137 | if __name__ == '__main__':
138 | 	main()
139 | 


--------------------------------------------------------------------------------
/wavenet_preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from multiprocessing import cpu_count
 4 | 
 5 | from datasets import wavenet_preprocessor
 6 | from hparams import hparams
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def preprocess(args, input_dir, out_dir, hparams):
11 | 	mel_dir = os.path.join(out_dir, 'mels')
12 | 	wav_dir = os.path.join(out_dir, 'audio')
13 | 	os.makedirs(mel_dir, exist_ok=True)
14 | 	os.makedirs(wav_dir, exist_ok=True)
15 | 	metadata = wavenet_preprocessor.build_from_path(hparams, input_dir, mel_dir, wav_dir, args.n_jobs, tqdm=tqdm)
16 | 	write_metadata(metadata, out_dir)
17 | 
18 | def write_metadata(metadata, out_dir):
19 | 	with open(os.path.join(out_dir, 'map.txt'), 'w', encoding='utf-8') as f:
20 | 		for m in metadata:
21 | 			f.write('|'.join([str(x) for x in m]) + '\n')
22 | 	mel_frames = sum([int(m[5]) for m in metadata])
23 | 	timesteps = sum([int(m[4]) for m in metadata])
24 | 	sr = hparams.sample_rate
25 | 	hours = timesteps / sr / 3600
26 | 	print('Write {} utterances, {} audio timesteps, ({:.2f} hours)'.format(
27 | 		len(metadata), timesteps, hours))
28 | 	print('Max mel frames length: {}'.format(max(int(m[5]) for m in metadata)))
29 | 	print('Max audio timesteps length: {}'.format(max(m[4] for m in metadata)))
30 | 
31 | def run_preprocess(args, hparams):
32 | 	output_folder = os.path.join(args.base_dir, args.output)
33 | 
34 | 	preprocess(args, args.input_dir, output_folder, hparams)
35 | 
36 | def main():
37 | 	print('initializing preprocessing..')
38 | 	parser = argparse.ArgumentParser()
39 | 	parser.add_argument('--base_dir', default='')
40 | 	parser.add_argument('--hparams', default='',
41 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
42 | 	parser.add_argument('--input_dir', default='LJSpeech-1.1/wavs')
43 | 	parser.add_argument('--output', default='tacotron_output/gta/')
44 | 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
45 | 	args = parser.parse_args()
46 | 
47 | 	modified_hp = hparams.parse(args.hparams)
48 | 
49 | 	run_preprocess(args, modified_hp)
50 | 
51 | if __name__ == '__main__':
52 | 	main()
53 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/wavenet_vocoder/feeder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from datasets import audio
  8 | from infolog import log
  9 | from keras.utils import np_utils
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | from .util import is_mulaw_quantize, is_scalar_input
 13 | 
 14 | 
 15 | 
 16 | _batches_per_group = 64
 17 | 
 18 | 
 19 | class Feeder:
 20 | 	"""
 21 | 		Feeds batches of data into queue in a background thread.
 22 | 	"""
 23 | 	def __init__(self, coordinator, metadata_filename, base_dir, hparams):
 24 | 		super(Feeder, self).__init__()
 25 | 
 26 | 		self._coord = coordinator
 27 | 		self._hparams = hparams
 28 | 		self._train_offset = 0
 29 | 		self._test_offset = 0
 30 | 
 31 | 		if hparams.symmetric_mels:
 32 | 			self._spec_pad = -hparams.max_abs_value
 33 | 		else:
 34 | 			self._spec_pad = 0.
 35 | 
 36 | 		#Base directory of the project (to map files from different locations)
 37 | 		self._base_dir = base_dir
 38 | 
 39 | 		#Load metadata
 40 | 		self._data_dir = os.path.dirname(metadata_filename)
 41 | 		with open(metadata_filename, 'r') as f:
 42 | 			self._metadata = [line.strip().split('|') for line in f]
 43 | 
 44 | 		#Train test split
 45 | 		if hparams.wavenet_test_size is None:
 46 | 			assert hparams.wavenet_test_batches is not None
 47 | 
 48 | 		test_size = (hparams.wavenet_test_size if hparams.wavenet_test_size is not None
 49 | 			else hparams.wavenet_test_batches * hparams.wavenet_batch_size)
 50 | 		indices = np.arange(len(self._metadata))
 51 | 		train_indices, test_indices = train_test_split(indices,
 52 | 			test_size=test_size, random_state=hparams.wavenet_data_random_state)
 53 | 
 54 | 		#Make sure test size is a multiple of batch size else round up
 55 | 		len_test_indices = _round_down(len(test_indices), hparams.wavenet_batch_size)
 56 | 		extra_test = test_indices[len_test_indices:]
 57 | 		test_indices = test_indices[:len_test_indices]
 58 | 		train_indices = np.concatenate([train_indices, extra_test])
 59 | 
 60 | 		self._train_meta = list(np.array(self._metadata)[train_indices])
 61 | 		self._test_meta = list(np.array(self._metadata)[test_indices])
 62 | 
 63 | 		self.test_steps = len(self._test_meta) // hparams.wavenet_batch_size
 64 | 
 65 | 		if hparams.wavenet_test_size is None:
 66 | 			assert hparams.wavenet_test_batches == self.test_steps
 67 | 
 68 | 		#Get conditioning status
 69 | 		self.local_condition, self.global_condition = self._check_conditions()
 70 | 
 71 | 		with tf.device('/cpu:0'):
 72 | 			# Create placeholders for inputs and targets. Don't specify batch size because we want
 73 | 			# to be able to feed different batch sizes at eval time.
 74 | 			if is_scalar_input(hparams.input_type):
 75 | 				input_placeholder = tf.placeholder(tf.float32, shape=(None, 1, None), name='audio_inputs')
 76 | 				target_placeholder = tf.placeholder(tf.float32, shape=(None, None, 1), name='audio_targets')
 77 | 				target_type = tf.float32
 78 | 			else:
 79 | 				input_placeholder = tf.placeholder(tf.float32, shape=(None, hparams.quantize_channels, None), name='audio_inputs')
 80 | 				target_placeholder = tf.placeholder(tf.int32, shape=(None, None, 1), name='audio_targets')
 81 | 				target_type = tf.int32
 82 | 
 83 | 			self._placeholders = [
 84 | 			input_placeholder,
 85 | 			target_placeholder,
 86 | 			tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 87 | 			]
 88 | 
 89 | 			queue_types = [tf.float32, target_type, tf.int32]
 90 | 
 91 | 			if self.local_condition:
 92 | 				self._placeholders.append(tf.placeholder(tf.float32, shape=(None, hparams.num_mels, None), name='local_condition_features'))
 93 | 				queue_types.append(tf.float32)
 94 | 			if self.global_condition:
 95 | 				self._placeholders.append(tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features'))
 96 | 				queue_types.append(tf.int32)
 97 | 
 98 | 			# Create queue for buffering data
 99 | 			queue = tf.FIFOQueue(8, queue_types, name='input_queue')
100 | 			self._enqueue_op = queue.enqueue(self._placeholders)
101 | 			variables = queue.dequeue()
102 | 
103 | 			self.inputs = variables[0]
104 | 			self.inputs.set_shape(self._placeholders[0].shape)
105 | 			self.targets = variables[1]
106 | 			self.targets.set_shape(self._placeholders[1].shape)
107 | 			self.input_lengths = variables[2]
108 | 			self.input_lengths.set_shape(self._placeholders[2].shape)
109 | 
110 | 			idx = 3
111 | 
112 | 			#If local conditioning disabled override c inputs with None
113 | 			if hparams.cin_channels < 0:
114 | 				self.local_condition_features = None
115 | 			else:
116 | 				self.local_condition_features = variables[idx]
117 | 				self.local_condition_features.set_shape(self._placeholders[idx].shape)
118 | 				idx += 1
119 | 
120 | 			#If global conditioning disabled override g inputs with None
121 | 			if hparams.gin_channels < 0:
122 | 				self.global_condition_features = None
123 | 			else:
124 | 				self.global_condition_features = variables[idx]
125 | 				self.global_condition_features.set_shape(self._placeholders[idx].shape)
126 | 
127 | 			# Create queue for buffering eval data
128 | 			eval_queue = tf.FIFOQueue(1, queue_types, name='eval_queue')
129 | 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
130 | 			eval_variables = eval_queue.dequeue()
131 | 
132 | 			self.eval_inputs = eval_variables[0]
133 | 			self.eval_inputs.set_shape(self._placeholders[0].shape)
134 | 			self.eval_targets = eval_variables[1]
135 | 			self.eval_targets.set_shape(self._placeholders[1].shape)
136 | 			self.eval_input_lengths = eval_variables[2]
137 | 			self.eval_input_lengths.set_shape(self._placeholders[2].shape)
138 | 
139 | 			eval_idx = 3
140 | 
141 | 			#If local conditioning disabled override c inputs with None
142 | 			if hparams.cin_channels < 0:
143 | 				self.eval_local_condition_features = None
144 | 			else:
145 | 				self.eval_local_condition_features = eval_variables[eval_idx]
146 | 				self.eval_local_condition_features.set_shape(self._placeholders[eval_idx].shape)
147 | 				eval_idx += 1
148 | 
149 | 			#If global conditioning disabled override g inputs with None
150 | 			if hparams.gin_channels < 0:
151 | 				self.eval_global_condition_features = None
152 | 			else:
153 | 				self.eval_global_condition_features = eval_variables[eval_idx]
154 | 				self.eval_global_condition_features.set_shape(self._placeholders[eval_idx].shape)
155 | 
156 | 
157 | 	def start_threads(self, session):
158 | 		self._session = session
159 | 		thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
160 | 		thread.daemon = True #Thread will close when parent quits
161 | 		thread.start()
162 | 
163 | 		thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
164 | 		thread.daemon = True #Thread will close when parent quits
165 | 		thread.start()
166 | 
167 | 	def _get_test_groups(self):
168 | 		meta = self._test_meta[self._test_offset]
169 | 		self._test_offset += 1
170 | 
171 | 		if self._hparams.train_with_GTA:
172 | 			mel_file = meta[2]
173 | 		else:
174 | 			mel_file = meta[1]
175 | 		audio_file = meta[0]
176 | 
177 | 		input_data = np.load(os.path.join(self._base_dir, audio_file))
178 | 
179 | 		if self.local_condition:
180 | 			local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
181 | 		else:
182 | 			local_condition_features = None
183 | 
184 | 		if self.global_condition:
185 | 			global_condition_features = meta[3]
186 | 			if global_condition_features == '<no_g>':
187 | 				raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!')
188 | 		else:
189 | 			global_condition_features = None
190 | 
191 | 		return (input_data, local_condition_features, global_condition_features, len(input_data))
192 | 
193 | 	def make_test_batches(self):
194 | 		start = time.time()
195 | 
196 | 		#Read one example for evaluation
197 | 		n = 1
198 | 
199 | 		#Test on entire test set (one sample at an evaluation step)
200 | 		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
201 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
202 | 		np.random.shuffle(batches)
203 | 
204 | 		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
205 | 		return batches
206 | 
207 | 	def _enqueue_next_train_group(self):
208 | 		while not self._coord.should_stop():
209 | 			start = time.time()
210 | 
211 | 			# Read a group of examples
212 | 			n = self._hparams.wavenet_batch_size
213 | 			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
214 | 
215 | 			# Bucket examples base on similiar output length for efficiency
216 | 			examples.sort(key=lambda x: x[-1])
217 | 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
218 | 			np.random.shuffle(batches)
219 | 
220 | 			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
221 | 			for batch in batches:
222 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
223 | 				self._session.run(self._enqueue_op, feed_dict=feed_dict)
224 | 
225 | 	def _enqueue_next_test_group(self):
226 | 		test_batches = self.make_test_batches()
227 | 		while not self._coord.should_stop():
228 | 			for batch in test_batches:
229 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
230 | 				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
231 | 
232 | 	def _get_next_example(self):
233 | 		'''Get a single example (input, output, len_output) from disk
234 | 		'''
235 | 		if self._train_offset >= len(self._train_meta):
236 | 			self._train_offset = 0
237 | 			np.random.shuffle(self._train_meta)
238 | 		meta = self._train_meta[self._train_offset]
239 | 		self._train_offset += 1
240 | 
241 | 		if self._hparams.train_with_GTA:
242 | 			mel_file = meta[2]
243 | 			if 'linear' in mel_file:
244 | 				raise RuntimeError('Linear spectrogram files selected instead of GTA mels, did you specify the wrong metadata?')
245 | 		else:
246 | 			mel_file = meta[1]
247 | 		audio_file = meta[0]
248 | 
249 | 		input_data = np.load(os.path.join(self._base_dir, audio_file))
250 | 
251 | 		if self.local_condition:
252 | 			local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
253 | 		else:
254 | 			local_condition_features = None
255 | 
256 | 		if self.global_condition:
257 | 			global_condition_features = meta[3]
258 | 			if global_condition_features == '<no_g>':
259 | 				raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!')
260 | 		else:
261 | 			global_condition_features = None
262 | 
263 | 		return (input_data, local_condition_features, global_condition_features, len(input_data))
264 | 
265 | 
266 | 	def _prepare_batch(self, batches):
267 | 		assert 0 == len(batches) % self._hparams.wavenet_num_gpus
268 | 		size_per_device = int(len(batches) / self._hparams.wavenet_num_gpus)
269 | 		np.random.shuffle(batches)
270 | 
271 | 		#Limit time steps to save GPU Memory usage
272 | 		max_time_steps = self._limit_time()
273 | 		#Adjust time resolution for upsampling
274 | 		batches = self._adjust_time_resolution(batches, self.local_condition, max_time_steps)
275 | 
276 | 		#time lengths
277 | 		input_lengths = np.asarray([len(x[0]) for x in batches], np.int32)
278 | 		max_input_length = max(input_lengths)
279 | 
280 | 		#Since all inputs/targets will have the same lengths for all GPUs, we can simply treat all GPUs batches as one big batch and stack all data. (fixed length)
281 | 		inputs = self._prepare_inputs([x[0] for x in batches], max_input_length)
282 | 		targets = self._prepare_targets([x[0] for x in batches], max_input_length)
283 | 		local_condition_features = self._prepare_local_conditions(self.local_condition, [x[1] for x in batches])
284 | 		global_condition_features = self._prepare_global_conditions(self.global_condition, [x[2] for x in batches])
285 | 
286 | 		#Create final batches
287 | 		new_batches = (inputs, targets, input_lengths)
288 | 		if local_condition_features is not None:
289 | 			new_batches += (local_condition_features, )
290 | 		if global_condition_features is not None:
291 | 			new_batches += (global_condition_features, )
292 | 
293 | 		return new_batches
294 | 
295 | 	def _prepare_inputs(self, inputs, maxlen):
296 | 		if is_mulaw_quantize(self._hparams.input_type):
297 | 			#[batch_size, time_steps, quantize_channels]
298 | 			x_batch = np.stack([_pad_inputs(np_utils.to_categorical(
299 | 				x, num_classes=self._hparams.quantize_channels), maxlen) for x in inputs]).astype(np.float32)
300 | 		else:
301 | 			#[batch_size, time_steps, 1]
302 | 			x_batch = np.stack([_pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs]).astype(np.float32)
303 | 		assert len(x_batch.shape) == 3
304 | 		#Convert to channels first [batch_size, quantize_channels (or 1), time_steps]
305 | 		x_batch = np.transpose(x_batch, (0, 2, 1))
306 | 		return x_batch
307 | 
308 | 	def _prepare_targets(self, targets, maxlen):
309 | 		#[batch_size, time_steps]
310 | 		if is_mulaw_quantize(self._hparams.input_type):
311 | 			y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.int32)
312 | 		else:
313 | 			y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.float32)
314 | 		assert len(y_batch.shape) == 2
315 | 		#Add extra axis (make 3 dimension)
316 | 		y_batch = np.expand_dims(y_batch, axis=-1)
317 | 		return y_batch
318 | 
319 | 	def _prepare_local_conditions(self, local_condition, c_features):
320 | 		if local_condition:
321 | 			maxlen = max([len(x) for x in c_features])
322 | 			#[-max, max] or [0,max]
323 | 			T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)
324 | 
325 | 			if self._hparams.clip_for_wavenet:
326 | 				c_features = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in c_features]
327 | 				
328 | 			c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in c_features]).astype(np.float32)
329 | 			assert len(c_batch.shape) == 3
330 | 			#[batch_size, c_channels, time_steps]
331 | 			c_batch = np.transpose(c_batch, (0, 2, 1))
332 | 
333 | 			if self._hparams.normalize_for_wavenet:
334 | 				#rerange to [0, 1]
335 | 				c_batch = _interp(c_batch, T2_output_range).astype(np.float32)
336 | 
337 | 		else:
338 | 			c_batch = None
339 | 
340 | 		return c_batch
341 | 
342 | 	def _prepare_global_conditions(self, global_condition, g_features):
343 | 		if global_condition:
344 | 			g_batch = np.array(g_features).astype(np.int32).reshape(-1, 1)
345 | 
346 | 		else:
347 | 			g_batch = None
348 | 
349 | 		return g_batch
350 | 
351 | 	def _check_conditions(self):
352 | 		local_condition = self._hparams.cin_channels > 0
353 | 		global_condition = self._hparams.gin_channels > 0
354 | 		return local_condition, global_condition
355 | 
356 | 	def _limit_time(self):
357 | 		'''Limit time resolution to save GPU memory.
358 | 		'''
359 | 		if self._hparams.max_time_sec is not None:
360 | 			return int(self._hparams.max_time_sec * self._hparams.sample_rate)
361 | 
362 | 		elif self._hparams.max_time_steps is not None:
363 | 			return self._hparams.max_time_steps
364 | 
365 | 		else:
366 | 			return None
367 | 
368 | 	def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
369 | 		'''Adjust time resolution between audio and local condition
370 | 		'''
371 | 		if local_condition:
372 | 			new_batch = []
373 | 			for b in batch:
374 | 				x, c, g, l = b
375 | 				self._assert_ready_for_upsample(x, c)
376 | 				if max_time_steps is not None:
377 | 					max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(self._hparams), True)
378 | 					if len(x) > max_time_steps:
379 | 						max_time_frames = max_steps // audio.get_hop_size(self._hparams)
380 | 						start = np.random.randint(0, len(c) - max_time_frames)
381 | 						time_start = start * audio.get_hop_size(self._hparams)
382 | 						x = x[time_start: time_start + max_time_frames * audio.get_hop_size(self._hparams)]
383 | 						c = c[start: start + max_time_frames, :]
384 | 						self._assert_ready_for_upsample(x, c)
385 | 
386 | 				new_batch.append((x, c, g, l))
387 | 			return new_batch
388 | 
389 | 		else:
390 | 			new_batch = []
391 | 			for b in batch:
392 | 				x, c, g, l = b
393 | 				x = audio.trim_silence(x, hparams)
394 | 				if max_time_steps is not None and len(x) > max_time_steps:
395 | 					start = np.random.randint(0, len(c) - max_time_steps)
396 | 					x = x[start: start + max_time_steps]
397 | 				new_batch.append((x, c, g, l))
398 | 			return new_batch
399 | 
400 | 	def _assert_ready_for_upsample(self, x, c):
401 | 		assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(self._hparams)
402 | 
403 | 
404 | def _pad_inputs(x, maxlen, _pad=0):
405 | 	return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad)
406 | 
407 | def _pad_targets(x, maxlen, _pad=0):
408 | 	return np.pad(x, (0, maxlen - len(x)), mode='constant', constant_values=_pad)
409 | 
410 | def _round_up(x, multiple):
411 | 	remainder = x % multiple
412 | 	return x if remainder == 0 else x + multiple - remainder
413 | 
414 | def _round_down(x, multiple):
415 | 	remainder = x % multiple
416 | 	return x if remainder == 0 else x - remainder
417 | 
418 | def _ensure_divisible(length, divisible_by=256, lower=True):
419 | 	if length % divisible_by == 0:
420 | 		return length
421 | 	if lower:
422 | 		return length - length % divisible_by
423 | 	else:
424 | 		return length + (divisible_by - length % divisible_by)
425 | 
426 | def _interp(feats, in_range):
427 | 	#rescales from [-max, max] (or [0, max]) to [0, 1]
428 | 	return (feats - in_range[0]) / (in_range[1] - in_range[0])
429 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .wavenet import WaveNet
 2 | from warnings import warn
 3 | from wavenet_vocoder.util import is_mulaw_quantize
 4 | 
 5 | def create_model(name, hparams, init=False):
 6 | 	if is_mulaw_quantize(hparams.input_type):
 7 | 		if hparams.out_channels != hparams.quantize_channels:
 8 | 			raise RuntimeError(
 9 | 				"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
10 | 
11 | 	if name == 'WaveNet':
12 | 		return WaveNet(hparams, init)
13 | 	else:
14 | 		raise Exception('Unknow model: {}'.format(name))
15 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/models/gaussian.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def gaussian_maximum_likelihood_estimation_loss(y_hat, y, log_scale_min_gauss, num_classes, use_cdf=True, reduce=True):
 6 | 	'''compute the gaussian MLE loss'''
 7 | 	with tf.control_dependencies([tf.assert_equal(tf.shape(y_hat)[1], 2), tf.assert_equal(tf.rank(y_hat), 3)]):
 8 | 		#[batch_size, time_steps, channels]
 9 | 		y_hat = tf.transpose(y_hat, [0, 2, 1])
10 | 
11 | 	#Unpack parameters: mean and log_scale outputs
12 | 	mean = y_hat[:, :, 0]
13 | 	log_scale = tf.maximum(y_hat[:, :, 1], log_scale_min_gauss)
14 | 	y = tf.squeeze(y, [-1])
15 | 
16 | 	if use_cdf:
17 | 		#Compute log_probs using CDF trick (Normalized loss value and more stable training than with natural log prob)
18 | 		#Instantiate a Normal distribution with model outputs
19 | 		gaussian = tf.contrib.distributions.Normal(loc=mean, scale=tf.exp(log_scale))
20 | 
21 | 		#Draw CDF+ and CDF- neighbors to the true sample y
22 | 		cdf_plus = gaussian.cdf(y + 1. / (num_classes - 1))
23 | 		cdf_min = gaussian.cdf(y - 1. / (num_classes - 1))
24 | 
25 | 		#Maximize the difference between CDF+ and CDF- (or its log)
26 | 		log_prob = tf.log(tf.maximum(cdf_plus - cdf_min, 1e-12))
27 | 
28 | 	else:
29 | 		#Get log probability of each sample under this distribution in a computationally stable fashion
30 | 		#This is the log(PDF)
31 | 		log_prob = -0.5 * (np.log(2. * np.pi) + 2. * log_scale + tf.square(y - mean) * tf.exp(-2. * log_scale))
32 | 
33 | 	#Loss (Maximize log probability by minimizing its negative)
34 | 	if reduce:
35 | 		return -tf.reduce_sum(log_prob)
36 | 	else:
37 | 		return -tf.expand_dims(log_prob, [-1])
38 | 
39 | def sample_from_gaussian(y, log_scale_min_gauss):
40 | 	'''sample from learned gaussian distribution'''
41 | 	with tf.control_dependencies([tf.assert_equal(tf.shape(y)[1], 2)]):
42 | 		#[batch_size, time_length, channels]
43 | 		y = tf.transpose(y, [0, 2, 1])
44 | 
45 | 	mean = y[:, :, 0]
46 | 	log_scale = tf.maximum(y[:, :, 1], log_scale_min_gauss)
47 | 	scale = tf.exp(log_scale)
48 | 
49 | 	gaussian_dist = tf.contrib.distributions.Normal(loc=mean, scale=scale, allow_nan_stats=False)
50 | 	x = gaussian_dist.sample()
51 | 
52 | 	return tf.minimum(tf.maximum(x, -1.), 1.)
53 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/models/mixture.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def log_sum_exp(x):
  6 | 	""" numerically stable log_sum_exp implementation that prevents overflow """
  7 | 	axis = len(x.get_shape())-1
  8 | 	m = tf.reduce_max(x, axis)
  9 | 	m2 = tf.reduce_max(x, axis, keepdims=True)
 10 | 	return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis))
 11 | 
 12 | def log_prob_from_logits(x):
 13 | 	""" numerically stable log_softmax implementation that prevents overflow """
 14 | 	axis = len(x.get_shape())-1
 15 | 	m = tf.reduce_max(x, axis, keepdims=True)
 16 | 	return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keepdims=True))
 17 | 
 18 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256,
 19 | 		log_scale_min=-7.0, reduce=True):
 20 | 	'''Discretized mix of logistic distributions loss.
 21 | 
 22 | 	Note that it is assumed that input is scaled to [-1, 1]
 23 | 
 24 | 	Args:
 25 | 		y_hat: Tensor [batch_size, channels, time_length], predicted output.
 26 | 		y: Tensor [batch_size, time_length, 1], Target.
 27 | 	Returns:
 28 | 		Tensor loss
 29 | 	'''
 30 | 	with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y_hat)[1], 3), 0), tf.assert_equal(tf.rank(y_hat), 3)]):
 31 | 		nr_mix = tf.shape(y_hat)[1] // 3
 32 | 
 33 | 	#[Batch_size, time_length, channels]
 34 | 	y_hat = tf.transpose(y_hat, [0, 2, 1])
 35 | 
 36 | 	#unpack parameters. [batch_size, time_length, num_mixtures] x 3
 37 | 	logit_probs = y_hat[:, :, :nr_mix]
 38 | 	means = y_hat[:, :, nr_mix:2 * nr_mix]
 39 | 	log_scales = tf.maximum(y_hat[:, :, 2* nr_mix: 3 * nr_mix], log_scale_min)
 40 | 
 41 | 	#[batch_size, time_length, 1] -> [batch_size, time_length, num_mixtures]
 42 | 	y = y * tf.ones(shape=[1, 1, nr_mix], dtype=tf.float32)
 43 | 
 44 | 	centered_y = y - means
 45 | 	inv_stdv = tf.exp(-log_scales)
 46 | 	plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 47 | 	cdf_plus = tf.nn.sigmoid(plus_in)
 48 | 	min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 49 | 	cdf_min = tf.nn.sigmoid(min_in)
 50 | 
 51 | 	log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling)
 52 | 	log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling)
 53 | 
 54 | 	#probability for all other cases
 55 | 	cdf_delta = cdf_plus - cdf_min
 56 | 
 57 | 	mid_in = inv_stdv * centered_y
 58 | 	#log probability in the center of the bin, to be used in extreme cases
 59 | 	#(not actually used in this code)
 60 | 	log_pdf_mid = mid_in - log_scales - 2. * tf.nn.softplus(mid_in)
 61 | 
 62 | 	log_probs = tf.where(y < -0.999, log_cdf_plus,
 63 | 		tf.where(y > 0.999, log_one_minus_cdf_min,
 64 | 			tf.where(cdf_delta > 1e-5,
 65 | 				tf.log(tf.maximum(cdf_delta, 1e-12)),
 66 | 				log_pdf_mid - np.log((num_classes - 1) / 2))))
 67 | 	
 68 | 	#log_probs = log_probs + tf.nn.log_softmax(logit_probs, -1)
 69 | 	log_probs = log_probs + log_prob_from_logits(logit_probs)
 70 | 
 71 | 	if reduce:
 72 | 		return -tf.reduce_sum(log_sum_exp(log_probs))
 73 | 	else:
 74 | 		return -tf.expand_dims(log_sum_exp(log_probs), [-1])
 75 | 
 76 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.):
 77 | 	'''
 78 | 	Args:
 79 | 		y: Tensor, [batch_size, channels, time_length]
 80 | 	Returns:
 81 | 		Tensor: sample in range of [-1, 1]
 82 | 	'''
 83 | 	with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y)[1], 3), 0)]):
 84 | 		nr_mix = tf.shape(y)[1] // 3
 85 | 
 86 | 	#[batch_size, time_length, channels]
 87 | 	y = tf.transpose(y, [0, 2, 1])
 88 | 	logit_probs = y[:, :, :nr_mix]
 89 | 
 90 | 	#sample mixture indicator from softmax
 91 | 	temp = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5)
 92 | 	temp = logit_probs - tf.log(-tf.log(temp))
 93 | 	argmax = tf.argmax(temp, -1)
 94 | 
 95 | 	#[batch_size, time_length] -> [batch_size, time_length, nr_mix]
 96 | 	one_hot = tf.one_hot(argmax, depth=nr_mix, dtype=tf.float32)
 97 | 	#select logistic parameters
 98 | 	means = tf.reduce_sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
 99 | 	log_scales = tf.maximum(tf.reduce_sum(
100 | 		y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), log_scale_min)
101 | 
102 | 	#sample from logistic & clip to interval
103 | 	#we don't actually round to the nearest 8-bit value when sampling
104 | 	u = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5)
105 | 	x = means + tf.exp(log_scales) * (tf.log(u) - tf.log(1 -u))
106 | 
107 | 	return tf.minimum(tf.maximum(x, -1.), 1.)
108 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/synthesize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from hparams import hparams, hparams_debug_string
 7 | from infolog import log
 8 | from tqdm import tqdm
 9 | from wavenet_vocoder.synthesizer import Synthesizer
10 | 
11 | 
12 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
13 | 	log_dir = os.path.join(output_dir, 'plots')
14 | 	wav_dir = os.path.join(output_dir, 'wavs')
15 | 
16 | 	#We suppose user will provide correct folder depending on training method
17 | 	log(hparams_debug_string())
18 | 	synth = Synthesizer()
19 | 	synth.load(checkpoint_path, hparams)
20 | 
21 | 	if args.model == 'Tacotron-2':
22 | 		#If running all Tacotron-2, synthesize audio from evaluated mels
23 | 		metadata_filename = os.path.join(args.mels_dir, 'map.txt')
24 | 		with open(metadata_filename, encoding='utf-8') as f:
25 | 			metadata = np.array([line.strip().split('|') for line in f])
26 | 
27 | 		speaker_ids = metadata[:, 2]
28 | 		mel_files = metadata[:, 1]
29 | 		texts = metadata[:, 0]
30 | 
31 | 		speaker_ids = None if (speaker_ids == '<no_g>').all() else speaker_ids
32 | 	else:
33 | 		#else Get all npy files in input_dir (supposing they are mels)
34 | 		mel_files  = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy'])
35 | 		speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',')
36 | 		if speaker_ids is not None:
37 | 			assert len(speaker_ids) == len(mel_files)
38 | 
39 | 		texts = None
40 | 
41 | 	log('Starting synthesis! (this will take a while..)')
42 | 	os.makedirs(log_dir, exist_ok=True)
43 | 	os.makedirs(wav_dir, exist_ok=True)
44 | 
45 | 	mel_files = [mel_files[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)]
46 | 	speaker_ids = None if speaker_ids is None else [speaker_ids[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(speaker_ids), hparams.wavenet_synthesis_batch_size)]
47 | 	texts = None if texts is None else [texts[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size)]
48 | 
49 | 	with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
50 | 		for i, mel_batch in enumerate(tqdm(mel_files)):
51 | 			mel_spectros = [np.load(mel_file) for mel_file in mel_batch]
52 | 
53 | 			basenames = [os.path.basename(mel_file).replace('.npy', '') for mel_file in mel_batch]
54 | 			speaker_id_batch = None if speaker_ids is None else speaker_ids[i]
55 | 			audio_files = synth.synthesize(mel_spectros, speaker_id_batch, basenames, wav_dir, log_dir)
56 | 
57 | 			speaker_logs = ['<no_g>'] * len(mel_batch) if speaker_id_batch is None else speaker_id_batch
58 | 
59 | 			for j, mel_file in enumerate(mel_batch):
60 | 				if texts is None:
61 | 					file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j]))
62 | 				else:
63 | 					file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j]))
64 | 
65 | 	log('synthesized audio waveforms at {}'.format(wav_dir))
66 | 
67 | 
68 | 
69 | def wavenet_synthesize(args, hparams, checkpoint):
70 | 	output_dir = 'wavenet_' + args.output_dir
71 | 
72 | 	try:
73 | 		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
74 | 		log('loaded model at {}'.format(checkpoint_path))
75 | 	except:
76 | 		raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
77 | 
78 | 	run_synthesis(args, checkpoint_path, output_dir, hparams)
79 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/synthesizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from datasets.audio import save_wavenet_wav, get_hop_size, melspectrogram
  6 | from infolog import log
  7 | from wavenet_vocoder.models import create_model
  8 | from wavenet_vocoder.train import create_shadow_saver, load_averaged_model
  9 | from wavenet_vocoder.feeder import _interp
 10 | 
 11 | from . import util
 12 | 
 13 | 
 14 | class Synthesizer:
 15 | 	def load(self, checkpoint_path, hparams, model_name='WaveNet'):
 16 | 		log('Constructing model: {}'.format(model_name))
 17 | 		self._hparams = hparams
 18 | 		local_cond, global_cond = self._check_conditions()
 19 | 
 20 | 		self.local_conditions = tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='local_condition_features') if local_cond else None
 21 | 		self.global_conditions = tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None
 22 | 		self.synthesis_length = tf.placeholder(tf.int32, shape=(), name='synthesis_length') if not local_cond else None
 23 | 		self.targets = tf.placeholder(tf.float32, shape=(1, None, 1), name='audio_targets') if hparams.wavenet_synth_debug else None #Debug only with 1 wav
 24 | 		self.input_lengths = tf.placeholder(tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None
 25 | 		self.synth_debug = hparams.wavenet_synth_debug
 26 | 
 27 | 		with tf.variable_scope('WaveNet_model') as scope:
 28 | 			self.model = create_model(model_name, hparams)
 29 | 			self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions,
 30 | 				input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=self.targets)
 31 | 
 32 | 			self._hparams = hparams
 33 | 			sh_saver = create_shadow_saver(self.model)
 34 | 
 35 | 			log('Loading checkpoint: {}'.format(checkpoint_path))
 36 | 			#Memory allocation on the GPU as needed
 37 | 			config = tf.ConfigProto()
 38 | 			config.gpu_options.allow_growth = True
 39 | 			config.allow_soft_placement = True
 40 | 
 41 | 			self.session = tf.Session(config=config)
 42 | 			self.session.run(tf.global_variables_initializer())
 43 | 
 44 | 		load_averaged_model(self.session, sh_saver, checkpoint_path)
 45 | 
 46 | 	def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir):
 47 | 		hparams = self._hparams
 48 | 		local_cond, global_cond = self._check_conditions()
 49 | 
 50 | 		#Switch mels in case of debug
 51 | 		if self.synth_debug:
 52 | 			assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs)
 53 | 			mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels]
 54 | 
 55 | 		#Get True length of audio to be synthesized: audio_len = mel_len * hop_size
 56 | 		audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms]
 57 | 
 58 | 		#Prepare local condition batch
 59 | 		maxlen = max([len(x) for x in mel_spectrograms])
 60 | 		#[-max, max] or [0,max]
 61 | 		T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)
 62 | 
 63 | 		if self._hparams.clip_for_wavenet:
 64 | 			mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms]
 65 | 
 66 | 		c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32)
 67 | 
 68 | 		if self._hparams.normalize_for_wavenet:
 69 | 			#rerange to [0, 1]
 70 | 			c_batch = _interp(c_batch, T2_output_range).astype(np.float32)
 71 | 
 72 | 		g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
 73 | 		feed_dict = {}
 74 | 
 75 | 		if local_cond:
 76 | 			feed_dict[self.local_conditions] = c_batch
 77 | 		else:
 78 | 			feed_dict[self.synthesis_length] = 100
 79 | 
 80 | 		if global_cond:
 81 | 			feed_dict[self.global_conditions] = g
 82 | 
 83 | 		if self.synth_debug:
 84 | 			debug_wavs = hparams.wavenet_debug_wavs
 85 | 			assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
 86 | 			test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs]
 87 | 
 88 | 			#pad wavs to same length
 89 | 			max_test_len = max([len(x) for x in test_wavs])
 90 | 			test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32)
 91 | 
 92 | 			assert len(test_wavs) == len(debug_wavs)
 93 | 			feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1)
 94 | 			feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])
 95 | 
 96 | 		#Generate wavs and clip extra padding to select Real speech parts
 97 | 		generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict)
 98 | 
 99 | 		#Linearize outputs (n_gpus -> 1D)
100 | 		generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs]
101 | 		upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats]
102 | 
103 | 		generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]
104 | 		upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)]
105 | 
106 | 		audio_filenames = []
107 | 		for i, (generated_wav, input_mel, upsampled_feature) in enumerate(zip(generated_wavs, mel_spectrograms, upsampled_features)):
108 | 			#Save wav to disk
109 | 			audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
110 | 			save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
111 | 			audio_filenames.append(audio_filename)
112 | 
113 | 			#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
114 | 			#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
115 | 			generated_mel = melspectrogram(generated_wav, hparams).T
116 | 			util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
117 | 				title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel)
118 | 			#Save upsampled features to visualize checkerboard artifacts.
119 | 			util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])),
120 | 				title='Upmsampled Local Condition features', auto_aspect=True)
121 | 
122 | 			#Save waveplot to disk
123 | 			if log_dir is not None:
124 | 				plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
125 | 				util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.')
126 | 
127 | 		return audio_filenames
128 | 
129 | 	def _check_conditions(self):
130 | 		local_condition = self._hparams.cin_channels > 0
131 | 		global_condition = self._hparams.gin_channels > 0
132 | 		return local_condition, global_condition
133 | 
134 | 
135 | def _pad_inputs(x, maxlen, _pad=0):
136 | 	return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad)
137 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import time
  5 | import traceback
  6 | from datetime import datetime
  7 | 
  8 | import infolog
  9 | import librosa
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | from hparams import hparams_debug_string
 13 | from datasets.audio import save_wavenet_wav, melspectrogram
 14 | from tacotron.utils import ValueWindow
 15 | from wavenet_vocoder.feeder import Feeder, _interp
 16 | from wavenet_vocoder.models import create_model
 17 | 
 18 | from . import util
 19 | 
 20 | log = infolog.log
 21 | 
 22 | 
 23 | def time_string():
 24 | 	return datetime.now().strftime('%Y-%m-%d %H:%M')
 25 | 
 26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
 27 | 	#Create tensorboard projector
 28 | 	config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
 29 | 	config.model_checkpoint_path = checkpoint_path
 30 | 
 31 | 	for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
 32 | 		#Initialize config
 33 | 		embedding = config.embeddings.add()
 34 | 		#Specifiy the embedding variable and the metadata
 35 | 		embedding.tensor_name = embedding_name
 36 | 		embedding.metadata_path = path_to_meta
 37 | 	
 38 | 	#Project the embeddings to space dimensions for visualization
 39 | 	tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
 40 | 
 41 | def add_train_stats(model, hparams):
 42 | 	with tf.variable_scope('stats') as scope:
 43 | 		for i in range(hparams.wavenet_num_gpus):
 44 | 			tf.summary.histogram('wav_outputs %d' % i, model.tower_y_hat_log[i])
 45 | 			tf.summary.histogram('wav_targets %d' % i, model.tower_y_log[i])
 46 | 			if model.tower_means[i] is not None:
 47 | 				tf.summary.histogram('gaussian_means %d' % i, model.tower_means[i])
 48 | 				tf.summary.histogram('gaussian_log_scales %d' % i, model.tower_log_scales[i])
 49 | 
 50 | 		tf.summary.scalar('wavenet_learning_rate', model.learning_rate)
 51 | 		tf.summary.scalar('wavenet_loss', model.loss)
 52 | 
 53 | 		gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None]
 54 | 		tf.summary.histogram('gradient_norm', gradient_norms)
 55 | 		tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
 56 | 		return tf.summary.merge_all()
 57 | 
 58 | def add_test_stats(summary_writer, step, eval_loss, hparams):
 59 | 	values = [
 60 | 	tf.Summary.Value(tag='Wavenet_eval_model/eval_stats/wavenet_eval_loss', simple_value=eval_loss),
 61 | 	]
 62 | 
 63 | 	test_summary = tf.Summary(value=values)
 64 | 	summary_writer.add_summary(test_summary, step)
 65 | 
 66 | 
 67 | def create_shadow_saver(model, global_step=None):
 68 | 	'''Load shadow variables of saved model.
 69 | 
 70 | 	Inspired by: https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
 71 | 
 72 | 	Can also use: shadow_dict = model.ema.variables_to_restore()
 73 | 	'''
 74 | 	#Add global step to saved variables to save checkpoints correctly
 75 | 	shadow_variables = [model.ema.average_name(v) for v in model.variables]
 76 | 	variables = model.variables
 77 | 
 78 | 	if global_step is not None:
 79 | 		shadow_variables += ['global_step']
 80 | 		variables += [global_step]
 81 | 
 82 | 	shadow_dict = dict(zip(shadow_variables, variables)) #dict(zip(keys, values)) -> {key1: value1, key2: value2, ...}
 83 | 	return tf.train.Saver(shadow_dict, max_to_keep=20)
 84 | 
 85 | def load_averaged_model(sess, sh_saver, checkpoint_path):
 86 | 	sh_saver.restore(sess, checkpoint_path)
 87 | 
 88 | 
 89 | def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name):
 90 | 	'''Evaluate model during training.
 91 | 	Supposes that model variables are averaged.
 92 | 	'''
 93 | 	start_time = time.time()
 94 | 	y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0],
 95 | 		model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]])
 96 | 	duration = time.time() - start_time
 97 | 	log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
 98 | 		len(y_target), duration, len(y_target)/duration))
 99 | 
100 | 	#Make audio and plot paths
101 | 	pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
102 | 	target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
103 | 	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
104 | 	mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
105 | 	upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))
106 | 
107 | 	#Save figure
108 | 	util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
109 | 	log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))
110 | 
111 | 	#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
112 | 	#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
113 | 	T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
114 | 	generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
115 | 	util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
116 | 		global_step, loss), target_spectrogram=input_mel.T)
117 | 	util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
118 | 		global_step, loss), auto_aspect=True)
119 | 
120 | 	#Save Audio
121 | 	save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
122 | 	save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
123 | 
124 | 	#Write eval summary to tensorboard
125 | 	log('Writing eval summary!')
126 | 	add_test_stats(summary_writer, global_step, loss, hparams=hparams)
127 | 
128 | def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
129 | 	log('\nSaving intermediate states at step {}'.format(global_step))
130 | 	idx = 0
131 | 	y_hat, y, loss, length, input_mel, upsampled_features = sess.run([model.tower_y_hat_log[0][idx], 
132 | 		model.tower_y_log[0][idx], 
133 | 		model.loss,
134 | 		model.tower_input_lengths[0][idx], 
135 | 		model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]])
136 | 
137 | 	#mask by length
138 | 	y_hat[length:] = 0
139 | 	y[length:] = 0
140 | 
141 | 	#Make audio and plot paths
142 | 	pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
143 | 	target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
144 | 	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
145 | 	mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
146 | 	upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))
147 | 
148 | 	#Save figure
149 | 	util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
150 | 
151 | 	#Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
152 | 	#Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
153 | 	T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
154 | 	generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
155 | 	util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
156 | 		global_step, loss), target_spectrogram=input_mel.T)
157 | 	util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
158 | 		global_step, loss), auto_aspect=True)
159 | 
160 | 	#Save audio
161 | 	save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
162 | 	save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
163 | 
164 | def save_checkpoint(sess, saver, checkpoint_path, global_step):
165 | 	saver.save(sess, checkpoint_path, global_step=global_step)
166 | 
167 | 
168 | def model_train_mode(args, feeder, hparams, global_step, init=False):
169 | 	with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope:
170 | 		model_name = None
171 | 		if args.model == 'Tacotron-2':
172 | 			model_name = 'WaveNet'
173 | 		model = create_model(model_name or args.model, hparams, init)
174 | 		#initialize model to train mode
175 | 		model.initialize(feeder.targets, feeder.local_condition_features, feeder.global_condition_features,
176 | 			feeder.input_lengths, x=feeder.inputs)
177 | 		model.add_loss()
178 | 		model.add_optimizer(global_step)
179 | 		stats = add_train_stats(model, hparams)
180 | 		return model, stats
181 | 
182 | def model_test_mode(args, feeder, hparams, global_step):
183 | 	with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope:
184 | 		model_name = None
185 | 		if args.model == 'Tacotron-2':
186 | 			model_name = 'WaveNet'
187 | 		model = create_model(model_name or args.model, hparams)
188 | 		#initialize model to test mode
189 | 		model.initialize(feeder.eval_targets, feeder.eval_local_condition_features, feeder.eval_global_condition_features,
190 | 			feeder.eval_input_lengths)
191 | 		model.add_loss()
192 | 		return model
193 | 
194 | def train(log_dir, args, hparams, input_path):
195 | 	save_dir = os.path.join(log_dir, 'wave_pretrained')
196 | 	plot_dir = os.path.join(log_dir, 'plots')
197 | 	wav_dir = os.path.join(log_dir, 'wavs')
198 | 	eval_dir = os.path.join(log_dir, 'eval-dir')
199 | 	eval_plot_dir = os.path.join(eval_dir, 'plots')
200 | 	eval_wav_dir = os.path.join(eval_dir, 'wavs')
201 | 	tensorboard_dir = os.path.join(log_dir, 'wavenet_events')
202 | 	meta_folder = os.path.join(log_dir, 'metas')
203 | 	os.makedirs(save_dir, exist_ok=True)
204 | 	os.makedirs(plot_dir, exist_ok=True)
205 | 	os.makedirs(wav_dir, exist_ok=True)
206 | 	os.makedirs(eval_dir, exist_ok=True)
207 | 	os.makedirs(eval_plot_dir, exist_ok=True)
208 | 	os.makedirs(eval_wav_dir, exist_ok=True)
209 | 	os.makedirs(tensorboard_dir, exist_ok=True)
210 | 	os.makedirs(meta_folder, exist_ok=True)
211 | 
212 | 	checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')
213 | 	input_path = os.path.join(args.base_dir, input_path)
214 | 
215 | 	log('Checkpoint_path: {}'.format(checkpoint_path))
216 | 	log('Loading training data from: {}'.format(input_path))
217 | 	log('Using model: {}'.format(args.model))
218 | 	log(hparams_debug_string())
219 | 
220 | 	#Start by setting a seed for repeatability
221 | 	tf.set_random_seed(hparams.wavenet_random_seed)
222 | 
223 | 	#Set up data feeder
224 | 	coord = tf.train.Coordinator()
225 | 	with tf.variable_scope('datafeeder') as scope:
226 | 		feeder = Feeder(coord, input_path, args.base_dir, hparams)
227 | 
228 | 	#Set up model
229 | 	global_step = tf.Variable(0, name='global_step', trainable=False)
230 | 	model, stats = model_train_mode(args, feeder, hparams, global_step)
231 | 	eval_model = model_test_mode(args, feeder, hparams, global_step)
232 | 
233 | 	#Speaker Embeddings metadata
234 | 	if hparams.speakers_path is not None:
235 | 		speaker_embedding_meta = hparams.speakers_path
236 | 
237 | 	else:
238 | 		speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv')
239 | 		if not os.path.isfile(speaker_embedding_meta):
240 | 			with open(speaker_embedding_meta, 'w', encoding='utf-8') as f:
241 | 				for speaker in hparams.speakers:
242 | 					f.write('{}\n'.format(speaker))
243 | 
244 | 		speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..')
245 | 
246 | 	#book keeping
247 | 	step = 0
248 | 	time_window = ValueWindow(100)
249 | 	loss_window = ValueWindow(100)
250 | 	sh_saver = create_shadow_saver(model, global_step)
251 | 
252 | 	log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps))
253 | 
254 | 	#Memory allocation on the memory
255 | 	config = tf.ConfigProto()
256 | 	config.gpu_options.allow_growth = True
257 | 	config.allow_soft_placement = True
258 | 	run_init = False
259 | 
260 | 	#Train
261 | 	with tf.Session(config=config) as sess:
262 | 		try:
263 | 			summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
264 | 			sess.run(tf.global_variables_initializer())
265 | 
266 | 			#saved model restoring
267 | 			if args.restore:
268 | 				# Restore saved model if the user requested it, default = True
269 | 				try:
270 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
271 | 
272 | 					if (checkpoint_state and checkpoint_state.model_checkpoint_path):
273 | 						log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
274 | 						load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
275 | 					else:
276 | 						log('No model to load at {}'.format(save_dir), slack=True)
277 | 						if hparams.wavenet_weight_normalization:
278 | 							run_init = True
279 | 
280 | 				except tf.errors.OutOfRangeError as e:
281 | 					log('Cannot restore checkpoint: {}'.format(e), slack=True)
282 | 			else:
283 | 				log('Starting new training!', slack=True)
284 | 				if hparams.wavenet_weight_normalization:
285 | 					run_init = True
286 | 
287 | 			if run_init:
288 | 				log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..')
289 | 				#Create init_model
290 | 				init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True)
291 | 
292 | 			#initializing feeder
293 | 			feeder.start_threads(sess)
294 | 
295 | 			if run_init:
296 | 				#Run one forward pass for model parameters initialization (make prediction on init_batch)
297 | 				_ = sess.run(init_model.tower_y_hat)
298 | 				log('Data dependent initialization done. Starting training!')
299 | 			
300 | 			#Training loop
301 | 			while not coord.should_stop() and step < args.wavenet_train_steps:
302 | 				start_time = time.time()
303 | 				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
304 | 				time_window.append(time.time() - start_time)
305 | 				loss_window.append(loss)
306 | 
307 | 				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
308 | 					step, time_window.average, loss, loss_window.average)
309 | 				log(message, end='\r', slack=(step % args.checkpoint_interval == 0))
310 | 
311 | 				if np.isnan(loss) or loss > 100:
312 | 					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
313 | 					raise Exception('Loss exploded')
314 | 
315 | 				if step % args.summary_interval == 0:
316 | 					log('\nWriting summary at step {}'.format(step))
317 | 					summary_writer.add_summary(sess.run(stats), step)
318 | 
319 | 				if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
320 | 					save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name=args.model)
321 | 					save_checkpoint(sess, sh_saver, checkpoint_path, global_step)
322 | 
323 | 				if step % args.eval_interval == 0:
324 | 					log('\nEvaluating at step {}'.format(step))
325 | 					eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer , hparams=model._hparams, model_name=args.model)
326 | 
327 | 				if hparams.gin_channels > 0 and (step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1):
328 | 					#Get current checkpoint state
329 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
330 | 
331 | 					#Update Projector
332 | 					log('\nSaving Model Speaker Embeddings visualization..')
333 | 					add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path)
334 | 					log('WaveNet Speaker embeddings have been updated on tensorboard!')
335 | 
336 | 			log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True)
337 | 			return save_dir
338 | 
339 | 		except Exception as e:
340 | 			log('Exiting due to exception: {}'.format(e), slack=True)
341 | 			traceback.print_exc()
342 | 			coord.request_stop(e)
343 | 
344 | 
345 | def wavenet_train(args, log_dir, hparams, input_path):
346 | 	return train(log_dir, args, hparams, input_path)
347 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/util.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | import librosa.display as dsp
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | def _assert_valid_input_type(s):
 11 | 	assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw'
 12 | 
 13 | def is_mulaw_quantize(s):
 14 | 	_assert_valid_input_type(s)
 15 | 	return s == 'mulaw-quantize'
 16 | 
 17 | def is_mulaw(s):
 18 | 	_assert_valid_input_type(s)
 19 | 	return s == 'mulaw'
 20 | 
 21 | def is_raw(s):
 22 | 	_assert_valid_input_type(s)
 23 | 	return s == 'raw'
 24 | 
 25 | def is_scalar_input(s):
 26 | 	return is_raw(s) or is_mulaw(s)
 27 | 
 28 | 
 29 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py
 30 | def mulaw(x, mu=256):
 31 | 	"""Mu-Law companding
 32 | 	Method described in paper [1]_.
 33 | 	.. math::
 34 | 		f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu)
 35 | 	Args:
 36 | 		x (array-like): Input signal. Each value of input signal must be in
 37 | 		  range of [-1, 1].
 38 | 		mu (number): Compression parameter ``μ``.
 39 | 	Returns:
 40 | 		array-like: Compressed signal ([-1, 1])
 41 | 	See also:
 42 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 43 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 44 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 45 | 	.. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding
 46 | 		implementations using the tms320c54x." SPRA163 (1997).
 47 | 	"""
 48 | 	mu = 255
 49 | 	return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu)
 50 | 
 51 | 
 52 | def inv_mulaw(y, mu=256):
 53 | 	"""Inverse of mu-law companding (mu-law expansion)
 54 | 	.. math::
 55 | 		f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1)
 56 | 	Args:
 57 | 		y (array-like): Compressed signal. Each value of input signal must be in
 58 | 		  range of [-1, 1].
 59 | 		mu (number): Compression parameter ``μ``.
 60 | 	Returns:
 61 | 		array-like: Uncomprresed signal (-1 <= x <= 1)
 62 | 	See also:
 63 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 64 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 65 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 66 | 	"""
 67 | 	mu = 255
 68 | 	return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
 69 | 
 70 | 
 71 | def mulaw_quantize(x, mu=256):
 72 | 	"""Mu-Law companding + quantize
 73 | 	Args:
 74 | 		x (array-like): Input signal. Each value of input signal must be in
 75 | 		  range of [-1, 1].
 76 | 		mu (number): Compression parameter ``μ``.
 77 | 	Returns:
 78 | 		array-like: Quantized signal (dtype=int)
 79 | 		  - y ∈ [0, mu] if x ∈ [-1, 1]
 80 | 		  - y ∈ [0, mu) if x ∈ [-1, 1)
 81 | 	.. note::
 82 | 		If you want to get quantized values of range [0, mu) (not [0, mu]),
 83 | 		then you need to provide input signal of range [-1, 1).
 84 | 	Examples:
 85 | 		>>> from scipy.io import wavfile
 86 | 		>>> import pysptk
 87 | 		>>> import numpy as np
 88 | 		>>> from nnmnkwii import preprocessing as P
 89 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
 90 | 		>>> x = (x / 32768.0).astype(np.float32)
 91 | 		>>> y = P.mulaw_quantize(x)
 92 | 		>>> print(y.min(), y.max(), y.dtype)
 93 | 		15 246 int64
 94 | 	See also:
 95 | 		:func:`nnmnkwii.preprocessing.mulaw`
 96 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 97 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 98 | 	"""
 99 | 	mu = 255
100 | 	y = mulaw(x, mu)
101 | 	# scale [-1, 1] to [0, mu]
102 | 	return _asint((y + 1) / 2 * mu)
103 | 
104 | 
105 | def inv_mulaw_quantize(y, mu=256):
106 | 	"""Inverse of mu-law companding + quantize
107 | 	Args:
108 | 		y (array-like): Quantized signal (∈ [0, mu]).
109 | 		mu (number): Compression parameter ``μ``.
110 | 	Returns:
111 | 		array-like: Uncompressed signal ([-1, 1])
112 | 	Examples:
113 | 		>>> from scipy.io import wavfile
114 | 		>>> import pysptk
115 | 		>>> import numpy as np
116 | 		>>> from nnmnkwii import preprocessing as P
117 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
118 | 		>>> x = (x / 32768.0).astype(np.float32)
119 | 		>>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x))
120 | 		>>> x_hat = (x_hat * 32768).astype(np.int16)
121 | 	See also:
122 | 		:func:`nnmnkwii.preprocessing.mulaw`
123 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
124 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
125 | 	"""
126 | 	# [0, m) to [-1, 1]
127 | 	mu = 255
128 | 	y = 2 * _asfloat(y) / mu - 1
129 | 	return inv_mulaw(y, mu)
130 | 
131 | def _sign(x):
132 | 	#wrapper to support tensorflow tensors/numpy arrays
133 | 	isnumpy = isinstance(x, np.ndarray)
134 | 	isscalar = np.isscalar(x)
135 | 	return np.sign(x) if (isnumpy or isscalar) else tf.sign(x)
136 | 
137 | 
138 | def _log1p(x):
139 | 	#wrapper to support tensorflow tensors/numpy arrays
140 | 	isnumpy = isinstance(x, np.ndarray)
141 | 	isscalar = np.isscalar(x)
142 | 	return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x)
143 | 
144 | 
145 | def _abs(x):
146 | 	#wrapper to support tensorflow tensors/numpy arrays
147 | 	isnumpy = isinstance(x, np.ndarray)
148 | 	isscalar = np.isscalar(x)
149 | 	return np.abs(x) if (isnumpy or isscalar) else tf.abs(x)
150 | 
151 | 
152 | def _asint(x):
153 | 	#wrapper to support tensorflow tensors/numpy arrays
154 | 	isnumpy = isinstance(x, np.ndarray)
155 | 	isscalar = np.isscalar(x)
156 | 	return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32)
157 | 
158 | 
159 | def _asfloat(x):
160 | 	#wrapper to support tensorflow tensors/numpy arrays
161 | 	isnumpy = isinstance(x, np.ndarray)
162 | 	isscalar = np.isscalar(x)
163 | 	return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32)
164 | 
165 | def sequence_mask(input_lengths, max_len=None, expand=True):
166 | 	if max_len is None:
167 | 		max_len = tf.reduce_max(input_lengths)
168 | 
169 | 	if expand:
170 | 		return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1)
171 | 	return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
172 | 
173 | 
174 | def waveplot(path, y_hat, y_target, hparams, title=None):
175 | 	sr = hparams.sample_rate
176 | 
177 | 	fig = plt.figure(figsize=(12, 4))
178 | 	if y_target is not None:
179 | 		ax = plt.subplot(3, 1, 1)
180 | 		dsp.waveplot(y_target, sr=sr)
181 | 		ax.set_title('Target waveform')
182 | 		ax = plt.subplot(3, 1, 2)
183 | 		dsp.waveplot(y_hat, sr=sr)
184 | 		ax.set_title('Predicted waveform')
185 | 	else:
186 | 		ax = plt.subplot(2, 1, 1)
187 | 		dsp.waveplot(y_hat, sr=sr)
188 | 		ax.set_title('Generated waveform')
189 | 
190 | 	if title is not None:
191 | 		# Set common labels
192 | 		fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
193 | 
194 | 	plt.tight_layout()
195 | 	plt.savefig(path, format="png")
196 | 	plt.close()
197 | 
198 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
199 | 	if max_len is not None:
200 | 		target_spectrogram = target_spectrogram[:max_len]
201 | 		pred_spectrogram = pred_spectrogram[:max_len]
202 | 
203 | 	if split_title:
204 | 		title = split_title_line(title)
205 | 
206 | 	fig = plt.figure(figsize=(10, 8))
207 | 	# Set common labels
208 | 	fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
209 | 
210 | 	#target spectrogram subplot
211 | 	if target_spectrogram is not None:
212 | 		ax1 = fig.add_subplot(311)
213 | 		ax2 = fig.add_subplot(312)
214 | 
215 | 		if auto_aspect:
216 | 			im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
217 | 		else:
218 | 			im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
219 | 		ax1.set_title('Target Mel-Spectrogram')
220 | 		fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
221 | 		ax2.set_title('Predicted Mel-Spectrogram')
222 | 	else:
223 | 		ax2 = fig.add_subplot(211)
224 | 
225 | 	if auto_aspect:
226 | 		im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
227 | 	else:
228 | 		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
229 | 	fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
230 | 
231 | 	plt.tight_layout()
232 | 	plt.savefig(path, format='png')
233 | 	plt.close()
234 | 


--------------------------------------------------------------------------------