├── .gitignore
├── LICENSE
├── README.md
├── datasets
    ├── __init__.py
    ├── audio.py
    ├── preprocessor.py
    └── util.py
├── hparams.py
├── preprocess.py
├── requirements.txt
├── synthesize.py
├── tacotron
    ├── __init__.py
    ├── feeder.py
    ├── models
    │   ├── Architecture_wrappers.py
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── custom_decoder.py
    │   ├── helpers.py
    │   ├── modules.py
    │   ├── tacotron.py
    │   └── zoneout_LSTM.py
    ├── synthesize.py
    ├── synthesizer.py
    ├── train.py
    └── utils
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── cleaners.py
    │   ├── cmudict.py
    │   ├── infolog.py
    │   ├── numbers.py
    │   ├── plot.py
    │   ├── symbols.py
    │   ├── text.py
    │   └── util.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | papers/
  6 | # C extensions
  7 | *.so
  8 | .idea/
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Tacotron 2 oddities
107 | logs-*/
108 | training_data/
109 | 
110 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Rayhane Mama
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # VAE Tacotron-2:
  2 | Unofficial Implementation of [Learning latent representations for style control and transfer in end-to-end speech synthesis](https://arxiv.org/pdf/1812.04342.pdf)
  3 | 
  4 | 
  5 | # Repository Structure:
  6 | 	Tacotron-2
  7 | 	├── datasets
  8 | 	├── LJSpeech-1.1	(0)
  9 | 	│   └── wavs
 10 | 	├── logs-Tacotron	(2)
 11 | 	│   ├── mel-spectrograms
 12 | 	│   ├── plots
 13 | 	│   ├── pretrained
 14 | 	│   └── wavs
 15 | 	├── papers
 16 | 	├── tacotron
 17 | 	│   ├── models
 18 | 	│   └── utils
 19 | 	├── tacotron_output	(3)
 20 | 	│   ├── eval
 21 | 	│   ├── gta
 22 | 	│   ├── logs-eval
 23 | 	│   │   ├── plots
 24 | 	│   │   └── wavs
 25 | 	│   └── natural
 26 | 	└── training_data	(1)
 27 | 	    ├── audio
 28 | 	    └── mels
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | The previous tree shows what the current state of the repository.
 35 | 
 36 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**.
 37 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
 38 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
 39 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
 40 | 
 41 | 
 42 | # Requirements
 43 | first, you need to have python 3.5 installed along with [Tensorflow v1.6](https://www.tensorflow.org/install/).
 44 | 
 45 | next you can install the requirements :
 46 | 
 47 | > pip install -r requirements.txt
 48 | 
 49 | else:
 50 | 
 51 | > pip3 install -r requirements.txt
 52 | 
 53 | # Dataset:
 54 | This repo tested on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording.
 55 | 
 56 | # Preprocessing
 57 | Before running the following steps, please make sure you are inside **Tacotron-2 folder**
 58 | 
 59 | > cd Tacotron-2
 60 | 
 61 | Preprocessing can then be started using:
 62 | 
 63 | > python preprocess.py
 64 | 
 65 | or
 66 | 
 67 | > python3 preprocess.py
 68 | 
 69 | dataset can be chosen using the **--dataset** argument. Default is **Ljspeech**.
 70 | 
 71 | # Training:
 72 | Feature prediction model can be **trained** using:
 73 | 
 74 | > python train.py --model='Tacotron'
 75 | 
 76 | or
 77 | 
 78 | > python3 train.py --model='Tacotron'
 79 | 
 80 | # Synthesis
 81 | There are **three types** of mel spectrograms synthesis for the Spectrogram prediction network (Tacotron):
 82 | 
 83 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
 84 | 
 85 | > python synthesize.py --model='Tacotron' --mode='eval' --reference_audio='ref_1.wav'
 86 | 
 87 | or
 88 | 
 89 | > python3 synthesize.py --model='Tacotron' --mode='eval' --reference_audio='ref_1.wav'
 90 | 
 91 | **Note:**
 92 | - This implementation not completly tested for all scenarios but training and synthesis with reference audio working.
 93 | - Though it only tested on synthesize without GTA and with `eval` mode.
 94 | - After training 250k step with 32 batch size on LJSpeech, KL error settled down near to zero (around 0.001) still not get good style transfer and control, may be because this model trained on LJSpeech which is not quite expressive datasets and only have 24 hrs of data, it might be produce good result on expressive dataset like `Blizzard 2013 voice dataset` though author of the paper used 105 hrs of Blizzard Challenge 2013 dataset.
 95 | - In my testing, I havn't get good results so far on style transfer side may be some more tweaking required, this implementation easily integrated with `wavenet` as well as `WaveRNN`.
 96 | - Feel free to suggest some changes or even better raise PR.
 97 | 
 98 | # Pretrained model and Samples:
 99 | TODO
100 | Claimed Samples from research paper : http://home.ustc.edu.cn/~zyj008/ICASSP2019
101 | 
102 | # References and Resources:
103 | - [Tensorflow original tacotron implementation](https://github.com/keithito/tacotron)
104 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
105 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
106 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
107 | - [r9y9/Tacotron-2](https://github.com/r9y9/Tacotron-2)
108 | - [yanggeng1995/vae_tacotron](https://github.com/yanggeng1995/vae_tacotron)
109 | 
110 | **Work in progress**
111 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/datasets/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np 
  4 | from scipy import signal
  5 | from hparams import hparams
  6 | import tensorflow as tf 
  7 | from scipy.io import wavfile
  8 | 
  9 | 
 10 | def load_wav(path):
 11 | 	return librosa.core.load(path, sr=hparams.sample_rate)[0]
 12 | 
 13 | def save_wav(wav, path):
 14 | 	wav *= 32767 / max(0.01, np.max(np.abs(wav))) 
 15 | 	#proposed by @dsmiller
 16 | 	wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 17 | 
 18 | def start_and_end_indices(quantized, silence_threshold=2):
 19 | 	for start in range(quantized.size):
 20 | 		if abs(quantized[start] - 127) > silence_threshold:
 21 | 			break
 22 | 	for end in range(quantized.size - 1, 1, -1):
 23 | 		if abs(quantized[end] - 127) > silence_threshold:
 24 | 			break
 25 | 
 26 | 	assert abs(quantized[start] - 127) > silence_threshold
 27 | 	assert abs(quantized[end] - 127) > silence_threshold
 28 | 
 29 | 	return start, end
 30 | 
 31 | def trim_silence(wav):
 32 | 	'''Trim leading and trailing silence
 33 | 
 34 | 	Useful for M-AILABS dataset if we choose to trim the extra 0.5 silences.
 35 | 	'''
 36 | 	return librosa.effects.trim(wav)[0]
 37 | 
 38 | def preemphasis(x):
 39 | 	return signal.lfilter([1, -hparams.preemphasis], [1], x)
 40 | 
 41 | def inv_preemphasis(x):
 42 | 	return signal.lfilter([1], [1, -hparams.preemphasis], x)
 43 | 
 44 | def get_hop_size():
 45 | 	hop_size = hparams.hop_size
 46 | 	if hop_size is None:
 47 | 		assert hparams.frame_shift_ms is not None
 48 | 		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 49 | 	return hop_size
 50 | 
 51 | def linearspectrogram(wav):
 52 | 	D = _stft(wav)
 53 | 	S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 54 | 
 55 | 	if hparams.signal_normalization:
 56 | 		return _normalize(S)
 57 | 	return S
 58 | 
 59 | def melspectrogram(wav):
 60 | 	D = _stft(wav)
 61 | 	S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 62 | 
 63 | 	if hparams.signal_normalization:
 64 | 		return _normalize(S)
 65 | 	return S
 66 | 
 67 | def inv_linear_spectrogram(linear_spectrogram):
 68 | 	'''Converts linear spectrogram to waveform using librosa'''
 69 | 	if hparams.signal_normalization:
 70 | 		D = _denormalize(linear_spectrogram)
 71 | 	else:
 72 | 		D = linear_spectrogram
 73 | 
 74 | 	S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
 75 | 
 76 | 	return _griffin_lim(S ** hparams.power)
 77 | 	
 78 | 
 79 | def inv_mel_spectrogram(mel_spectrogram):
 80 | 	'''Converts mel spectrogram to waveform using librosa'''
 81 | 	if hparams.signal_normalization:
 82 | 		D = _denormalize(mel_spectrogram)
 83 | 	else:
 84 | 		D = mel_spectrogram
 85 | 
 86 | 	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db))  # Convert back to linear
 87 | 
 88 | 	return _griffin_lim(S ** hparams.power)
 89 | 
 90 | def _griffin_lim(S):
 91 | 	'''librosa implementation of Griffin-Lim
 92 | 	Based on https://github.com/librosa/librosa/issues/434
 93 | 	'''
 94 | 	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 95 | 	S_complex = np.abs(S).astype(np.complex)
 96 | 	y = _istft(S_complex * angles)
 97 | 	for i in range(hparams.griffin_lim_iters):
 98 | 		angles = np.exp(1j * np.angle(_stft(y)))
 99 | 		y = _istft(S_complex * angles)
100 | 	return y
101 | 
102 | def _stft(y):
103 | 	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size())
104 | 
105 | def _istft(y):
106 | 	return librosa.istft(y, hop_length=get_hop_size())
107 | 
108 | def num_frames(length, fsize, fshift):
109 | 	"""Compute number of time frames of spectrogram
110 | 	"""
111 | 	pad = (fsize - fshift)
112 | 	if length % fshift == 0:
113 | 		M = (length + pad * 2 - fsize) // fshift + 1
114 | 	else:
115 | 		M = (length + pad * 2 - fsize) // fshift + 2
116 | 	return M
117 | 
118 | 
119 | def pad_lr(x, fsize, fshift):
120 | 	"""Compute left and right padding
121 | 	"""
122 | 	M = num_frames(len(x), fsize, fshift)
123 | 	pad = (fsize - fshift)
124 | 	T = len(x) + 2 * pad
125 | 	r = (M - 1) * fshift + fsize - T
126 | 	return pad, pad + r
127 | 
128 | 
129 | # Conversions
130 | _mel_basis = None
131 | _inv_mel_basis = None
132 | 
133 | def _linear_to_mel(spectogram):
134 | 	global _mel_basis
135 | 	if _mel_basis is None:
136 | 		_mel_basis = _build_mel_basis()
137 | 	return np.dot(_mel_basis, spectogram)
138 | 
139 | def _mel_to_linear(mel_spectrogram):
140 | 	global _inv_mel_basis
141 | 	if _inv_mel_basis is None:
142 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
143 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
144 | 
145 | def _build_mel_basis():
146 | 	assert hparams.fmax <= hparams.sample_rate // 2
147 | 	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
148 | 							   fmin=hparams.fmin, fmax=hparams.fmax)
149 | 
150 | def _amp_to_db(x):
151 | 	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
152 | 	return 20 * np.log10(np.maximum(min_level, x))
153 | 
154 | def _db_to_amp(x):
155 | 	return np.power(10.0, (x) * 0.05)
156 | 
157 | def _normalize(S):
158 | 	if hparams.allow_clipping_in_normalization:
159 | 		if hparams.symmetric_mels:
160 | 			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
161 | 			 -hparams.max_abs_value, hparams.max_abs_value)
162 | 		else:
163 | 			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
164 | 
165 | 	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
166 | 	if hparams.symmetric_mels:
167 | 		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
168 | 	else:
169 | 		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
170 | 
171 | def _denormalize(D):
172 | 	if hparams.allow_clipping_in_normalization:
173 | 		if hparams.symmetric_mels:
174 | 			return (((np.clip(D, -hparams.max_abs_value,
175 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 
176 | 				+ hparams.min_level_db)
177 | 		else:
178 | 			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
179 | 
180 | 	if hparams.symmetric_mels:
181 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
182 | 	else:
183 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)


--------------------------------------------------------------------------------
/datasets/preprocessor.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | from datasets import audio
  4 | import os
  5 | import numpy as np
  6 | from hparams import hparams
  7 | from datasets.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize
  8 | 
  9 | 
 10 | def build_from_path(input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
 11 | 	"""
 12 | 	Preprocesses the speech dataset from a gven input path to given output directories
 13 | 
 14 | 	Args:
 15 | 		- input_dir: input directory that contains the files to prerocess
 16 | 		- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
 17 | 		- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
 18 | 		- wav_dir: output directory of the preprocessed speech audio dataset
 19 | 		- n_jobs: Optional, number of worker process to parallelize across
 20 | 		- tqdm: Optional, provides a nice progress bar
 21 | 
 22 | 	Returns:
 23 | 		- A list of tuple describing the train examples. this should be written to train.txt
 24 | 	"""
 25 | 
 26 | 	# We use ProcessPoolExecutor to parallelize across processes, this is just for
 27 | 	# optimization purposes and it can be omited
 28 | 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 29 | 	futures = []
 30 | 	index = 1
 31 | 	for input_dir in input_dirs:
 32 | 		with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f:
 33 | 			for line in f:
 34 | 				parts = line.strip().split('|')
 35 | 				wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0]))
 36 | 				text = parts[2]
 37 | 				futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, index, wav_path, text)))
 38 | 				index += 1
 39 | 
 40 | 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 41 | 
 42 | 
 43 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text):
 44 | 	"""
 45 | 	Preprocesses a single utterance wav/text pair
 46 | 
 47 | 	this writes the mel scale spectogram to disk and return a tuple to write
 48 | 	to the train.txt file
 49 | 
 50 | 	Args:
 51 | 		- mel_dir: the directory to write the mel spectograms into
 52 | 		- linear_dir: the directory to write the linear spectrograms into
 53 | 		- wav_dir: the directory to write the preprocessed wav into
 54 | 		- index: the numeric index to use in the spectogram filename
 55 | 		- wav_path: path to the audio file containing the speech input
 56 | 		- text: text spoken in the input audio file
 57 | 
 58 | 	Returns:
 59 | 		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
 60 | 	"""
 61 | 
 62 | 	try:
 63 | 		# Load the audio as numpy array
 64 | 		wav = audio.load_wav(wav_path)
 65 | 	except FileNotFoundError: #catch missing wav exception
 66 | 		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
 67 | 			wav_path))
 68 | 		return None
 69 | 
 70 | 	#rescale wav
 71 | 	if hparams.rescale:
 72 | 		wav = wav / np.abs(wav).max() * hparams.rescaling_max
 73 | 
 74 | 	#M-AILABS extra silence specific
 75 | 	if hparams.trim_silence:
 76 | 		wav = audio.trim_silence(wav)
 77 | 
 78 | 	#Mu-law quantize
 79 | 	if is_mulaw_quantize(hparams.input_type):
 80 | 		#[0, quantize_channels)
 81 | 		out = mulaw_quantize(wav, hparams.quantize_channels)
 82 | 
 83 | 		#Trim silences
 84 | 		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 85 | 		wav = wav[start: end]
 86 | 		out = out[start: end]
 87 | 
 88 | 		constant_values = mulaw_quantize(0, hparams.quantize_channels)
 89 | 		out_dtype = np.int16
 90 | 
 91 | 	elif is_mulaw(hparams.input_type):
 92 | 		#[-1, 1]
 93 | 		out = mulaw(wav, hparams.quantize_channels)
 94 | 		constant_values = mulaw(0., hparams.quantize_channels)
 95 | 		out_dtype = np.float32
 96 | 
 97 | 	else:
 98 | 		#[-1, 1]
 99 | 		out = wav
100 | 		constant_values = 0.
101 | 		out_dtype = np.float32
102 | 
103 | 	# Compute the mel scale spectrogram from the wav
104 | 	mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
105 | 	mel_frames = mel_spectrogram.shape[1]
106 | 
107 | 	#Compute the linear scale spectrogram from the wav
108 | 	linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32)
109 | 	linear_frames = linear_spectrogram.shape[1]
110 | 
111 | 	#sanity check
112 | 	assert linear_frames == mel_frames
113 | 
114 | 	#Ensure time resolution adjustement between audio and mel-spectrogram
115 | 	l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size())
116 | 
117 | 	#Zero pad for quantized signal
118 | 	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
119 | 	time_steps = len(out)
120 | 	assert time_steps >= mel_frames * audio.get_hop_size()
121 | 
122 | 	#time resolution adjustement
123 | 	#ensure length of raw audio is multiple of hop size so that we can use
124 | 	#transposed convolution to upsample
125 | 	out = out[:mel_frames * audio.get_hop_size()]
126 | 	assert time_steps % audio.get_hop_size() == 0
127 | 
128 | 	# Write the spectrogram and audio to disk
129 | 	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
130 | 	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
131 | 	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
132 | 	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
133 | 	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
134 | 	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)
135 | 
136 | 	# Return a tuple describing this training example
137 | 	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
138 | 


--------------------------------------------------------------------------------
/datasets/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | 
  3 | 
  4 | 
  5 | def _assert_valid_input_type(s):
  6 | 	assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw'
  7 | 
  8 | def is_mulaw_quantize(s):
  9 | 	_assert_valid_input_type(s)
 10 | 	return s == 'mulaw-quantize'
 11 | 
 12 | def is_mulaw(s):
 13 | 	_assert_valid_input_type(s)
 14 | 	return s == 'mulaw'
 15 | 
 16 | def is_raw(s):
 17 | 	_assert_valid_input_type(s)
 18 | 	return s == 'raw'
 19 | 
 20 | def is_scalar_input(s):
 21 | 	return is_raw(s) or is_mulaw(s)
 22 | 
 23 | 
 24 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py
 25 | def mulaw(x, mu=256):
 26 | 	"""Mu-Law companding
 27 | 	Method described in paper [1]_.
 28 | 	.. math::
 29 | 		f(x) = sign(x) \ln (1 + \mu |x|) / \ln (1 + \mu)
 30 | 	Args:
 31 | 		x (array-like): Input signal. Each value of input signal must be in
 32 | 		  range of [-1, 1].
 33 | 		mu (number): Compression parameter ``μ``.
 34 | 	Returns:
 35 | 		array-like: Compressed signal ([-1, 1])
 36 | 	See also:
 37 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 38 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 39 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 40 | 	.. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding
 41 | 		implementations using the tms320c54x." SPRA163 (1997).
 42 | 	"""
 43 | 	return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu)
 44 | 
 45 | 
 46 | def inv_mulaw(y, mu=256):
 47 | 	"""Inverse of mu-law companding (mu-law expansion)
 48 | 	.. math::
 49 | 		f^{-1}(x) = sign(y) (1 / \mu) (1 + \mu)^{|y|} - 1)
 50 | 	Args:
 51 | 		y (array-like): Compressed signal. Each value of input signal must be in
 52 | 		  range of [-1, 1].
 53 | 		mu (number): Compression parameter ``μ``.
 54 | 	Returns:
 55 | 		array-like: Uncomprresed signal (-1 <= x <= 1)
 56 | 	See also:
 57 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 58 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 59 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 60 | 	"""
 61 | 	return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
 62 | 
 63 | 
 64 | def mulaw_quantize(x, mu=256):
 65 | 	"""Mu-Law companding + quantize
 66 | 	Args:
 67 | 		x (array-like): Input signal. Each value of input signal must be in
 68 | 		  range of [-1, 1].
 69 | 		mu (number): Compression parameter ``μ``.
 70 | 	Returns:
 71 | 		array-like: Quantized signal (dtype=int)
 72 | 		  - y ∈ [0, mu] if x ∈ [-1, 1]
 73 | 		  - y ∈ [0, mu) if x ∈ [-1, 1)
 74 | 	.. note::
 75 | 		If you want to get quantized values of range [0, mu) (not [0, mu]),
 76 | 		then you need to provide input signal of range [-1, 1).
 77 | 	Examples:
 78 | 		>>> from scipy.io import wavfile
 79 | 		>>> import pysptk
 80 | 		>>> import numpy as np
 81 | 		>>> from nnmnkwii import preprocessing as P
 82 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
 83 | 		>>> x = (x / 32768.0).astype(np.float32)
 84 | 		>>> y = P.mulaw_quantize(x)
 85 | 		>>> print(y.min(), y.max(), y.dtype)
 86 | 		15 246 int64
 87 | 	See also:
 88 | 		:func:`nnmnkwii.preprocessing.mulaw`
 89 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 90 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 91 | 	"""
 92 | 	y = mulaw(x, mu)
 93 | 	# scale [-1, 1] to [0, mu]
 94 | 	return _asint((y + 1) / 2 * mu)
 95 | 
 96 | 
 97 | def inv_mulaw_quantize(y, mu=256):
 98 | 	"""Inverse of mu-law companding + quantize
 99 | 	Args:
100 | 		y (array-like): Quantized signal (∈ [0, mu]).
101 | 		mu (number): Compression parameter ``μ``.
102 | 	Returns:
103 | 		array-like: Uncompressed signal ([-1, 1])
104 | 	Examples:
105 | 		>>> from scipy.io import wavfile
106 | 		>>> import pysptk
107 | 		>>> import numpy as np
108 | 		>>> from nnmnkwii import preprocessing as P
109 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
110 | 		>>> x = (x / 32768.0).astype(np.float32)
111 | 		>>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x))
112 | 		>>> x_hat = (x_hat * 32768).astype(np.int16)
113 | 	See also:
114 | 		:func:`nnmnkwii.preprocessing.mulaw`
115 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
116 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
117 | 	"""
118 | 	# [0, m) to [-1, 1]
119 | 	y = 2 * _asfloat(y) / mu - 1
120 | 	return inv_mulaw(y, mu)
121 | 
122 | def _sign(x):
123 | 	isnumpy = isinstance(x, np.ndarray)
124 | 	isscalar = np.isscalar(x)
125 | 	return np.sign(x) if isnumpy or isscalar else x.sign()
126 | 
127 | 
128 | def _log1p(x):
129 | 	isnumpy = isinstance(x, np.ndarray)
130 | 	isscalar = np.isscalar(x)
131 | 	return np.log1p(x) if isnumpy or isscalar else x.log1p()
132 | 
133 | 
134 | def _abs(x):
135 | 	isnumpy = isinstance(x, np.ndarray)
136 | 	isscalar = np.isscalar(x)
137 | 	return np.abs(x) if isnumpy or isscalar else x.abs()
138 | 
139 | 
140 | def _asint(x):
141 | 	# ugly wrapper to support torch/numpy arrays
142 | 	isnumpy = isinstance(x, np.ndarray)
143 | 	isscalar = np.isscalar(x)
144 | 	return x.astype(np.int) if isnumpy else int(x) if isscalar else x.long()
145 | 
146 | 
147 | def _asfloat(x):
148 | 	# ugly wrapper to support torch/numpy arrays
149 | 	isnumpy = isinstance(x, np.ndarray)
150 | 	isscalar = np.isscalar(x)
151 | 	return x.astype(np.float32) if isnumpy else float(x) if isscalar else x.float()
152 | 
153 | 
154 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/lrschedule.py
155 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000):
156 | 	 # Noam scheme from tensor2tensor:
157 | 	warmup_steps = float(warmup_steps)
158 | 	step = global_step + 1.
159 | 	lr = init_lr * warmup_steps**0.5 * np.minimum(
160 | 		step * warmup_steps**-1.5, step**-0.5)
161 | 	return lr
162 | 
163 | 
164 | def step_learning_rate_decay(init_lr, global_step,
165 | 							 anneal_rate=0.98,
166 | 							 anneal_interval=30000):
167 | 	return init_lr * anneal_rate ** (global_step // anneal_interval)
168 | 
169 | 
170 | def cyclic_cosine_annealing(init_lr, global_step, T, M):
171 | 	"""Cyclic cosine annealing
172 | 
173 | 	https://arxiv.org/pdf/1704.00109.pdf
174 | 
175 | 	Args:
176 | 		init_lr (float): Initial learning rate
177 | 		global_step (int): Current iteration number
178 | 		T (int): Total iteration number (i,e. nepoch)
179 | 		M (int): Number of ensembles we want
180 | 
181 | 	Returns:
182 | 		float: Annealed learning rate
183 | 	"""
184 | 	TdivM = T // M
185 | 	return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0)


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | import numpy as np 
  3 | 
  4 | 
  5 | # Default hyperparameters
  6 | hparams = tf.contrib.training.HParams(
  7 | 	# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
  8 | 	# text, you may want to use "basic_cleaners" or "transliteration_cleaners".
  9 | 	cleaners='english_cleaners',
 10 | 
 11 | 
 12 | 	#Audio
 13 | 	num_mels = 80, 
 14 | 	num_freq = 513, #only used when adding linear spectrograms post processing network
 15 | 	rescale = True, 
 16 | 	rescaling_max = 0.999,
 17 | 	trim_silence = True,
 18 | 
 19 | 	#Mel spectrogram
 20 | 	fft_size = 1024,
 21 | 	hop_size = 256,
 22 | 	sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset)
 23 | 	frame_shift_ms = None,
 24 | 
 25 | 	#Mel and Linear spectrograms normalization/scaling and clipping
 26 | 	mel_normalization = False,
 27 | 	signal_normalization = True,
 28 | 	allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
 29 | 	symmetric_mels = True, #Whether to scale the data to be symmetric around 0
 30 | 	max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 
 31 | 
 32 | 	#Limits
 33 | 	min_level_db =- 100,
 34 | 	ref_level_db = 20,
 35 | 	fmin = 125,
 36 | 	fmax = 7600,
 37 | 
 38 | 	#Griffin Lim
 39 | 	power = 1.55,
 40 | 	griffin_lim_iters = 60,
 41 | 
 42 | 	# VAE:
 43 | 	use_vae=True,
 44 | 	vae_dim=32,
 45 | 	vae_warming_up=15000,
 46 | 	init_vae_weights=0.001,
 47 | 	vae_weight_multiler=0.002,
 48 | 	filters=[32, 32, 64, 64, 128, 128],
 49 | 
 50 | 	#Tacotron
 51 | 	outputs_per_step = 1, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
 52 | 	stop_at_any = True, #Determines whether the decoder should stop when predicting <stop> to any frame or to all of them
 53 | 
 54 | 	embedding_dim = 512, #dimension of embedding space
 55 | 
 56 | 	enc_conv_num_layers = 3, #number of encoder convolutional layers
 57 | 	enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer
 58 | 	enc_conv_channels = 512, #number of encoder convolutions filters for each layer
 59 | 	encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward)
 60 | 	encoder_depth=512,
 61 | 	smoothing = False, #Whether to smooth the attention normalization function 
 62 | 	attention_dim = 128, #dimension of attention space
 63 | 	attention_filters = 32, #number of attention convolution filters
 64 | 	attention_kernel = (31, ), #kernel size of attention convolution
 65 | 	cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
 66 | 
 67 | 	prenet_layers = [256, 256], #number of layers and number of units of prenet
 68 | 	decoder_layers = 2, #number of decoder lstm layers
 69 | 	decoder_lstm_units = 1024, #number of decoder lstm units on each layer
 70 | 	max_iters = 2500, #Max decoder steps during inference (Just for safety from infinite loop cases)
 71 | 
 72 | 	postnet_num_layers = 5, #number of postnet convolutional layers
 73 | 	postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer
 74 | 	postnet_channels = 512, #number of postnet convolution filters for each layer
 75 | 
 76 | 	mask_encoder = False, #whether to mask encoder padding while computing attention
 77 | 	impute_finished = False, #Whether to use loss mask for padded sequences
 78 | 	mask_finished = False, #Whether to mask alignments beyond the <stop_token> (False for debug, True for style)
 79 | 
 80 | 	predict_linear = False, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!)
 81 | 
 82 | 
 83 | 	#Wavenet
 84 | 	# Input type:
 85 | 	# 1. raw [-1, 1]
 86 | 	# 2. mulaw [-1, 1]
 87 | 	# 3. mulaw-quantize [0, mu]
 88 | 	# If input_type is raw or mulaw, network assumes scalar input and
 89 | 	# discretized mixture of logistic distributions output, otherwise one-hot
 90 | 	# input and softmax output are assumed.
 91 | 	# **NOTE**: if you change the one of the two parameters below, you need to
 92 | 	# re-run preprocessing before training.
 93 | 	# **NOTE**: scaler input (raw or mulaw) is experimental. Use it your own risk.
 94 | 	input_type="mulaw-quantize",
 95 | 	quantize_channels=256,  # 65536 or 256
 96 | 
 97 | 	silence_threshold=2,
 98 | 
 99 | 	# Mixture of logistic distributions:
100 | 	log_scale_min=float(np.log(1e-14)),
101 | 
102 | 	#TODO model params
103 | 
104 | 
105 | 	#Tacotron Training
106 | 	tacotron_batch_size = 32, #number of training samples on each training steps
107 | 	tacotron_reg_weight = 1e-6, #regularization weight (for l2 regularization)
108 | 	tacotron_scale_regularization = True, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)
109 | 
110 | 	tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
111 | 	tacotron_start_decay = 50000, #Step at which learning decay starts
112 | 	tacotron_decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope) (UNDER TEST)
113 | 	tacotron_decay_rate = 0.4, #learning rate decay rate (UNDER TEST)
114 | 	tacotron_initial_learning_rate = 1e-3, #starting learning rate
115 | 	tacotron_final_learning_rate = 1e-5, #minimal learning rate
116 | 
117 | 	tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
118 | 	tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
119 | 	tacotron_adam_epsilon = 1e-6, #AdamOptimizer beta3 parameter
120 | 
121 | 	tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network
122 | 	tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet
123 | 
124 | 	tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs
125 | 	
126 | 
127 | 	#Wavenet Training TODO
128 | 
129 | 
130 | 
131 | 	#Eval sentences
132 | 	sentences = [
133 | 	# From July 8, 2017 New York Times:
134 | 	'Scientists at the CERN laboratory say they have discovered a new particle.',
135 | 	'There\'s a way to measure the acute emotional intelligence that has never gone out of style.',
136 | 	'President Trump met with other leaders at the Group of 20 conference.',
137 | 	'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
138 | 	# From Google's Tacotron example page:
139 | 	'Generative adversarial network or variational auto-encoder.',
140 | 	'Basilar membrane and otolaryngology are not auto-correlations.',
141 | 	'He has read the whole thing.',
142 | 	'He reads books.',
143 | 	"Don't desert me here in the desert!",
144 | 	'He thought it was time to present the present.',
145 | 	'Thisss isrealy awhsome.',
146 | 	'Punctuation sensitivity, is working.',
147 | 	'Punctuation sensitivity is working.',
148 | 	"The buses aren't the problem, they actually provide a solution.",
149 | 	"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
150 | 	"The quick brown fox jumps over the lazy dog.",
151 | 	"Does the quick brown fox jump over the lazy dog?",
152 | 	"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
153 | 	"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
154 | 	"The blue lagoon is a nineteen eighty American romance adventure film.",
155 | 	"Tajima Airport serves Toyooka.",
156 | 	'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
157 | 	#From Training data:
158 | 	'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.',
159 | 	'in giltspur street compter, where he was first lodged.',
160 | 	'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,',
161 | 	'it appears that oswald had only one caller in response to all of his fpcc activities,',
162 | 	'he relied on the absence of the strychnia.',
163 | 	'scoggins thought it was lighter.',
164 | 	'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, 
165 | 	and would have possessed so much moral dignity''',
166 | 	'''the only purpose of this whole sentence is to evaluate the scalability of the model for very long sentences. 
167 | 	This is not even a long sentence anymore, it has become an entire paragraph. 
168 | 	Should I stop now? Let\'s add this last sentence in which we talk about nothing special.''',
169 | 	'Thank you so much for your support!!'
170 | 	]
171 | 
172 | 	)
173 | 
174 | def hparams_debug_string():
175 | 	values = hparams.values()
176 | 	hp = ['  %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences']
177 | 	return 'Hyperparameters:\n' + '\n'.join(hp)
178 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from multiprocessing import cpu_count
  3 | import os
  4 | from tqdm import tqdm
  5 | from datasets import preprocessor
  6 | from hparams import hparams
  7 | 
  8 | 
  9 | def preprocess(args, input_folders, out_dir):
 10 | 	mel_dir = os.path.join(out_dir, 'mels')
 11 | 	wav_dir = os.path.join(out_dir, 'audio')
 12 | 	linear_dir = os.path.join(out_dir, 'linear')
 13 | 	os.makedirs(mel_dir, exist_ok=True)
 14 | 	os.makedirs(wav_dir, exist_ok=True)
 15 | 	os.makedirs(linear_dir, exist_ok=True)
 16 | 	metadata = preprocessor.build_from_path(input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm)
 17 | 	write_metadata(metadata, out_dir)
 18 | 
 19 | def write_metadata(metadata, out_dir):
 20 | 	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
 21 | 		for m in metadata:
 22 | 			f.write('|'.join([str(x) for x in m]) + '\n')
 23 | 	mel_frames = sum([int(m[4]) for m in metadata])
 24 | 	timesteps = sum([int(m[3]) for m in metadata])
 25 | 	sr = hparams.sample_rate
 26 | 	hours = timesteps / sr / 3600
 27 | 	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
 28 | 		len(metadata), mel_frames, timesteps, hours))
 29 | 	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
 30 | 	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
 31 | 	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
 32 | 
 33 | def norm_data(args):
 34 | 	print('Selecting data folders..')
 35 | 	supported_datasets = ['LJSpeech-1.1', 'M-AILABS']
 36 | 	if args.dataset not in supported_datasets:
 37 | 		raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
 38 | 			args.dataset, supported_datasets))
 39 | 
 40 | 	if args.dataset == 'LJSpeech-1.1':
 41 | 		return [os.path.join(args.base_dir, args.dataset)]
 42 | 
 43 | 	
 44 | 	if args.dataset == 'M-AILABS':
 45 | 		supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 
 46 | 			'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
 47 | 		if args.language not in supported_languages:
 48 | 			raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
 49 | 				supported_languages))
 50 | 
 51 | 		supported_voices = ['female', 'male', 'mix']
 52 | 		if args.voice not in supported_voices:
 53 | 			raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
 54 | 				supported_voices))
 55 | 
 56 | 		path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
 57 | 		supported_readers = [e for e in os.listdir(path) if 'DS_Store' not in e]
 58 | 		if args.reader not in supported_readers:
 59 | 			raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
 60 | 				supported_readers))
 61 | 
 62 | 		path = os.path.join(path, args.reader)
 63 | 		supported_books = [e for e in os.listdir(path) if e != '.DS_Store']
 64 | 
 65 | 		if args.merge_books:
 66 | 			return [os.path.join(path, book) for book in supported_books]
 67 | 
 68 | 		else:
 69 | 			if args.book not in supported_books:
 70 | 				raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
 71 | 					supported_books))
 72 | 
 73 | 			return [os.path.join(path, args.book)]
 74 | 
 75 | 
 76 | def run_preprocess(args):
 77 | 	input_folders = norm_data(args)
 78 | 	output_folder = os.path.join(args.base_dir, args.output)
 79 | 
 80 | 	preprocess(args, input_folders, output_folder)
 81 | 
 82 | 
 83 | def main():
 84 | 	print('initializing preprocessing..')
 85 | 	parser = argparse.ArgumentParser()
 86 | 	parser.add_argument('--base_dir', default='')
 87 | 	parser.add_argument('--dataset', default='LJSpeech-1.1')
 88 | 	parser.add_argument('--language', default='en_US')
 89 | 	parser.add_argument('--voice', default='female')
 90 | 	parser.add_argument('--reader', default='mary_ann')
 91 | 	parser.add_argument('--merge_books', type=bool, default=False)
 92 | 	parser.add_argument('--book', default='northandsouth')
 93 | 	parser.add_argument('--output', default='training_data')
 94 | 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
 95 | 	args = parser.parse_args()
 96 | 
 97 | 	run_preprocess(args)
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 | 	main()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | falcon==1.2.0
2 | inflect==0.2.5
3 | librosa==0.5.1
4 | matplotlib==2.0.2
5 | numpy==1.13.0
6 | scipy==1.0.0
7 | tqdm==4.11.2
8 | Unidecode==0.4.20


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tacotron.synthesize import tacotron_synthesize
 3 | 
 4 | 
 5 | def main():
 6 | 	accepted_modes = ['eval', 'synthesis']
 7 | 	parser = argparse.ArgumentParser()
 8 | 	parser.add_argument('--checkpoint', default='logs-Tacotron/pretrained/', help='Path to model checkpoint')
 9 | 	parser.add_argument('--hparams', default='',
10 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
11 | 	parser.add_argument('--reference_audio', required=True)
12 | 	parser.add_argument('--model', default='Tacotron')
13 | 	parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
14 | 	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
15 | 	parser.add_argument('--mode', default='synthesis', help='mode of run: can be one of {}'.format(accepted_modes))
16 | 	parser.add_argument('--GTA', default=False, help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
17 | 	args = parser.parse_args()
18 | 	
19 | 	accepted_models = ['Tacotron', 'Wavenet']
20 | 
21 | 	if args.model not in accepted_models:
22 | 		raise ValueError('please enter a valid model to train: {}'.format(accepted_models))
23 | 
24 | 	if args.mode not in accepted_modes:
25 | 		raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))
26 | 
27 | 	if args.model == 'Tacotron':
28 | 		tacotron_synthesize(args)
29 | 	elif args.model == 'Wavenet':
30 | 		raise NotImplementedError('Wavenet is still a work in progress, thank you for your patience!')
31 | 
32 | 
33 | if __name__ == '__main__':
34 | 	main()


--------------------------------------------------------------------------------
/tacotron/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/tacotron/feeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import os
  3 | import threading
  4 | import time
  5 | import traceback
  6 | from tacotron.utils.text import text_to_sequence
  7 | from tacotron.utils.infolog import log
  8 | import tensorflow as tf 
  9 | from hparams import hparams
 10 | 
 11 | 
 12 | _batches_per_group = 32
 13 | #pad input sequences with the <pad_token> 0 ( _ )
 14 | _pad = 0
 15 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram
 16 | #to avoid any possible conflicts, without affecting the output range of the model too much
 17 | if hparams.symmetric_mels:
 18 | 	_target_pad = -(hparams.max_abs_value + .1)
 19 | else:
 20 | 	_target_pad = -0.1
 21 | #Mark finished sequences with 1s
 22 | _token_pad = 1.
 23 | 
 24 | class Feeder(threading.Thread):
 25 | 	"""
 26 | 		Feeds batches of data into queue on a background thread.
 27 | 	"""
 28 | 
 29 | 	def __init__(self, coordinator, metadata_filename, hparams):
 30 | 		super(Feeder, self).__init__()
 31 | 		self._coord = coordinator
 32 | 		self._hparams = hparams
 33 | 		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 34 | 		self._offset = 0
 35 | 
 36 | 		# Load metadata
 37 | 		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
 38 | 		self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
 39 | 		with open(metadata_filename, encoding='utf-8') as f:
 40 | 			self._metadata = [line.strip().split('|') for line in f]
 41 | 			frame_shift_ms = hparams.hop_size / hparams.sample_rate
 42 | 			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
 43 | 			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
 44 | 
 45 | 		# Create placeholders for inputs and targets. Don't specify batch size because we want
 46 | 		# to be able to feed different batch sizes at eval time.
 47 | 		self._placeholders = [
 48 | 		tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
 49 | 		tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 50 | 		tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
 51 | 		tf.placeholder(tf.int32,[None],'mel_lengths'),
 52 | 		tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
 53 | 		tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
 54 | 		]
 55 | 
 56 | 		# Create queue for buffering data
 57 | 		queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue')
 58 | 		self._enqueue_op = queue.enqueue(self._placeholders)
 59 | 		self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue()
 60 | 		self.inputs.set_shape(self._placeholders[0].shape)
 61 | 		self.input_lengths.set_shape(self._placeholders[1].shape)
 62 | 		self.mel_targets.set_shape(self._placeholders[2].shape)
 63 | 		self.mel_lengths.set_shape(self._placeholders[3].shape)
 64 | 		self.token_targets.set_shape(self._placeholders[4].shape)
 65 | 		self.linear_targets.set_shape(self._placeholders[5].shape)
 66 | 
 67 | 	def start_in_session(self, session):
 68 | 		self._session = session
 69 | 		self.start()
 70 | 
 71 | 	def run(self):
 72 | 		try:
 73 | 			while not self._coord.should_stop():
 74 | 				self._enqueue_next_group()
 75 | 		except Exception as e:
 76 | 			traceback.print_exc()
 77 | 			self._coord.request_stop(e)
 78 | 
 79 | 	def _enqueue_next_group(self):
 80 | 		start = time.time()
 81 | 
 82 | 		# Read a group of examples
 83 | 		n = self._hparams.tacotron_batch_size
 84 | 		r = self._hparams.outputs_per_step
 85 | 		examples = [self._get_next_example() for i in range(n * _batches_per_group)]
 86 | 
 87 | 		# Bucket examples based on similar output sequence length for efficiency
 88 | 		examples.sort(key=lambda x: x[-1])
 89 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
 90 | 		np.random.shuffle(batches)
 91 | 
 92 | 		log('\nGenerated {} batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
 93 | 		for batch in batches:
 94 | 			feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
 95 | 			self._session.run(self._enqueue_op, feed_dict=feed_dict)
 96 | 
 97 | 	def _get_next_example(self):
 98 | 		"""
 99 | 		Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
100 | 		"""
101 | 		if self._offset >= len(self._metadata):
102 | 			self._offset = 0
103 | 			np.random.shuffle(self._metadata)
104 | 		meta = self._metadata[self._offset]
105 | 		self._offset += 1
106 | 
107 | 		text = meta[5]
108 | 
109 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
110 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
111 | 		#Create parallel sequences containing zeros to represent a non finished sequence
112 | 		token_target = np.asarray([0.] * len(mel_target))
113 | 		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
114 | 		return (input_data, mel_target, token_target, linear_target, len(mel_target))
115 | 
116 | 
117 | def _prepare_batch(batch, outputs_per_step):
118 | 	np.random.shuffle(batch)
119 | 	inputs = _prepare_inputs([x[0] for x in batch])
120 | 	input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
121 | 	mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
122 | 	mel_lengths= [len(x[1]) for x in batch]
123 | 	#Pad sequences with 1 to infer that the sequence is done
124 | 	token_targets = _prepare_token_targets([x[2] for x in batch], outputs_per_step)
125 | 	linear_targets = _prepare_targets([x[3] for x in batch], outputs_per_step)
126 | 	return (inputs, input_lengths, mel_targets, mel_lengths, token_targets, linear_targets)
127 | 
128 | def _prepare_inputs(inputs):
129 | 	max_len = max([len(x) for x in inputs])
130 | 	return np.stack([_pad_input(x, max_len) for x in inputs])
131 | 
132 | def _prepare_targets(targets, alignment):
133 | 	max_len = max([len(t) for t in targets]) + 1 
134 | 	return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
135 | 
136 | def _prepare_token_targets(targets, alignment):
137 | 	max_len = max([len(t) for t in targets]) + 1
138 | 	return np.stack([_pad_token_target(t, _round_up(max_len, alignment)) for t in targets])
139 | 
140 | def _pad_input(x, length):
141 | 	return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
142 | 
143 | def _pad_target(t, length):
144 | 	return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_target_pad)
145 | 
146 | def _pad_token_target(t, length):
147 | 	return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_token_pad)
148 | 
149 | def _round_up(x, multiple):
150 | 	remainder = x % multiple
151 | 	return x if remainder == 0 else x + multiple - remainder
152 | 


--------------------------------------------------------------------------------
/tacotron/models/Architecture_wrappers.py:
--------------------------------------------------------------------------------
  1 | """A set of wrappers usefull for tacotron 2 architecture
  2 | All notations and variable names were used in concordance with originial tensorflow implementation
  3 | """
  4 | import collections
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow.contrib.rnn import RNNCell
  8 | from tensorflow.python.framework import ops
  9 | from tensorflow.python.ops import rnn_cell_impl
 10 | from tensorflow.python.ops import check_ops
 11 | from tensorflow.python.util import nest
 12 | from tensorflow.python.ops import array_ops
 13 | from tensorflow.python.ops import tensor_array_ops
 14 | from tensorflow.python.framework import tensor_shape
 15 | from tacotron.models.attention import _compute_attention
 16 | 
 17 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 18 | 
 19 | 
 20 | 
 21 | class TacotronEncoderCell(RNNCell):
 22 | 	"""Tacotron 2 Encoder Cell
 23 | 	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
 24 | 	layer to predict the hidden representation vector (or memory)
 25 | 	"""
 26 | 
 27 | 	def __init__(self, convolutional_layers, lstm_layer):
 28 | 		"""Initialize encoder parameters
 29 | 
 30 | 		Args:
 31 | 			convolutional_layers: Encoder convolutional block class
 32 | 			lstm_layer: encoder bidirectional lstm layer class
 33 | 		"""
 34 | 		super(TacotronEncoderCell, self).__init__()
 35 | 		#Initialize encoder layers
 36 | 		self._convolutions = convolutional_layers
 37 | 		self._cell = lstm_layer
 38 | 
 39 | 	def __call__(self, inputs, input_lengths=None):
 40 | 		#Pass input sequence through a stack of convolutional layers
 41 | 		conv_output = self._convolutions(inputs)
 42 | 
 43 | 		#Extract hidden representation from encoder lstm cells
 44 | 		hidden_representation = self._cell(conv_output, input_lengths)
 45 | 
 46 | 		#For shape visualization
 47 | 		self.conv_output_shape = conv_output.shape
 48 | 		return hidden_representation
 49 | 
 50 | 
 51 | class TacotronDecoderCellState(
 52 | 	collections.namedtuple("TacotronDecoderCellState",
 53 | 	 ("cell_state", "attention", "time", "alignments",
 54 | 	  "alignment_history", "finished"))):
 55 | 	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
 56 | 	Contains:
 57 | 	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 58 | 		step.
 59 | 	  - `attention`: The attention emitted at the previous time step.
 60 | 	  - `time`: int32 scalar containing the current time step.
 61 | 	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 62 | 		 emitted at the previous time step for each attention mechanism.
 63 | 	  - `alignment_history`: a single or tuple of `TensorArray`(s)
 64 | 		 containing alignment matrices from all time steps for each attention
 65 | 		 mechanism. Call `stack()` on each to convert to a `Tensor`.
 66 | 	"""
 67 | 	def replace(self, **kwargs):
 68 | 		"""Clones the current state while overwriting components provided by kwargs.
 69 | 		"""
 70 | 		return super(TacotronDecoderCellState, self)._replace(**kwargs)
 71 | 
 72 | class TacotronDecoderCell(RNNCell):
 73 | 	"""Tactron 2 Decoder Cell
 74 | 	Decodes encoder output and previous mel frames into next r frames
 75 | 
 76 | 	Decoder Step i:
 77 | 		1) Prenet to compress last output information
 78 | 		2) Concat compressed inputs with previous context vector (input feeding) *
 79 | 		3) Decoder RNN (actual decoding) to predict current state s_{i} *
 80 | 		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 81 | 		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 82 | 		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 83 | 
 84 | 	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
 85 | 	and wrap that with the prenet before doing an input feeding, and with the prediction layer
 86 | 	that uses RNN states to project on output space. Actions marked with (*) can be replaced with 
 87 | 	tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
 88 | 	"""
 89 | 
 90 | 	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection, mask_finished=False):
 91 | 		"""Initialize decoder parameters
 92 | 
 93 | 		Args:
 94 | 		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
 95 | 		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to 
 96 | 			    learn encoder-decoder alignments
 97 | 		    rnn_cell: Instance of RNNCell, main body of the decoder
 98 | 		    frame_projection: tensorflow fully connected layer with r * num_mels output units
 99 | 		    stop_projection: tensorflow fully connected layer, expected to project to a scalar 
100 | 			    and through a sigmoid activation
101 | 			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
102 | 		"""
103 | 		super(TacotronDecoderCell, self).__init__()
104 | 		#Initialize decoder layers
105 | 		self._prenet = prenet
106 | 		self._attention_mechanism = attention_mechanism
107 | 		self._cell = rnn_cell
108 | 		self._frame_projection = frame_projection
109 | 		self._stop_projection = stop_projection
110 | 
111 | 		self._mask_finished = mask_finished
112 | 		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
113 | 
114 | 	def _batch_size_checks(self, batch_size, error_message):
115 | 		return [check_ops.assert_equal(batch_size,
116 | 		  self._attention_mechanism.batch_size,
117 | 		  message=error_message)]
118 | 
119 | 	@property
120 | 	def output_size(self):
121 | 		return self._frame_projection.shape
122 | 
123 | 	@property
124 | 	def state_size(self):
125 | 		"""The `state_size` property of `TacotronDecoderCell`.
126 | 
127 | 		Returns:
128 | 		  An `TacotronDecoderCell` tuple containing shapes used by this object.
129 | 		"""
130 | 		return TacotronDecoderCellState(
131 | 			cell_state=self._cell._cell.state_size,
132 | 			time=tensor_shape.TensorShape([]),
133 | 			attention=self._attention_layer_size,
134 | 			alignments=self._attention_mechanism.alignments_size,
135 | 			alignment_history=(),
136 | 			finished=())
137 | 
138 | 	def zero_state(self, batch_size, dtype):
139 | 		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
140 | 		
141 | 		Args:
142 | 		  batch_size: `0D` integer tensor: the batch size.
143 | 		  dtype: The internal state data type.
144 | 		Returns:
145 | 		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
146 | 		  possibly, empty `TensorArray` objects.
147 | 		Raises:
148 | 		  ValueError: (or, possibly at runtime, InvalidArgument), if
149 | 			`batch_size` does not match the output size of the encoder passed
150 | 			to the wrapper object at initialization time.
151 | 		"""
152 | 		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
153 | 			cell_state = self._cell._cell.zero_state(batch_size, dtype)
154 | 			error_message = (
155 | 				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
156 | 				"Non-matching batch sizes between the memory "
157 | 				"(encoder output) and the requested batch size.")
158 | 			with ops.control_dependencies(
159 | 				self._batch_size_checks(batch_size, error_message)):
160 | 				cell_state = nest.map_structure(
161 | 					lambda s: array_ops.identity(s, name="checked_cell_state"),
162 | 					cell_state)
163 | 			return TacotronDecoderCellState(
164 | 				cell_state=cell_state,
165 | 				time=array_ops.zeros([], dtype=tf.int32),
166 | 				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
167 | 				  dtype),
168 | 				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
169 | 				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
170 | 				dynamic_size=True),
171 | 				finished=tf.reshape(tf.tile([0.0], [batch_size]), [-1, 1]))
172 | 
173 | 	def __call__(self, inputs, state):
174 | 		#Information bottleneck (essential for learning attention)
175 | 		prenet_output = self._prenet(inputs)
176 | 
177 | 		#Concat context vector and prenet output to form LSTM cells input (input feeding)
178 | 		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
179 | 
180 | 		#Unidirectional LSTM layers
181 | 		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
182 | 
183 | 		#Compute the attention (context) vector and alignments using
184 | 		#the new decoder cell hidden state as query vector 
185 | 		#and cumulative alignments to extract location features
186 | 		#The choice of the new cell hidden state (s_{i}) of the last
187 | 		#decoder RNN Cell is based on Luong et Al. (2015):
188 | 		#https://arxiv.org/pdf/1508.04025.pdf
189 | 		previous_alignments = state.alignments
190 | 		previous_alignment_history = state.alignment_history
191 | 		context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 
192 | 			LSTM_output,
193 | 			previous_alignments,
194 | 			attention_layer=None)
195 | 
196 | 		#Concat LSTM outputs and context vector to form projections inputs
197 | 		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
198 | 
199 | 		#Compute predicted frames and predicted <stop_token>
200 | 		cell_outputs = self._frame_projection(projections_input)
201 | 		stop_tokens = self._stop_projection(projections_input)
202 | 
203 | 		#mask attention computed for decoding steps where sequence is already finished
204 | 		#this is purely for visual purposes and will not affect the training of the model
205 | 		#we don't pay much attention to the alignments of the output paddings if we impute
206 | 		#the decoder outputs beyond the end of sequence.
207 | 		if self._mask_finished:
208 | 			finished = tf.cast(state.finished * tf.ones(tf.shape(alignments)), tf.bool)
209 | 			mask = tf.zeros(tf.shape(alignments))
210 | 			masked_alignments = tf.where(finished, mask, alignments)
211 | 		else:
212 | 			masked_alignments = alignments
213 | 
214 | 		#Save alignment history
215 | 		alignment_history = previous_alignment_history.write(state.time, masked_alignments)
216 | 
217 | 		#Prepare next decoder state
218 | 		next_state = TacotronDecoderCellState(
219 | 			time=state.time + 1,
220 | 			cell_state=next_cell_state,
221 | 			attention=context_vector,
222 | 			alignments=cumulated_alignments,
223 | 			alignment_history=alignment_history,
224 | 			finished=state.finished)
225 | 
226 | 		return (cell_outputs, stop_tokens), next_state 
227 | 


--------------------------------------------------------------------------------
/tacotron/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == 'Tacotron':
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception('Unknown model: ' + name)
9 | 


--------------------------------------------------------------------------------
/tacotron/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.ops import nn_ops
  6 | from tensorflow.python.layers import core as layers_core
  7 | from tensorflow.python.ops import array_ops
  8 | from tensorflow.python.ops import variable_scope
  9 | from tensorflow.python.ops import math_ops
 10 | from hparams import hparams
 11 | 
 12 | 
 13 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 14 | def _compute_attention(attention_mechanism, cell_output, attention_state,
 15 | 					   attention_layer):
 16 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 17 | 	alignments, next_attention_state = attention_mechanism(
 18 | 		cell_output, state=attention_state)
 19 | 
 20 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 21 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 22 | 	# Context is the inner product of alignments and values along the
 23 | 	# memory time dimension.
 24 | 	# alignments shape is
 25 | 	#   [batch_size, 1, memory_time]
 26 | 	# attention_mechanism.values shape is
 27 | 	#   [batch_size, memory_time, memory_size]
 28 | 	# the batched matmul is over memory_time, so the output shape is
 29 | 	#   [batch_size, 1, memory_size].
 30 | 	# we then squeeze out the singleton dim.
 31 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 32 | 	context = array_ops.squeeze(context, [1])
 33 | 
 34 | 	if attention_layer is not None:
 35 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 36 | 	else:
 37 | 		attention = context
 38 | 
 39 | 	return attention, alignments, next_attention_state
 40 | 
 41 | 
 42 | def _location_sensitive_score(W_query, W_fil, W_keys):
 43 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 44 | 	This attention is described in:
 45 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 46 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 47 | 	  vances in Neural Information Processing Systems, 2015, pp.
 48 | 	  577–585.
 49 | 
 50 | 	#############################################################################
 51 | 			  hybrid attention (content-based + location-based)
 52 | 							   f = F * α_{i-1}
 53 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 54 | 	#############################################################################
 55 | 
 56 | 	Args:
 57 | 		W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
 58 | 		W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
 59 | 		W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
 60 | 	Returns:
 61 | 		A '[batch_size, max_time]' attention score (energy)
 62 | 	"""
 63 | 	# Get the number of hidden units from the trailing dimension of keys
 64 | 	dtype = W_query.dtype
 65 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 66 | 
 67 | 	v_a = tf.get_variable(
 68 | 		'attention_variable', shape=[num_units], dtype=dtype,
 69 | 		initializer=tf.contrib.layers.xavier_initializer())
 70 | 	b_a = tf.get_variable(
 71 | 		'attention_bias', shape=[num_units], dtype=dtype,
 72 | 		initializer=tf.zeros_initializer())
 73 | 
 74 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 75 | 
 76 | def _smoothing_normalization(e):
 77 | 	"""Applies a smoothing normalization function instead of softmax
 78 | 	Introduced in:
 79 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 80 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 81 | 	  vances in Neural Information Processing Systems, 2015, pp.
 82 | 	  577–585.
 83 | 
 84 | 	############################################################################
 85 | 						Smoothing normalization function
 86 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 87 | 	############################################################################
 88 | 
 89 | 	Args:
 90 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 91 | 			values of an attention mechanism
 92 | 	Returns:
 93 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 94 | 			attendance to multiple memory time steps.
 95 | 	"""
 96 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 97 | 
 98 | 
 99 | class LocationSensitiveAttention(BahdanauAttention):
100 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
101 | 	Usually referred to as "hybrid" attention (content-based + location-based)
102 | 	Extends the additive attention described in:
103 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
104 |   tion by jointly learning to align and translate,” in Proceedings
105 |   of ICLR, 2015."
106 | 	to use previous alignments as additional location features.
107 | 	
108 | 	This attention is described in:
109 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
110 |   gio, “Attention-based models for speech recognition,” in Ad-
111 |   vances in Neural Information Processing Systems, 2015, pp.
112 |   577–585.
113 | 	"""
114 | 
115 | 	def __init__(self,
116 | 				 num_units,
117 | 				 memory,
118 | 				 mask_encoder=True,
119 | 				 memory_sequence_length=None,
120 | 				 smoothing=False,
121 | 				 cumulate_weights=True,
122 | 				 name='LocationSensitiveAttention'):
123 | 		"""Construct the Attention mechanism.
124 | 		Args:
125 | 			num_units: The depth of the query mechanism.
126 | 			memory: The memory to query; usually the output of an RNN encoder.  This
127 | 				tensor should be shaped `[batch_size, max_time, ...]`.
128 | 			mask_encoder (optional): Boolean, whether to mask encoder paddings.
129 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
130 | 				in memory.  If provided, the memory tensor rows are masked with zeros
131 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
132 | 			smoothing (optional): Boolean. Determines which normalization function to use.
133 | 				Default normalization function (probablity_fn) is softmax. If smoothing is 
134 | 				enabled, we replace softmax with:
135 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
136 | 				Introduced in:
137 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
138 | 				  gio, “Attention-based models for speech recognition,” in Ad-
139 | 				  vances in Neural Information Processing Systems, 2015, pp.
140 | 				  577–585.
141 | 				This is mainly used if the model wants to attend to multiple inputs parts 
142 | 				at the same decoding step. We probably won't be using it since multiple sound
143 | 				frames may depend from the same character, probably not the way around.
144 | 				Note:
145 | 					We still keep it implemented in case we want to test it. They used it in the
146 | 					paper in the context of speech recognition, where one phoneme may depend on
147 | 					multiple subsequent sound frames.
148 | 			name: Name to use when creating ops.
149 | 		"""
150 | 		#Create normalization function
151 | 		#Setting it to None defaults in using softmax
152 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
153 | 		memory_length = memory_sequence_length if (mask_encoder==True) else None
154 | 		super(LocationSensitiveAttention, self).__init__(
155 | 				num_units=num_units,
156 | 				memory=memory,
157 | 				memory_sequence_length=memory_length,
158 | 				probability_fn=normalization_function,
159 | 				name=name)
160 | 
161 | 		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
162 | 			kernel_size=hparams.attention_kernel, padding='same', use_bias=False, 
163 | 			name='location_features_convolution')
164 | 		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 
165 | 			dtype=tf.float32, name='location_features_layer')
166 | 		self._cumulate = cumulate_weights
167 | 
168 | 	def __call__(self, query, state):
169 | 		"""Score the query based on the keys and values.
170 | 		Args:
171 | 			query: Tensor of dtype matching `self.values` and shape
172 | 				`[batch_size, query_depth]`.
173 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
174 | 				`[batch_size, alignments_size]`
175 | 				(`alignments_size` is memory's `max_time`).
176 | 		Returns:
177 | 			alignments: Tensor of dtype matching `self.values` and shape
178 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
179 | 				`max_time`).
180 | 		"""
181 | 		previous_alignments = state
182 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
183 | 
184 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
185 | 			processed_query = self.query_layer(query) if self.query_layer else query
186 | 			# -> [batch_size, 1, attention_dim]
187 | 			processed_query = tf.expand_dims(processed_query, 1)
188 | 
189 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
190 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
191 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
192 | 			# location features [batch_size, max_time, filters]
193 | 			f = self.location_convolution(expanded_alignments)
194 | 			# Projected location features [batch_size, max_time, attention_dim]
195 | 			processed_location_features = self.location_layer(f)
196 | 
197 | 			# energy shape [batch_size, max_time]
198 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
199 | 
200 | 		# alignments shape = energy shape = [batch_size, max_time]
201 | 		alignments = self._probability_fn(energy, previous_alignments)
202 | 
203 | 		# Cumulate alignments
204 | 		if self._cumulate:
205 | 			next_state = alignments + previous_alignments
206 | 		else:
207 | 			next_state = alignments
208 | 			
209 | 		return alignments, next_state
210 | 


--------------------------------------------------------------------------------
/tacotron/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import collections
  6 | import tensorflow as tf
  7 | 
  8 | from tensorflow.contrib.seq2seq.python.ops import decoder
  9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 10 | from tensorflow.python.framework import ops
 11 | from tensorflow.python.framework import tensor_shape
 12 | from tensorflow.python.layers import base as layers_base
 13 | from tensorflow.python.ops import rnn_cell_impl
 14 | from tensorflow.python.util import nest
 15 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
 16 | 
 17 | 
 18 | 
 19 | class CustomDecoderOutput(
 20 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 21 | 	pass
 22 | 
 23 | 
 24 | class CustomDecoder(decoder.Decoder):
 25 | 	"""Custom sampling decoder.
 26 | 
 27 | 	Allows for stop token prediction at inference time
 28 | 	and returns equivalent loss in training time.
 29 | 
 30 | 	Note:
 31 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 32 | 	"""
 33 | 
 34 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 35 | 		"""Initialize CustomDecoder.
 36 | 		Args:
 37 | 			cell: An `RNNCell` instance.
 38 | 			helper: A `Helper` instance.
 39 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 40 | 				The initial state of the RNNCell.
 41 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 42 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 43 | 				to storing the result or sampling.
 44 | 		Raises:
 45 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 46 | 		"""
 47 | 		if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
 48 | 			raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
 49 | 		if not isinstance(helper, helper_py.Helper):
 50 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 51 | 		if (output_layer is not None
 52 | 				and not isinstance(output_layer, layers_base.Layer)):
 53 | 			raise TypeError(
 54 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 55 | 		self._cell = cell
 56 | 		self._helper = helper
 57 | 		self._initial_state = initial_state
 58 | 		self._output_layer = output_layer
 59 | 
 60 | 	@property
 61 | 	def batch_size(self):
 62 | 		return self._helper.batch_size
 63 | 
 64 | 	def _rnn_output_size(self):
 65 | 		size = self._cell.output_size
 66 | 		if self._output_layer is None:
 67 | 			return size
 68 | 		else:
 69 | 			# To use layer's compute_output_shape, we need to convert the
 70 | 			# RNNCell's output_size entries into shapes with an unknown
 71 | 			# batch size.  We then pass this through the layer's
 72 | 			# compute_output_shape and read off all but the first (batch)
 73 | 			# dimensions to get the output size of the rnn with the layer
 74 | 			# applied to the top.
 75 | 			output_shape_with_unknown_batch = nest.map_structure(
 76 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 77 | 					size)
 78 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 79 | 					output_shape_with_unknown_batch)
 80 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 81 | 
 82 | 	@property
 83 | 	def output_size(self):
 84 | 		# Return the cell output and the id
 85 | 		return CustomDecoderOutput(
 86 | 				rnn_output=self._rnn_output_size(),
 87 | 				token_output=self._helper.token_output_size,
 88 | 				sample_id=self._helper.sample_ids_shape)
 89 | 
 90 | 	@property
 91 | 	def output_dtype(self):
 92 | 		# Assume the dtype of the cell is the output_size structure
 93 | 		# containing the input_state's first component's dtype.
 94 | 		# Return that structure and the sample_ids_dtype from the helper.
 95 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 96 | 		return CustomDecoderOutput(
 97 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 98 | 				tf.float32,
 99 | 				self._helper.sample_ids_dtype)
100 | 
101 | 	def initialize(self, name=None):
102 | 		"""Initialize the decoder.
103 | 		Args:
104 | 			name: Name scope for any created operations.
105 | 		Returns:
106 | 			`(finished, first_inputs, initial_state)`.
107 | 		"""
108 | 		return self._helper.initialize() + (self._initial_state,)
109 | 
110 | 	def step(self, time, inputs, state, name=None):
111 | 		"""Perform a custom decoding step.
112 | 		Enables for dyanmic <stop_token> prediction
113 | 		Args:
114 | 			time: scalar `int32` tensor.
115 | 			inputs: A (structure of) input tensors.
116 | 			state: A (structure of) state tensors and TensorArrays.
117 | 			name: Name scope for any created operations.
118 | 		Returns:
119 | 			`(outputs, next_state, next_inputs, finished)`.
120 | 		"""
121 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
122 | 			#Call outputprojection wrapper cell
123 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
124 | 
125 | 			#apply output_layer (if existant)
126 | 			if self._output_layer is not None:
127 | 				cell_outputs = self._output_layer(cell_outputs)
128 | 			sample_ids = self._helper.sample(
129 | 					time=time, outputs=cell_outputs, state=cell_state)
130 | 
131 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
132 | 					time=time,
133 | 					outputs=cell_outputs,
134 | 					state=cell_state,
135 | 					sample_ids=sample_ids,
136 | 					stop_token_prediction=stop_token)
137 | 
138 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
139 | 		return (outputs, next_state, next_inputs, finished)


--------------------------------------------------------------------------------
/tacotron/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | from hparams import hparams
  5 | 
  6 | 
  7 | class TacoTestHelper(Helper):
  8 | 	def __init__(self, batch_size, output_dim, r):
  9 | 		with tf.name_scope('TacoTestHelper'):
 10 | 			self._batch_size = batch_size
 11 | 			self._output_dim = output_dim
 12 | 			self._reduction_factor = r
 13 | 
 14 | 	@property
 15 | 	def batch_size(self):
 16 | 		return self._batch_size
 17 | 
 18 | 	@property
 19 | 	def token_output_size(self):
 20 | 		return self._reduction_factor
 21 | 
 22 | 	@property
 23 | 	def sample_ids_shape(self):
 24 | 		return tf.TensorShape([])
 25 | 
 26 | 	@property
 27 | 	def sample_ids_dtype(self):
 28 | 		return np.int32
 29 | 
 30 | 	def initialize(self, name=None):
 31 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 | 	def sample(self, time, outputs, state, name=None):
 34 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
 37 | 		'''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
 38 | 		with tf.name_scope('TacoTestHelper'):
 39 | 			#A sequence is finished when the output probability is > 0.5
 40 | 			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
 41 | 
 42 | 			#Since we are predicting r frames at each step, two modes are 
 43 | 			#then possible:
 44 | 			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
 45 | 			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
 46 | 			#Note:
 47 | 			#	With enough training steps, the model should be able to predict when to stop correctly
 48 | 			#	and the use of stop_at_any = True would be recommended. If however the model didn't
 49 | 			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option 
 50 | 			#	to get a correct synthesis
 51 | 			if hparams.stop_at_any:
 52 | 				finished = tf.reduce_any(finished) #Recommended
 53 | 			else:
 54 | 				finished = tf.reduce_all(finished) #Safer option
 55 | 			
 56 | 			# Feed last output frame as next input. outputs is [N, output_dim * r]
 57 | 			next_inputs = outputs[:, -self._output_dim:]
 58 | 			next_state = state
 59 | 			return (finished, next_inputs, next_state)
 60 | 
 61 | 
 62 | class TacoTrainingHelper(Helper):
 63 | 	def __init__(self, batch_size, targets, stop_targets, output_dim, r, ratio, gta):
 64 | 		# inputs is [N, T_in], targets is [N, T_out, D]
 65 | 		with tf.name_scope('TacoTrainingHelper'):
 66 | 			self._batch_size = batch_size
 67 | 			self._output_dim = output_dim
 68 | 			self._reduction_factor = r
 69 | 			self._ratio = tf.convert_to_tensor(ratio)
 70 | 			self.gta = gta
 71 | 
 72 | 			# Feed every r-th target frame as input
 73 | 			self._targets = targets[:, r-1::r, :]
 74 | 
 75 | 			if not gta:
 76 | 				# Detect finished sequence using stop_targets
 77 | 				self._stop_targets = stop_targets[:, r-1::r]
 78 | 			else:
 79 | 				# GTA synthesis
 80 | 				self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
 81 | 
 82 | 	@property
 83 | 	def batch_size(self):
 84 | 		return self._batch_size
 85 | 
 86 | 	@property
 87 | 	def token_output_size(self):
 88 | 		return self._reduction_factor
 89 | 
 90 | 	@property
 91 | 	def sample_ids_shape(self):
 92 | 		return tf.TensorShape([])
 93 | 
 94 | 	@property
 95 | 	def sample_ids_dtype(self):
 96 | 		return np.int32
 97 | 
 98 | 	def initialize(self, name=None):
 99 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
100 | 
101 | 	def sample(self, time, outputs, state, name=None):
102 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
103 | 
104 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
105 | 		with tf.name_scope(name or 'TacoTrainingHelper'):
106 | 			if not self.gta:
107 | 				#mark sequences where stop_target == 1 as finished (for case of imputation)
108 | 				finished = tf.equal(self._stop_targets[:, time], [1.])
109 | 			else:
110 | 				#GTA synthesis stop
111 | 				finished = (time + 1 >= self._lengths)
112 | 
113 | 			next_inputs = tf.cond(
114 | 				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
115 | 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
116 | 				lambda: outputs[:,-self._output_dim:])
117 | 
118 | 			#Update the finished state
119 | 			next_state = state.replace(finished=tf.cast(tf.reshape(finished, [-1, 1]), tf.float32))
120 | 			return (finished, next_inputs, next_state)
121 | 
122 | 
123 | def _go_frames(batch_size, output_dim):
124 | 	'''Returns all-zero <GO> frames for a given batch size and output dimension'''
125 | 	return tf.tile([[0.0]], [batch_size, output_dim])


--------------------------------------------------------------------------------
/tacotron/models/modules.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell
  3 | from tensorflow.contrib.rnn import LSTMBlockCell
  4 | from hparams import hparams
  5 | from tensorflow.contrib.rnn import GRUCell
  6 | from tacotron.utils.util import shape_list
  7 | 
  8 | def VAE(inputs, input_lengths, filters, kernel_size, strides, num_units, is_training, scope):
  9 |     with tf.variable_scope(scope):
 10 |         outputs = ReferenceEncoder(
 11 |             inputs=inputs,
 12 |             input_lengths=input_lengths,
 13 |             filters=filters,
 14 |             kernel_size=kernel_size,
 15 |             strides=strides,
 16 |             is_training=is_training)
 17 | 
 18 |         mu = tf.layers.dense(outputs, num_units, name='mean')
 19 |         log_var = tf.layers.dense(outputs, num_units, name='vari')
 20 |         std = tf.exp(log_var)
 21 |         z = tf.random_normal(shape=[tf.shape(mu)[0], num_units], mean=0.0, stddev=1.0)
 22 |         output = mu + z * std
 23 |         return output, mu, log_var
 24 | 
 25 | def ReferenceEncoder(inputs, input_lengths, filters, kernel_size, strides, is_training, scope='reference_encoder'):
 26 |     with tf.variable_scope(scope):
 27 |         reference_output = tf.expand_dims(inputs, axis=-1)
 28 |         for i, channel in enumerate(filters):
 29 |             reference_output = conv2d(reference_output, channel, kernel_size,
 30 |                       strides, tf.nn.relu, is_training, 'conv2d_{}'.format(i))
 31 | 
 32 |         shape = shape_list(reference_output)
 33 |         reference_output = tf.reshape(reference_output, shape[:-2] + [shape[2] * shape[3]])
 34 | 
 35 |         #GRU
 36 |         encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
 37 |            cell=GRUCell(128),
 38 |            inputs=reference_output,
 39 |            sequence_length=input_lengths,
 40 |            dtype=tf.float32
 41 |         )
 42 |         return encoder_state
 43 | 
 44 | 
 45 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
 46 | 	drop_rate = hparams.tacotron_dropout_rate
 47 | 
 48 | 	with tf.variable_scope(scope):
 49 | 		conv1d_output = tf.layers.conv1d(
 50 | 			inputs,
 51 | 			filters=channels,
 52 | 			kernel_size=kernel_size,
 53 | 			activation=None,
 54 | 			padding='same')
 55 | 		batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
 56 | 		activated = activation(batched)
 57 | 		return tf.layers.dropout(activated, rate=drop_rate, training=is_training,
 58 | 								name='dropout_{}'.format(scope))
 59 | 
 60 | 
 61 | def conv2d(inputs, filters, kernel_size, strides, activation, is_training, scope):
 62 |     with tf.variable_scope(scope):
 63 |         conv2d_output = tf.layers.conv2d(
 64 |             inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding='same')
 65 | 
 66 |         batch_norm_output = tf.layers.batch_normalization(
 67 |             conv2d_output, training=is_training, name='batch_norm')
 68 |         if activation is not None:
 69 |             conv2d_output = activation(batch_norm_output)
 70 | 
 71 |         return conv2d_output
 72 | 
 73 | class EncoderConvolutions:
 74 | 	"""Encoder convolutional layers used to find local dependencies in inputs characters.
 75 | 	"""
 76 | 	def __init__(self, is_training, kernel_size=(5, ), channels=512, activation=tf.nn.relu, scope=None):
 77 | 		"""
 78 | 		Args:
 79 | 			is_training: Boolean, determines if the model is training or in inference to control dropout
 80 | 			kernel_size: tuple or integer, The size of convolution kernels
 81 | 			channels: integer, number of convolutional kernels
 82 | 			activation: callable, postnet activation function for each convolutional layer
 83 | 			scope: Postnet scope.
 84 | 		"""
 85 | 		super(EncoderConvolutions, self).__init__()
 86 | 		self.is_training = is_training
 87 | 
 88 | 		self.kernel_size = kernel_size
 89 | 		self.channels = channels
 90 | 		self.activation = activation
 91 | 		self.scope = 'enc_conv_layers' if scope is None else scope
 92 | 
 93 | 	def __call__(self, inputs):
 94 | 		with tf.variable_scope(self.scope):
 95 | 			x = inputs
 96 | 			for i in range(hparams.enc_conv_num_layers):
 97 | 				x = conv1d(x, self.kernel_size, self.channels, self.activation,
 98 | 					self.is_training, 'conv_layer_{}_'.format(i + 1)+self.scope)
 99 | 		return x
100 | 
101 | 
102 | class EncoderRNN:
103 | 	"""Encoder bidirectional one layer LSTM
104 | 	"""
105 | 	def __init__(self, is_training, size=256, zoneout=0.1, scope=None):
106 | 		"""
107 | 		Args:
108 | 			is_training: Boolean, determines if the model is training or in inference to control zoneout
109 | 			size: integer, the number of LSTM units for each direction
110 | 			zoneout: the zoneout factor
111 | 			scope: EncoderRNN scope.
112 | 		"""
113 | 		super(EncoderRNN, self).__init__()
114 | 		self.is_training = is_training
115 | 
116 | 		self.size = size
117 | 		self.zoneout = zoneout
118 | 		self.scope = 'encoder_LSTM' if scope is None else scope
119 | 
120 | 		#Create LSTM Cell
121 | 		self._cell = ZoneoutLSTMCell(size, is_training,
122 | 			zoneout_factor_cell=zoneout,
123 | 			zoneout_factor_output=zoneout)
124 | 
125 | 	def __call__(self, inputs, input_lengths):
126 | 		with tf.variable_scope(self.scope):
127 | 			outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn(
128 | 				self._cell,
129 | 				self._cell,
130 | 				inputs,
131 | 				sequence_length=input_lengths,
132 | 				dtype=tf.float32)
133 | 
134 | 			return tf.concat(outputs, axis=2) # Concat and return forward + backward outputs
135 | 
136 | 
137 | class Prenet:
138 | 	"""Two fully connected layers used as an information bottleneck for the attention.
139 | 	"""
140 | 	def __init__(self, is_training, layer_sizes=[256, 256], activation=tf.nn.relu, scope=None):
141 | 		"""
142 | 		Args:
143 | 			is_training: Boolean, determines if the model is in training or inference to control dropout
144 | 			layer_sizes: list of integers, the length of the list represents the number of pre-net
145 | 				layers and the list values represent the layers number of units
146 | 			activation: callable, activation functions of the prenet layers.
147 | 			scope: Prenet scope.
148 | 		"""
149 | 		super(Prenet, self).__init__()
150 | 		self.drop_rate = hparams.tacotron_dropout_rate
151 | 
152 | 		self.layer_sizes = layer_sizes
153 | 		self.is_training = is_training
154 | 		self.activation = activation
155 | 		
156 | 		self.scope = 'prenet' if scope is None else scope
157 | 
158 | 	def __call__(self, inputs):
159 | 		x = inputs
160 | 
161 | 		with tf.variable_scope(self.scope):
162 | 			for i, size in enumerate(self.layer_sizes):
163 | 				dense = tf.layers.dense(x, units=size, activation=self.activation,
164 | 					name='dense_{}'.format(i + 1))
165 | 				#The paper discussed introducing diversity in generation at inference time
166 | 				#by using a dropout of 0.5 only in prenet layers (in both training and inference).
167 | 				x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
168 | 					name='dropout_{}'.format(i + 1) + self.scope)
169 | 		return x
170 | 
171 | 
172 | class DecoderRNN:
173 | 	"""Decoder two uni directional LSTM Cells
174 | 	"""
175 | 	def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
176 | 		"""
177 | 		Args:
178 | 			is_training: Boolean, determines if the model is in training or inference to control zoneout
179 | 			layers: integer, the number of LSTM layers in the decoder
180 | 			size: integer, the number of LSTM units in each layer
181 | 			zoneout: the zoneout factor
182 | 		"""
183 | 		super(DecoderRNN, self).__init__()
184 | 		self.is_training = is_training
185 | 
186 | 		self.layers = layers
187 | 		self.size = size
188 | 		self.zoneout = zoneout
189 | 		self.scope = 'decoder_rnn' if scope is None else scope
190 | 
191 | 		#Create a set of LSTM layers
192 | 		self.rnn_layers = [ZoneoutLSTMCell(size, is_training, 
193 | 			zoneout_factor_cell=zoneout,
194 | 			zoneout_factor_output=zoneout) for i in range(layers)]
195 | 
196 | 		self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
197 | 
198 | 	def __call__(self, inputs, states):
199 | 		with tf.variable_scope(self.scope):
200 | 			return self._cell(inputs, states)
201 | 
202 | 
203 | class FrameProjection:
204 | 	"""Projection layer to r * num_mels dimensions or num_mels dimensions
205 | 	"""
206 | 	def __init__(self, shape=80, activation=None, scope=None):
207 | 		"""
208 | 		Args:
209 | 			shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for postnet)
210 | 			activation: callable, activation function
211 | 			scope: FrameProjection scope.
212 | 		"""
213 | 		super(FrameProjection, self).__init__()
214 | 
215 | 		self.shape = shape
216 | 		self.activation = activation
217 | 		
218 | 		self.scope = 'Linear_projection' if scope is None else scope
219 | 
220 | 	def __call__(self, inputs):
221 | 		with tf.variable_scope(self.scope):
222 | 			#If activation==None, this returns a simple Linear projection
223 | 			#else the projection will be passed through an activation function
224 | 			output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
225 | 				name='projection_{}'.format(self.scope))
226 | 
227 | 			return output
228 | 
229 | 
230 | class StopProjection:
231 | 	"""Projection to a scalar and through a sigmoid activation
232 | 	"""
233 | 	def __init__(self, is_training, shape=hparams.outputs_per_step, activation=tf.nn.sigmoid, scope=None):
234 | 		"""
235 | 		Args:
236 | 			is_training: Boolean, to control the use of sigmoid function as it is useless to use it
237 | 				during training since it is integrate inside the sigmoid_crossentropy loss
238 | 			shape: integer, dimensionality of output space. Defaults to 1 (scalar)
239 | 			activation: callable, activation function. only used during inference
240 | 			scope: StopProjection scope.
241 | 		"""
242 | 		super(StopProjection, self).__init__()
243 | 		self.is_training = is_training
244 | 		
245 | 		self.shape = shape
246 | 		self.activation = activation
247 | 		self.scope = 'stop_token_projection' if scope is None else scope
248 | 
249 | 	def __call__(self, inputs):
250 | 		with tf.variable_scope(self.scope):
251 | 			output = tf.layers.dense(inputs, units=self.shape,
252 | 				activation=None, name='projection_{}'.format(self.scope))
253 | 
254 | 			#During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function
255 | 			if self.is_training:
256 | 				return output
257 | 			return self.activation(output)
258 | 
259 | 
260 | class Postnet:
261 | 	"""Postnet that takes final decoder output and fine tunes it (using vision on past and future frames)
262 | 	"""
263 | 	def __init__(self, is_training, kernel_size=(5, ), channels=512, activation=tf.nn.tanh, scope=None):
264 | 		"""
265 | 		Args:
266 | 			is_training: Boolean, determines if the model is training or in inference to control dropout
267 | 			kernel_size: tuple or integer, The size of convolution kernels
268 | 			channels: integer, number of convolutional kernels
269 | 			activation: callable, postnet activation function for each convolutional layer
270 | 			scope: Postnet scope.
271 | 		"""
272 | 		super(Postnet, self).__init__()
273 | 		self.is_training = is_training
274 | 
275 | 		self.kernel_size = kernel_size
276 | 		self.channels = channels
277 | 		self.activation = activation
278 | 		self.scope = 'postnet_convolutions' if scope is None else scope
279 | 
280 | 	def __call__(self, inputs):
281 | 		with tf.variable_scope(self.scope):
282 | 			x = inputs
283 | 			for i in range(hparams.postnet_num_layers - 1):
284 | 				x = conv1d(x, self.kernel_size, self.channels, self.activation,
285 | 					self.is_training, 'conv_layer_{}_'.format(i + 1)+self.scope)
286 | 			x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, 'conv_layer_{}_'.format(5)+self.scope)
287 | 		return x


--------------------------------------------------------------------------------
/tacotron/models/tacotron.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tacotron.utils.symbols import symbols
  3 | from tacotron.utils.infolog import log
  4 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
  5 | from tacotron.models.modules import *
  6 | from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell
  7 | from tensorflow.contrib.seq2seq import dynamic_decode
  8 | from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
  9 | from tacotron.models.custom_decoder import CustomDecoder
 10 | from tacotron.models.attention import LocationSensitiveAttention
 11 | from tacotron.utils.util import shape_list, vae_weight
 12 | 
 13 | 
 14 | class Tacotron():
 15 | 	"""vae_tacotron2 Feature prediction Model.
 16 | 	"""
 17 | 	def __init__(self, hparams):
 18 | 		self._hparams = hparams
 19 | 
 20 | 	def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None):
 21 | 		"""
 22 | 		Initializes the model for inference
 23 | 
 24 | 		sets "mel_outputs" and "alignments" fields.
 25 | 
 26 | 		Args:
 27 | 			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
 28 | 			  steps in the input time series, and values are character IDs
 29 | 			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
 30 | 			of each sequence in inputs.
 31 | 			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
 32 | 			of steps in the output time series, M is num_mels, and values are entries in the mel
 33 | 			spectrogram. Only needed for training.
 34 | 		"""
 35 | 		if mel_targets is None and stop_token_targets is not None:
 36 | 			raise ValueError('no mel targets were provided but token_targets were given')
 37 | 		if mel_targets is not None and stop_token_targets is None and not gta:
 38 | 			raise ValueError('Mel targets are provided without corresponding token_targets')
 39 | 		if gta==False and self._hparams.predict_linear==True and linear_targets is None:
 40 | 			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
 41 | 		if gta and linear_targets is not None:
 42 | 			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
 43 | 
 44 | 		with tf.variable_scope('inference') as scope:
 45 | 			is_training = mel_targets is not None and not gta
 46 | 			batch_size = tf.shape(inputs)[0]
 47 | 			hp = self._hparams
 48 | 			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
 49 | 			post_condition = hp.predict_linear and not gta
 50 | 
 51 | 			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
 52 | 			embedding_table = tf.get_variable(
 53 | 				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
 54 | 			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)
 55 | 
 56 | 
 57 | 			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
 58 | 			encoder_cell = TacotronEncoderCell(
 59 | 				EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size,
 60 | 					channels=hp.enc_conv_channels, scope='encoder_convolutions'),
 61 | 				EncoderRNN(is_training, size=hp.encoder_lstm_units,
 62 | 					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
 63 | 
 64 | 			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
 65 | 			if hp.use_vae:
 66 | 				if is_training:
 67 | 					reference_mel = mel_targets
 68 | 				
 69 | 				style_embeddings, mu, log_var = VAE(
 70 | 					inputs=reference_mel,
 71 | 					input_lengths=mel_lengths,
 72 | 					filters=hp.filters,
 73 | 					kernel_size=(3, 3),
 74 | 					strides=(2, 2),
 75 | 					num_units=hp.vae_dim,
 76 | 					is_training=is_training,
 77 | 					scope='vae')
 78 | 
 79 | 				self.mu = mu
 80 | 				self.log_var = log_var
 81 | 				style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth)
 82 | 				style_embeddings = tf.expand_dims(style_embeddings, axis=1)
 83 | 				style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 256]
 84 | 				encoder_outputs = encoder_outputs + style_embeddings
 85 | 
 86 | 			#For shape visualization purpose
 87 | 			enc_conv_output_shape = encoder_cell.conv_output_shape
 88 | 
 89 | 
 90 | 			#Decoder Parts
 91 | 			#Attention Decoder Prenet
 92 | 			prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet')
 93 | 			#Attention Mechanism
 94 | 			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs,
 95 | 				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing,
 96 | 				cumulate_weights=hp.cumulative_weights)
 97 | 			#Decoder LSTM Cells
 98 | 			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
 99 | 				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
100 | 			#Frames Projection layer
101 | 			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
102 | 			#<stop_token> projection layer
103 | 			stop_projection = StopProjection(is_training, scope='stop_token_projection')
104 | 
105 | 
106 | 			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
107 | 			decoder_cell = TacotronDecoderCell(
108 | 				prenet,
109 | 				attention_mechanism,
110 | 				decoder_lstm,
111 | 				frame_projection,
112 | 				stop_projection,
113 | 				mask_finished=hp.mask_finished)
114 | 
115 | 
116 | 			#Define the helper for our decoder
117 | 			if (is_training or gta) == True:
118 | 				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets,
119 | 					hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta)
120 | 			else:
121 | 				self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
122 | 
123 | 
124 | 			#initial decoder state
125 | 			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
126 | 
127 | 			#Only use max iterations at synthesis time
128 | 			max_iters = hp.max_iters if not is_training else None
129 | 
130 | 			#Decode
131 | 			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
132 | 				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
133 | 				impute_finished=hp.impute_finished,
134 | 				maximum_iterations=max_iters)
135 | 
136 | 
137 | 			# Reshape outputs to be one output per entry
138 | 			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
139 | 			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
140 | 			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
141 | 
142 | 
143 | 			#Postnet
144 | 			postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size,
145 | 				channels=hp.postnet_channels, scope='postnet_convolutions')
146 | 
147 | 			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
148 | 			residual = postnet(decoder_output)
149 | 
150 | 			#Project residual to same dimension as mel spectrogram
151 | 			#==> [batch_size, decoder_steps * r, num_mels]
152 | 			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
153 | 			projected_residual = residual_projection(residual)
154 | 
155 | 
156 | 			#Compute the mel spectrogram
157 | 			mel_outputs = decoder_output + projected_residual
158 | 
159 | 
160 | 			if post_condition:
161 | 				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
162 | 				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
163 | 				post_processing_cell = TacotronEncoderCell(
164 | 				EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size,
165 | 					channels=hp.enc_conv_channels, scope='post_processing_convolutions'),
166 | 				EncoderRNN(is_training, size=hp.encoder_lstm_units,
167 | 					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))
168 | 
169 | 				expand_outputs = post_processing_cell(mel_outputs)
170 | 				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)
171 | 
172 | 			#Grab alignments from the final decoder state
173 | 			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])
174 | 
175 | 			self.inputs = inputs
176 | 			self.input_lengths = input_lengths
177 | 			self.decoder_output = decoder_output
178 | 			self.alignments = alignments
179 | 			self.stop_token_prediction = stop_token_prediction
180 | 			self.stop_token_targets = stop_token_targets
181 | 			self.mel_outputs = mel_outputs
182 | 			self.reference_mel = reference_mel
183 | 			if post_condition:
184 | 				self.linear_outputs = linear_outputs
185 | 				self.linear_targets = linear_targets
186 | 			self.mel_targets = mel_targets
187 | 			self.mel_lengths = mel_lengths
188 | 			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
189 | 			log('  embedding:                {}'.format(embedded_inputs.shape))
190 | 			log('  enc conv out:             {}'.format(enc_conv_output_shape))
191 | 			log('  encoder out:              {}'.format(encoder_outputs.shape))
192 | 			log('  decoder out:              {}'.format(decoder_output.shape))
193 | 			log('  residual out:             {}'.format(residual.shape))
194 | 			log('  projected residual out:   {}'.format(projected_residual.shape))
195 | 			log('  mel out:                  {}'.format(mel_outputs.shape))
196 | 			if post_condition:
197 | 				log('  linear out:               {}'.format(linear_outputs.shape))
198 | 			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
199 | 
200 | 
201 | 	def add_loss(self, global_step):
202 | 		'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
203 | 		with tf.variable_scope('loss') as scope:
204 | 			hp = self._hparams
205 | 
206 | 			# Compute loss of predictions before postnet
207 | 			before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_output)
208 | 			# Compute loss after postnet
209 | 			after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs)
210 | 			#Compute <stop_token> loss (for learning dynamic generation stop)
211 | 			stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
212 | 				labels=self.stop_token_targets,
213 | 				logits=self.stop_token_prediction))
214 | 
215 | 			if hp.predict_linear:
216 | 				#Compute linear loss
217 | 				#From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
218 | 				#Prioritize loss for frequencies under 2000 Hz.
219 | 				l1 = tf.abs(self.linear_targets - self.linear_outputs)
220 | 				n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_mels)
221 | 				linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq])
222 | 			else:
223 | 				linear_loss = 0.
224 | 
225 | 			# Compute the regularization weight
226 | 			if hp.tacotron_scale_regularization:
227 | 				reg_weight_scaler = 1. / (2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (hp.max_abs_value)
228 | 				reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
229 | 			else:
230 | 				reg_weight = hp.tacotron_reg_weight
231 | 
232 | 			# Get all trainable variables
233 | 			all_vars = tf.trainable_variables()
234 | 			regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars
235 | 				if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight
236 | 
237 | 			# Compute final loss term
238 | 			self.before_loss = before
239 | 			self.after_loss = after
240 | 			self.stop_token_loss = stop_token_loss
241 | 			self.regularization_loss = regularization
242 | 			self.linear_loss = linear_loss
243 | 
244 | 			self.loss = self.before_loss + self.after_loss + self.stop_token_loss + self.regularization_loss + self.linear_loss
245 | 
246 | 			if hp.use_vae:
247 | 				self.ki_loss = -0.5 * tf.reduce_sum(1 + self.log_var - tf.pow(self.mu, 2) - tf.exp(self.log_var))
248 | 				vae_loss_weight = vae_weight(global_step)
249 | 				self.loss += self.ki_loss * vae_loss_weight
250 | 
251 | 
252 | 	def add_optimizer(self, global_step):
253 | 		'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
254 | 
255 | 		Args:
256 | 			global_step: int32 scalar Tensor representing current global step in training
257 | 		'''
258 | 		with tf.variable_scope('optimizer') as scope:
259 | 			hp = self._hparams
260 | 			if hp.tacotron_decay_learning_rate:
261 | 				self.decay_steps = hp.tacotron_decay_steps
262 | 				self.decay_rate = hp.tacotron_decay_rate
263 | 				self.learning_rate = self._learning_rate_decay(hp.tacotron_initial_learning_rate, global_step)
264 | 			else:
265 | 				self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
266 | 
267 | 			optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
268 | 				hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
269 | 			gradients, variables = zip(*optimizer.compute_gradients(self.loss))
270 | 			self.gradients = gradients
271 | 			#Just for causion
272 | 			#https://github.com/Rayhane-mamah/Tacotron-2/issues/11
273 | 			clipped_gradients, _ = tf.clip_by_global_norm(gradients, 0.5)
274 | 
275 | 			# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
276 | 			# https://github.com/tensorflow/tensorflow/issues/1122
277 | 			with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
278 | 				self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
279 | 					global_step=global_step)
280 | 
281 | 	def _learning_rate_decay(self, init_lr, global_step):
282 | 		#################################################################
283 | 		# Narrow Exponential Decay:
284 | 
285 | 		# Phase 1: lr = 1e-3
286 | 		# We only start learning rate decay after 50k steps
287 | 
288 | 		# Phase 2: lr in ]1e-3, 1e-5[
289 | 		# decay reach minimal value at step 300k
290 | 
291 | 		# Phase 3: lr = 1e-5
292 | 		# clip by minimal learning rate value (step > 300k)
293 | 		#################################################################
294 | 		hp = self._hparams
295 | 
296 | 		#Compute natural exponential decay
297 | 		lr = tf.train.exponential_decay(init_lr,
298 | 			global_step - hp.tacotron_start_decay, #lr = 1e-3 at step 50k
299 | 			self.decay_steps,
300 | 			self.decay_rate, #lr = 1e-5 around step 300k
301 | 			name='exponential_decay')
302 | 
303 | 
304 | 		#clip learning rate by max and min values (initial and final values)
305 | 		return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
306 | 


--------------------------------------------------------------------------------
/tacotron/models/zoneout_LSTM.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.python.ops.rnn_cell import RNNCell
  4 | 
  5 | 
  6 | # Thanks to 'initializers_enhanced.py' of Project RNN Enhancement:
  7 | # https://github.com/nicolas-ivanov/Seq2Seq_Upgrade_TensorFlow/blob/master/rnn_enhancement/initializers_enhanced.py
  8 | def orthogonal_initializer(scale=1.0):
  9 |     def _initializer(shape, dtype=tf.float32):
 10 |         flat_shape = (shape[0], np.prod(shape[1:]))
 11 |         a = np.random.normal(0.0, 1.0, flat_shape)
 12 |         u, _, v = np.linalg.svd(a, full_matrices=False)
 13 |         q = u if u.shape == flat_shape else v
 14 |         q = q.reshape(shape)
 15 |         return tf.constant(scale * q[:shape[0], :shape[1]], dtype=tf.float32)
 16 |     return _initializer
 17 | 
 18 | 
 19 | class ZoneoutLSTMCell(RNNCell):
 20 |     """Zoneout Regularization for LSTM-RNN.
 21 |     """
 22 | 
 23 |     def __init__(self, num_units, is_training, input_size=None,
 24 |          use_peepholes=False, cell_clip=None,
 25 |          #initializer=orthogonal_initializer(),
 26 |          initializer=tf.contrib.layers.xavier_initializer(),
 27 |          num_proj=None, proj_clip=None, ext_proj=None,
 28 |          forget_bias=1.0,
 29 |          state_is_tuple=True,
 30 |          activation=tf.tanh,
 31 |          zoneout_factor_cell=0.0,
 32 |          zoneout_factor_output=0.0,
 33 |          reuse=None):
 34 |         """Initialize the parameters for an LSTM cell.
 35 |         Args:
 36 |           num_units: int, The number of units in the LSTM cell.
 37 |           is_training: bool, set True when training.
 38 |           use_peepholes: bool, set True to enable diagonal/peephole
 39 |             connections.
 40 |           cell_clip: (optional) A float value, if provided the cell state
 41 |             is clipped by this value prior to the cell output activation.
 42 |           initializer: (optional) The initializer to use for the weight
 43 |             matrices.
 44 |           num_proj: (optional) int, The output dimensionality for
 45 |             the projection matrices.  If None, no projection is performed.
 46 |           forget_bias: Biases of the forget gate are initialized by default
 47 |             to 1 in order to reduce the scale of forgetting at the beginning of
 48 |             the training.
 49 |           activation: Activation function of the inner states.
 50 |         """
 51 |         if not state_is_tuple:
 52 |             tf.logging.warn(
 53 |                 "%s: Using a concatenated state is slower and will soon be "
 54 |                 "deprecated.  Use state_is_tuple=True.", self)
 55 |         if input_size is not None:
 56 |             tf.logging.warn(
 57 |                 "%s: The input_size parameter is deprecated.", self)
 58 | 
 59 |         if not (zoneout_factor_cell >= 0.0 and zoneout_factor_cell <= 1.0):
 60 |             raise ValueError(
 61 |                 "Parameter zoneout_factor_cell must be in [0 1]")
 62 | 
 63 |         if not (zoneout_factor_output >= 0.0 and zoneout_factor_output <= 1.0):
 64 |             raise ValueError(
 65 |                 "Parameter zoneout_factor_cell must be in [0 1]")
 66 | 
 67 |         self.num_units = num_units
 68 |         self.is_training = is_training
 69 |         self.use_peepholes = use_peepholes
 70 |         self.cell_clip = cell_clip
 71 |         self.num_proj = num_proj
 72 |         self.proj_clip = proj_clip
 73 |         self.initializer = initializer
 74 |         self.forget_bias = forget_bias
 75 |         self.state_is_tuple = state_is_tuple
 76 |         self.activation = activation
 77 |         self.zoneout_factor_cell = zoneout_factor_cell
 78 |         self.zoneout_factor_output = zoneout_factor_output
 79 | 
 80 |         if num_proj:
 81 |             self._state_size = (
 82 |                 tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj)
 83 |                 if state_is_tuple else num_units + num_proj)
 84 |             self._output_size = num_proj
 85 |         else:
 86 |             self._state_size = (
 87 |                 tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units)
 88 |                 if state_is_tuple else 2 * num_units)
 89 |             self._output_size = num_units
 90 | 
 91 |         self._ext_proj = ext_proj
 92 | 
 93 |     @property
 94 |     def state_size(self):
 95 |         return self._state_size
 96 | 
 97 |     @property
 98 |     def output_size(self):
 99 |         if self._ext_proj is None:
100 |             return self._output_size
101 |         return self._ext_proj
102 | 
103 |     def __call__(self, inputs, state, scope=None):
104 | 
105 |         num_proj = self.num_units if self.num_proj is None else self.num_proj
106 | 
107 |         if self.state_is_tuple:
108 |             (c_prev, h_prev) = state
109 |         else:
110 |             c_prev = tf.slice(state, [0, 0], [-1, self.num_units])
111 |             h_prev = tf.slice(state, [0, self.num_units], [-1, num_proj])
112 | 
113 |         # c_prev : Tensor with the size of [batch_size, state_size]
114 |         # h_prev : Tensor with the size of [batch_size, state_size/2]
115 | 
116 |         dtype = inputs.dtype
117 |         input_size = inputs.get_shape().with_rank(2)[1]
118 | 
119 |         with tf.variable_scope(scope or type(self).__name__):
120 |             if input_size.value is None:
121 |                 raise ValueError(
122 |                     "Could not infer input size from inputs.get_shape()[-1]")
123 | 
124 |             # i = input_gate, j = new_input, f = forget_gate, o = output_gate
125 |             lstm_matrix = _linear([inputs, h_prev], 4 * self.num_units, True)
126 |             i, j, f, o = tf.split(lstm_matrix, 4, 1)
127 | 
128 |             # diagonal connections
129 |             if self.use_peepholes:
130 |                 w_f_diag = tf.get_variable(
131 |                     "W_F_diag", shape=[self.num_units], dtype=dtype)
132 |                 w_i_diag = tf.get_variable(
133 |                     "W_I_diag", shape=[self.num_units], dtype=dtype)
134 |                 w_o_diag = tf.get_variable(
135 |                     "W_O_diag", shape=[self.num_units], dtype=dtype)
136 | 
137 |             with tf.name_scope(None, "zoneout"):
138 |                 # make binary mask tensor for cell
139 |                 keep_prob_cell = tf.convert_to_tensor(
140 |                     self.zoneout_factor_cell,
141 |                     dtype=c_prev.dtype
142 |                 )
143 |                 random_tensor_cell = keep_prob_cell
144 |                 random_tensor_cell += \
145 |                     tf.random_uniform(tf.shape(c_prev),
146 |                                       seed=None, dtype=c_prev.dtype)
147 |                 binary_mask_cell = tf.floor(random_tensor_cell)
148 |                 # 0 <-> 1 swap
149 |                 binary_mask_cell_complement = tf.ones(tf.shape(c_prev)) \
150 |                     - binary_mask_cell
151 | 
152 |                 # make binary mask tensor for output
153 |                 keep_prob_output = tf.convert_to_tensor(
154 |                     self.zoneout_factor_output,
155 |                     dtype=h_prev.dtype
156 |                 )
157 |                 random_tensor_output = keep_prob_output
158 |                 random_tensor_output += \
159 |                     tf.random_uniform(tf.shape(h_prev),
160 |                                       seed=None, dtype=h_prev.dtype)
161 |                 binary_mask_output = tf.floor(random_tensor_output)
162 |                 # 0 <-> 1 swap
163 |                 binary_mask_output_complement = tf.ones(tf.shape(h_prev)) \
164 |                     - binary_mask_output
165 | 
166 |             # apply zoneout for cell
167 |             if self.use_peepholes:
168 |                 c_temp = c_prev * \
169 |                     tf.sigmoid(f + self.forget_bias +
170 |                                w_f_diag * c_prev) + \
171 |                     tf.sigmoid(i + w_i_diag * c_prev) * \
172 |                     self.activation(j)
173 |                 if self.is_training and self.zoneout_factor_cell > 0.0:
174 |                     c = binary_mask_cell * c_prev + \
175 |                         binary_mask_cell_complement * c_temp
176 |                 else:
177 |                     c = c_temp
178 |             else:
179 |                 c_temp = c_prev * tf.sigmoid(f + self.forget_bias) + \
180 |                     tf.sigmoid(i) * self.activation(j)
181 |                 if self.is_training and self.zoneout_factor_cell > 0.0:
182 |                     c = binary_mask_cell * c_prev + \
183 |                         binary_mask_cell_complement * c_temp
184 |                 else:
185 |                     c = c_temp
186 | 
187 |             if self.cell_clip is not None:
188 |                 c = tf.clip_by_value(c, -self.cell_clip, self.cell_clip)
189 | 
190 |             # apply zoneout for output
191 |             if self.use_peepholes:
192 |                 h_temp = tf.sigmoid(o + w_o_diag * c) * self.activation(c)
193 |                 if self.is_training and self.zoneout_factor_output > 0.0:
194 |                     h = binary_mask_output * h_prev + \
195 |                         binary_mask_output_complement * h_temp
196 |                 else:
197 |                     h = h_temp
198 |             else:
199 |                 h_temp = tf.sigmoid(o) * self.activation(c)
200 |                 if self.is_training and self.zoneout_factor_output > 0.0:
201 |                     h = binary_mask_output * h_prev + \
202 |                         binary_mask_output_complement * h_temp
203 |                 else:
204 |                     h = h_temp
205 | 
206 |             # apply prejection
207 |             if self.num_proj is not None:
208 |                 w_proj = tf.get_variable(
209 |                     "W_P", [self.num_units, num_proj], dtype=dtype)
210 | 
211 |                 h = tf.matmul(h, w_proj)
212 |                 if self.proj_clip is not None:
213 |                     h = tf.clip_by_value(h, -self.proj_clip, self.proj_clip)
214 | 
215 |             new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, h)
216 |                          if self.state_is_tuple else tf.concat(1, [c, h]))
217 | 
218 |             return h, new_state
219 | 
220 | 
221 | def _linear(args, output_size, bias, bias_start=0.0, scope=None):
222 |     """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
223 |     Args:
224 |       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
225 |       output_size: int, second dimension of W[i].
226 |       bias: boolean, whether to add a bias term or not.
227 |       bias_start: starting value to initialize the bias; 0 by default.
228 |       scope: VariableScope for the created subgraph; defaults to "Linear".
229 |     Returns:
230 |       A 2D Tensor with shape [batch x output_size] equal to
231 |       sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
232 |     Raises:
233 |       ValueError: if some of the arguments has unspecified or wrong shape.
234 |     """
235 |     if args is None or (isinstance(args, (list, tuple)) and not args):
236 |         raise ValueError("`args` must be specified")
237 |     if not isinstance(args, (list, tuple)):
238 |         args = [args]
239 | 
240 |     # Calculate the total size of arguments on dimension 1.
241 |     total_arg_size = 0
242 |     shapes = [a.get_shape().as_list() for a in args]
243 |     for shape in shapes:
244 |         if len(shape) != 2:
245 |             raise ValueError(
246 |                 "Linear is expecting 2D arguments: %s" % str(shapes))
247 |         if not shape[1]:
248 |             raise ValueError(
249 |                 "Linear expects shape[1] of arguments: %s" % str(shapes))
250 |         else:
251 |             total_arg_size += shape[1]
252 | 
253 |     # Now the computation.
254 |     with tf.variable_scope(scope or "Linear"):
255 |         matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
256 |         if len(args) == 1:
257 |             res = tf.matmul(args[0], matrix)
258 |         else:
259 |             res = tf.matmul(tf.concat(args, 1), matrix)
260 |         if not bias:
261 |             return res
262 |         bias_term = tf.get_variable(
263 |             "Bias", [output_size],
264 |             initializer=tf.constant_initializer(bias_start))
265 |     return res + bias_term


--------------------------------------------------------------------------------
/tacotron/synthesize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | from hparams import hparams, hparams_debug_string
 5 | from tacotron.synthesizer import Synthesizer
 6 | import tensorflow as tf
 7 | import time
 8 | from tqdm import tqdm
 9 | from tacotron.utils.audio import load_wav, melspectrogram
10 | 
11 | def run_eval(args, checkpoint_path, output_dir):
12 | 	print(hparams_debug_string())
13 | 	synth = Synthesizer()
14 | 	synth.load(checkpoint_path)
15 | 	eval_dir = os.path.join(output_dir, 'eval')
16 | 	log_dir = os.path.join(output_dir, 'logs-eval')
17 | 	wav = load_wav(args.reference_audio)
18 | 	reference_mel = melspectrogram(wav).transpose()
19 | 	#Create output path if it doesn't exist
20 | 	os.makedirs(eval_dir, exist_ok=True)
21 | 	os.makedirs(log_dir, exist_ok=True)
22 | 	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
23 | 	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
24 | 
25 | 	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
26 | 		for i, text in enumerate(tqdm(hparams.sentences)):
27 | 			start = time.time()
28 | 			mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None, reference_mel)
29 | 
30 | 			file.write('{}|{}\n'.format(text, mel_filename))
31 | 	print('synthesized mel spectrograms at {}'.format(eval_dir))
32 | 
33 | def run_synthesis(args, checkpoint_path, output_dir):
34 | 	metadata_filename = os.path.join(args.input_dir, 'train.txt')
35 | 	print(hparams_debug_string())
36 | 	synth = Synthesizer()
37 | 	synth.load(checkpoint_path, gta=args.GTA)
38 | 	with open(metadata_filename, encoding='utf-8') as f:
39 | 		metadata = [line.strip().split('|') for line in f]
40 | 		frame_shift_ms = hparams.hop_size / hparams.sample_rate
41 | 		hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
42 | 		print('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
43 | 
44 | 	if args.GTA==True:
45 | 		synth_dir = os.path.join(output_dir, 'gta')
46 | 	else:
47 | 		synth_dir = os.path.join(output_dir, 'natural')
48 | 
49 | 	#Create output path if it doesn't exist
50 | 	os.makedirs(synth_dir, exist_ok=True)
51 | 
52 | 	print('starting synthesis')
53 | 	mel_dir = os.path.join(args.input_dir, 'mels')
54 | 	wav_dir = os.path.join(args.input_dir, 'audio')
55 | 	with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
56 | 		for i, meta in enumerate(tqdm(metadata)):
57 | 			text = meta[5]
58 | 			mel_filename = os.path.join(mel_dir, meta[1])
59 | 			wav_filename = os.path.join(wav_dir, meta[0])
60 | 			mel_output_filename = synth.synthesize(text, None, i+1, synth_dir, None, mel_filename)
61 | 
62 | 			file.write('{}|{}|{}|{}\n'.format(text, mel_filename, mel_output_filename, wav_filename))
63 | 	print('synthesized mel spectrograms at {}'.format(synth_dir))
64 | 
65 | def tacotron_synthesize(args):
66 | 	hparams.parse(args.hparams)
67 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
68 | 	output_dir = 'tacotron_' + args.output_dir
69 | 
70 | 	try:
71 | 		checkpoint_path = tf.train.get_checkpoint_state(args.checkpoint).model_checkpoint_path
72 | 		print('loaded model at {}'.format(checkpoint_path))
73 | 	except:
74 | 		raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(args.checkpoint))
75 | 
76 | 	if args.mode == 'eval':
77 | 		run_eval(args, checkpoint_path, output_dir)
78 | 	else:
79 | 		run_synthesis(args, checkpoint_path, output_dir)
80 | 


--------------------------------------------------------------------------------
/tacotron/synthesizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from hparams import hparams
 5 | from librosa import effects
 6 | from tacotron.models import create_model
 7 | from tacotron.utils.text import text_to_sequence
 8 | from tacotron.utils import plot
 9 | from datasets import audio
10 | from datetime import datetime
11 | 
12 | 
13 | class Synthesizer:
14 | 	def load(self, checkpoint_path, gta=False, model_name='Tacotron'):
15 | 		print('Constructing model: %s' % model_name)
16 | 		inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
17 | 		input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
18 | 
19 | 		with tf.variable_scope('model') as scope:
20 | 			self.model = create_model(model_name, hparams)
21 | 			if hparams.use_vae:
22 | 				ref_targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'ref_targets')
23 | 			if gta:
24 | 				targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
25 | 				
26 | 				if hparams.use_vae:
27 | 					self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=ref_targets)
28 | 				else:
29 | 					self.model.initialize(inputs, input_lengths, targets, gta=gta)
30 | 			else:
31 | 				if hparams.use_vae:
32 | 					self.model.initialize(inputs, input_lengths, reference_mel=ref_targets)
33 | 				else:
34 | 					self.model.initialize(inputs, input_lengths)
35 | 			self.mel_outputs = self.model.mel_outputs
36 | 			self.alignment = self.model.alignments[0]
37 | 
38 | 		self.gta = gta
39 | 		print('Loading checkpoint: %s' % checkpoint_path)
40 | 		self.session = tf.Session()
41 | 		self.session.run(tf.global_variables_initializer())
42 | 		saver = tf.train.Saver()
43 | 		saver.restore(self.session, checkpoint_path)
44 | 
45 | 
46 | 	def synthesize(self, text, index, out_dir, log_dir, mel_filename, reference_mel):
47 | 		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
48 | 		seq = text_to_sequence(text, cleaner_names)
49 | 		feed_dict = {
50 | 			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
51 | 			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
52 | 		}
53 | 
54 | 		if self.gta:
55 | 			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)
56 | 			feed_dict[self.model.reference_mel] = np.load(mel_filename).reshape(1, -1, 80)
57 | 		elif hparams.use_vae:
58 | 			reference_mel = [np.asarray(reference_mel, dtype=np.float32)]
59 | 			feed_dict[self.model.reference_mel] = reference_mel
60 | 
61 | 
62 | 		if self.gta or not hparams.predict_linear:
63 | 			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
64 | 
65 | 		else:
66 | 			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
67 | 			linear = linear.reshape(-1, hparams.num_freq)
68 | 
69 | 		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out
70 | 
71 | 		# Write the spectrogram to disk
72 | 		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
73 | 		mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
74 | 		np.save(mel_filename, mels, allow_pickle=False)
75 | 
76 | 		if log_dir is not None:
77 | 			#save wav (mel -> wav)
78 | 			wav = audio.inv_mel_spectrogram(mels.T)
79 | 			audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)))
80 | 
81 | 			if hparams.predict_linear:
82 | 				#save wav (linear -> wav)
83 | 				wav = audio.inv_linear_spectrogram(linear.T)
84 | 				audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)))
85 | 
86 | 			#save alignments
87 | 			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
88 | 				info='{}'.format(text), split_title=True)
89 | 
90 | 			#save mel spectrogram plot
91 | 			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
92 | 				info='{}'.format(text), split_title=True)
93 | 
94 | 		return mel_filename
95 | 


--------------------------------------------------------------------------------
/tacotron/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | from datetime import datetime
  3 | import os
  4 | import subprocess
  5 | import time
  6 | import tensorflow as tf 
  7 | import traceback
  8 | import argparse
  9 | 
 10 | from tacotron.feeder import Feeder
 11 | from hparams import hparams, hparams_debug_string
 12 | from tacotron.models import create_model
 13 | from tacotron.utils.text import sequence_to_text
 14 | from tacotron.utils import infolog, plot, ValueWindow
 15 | from datasets import audio
 16 | log = infolog.log
 17 | 
 18 | 
 19 | def add_stats(model):
 20 | 	with tf.variable_scope('stats') as scope:
 21 | 		tf.summary.histogram('mel_outputs', model.mel_outputs)
 22 | 		tf.summary.histogram('mel_targets', model.mel_targets)
 23 | 		tf.summary.scalar('before_loss', model.before_loss)
 24 | 		tf.summary.scalar('after_loss', model.after_loss)
 25 | 		if hparams.predict_linear:
 26 | 			tf.summary.scalar('linear loss', model.linear_loss)
 27 | 		tf.summary.scalar('regularization_loss', model.regularization_loss)
 28 | 		tf.summary.scalar('stop_token_loss', model.stop_token_loss)
 29 | 		tf.summary.scalar('loss', model.loss)
 30 | 		tf.summary.scalar('learning_rate', model.learning_rate) #control learning rate decay speed
 31 | 		# gradient_norms = [tf.norm(grad) for grad in model.gradients]
 32 | 		# tf.summary.histogram('gradient_norm', gradient_norms)
 33 | 		# tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
 34 | 		if hparams.use_vae:
 35 | 			tf.summary.scalar('ki_loss', model.ki_loss)
 36 | 		return tf.summary.merge_all()
 37 | 
 38 | def time_string():
 39 | 	return datetime.now().strftime('%Y-%m-%d %H:%M')
 40 | 
 41 | def train(log_dir, args):
 42 | 	save_dir = os.path.join(log_dir, 'pretrained/')
 43 | 	checkpoint_path = os.path.join(save_dir, 'model.ckpt')
 44 | 	input_path = os.path.join(args.base_dir, args.input)
 45 | 	plot_dir = os.path.join(log_dir, 'plots')
 46 | 	wav_dir = os.path.join(log_dir, 'wavs')
 47 | 	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
 48 | 	os.makedirs(plot_dir, exist_ok=True)
 49 | 	os.makedirs(wav_dir, exist_ok=True)
 50 | 	os.makedirs(mel_dir, exist_ok=True)
 51 | 
 52 | 	if hparams.predict_linear:
 53 | 		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
 54 | 		os.makedirs(linear_dir, exist_ok=True)
 55 | 
 56 | 	log('Checkpoint path: {}'.format(checkpoint_path))
 57 | 	log('Loading training data from: {}'.format(input_path))
 58 | 	log('Using model: {}'.format(args.model))
 59 | 	log(hparams_debug_string())
 60 | 
 61 | 	#Set up data feeder
 62 | 	coord = tf.train.Coordinator()
 63 | 	with tf.variable_scope('datafeeder') as scope:
 64 | 		feeder = Feeder(coord, input_path, hparams)
 65 | 
 66 | 	#Set up model:
 67 | 	step_count = 0
 68 | 	try:
 69 | 		#simple text file to keep count of global step
 70 | 		with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file:
 71 | 			step_count = int(file.read())
 72 | 	except:
 73 | 		print('no step_counter file found, assuming there is no saved checkpoint')
 74 | 
 75 | 	global_step = tf.Variable(step_count, name='global_step', trainable=False)
 76 | 	with tf.variable_scope('model') as scope:
 77 | 		model = create_model(args.model, hparams)
 78 | 		if hparams.predict_linear:
 79 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.mel_lengths, feeder.token_targets, feeder.linear_targets)
 80 | 		else:
 81 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.mel_lengths, feeder.token_targets)
 82 | 		model.add_loss(global_step)
 83 | 		model.add_optimizer(global_step)
 84 | 		stats = add_stats(model)
 85 | 
 86 | 	#Book keeping
 87 | 	step = 0
 88 | 	time_window = ValueWindow(100)
 89 | 	loss_window = ValueWindow(100)
 90 | 	saver = tf.train.Saver(max_to_keep=5)
 91 | 
 92 | 	#Memory allocation on the GPU as needed
 93 | 	config = tf.ConfigProto()
 94 | 	config.gpu_options.allow_growth = True
 95 | 
 96 | 	#Train
 97 | 	with tf.Session(config=config) as sess:
 98 | 		try:
 99 | 			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
100 | 			sess.run(tf.global_variables_initializer())
101 | 
102 | 			#saved model restoring
103 | 			if args.restore:
104 | 				#Restore saved model if the user requested it, Default = True.
105 | 				try:
106 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
107 | 				except tf.errors.OutOfRangeError as e:
108 | 					log('Cannot restore checkpoint: {}'.format(e))
109 | 
110 | 			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
111 | 				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
112 | 				saver.restore(sess, checkpoint_state.model_checkpoint_path)
113 | 
114 | 			else:
115 | 				if not args.restore:
116 | 					log('Starting new training!')
117 | 				else:
118 | 					log('No model to load at {}'.format(save_dir))
119 | 
120 | 			#initializing feeder
121 | 			feeder.start_in_session(sess)
122 | 
123 | 			#Training loop
124 | 			while not coord.should_stop():
125 | 				start_time = time.time()
126 | 				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
127 | 				time_window.append(time.time() - start_time)
128 | 				loss_window.append(loss)
129 | 				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
130 | 					step, time_window.average, loss, loss_window.average)
131 | 				log(message, end='\r')
132 | 
133 | 				if loss > 100 or np.isnan(loss):
134 | 					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
135 | 					raise Exception('Loss exploded')
136 | 
137 | 				if step % args.summary_interval == 0:
138 | 					log('\nWriting summary at step: {}'.format(step))
139 | 					summary_writer.add_summary(sess.run(stats), step)
140 | 				
141 | 				if step % args.checkpoint_interval == 0:
142 | 					with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file:
143 | 						file.write(str(step))
144 | 					log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step))
145 | 					saver.save(sess, checkpoint_path, global_step=step)
146 | 					
147 | 					log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
148 | 					if hparams.predict_linear:
149 | 						input_seq, mel_prediction, linear_prediction, alignment, target = sess.run([
150 | 							model.inputs[0],
151 | 							model.mel_outputs[0],
152 | 							model.linear_outputs[0],
153 | 							model.alignments[0],
154 | 							model.mel_targets[0],
155 | 							])
156 | 
157 | 						#save predicted linear spectrogram to disk (debug)
158 | 						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
159 | 						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
160 | 
161 | 						#save griffin lim inverted wav for debug (linear -> wav)
162 | 						wav = audio.inv_linear_spectrogram(linear_prediction.T)
163 | 						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-waveform-linear.wav'.format(step)))
164 | 
165 | 					else:
166 | 						input_seq, mel_prediction, alignment, target = sess.run([model.inputs[0],
167 | 							model.mel_outputs[0],
168 | 							model.alignments[0],
169 | 							model.mel_targets[0],
170 | 							])
171 | 
172 | 					#save predicted mel spectrogram to disk (debug)
173 | 					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
174 | 					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)
175 | 
176 | 					#save griffin lim inverted wav for debug (mel -> wav)
177 | 					wav = audio.inv_mel_spectrogram(mel_prediction.T)
178 | 					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-waveform-mel.wav'.format(step)))
179 | 
180 | 					#save alignment plot to disk (control purposes)
181 | 					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
182 | 						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss))
183 | 					#save real mel-spectrogram plot to disk (control purposes)
184 | 					plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)),
185 | 						info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss))
186 | 					#save predicted mel-spectrogram plot to disk (control purposes)
187 | 					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)),
188 | 						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss))
189 | 					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
190 | 
191 | 		except Exception as e:
192 | 			log('Exiting due to exception: {}'.format(e), slack=True)
193 | 			traceback.print_exc()
194 | 			coord.request_stop(e)
195 | 
196 | def tacotron_train(args):
197 | 	hparams.parse(args.hparams)
198 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
199 | 	run_name = args.name or args.model
200 | 	log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
201 | 	os.makedirs(log_dir, exist_ok=True)
202 | 	infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name)
203 | 	train(log_dir, args)
204 | 


--------------------------------------------------------------------------------
/tacotron/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/tacotron/utils/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | from scipy import signal
  5 | from hparams import hparams
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def load_wav(path):
 10 | 	return librosa.core.load(path, sr=hparams.sample_rate)[0]
 11 | 
 12 | def save_wav(wav, path):
 13 | 	wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 14 | 	librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
 15 | 
 16 | def trim_silence(wav):
 17 | 	'''Trim leading and trailing silence
 18 | 
 19 | 	Useful for M-AILABS dataset if we choose to trim the extra 0.5 silences.
 20 | 	'''
 21 | 	return librosa.effects.trim(wav)[0]
 22 | 
 23 | def preemphasis(x):
 24 | 	return signal.lfilter([1, -hparams.preemphasis], [1], x)
 25 | 
 26 | def inv_preemphasis(x):
 27 | 	return signal.lfilter([1], [1, -hparams.preemphasis], x)
 28 | 
 29 | def get_hop_size():
 30 | 	hop_size = hparams.hop_size
 31 | 	if hop_size is None:
 32 | 		assert hparams.frame_shift_ms is not None
 33 | 		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 34 | 	return hop_size
 35 | 
 36 | def melspectrogram(wav):
 37 | 	D = _stft(wav)
 38 | 	S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 39 | 
 40 | 	if hparams.mel_normalization:
 41 | 		return _normalize(S)
 42 | 	return S
 43 | 
 44 | 
 45 | def inv_mel_spectrogram(mel_spectrogram):
 46 | 	'''Converts mel spectrogram to waveform using librosa'''
 47 | 	if hparams.mel_normalization:
 48 | 		D = _denormalize(mel_spectrogram)
 49 | 	else:
 50 | 		D = mel_spectrogram
 51 | 
 52 | 	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db))  # Convert back to linear
 53 | 
 54 | 	return _griffin_lim(S ** hparams.power)
 55 | 
 56 | def _griffin_lim(S):
 57 | 	'''librosa implementation of Griffin-Lim
 58 | 	Based on https://github.com/librosa/librosa/issues/434
 59 | 	'''
 60 | 	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 61 | 	S_complex = np.abs(S).astype(np.complex)
 62 | 	y = _istft(S_complex * angles)
 63 | 	for i in range(hparams.griffin_lim_iters):
 64 | 		angles = np.exp(1j * np.angle(_stft(y)))
 65 | 		y = _istft(S_complex * angles)
 66 | 	return y
 67 | 
 68 | def _stft(y):
 69 | 	return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size())
 70 | 
 71 | def _istft(y):
 72 | 	return librosa.istft(y, hop_length=get_hop_size())
 73 | 
 74 | 
 75 | # Conversions
 76 | _mel_basis = None
 77 | _inv_mel_basis = None
 78 | 
 79 | def _linear_to_mel(spectogram):
 80 | 	global _mel_basis
 81 | 	if _mel_basis is None:
 82 | 		_mel_basis = _build_mel_basis()
 83 | 	return np.dot(_mel_basis, spectogram)
 84 | 
 85 | def _mel_to_linear(mel_spectrogram):
 86 | 	global _inv_mel_basis
 87 | 	if _inv_mel_basis is None:
 88 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis())
 89 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
 90 | 
 91 | def _build_mel_basis():
 92 | 	assert hparams.fmax <= hparams.sample_rate // 2
 93 | 	return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels,
 94 | 							   fmin=hparams.fmin, fmax=hparams.fmax)
 95 | 
 96 | def _amp_to_db(x):
 97 | 	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
 98 | 	return 20 * np.log10(np.maximum(min_level, x))
 99 | 
100 | def _db_to_amp(x):
101 | 	return np.power(10.0, (x) * 0.05)
102 | 
103 | def _normalize(S):
104 | 	if hparams.allow_clipping_in_normalization:
105 | 		if hparams.symmetric_mels:
106 | 			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
107 | 			 -hparams.max_abs_value, hparams.max_abs_value)
108 | 		else:
109 | 			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
110 | 
111 | 	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
112 | 	if hparams.symmetric_mels:
113 | 		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
114 | 	else:
115 | 		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
116 | 
117 | def _denormalize(D):
118 | 	if hparams.allow_clipping_in_normalization:
119 | 		if hparams.symmetric_mels:
120 | 			return (((np.clip(D, -hparams.max_abs_value,
121 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
122 | 				+ hparams.min_level_db)
123 | 		else:
124 | 			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
125 | 
126 | 	if hparams.symmetric_mels:
127 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
128 | 	else:
129 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
130 | 


--------------------------------------------------------------------------------
/tacotron/utils/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 |   ('mrs', 'misess'),
24 |   ('mr', 'mister'),
25 |   ('dr', 'doctor'),
26 |   ('st', 'saint'),
27 |   ('co', 'company'),
28 |   ('jr', 'junior'),
29 |   ('maj', 'major'),
30 |   ('gen', 'general'),
31 |   ('drs', 'doctors'),
32 |   ('rev', 'reverend'),
33 |   ('lt', 'lieutenant'),
34 |   ('hon', 'honorable'),
35 |   ('sgt', 'sergeant'),
36 |   ('capt', 'captain'),
37 |   ('esq', 'esquire'),
38 |   ('ltd', 'limited'),
39 |   ('col', 'colonel'),
40 |   ('ft', 'fort'),
41 | ]]
42 | 
43 | 
44 | def expand_abbreviations(text):
45 |   for regex, replacement in _abbreviations:
46 |     text = re.sub(regex, replacement, text)
47 |   return text
48 | 
49 | 
50 | def expand_numbers(text):
51 |   return normalize_numbers(text)
52 | 
53 | 
54 | def lowercase(text):
55 |   '''lowercase input tokens.
56 |   '''
57 |   return text.lower()
58 | 
59 | 
60 | def collapse_whitespace(text):
61 |   return re.sub(_whitespace_re, ' ', text)
62 | 
63 | 
64 | def convert_to_ascii(text):
65 |   return unidecode(text)
66 | 
67 | 
68 | def basic_cleaners(text):
69 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70 |   text = lowercase(text)
71 |   text = collapse_whitespace(text)
72 |   return text
73 | 
74 | 
75 | def transliteration_cleaners(text):
76 |   '''Pipeline for non-English text that transliterates to ASCII.'''
77 |   text = convert_to_ascii(text)
78 |   text = lowercase(text)
79 |   text = collapse_whitespace(text)
80 |   return text
81 | 
82 | 
83 | def english_cleaners(text):
84 |   '''Pipeline for English text, including number and abbreviation expansion.'''
85 |   text = convert_to_ascii(text)
86 |   text = expand_numbers(text)
87 |   text = expand_abbreviations(text)
88 |   text = collapse_whitespace(text)
89 |   return text
90 | 


--------------------------------------------------------------------------------
/tacotron/utils/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |   def __init__(self, file_or_path, keep_ambiguous=True):
20 |     if isinstance(file_or_path, str):
21 |       with open(file_or_path, encoding='latin-1') as f:
22 |         entries = _parse_cmudict(f)
23 |     else:
24 |       entries = _parse_cmudict(file_or_path)
25 |     if not keep_ambiguous:
26 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
27 |     self._entries = entries
28 | 
29 | 
30 |   def __len__(self):
31 |     return len(self._entries)
32 | 
33 | 
34 |   def lookup(self, word):
35 |     '''Returns list of ARPAbet pronunciations of the given word.'''
36 |     return self._entries.get(word.upper())
37 | 
38 | 
39 | 
40 | _alt_re = re.compile(r'\([0-9]+\)')
41 | 
42 | 
43 | def _parse_cmudict(file):
44 |   cmudict = {}
45 |   for line in file:
46 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
47 |       parts = line.split('  ')
48 |       word = re.sub(_alt_re, '', parts[0])
49 |       pronunciation = _get_pronunciation(parts[1])
50 |       if pronunciation:
51 |         if word in cmudict:
52 |           cmudict[word].append(pronunciation)
53 |         else:
54 |           cmudict[word] = [pronunciation]
55 |   return cmudict
56 | 
57 | 
58 | def _get_pronunciation(s):
59 |   parts = s.strip().split(' ')
60 |   for part in parts:
61 |     if part not in _valid_symbol_set:
62 |       return None
63 |   return ' '.join(parts)
64 | 


--------------------------------------------------------------------------------
/tacotron/utils/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread 
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 | 	global _file, _run_name, _slack_url
16 | 	_close_logfile()
17 | 	_file = open(filename, 'a')
18 | 	_file = open(filename, 'a')
19 | 	_file.write('\n-----------------------------------------------------------------\n')
20 | 	_file.write('Starting new training run\n')
21 | 	_file.write('-----------------------------------------------------------------\n')
22 | 	_run_name = run_name
23 | 	_slack_url = slack_url
24 | 
25 | 
26 | def log(msg, end='\n', slack=False):
27 | 	print(msg, end=end)
28 | 	if _file is not None:
29 | 		_file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
30 | 	if slack and _slack_url is not None:
31 | 		Thread(target=_send_slack, args=(msg,)).start()
32 | 
33 | 
34 | def _close_logfile():
35 | 	global _file
36 | 	if _file is not None:
37 | 		_file.close()
38 | 		_file = None
39 | 
40 | 
41 | def _send_slack(msg):
42 | 	req = Request(_slack_url)
43 | 	req.add_header('Content-Type', 'application/json')
44 | 	urlopen(req, json.dumps({
45 | 		'username': 'tacotron',
46 | 		'icon_emoji': ':taco:',
47 | 		'text': '*%s*: %s' % (_run_name, msg)
48 | 	}).encode())
49 | 
50 | 
51 | atexit.register(_close_logfile)


--------------------------------------------------------------------------------
/tacotron/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/tacotron/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np 
 5 | 
 6 | 
 7 | def split_title_line(title_text, max_words=5):
 8 | 	"""
 9 | 	A function that splits any string based on specific character
10 | 	(returning it with the string), with maximum number of words on it
11 | 	"""
12 | 	seq = title_text.split()
13 | 	return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
14 | 
15 | def plot_alignment(alignment, path, info=None, split_title=False):
16 | 	fig, ax = plt.subplots()
17 | 	im = ax.imshow(
18 | 		alignment,
19 | 		aspect='auto',
20 | 		origin='lower',
21 | 		interpolation='none')
22 | 	fig.colorbar(im, ax=ax)
23 | 	xlabel = 'Decoder timestep'
24 | 	if info is not None:
25 | 		if split_title:
26 | 			title = split_title_line(info)
27 | 		else:
28 | 			title = info
29 | 	plt.xlabel(xlabel)
30 | 	plt.title(title)
31 | 	plt.ylabel('Encoder timestep')
32 | 	plt.tight_layout()
33 | 	plt.savefig(path, format='png')
34 | 
35 | 
36 | def plot_spectrogram(spectrogram, path, info=None, split_title=False):
37 | 	plt.figure()
38 | 	plt.imshow(np.rot90(spectrogram))
39 | 	plt.colorbar(shrink=0.65, orientation='horizontal')
40 | 	plt.ylabel('mels')
41 | 	xlabel = 'frames'
42 | 	if info is not None:
43 | 		if split_title:
44 | 			title = split_title_line(info)
45 | 		else:
46 | 			title = info
47 | 	plt.xlabel(xlabel)
48 | 	plt.title(title)
49 | 	plt.tight_layout()
50 | 	plt.savefig(path, format='png')
51 | 


--------------------------------------------------------------------------------
/tacotron/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from . import cmudict
 8 | 
 9 | _pad        = '_'
10 | _eos        = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet


--------------------------------------------------------------------------------
/tacotron/utils/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from . import cleaners
 3 | from .symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |     Args:
21 |       text: string to convert to a sequence
22 |       cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |     Returns:
25 |       List of integers corresponding to the symbols in the text
26 |   '''
27 |   sequence = []
28 | 
29 |   # Check for curly braces and treat their contents as ARPAbet:
30 |   while len(text):
31 |     m = _curly_re.match(text)
32 |     if not m:
33 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |       break
35 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |     sequence += _arpabet_to_sequence(m.group(2))
37 |     text = m.group(3)
38 | 
39 |   # Append EOS token
40 |   sequence.append(_symbol_to_id['~'])
41 |   return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |   '''Converts a sequence of IDs back to a string'''
46 |   result = ''
47 |   for symbol_id in sequence:
48 |     if symbol_id in _id_to_symbol:
49 |       s = _id_to_symbol[symbol_id]
50 |       # Enclose ARPAbet back in curly braces:
51 |       if len(s) > 1 and s[0] == '@':
52 |         s = '{%s}' % s[1:]
53 |       result += s
54 |   return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |   for name in cleaner_names:
59 |     cleaner = getattr(cleaners, name)
60 |     if not cleaner:
61 |       raise Exception('Unknown cleaner: %s' % name)
62 |     text = cleaner(text)
63 |   return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |   return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |   return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/tacotron/utils/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from hparams import hparams as hp
 4 | 
 5 | def shape_list(x):
 6 |   """Return list of dims, statically where possible."""
 7 |   x = tf.convert_to_tensor(x)
 8 | 
 9 |   # If unknown rank, return dynamic shape
10 |   if x.get_shape().dims is None:
11 |     return tf.shape(x)
12 | 
13 |   static = x.get_shape().as_list()
14 |   shape = tf.shape(x)
15 | 
16 |   ret = []
17 |   for i in range(len(static)):
18 |     dim = static[i]
19 |     if dim is None:
20 |       dim = shape[i]
21 |     ret.append(dim)
22 |   return ret
23 | 
24 | def vae_weight(global_step):
25 |     warm_up_step = hp.vae_warming_up
26 |     w1 = tf.cond(
27 |        global_step < warm_up_step,
28 |        lambda: tf.cond(
29 |             global_step % 100 < 1,
30 |             lambda: tf.convert_to_tensor(hp.init_vae_weights) + tf.cast(global_step / 100  * hp.vae_weight_multiler, tf.float32),
31 |             lambda: tf.cast(tf.convert_to_tensor(0), tf.float32)
32 |          ),
33 |        lambda: tf.cast(tf.convert_to_tensor(0), tf.float32)
34 |     )
35 |       
36 |     w2 = tf.cond(
37 |        global_step > warm_up_step,
38 |        lambda: tf.cond(
39 |              global_step % 400 < 1,
40 |              lambda: tf.convert_to_tensor(hp.init_vae_weights) + tf.cast((global_step - warm_up_step) / 400 * hp.vae_weight_multiler + warm_up_step / 100 * hp.vae_weight_multiler, tf.float32),
41 |              lambda: tf.cast(tf.convert_to_tensor(0), tf.float32)
42 |          ),
43 |        lambda: tf.cast(tf.convert_to_tensor(0), tf.float32)
44 |     )             
45 |     return tf.maximum(w1, w2)
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tacotron.train import tacotron_train
 3 | 
 4 | 
 5 | def main():
 6 | 	parser = argparse.ArgumentParser()
 7 | 	parser.add_argument('--base_dir', default='.')
 8 | 	parser.add_argument('--hparams', default='',
 9 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
10 | 	parser.add_argument('--input', default='training_data/train.txt')
11 | 	parser.add_argument('--name', help='Name of logging directory.')
12 | 	parser.add_argument('--model', default='Tacotron')
13 | 	parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training')
14 | 	parser.add_argument('--summary_interval', type=int, default=100,
15 | 		help='Steps between running summary ops')
16 | 	parser.add_argument('--checkpoint_interval', type=int, default=500,
17 | 		help='Steps between writing checkpoints')
18 | 	parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
19 | 	args = parser.parse_args()
20 | 
21 | 	accepted_models = ['Tacotron', 'Wavenet']
22 | 
23 | 	if args.model not in accepted_models:
24 | 		raise ValueError('please enter a valid model to train: {}'.format(accepted_models))
25 | 
26 | 	if args.model == 'Tacotron':
27 | 		tacotron_train(args)
28 | 	elif args.model == 'Wavenet':
29 | 		raise NotImplementedError('Wavenet is still a work in progress, thank you for your patience!')
30 | 
31 | 
32 | if __name__ == '__main__':
33 | 	main()


--------------------------------------------------------------------------------