├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── __init__.py ├── audio.py ├── preprocessor.py └── util.py ├── hparams.py ├── preprocess.py ├── requirements.txt ├── synthesize.py ├── tacotron ├── __init__.py ├── feeder.py ├── models │ ├── Architecture_wrappers.py │ ├── __init__.py │ ├── attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── modules.py │ ├── tacotron.py │ └── zoneout_LSTM.py ├── synthesize.py ├── synthesizer.py ├── train.py └── utils │ ├── __init__.py │ ├── audio.py │ ├── cleaners.py │ ├── cmudict.py │ ├── infolog.py │ ├── numbers.py │ ├── plot.py │ ├── symbols.py │ ├── text.py │ └── util.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | papers/ 6 | # C extensions 7 | *.so 8 | .idea/ 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Tacotron 2 oddities 107 | logs-*/ 108 | training_data/ 109 | 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rayhane Mama 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VAE Tacotron-2: 2 | Unofficial Implementation of [Learning latent representations for style control and transfer in end-to-end speech synthesis](https://arxiv.org/pdf/1812.04342.pdf) 3 | 4 | 5 | # Repository Structure: 6 | Tacotron-2 7 | ├── datasets 8 | ├── LJSpeech-1.1 (0) 9 | │   └── wavs 10 | ├── logs-Tacotron (2) 11 | │   ├── mel-spectrograms 12 | │   ├── plots 13 | │   ├── pretrained 14 | │   └── wavs 15 | ├── papers 16 | ├── tacotron 17 | │   ├── models 18 | │   └── utils 19 | ├── tacotron_output (3) 20 | │   ├── eval 21 | │   ├── gta 22 | │   ├── logs-eval 23 | │   │   ├── plots 24 | │   │   └── wavs 25 | │   └── natural 26 | └── training_data (1) 27 |    ├── audio 28 |    └── mels 29 | 30 | 31 | 32 | 33 | 34 | The previous tree shows what the current state of the repository. 35 | 36 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**. 37 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder. 38 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder. 39 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder. 40 | 41 | 42 | # Requirements 43 | first, you need to have python 3.5 installed along with [Tensorflow v1.6](https://www.tensorflow.org/install/). 44 | 45 | next you can install the requirements : 46 | 47 | > pip install -r requirements.txt 48 | 49 | else: 50 | 51 | > pip3 install -r requirements.txt 52 | 53 | # Dataset: 54 | This repo tested on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. 55 | 56 | # Preprocessing 57 | Before running the following steps, please make sure you are inside **Tacotron-2 folder** 58 | 59 | > cd Tacotron-2 60 | 61 | Preprocessing can then be started using: 62 | 63 | > python preprocess.py 64 | 65 | or 66 | 67 | > python3 preprocess.py 68 | 69 | dataset can be chosen using the **--dataset** argument. Default is **Ljspeech**. 70 | 71 | # Training: 72 | Feature prediction model can be **trained** using: 73 | 74 | > python train.py --model='Tacotron' 75 | 76 | or 77 | 78 | > python3 train.py --model='Tacotron' 79 | 80 | # Synthesis 81 | There are **three types** of mel spectrograms synthesis for the Spectrogram prediction network (Tacotron): 82 | 83 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model. 84 | 85 | > python synthesize.py --model='Tacotron' --mode='eval' --reference_audio='ref_1.wav' 86 | 87 | or 88 | 89 | > python3 synthesize.py --model='Tacotron' --mode='eval' --reference_audio='ref_1.wav' 90 | 91 | **Note:** 92 | - This implementation not completly tested for all scenarios but training and synthesis with reference audio working. 93 | - Though it only tested on synthesize without GTA and with `eval` mode. 94 | - After training 250k step with 32 batch size on LJSpeech, KL error settled down near to zero (around 0.001) still not get good style transfer and control, may be because this model trained on LJSpeech which is not quite expressive datasets and only have 24 hrs of data, it might be produce good result on expressive dataset like `Blizzard 2013 voice dataset` though author of the paper used 105 hrs of Blizzard Challenge 2013 dataset. 95 | - In my testing, I havn't get good results so far on style transfer side may be some more tweaking required, this implementation easily integrated with `wavenet` as well as `WaveRNN`. 96 | - Feel free to suggest some changes or even better raise PR. 97 | 98 | # Pretrained model and Samples: 99 | TODO 100 | Claimed Samples from research paper : http://home.ustc.edu.cn/~zyj008/ICASSP2019 101 | 102 | # References and Resources: 103 | - [Tensorflow original tacotron implementation](https://github.com/keithito/tacotron) 104 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf) 105 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf) 106 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf) 107 | - [r9y9/Tacotron-2](https://github.com/r9y9/Tacotron-2) 108 | - [yanggeng1995/vae_tacotron](https://github.com/yanggeng1995/vae_tacotron) 109 | 110 | **Work in progress** 111 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /datasets/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from scipy import signal 5 | from hparams import hparams 6 | import tensorflow as tf 7 | from scipy.io import wavfile 8 | 9 | 10 | def load_wav(path): 11 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 12 | 13 | def save_wav(wav, path): 14 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 15 | #proposed by @dsmiller 16 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 17 | 18 | def start_and_end_indices(quantized, silence_threshold=2): 19 | for start in range(quantized.size): 20 | if abs(quantized[start] - 127) > silence_threshold: 21 | break 22 | for end in range(quantized.size - 1, 1, -1): 23 | if abs(quantized[end] - 127) > silence_threshold: 24 | break 25 | 26 | assert abs(quantized[start] - 127) > silence_threshold 27 | assert abs(quantized[end] - 127) > silence_threshold 28 | 29 | return start, end 30 | 31 | def trim_silence(wav): 32 | '''Trim leading and trailing silence 33 | 34 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silences. 35 | ''' 36 | return librosa.effects.trim(wav)[0] 37 | 38 | def preemphasis(x): 39 | return signal.lfilter([1, -hparams.preemphasis], [1], x) 40 | 41 | def inv_preemphasis(x): 42 | return signal.lfilter([1], [1, -hparams.preemphasis], x) 43 | 44 | def get_hop_size(): 45 | hop_size = hparams.hop_size 46 | if hop_size is None: 47 | assert hparams.frame_shift_ms is not None 48 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 49 | return hop_size 50 | 51 | def linearspectrogram(wav): 52 | D = _stft(wav) 53 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 54 | 55 | if hparams.signal_normalization: 56 | return _normalize(S) 57 | return S 58 | 59 | def melspectrogram(wav): 60 | D = _stft(wav) 61 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 62 | 63 | if hparams.signal_normalization: 64 | return _normalize(S) 65 | return S 66 | 67 | def inv_linear_spectrogram(linear_spectrogram): 68 | '''Converts linear spectrogram to waveform using librosa''' 69 | if hparams.signal_normalization: 70 | D = _denormalize(linear_spectrogram) 71 | else: 72 | D = linear_spectrogram 73 | 74 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 75 | 76 | return _griffin_lim(S ** hparams.power) 77 | 78 | 79 | def inv_mel_spectrogram(mel_spectrogram): 80 | '''Converts mel spectrogram to waveform using librosa''' 81 | if hparams.signal_normalization: 82 | D = _denormalize(mel_spectrogram) 83 | else: 84 | D = mel_spectrogram 85 | 86 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)) # Convert back to linear 87 | 88 | return _griffin_lim(S ** hparams.power) 89 | 90 | def _griffin_lim(S): 91 | '''librosa implementation of Griffin-Lim 92 | Based on https://github.com/librosa/librosa/issues/434 93 | ''' 94 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 95 | S_complex = np.abs(S).astype(np.complex) 96 | y = _istft(S_complex * angles) 97 | for i in range(hparams.griffin_lim_iters): 98 | angles = np.exp(1j * np.angle(_stft(y))) 99 | y = _istft(S_complex * angles) 100 | return y 101 | 102 | def _stft(y): 103 | return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 104 | 105 | def _istft(y): 106 | return librosa.istft(y, hop_length=get_hop_size()) 107 | 108 | def num_frames(length, fsize, fshift): 109 | """Compute number of time frames of spectrogram 110 | """ 111 | pad = (fsize - fshift) 112 | if length % fshift == 0: 113 | M = (length + pad * 2 - fsize) // fshift + 1 114 | else: 115 | M = (length + pad * 2 - fsize) // fshift + 2 116 | return M 117 | 118 | 119 | def pad_lr(x, fsize, fshift): 120 | """Compute left and right padding 121 | """ 122 | M = num_frames(len(x), fsize, fshift) 123 | pad = (fsize - fshift) 124 | T = len(x) + 2 * pad 125 | r = (M - 1) * fshift + fsize - T 126 | return pad, pad + r 127 | 128 | 129 | # Conversions 130 | _mel_basis = None 131 | _inv_mel_basis = None 132 | 133 | def _linear_to_mel(spectogram): 134 | global _mel_basis 135 | if _mel_basis is None: 136 | _mel_basis = _build_mel_basis() 137 | return np.dot(_mel_basis, spectogram) 138 | 139 | def _mel_to_linear(mel_spectrogram): 140 | global _inv_mel_basis 141 | if _inv_mel_basis is None: 142 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 143 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 144 | 145 | def _build_mel_basis(): 146 | assert hparams.fmax <= hparams.sample_rate // 2 147 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels, 148 | fmin=hparams.fmin, fmax=hparams.fmax) 149 | 150 | def _amp_to_db(x): 151 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 152 | return 20 * np.log10(np.maximum(min_level, x)) 153 | 154 | def _db_to_amp(x): 155 | return np.power(10.0, (x) * 0.05) 156 | 157 | def _normalize(S): 158 | if hparams.allow_clipping_in_normalization: 159 | if hparams.symmetric_mels: 160 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 161 | -hparams.max_abs_value, hparams.max_abs_value) 162 | else: 163 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 164 | 165 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 166 | if hparams.symmetric_mels: 167 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 168 | else: 169 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 170 | 171 | def _denormalize(D): 172 | if hparams.allow_clipping_in_normalization: 173 | if hparams.symmetric_mels: 174 | return (((np.clip(D, -hparams.max_abs_value, 175 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 176 | + hparams.min_level_db) 177 | else: 178 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 179 | 180 | if hparams.symmetric_mels: 181 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 182 | else: 183 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) -------------------------------------------------------------------------------- /datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | from datasets import audio 4 | import os 5 | import numpy as np 6 | from hparams import hparams 7 | from datasets.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize 8 | 9 | 10 | def build_from_path(input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 11 | """ 12 | Preprocesses the speech dataset from a gven input path to given output directories 13 | 14 | Args: 15 | - input_dir: input directory that contains the files to prerocess 16 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 17 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 18 | - wav_dir: output directory of the preprocessed speech audio dataset 19 | - n_jobs: Optional, number of worker process to parallelize across 20 | - tqdm: Optional, provides a nice progress bar 21 | 22 | Returns: 23 | - A list of tuple describing the train examples. this should be written to train.txt 24 | """ 25 | 26 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 27 | # optimization purposes and it can be omited 28 | executor = ProcessPoolExecutor(max_workers=n_jobs) 29 | futures = [] 30 | index = 1 31 | for input_dir in input_dirs: 32 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: 33 | for line in f: 34 | parts = line.strip().split('|') 35 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0])) 36 | text = parts[2] 37 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, index, wav_path, text))) 38 | index += 1 39 | 40 | return [future.result() for future in tqdm(futures) if future.result() is not None] 41 | 42 | 43 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): 44 | """ 45 | Preprocesses a single utterance wav/text pair 46 | 47 | this writes the mel scale spectogram to disk and return a tuple to write 48 | to the train.txt file 49 | 50 | Args: 51 | - mel_dir: the directory to write the mel spectograms into 52 | - linear_dir: the directory to write the linear spectrograms into 53 | - wav_dir: the directory to write the preprocessed wav into 54 | - index: the numeric index to use in the spectogram filename 55 | - wav_path: path to the audio file containing the speech input 56 | - text: text spoken in the input audio file 57 | 58 | Returns: 59 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 60 | """ 61 | 62 | try: 63 | # Load the audio as numpy array 64 | wav = audio.load_wav(wav_path) 65 | except FileNotFoundError: #catch missing wav exception 66 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 67 | wav_path)) 68 | return None 69 | 70 | #rescale wav 71 | if hparams.rescale: 72 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 73 | 74 | #M-AILABS extra silence specific 75 | if hparams.trim_silence: 76 | wav = audio.trim_silence(wav) 77 | 78 | #Mu-law quantize 79 | if is_mulaw_quantize(hparams.input_type): 80 | #[0, quantize_channels) 81 | out = mulaw_quantize(wav, hparams.quantize_channels) 82 | 83 | #Trim silences 84 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 85 | wav = wav[start: end] 86 | out = out[start: end] 87 | 88 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 89 | out_dtype = np.int16 90 | 91 | elif is_mulaw(hparams.input_type): 92 | #[-1, 1] 93 | out = mulaw(wav, hparams.quantize_channels) 94 | constant_values = mulaw(0., hparams.quantize_channels) 95 | out_dtype = np.float32 96 | 97 | else: 98 | #[-1, 1] 99 | out = wav 100 | constant_values = 0. 101 | out_dtype = np.float32 102 | 103 | # Compute the mel scale spectrogram from the wav 104 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 105 | mel_frames = mel_spectrogram.shape[1] 106 | 107 | #Compute the linear scale spectrogram from the wav 108 | linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) 109 | linear_frames = linear_spectrogram.shape[1] 110 | 111 | #sanity check 112 | assert linear_frames == mel_frames 113 | 114 | #Ensure time resolution adjustement between audio and mel-spectrogram 115 | l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) 116 | 117 | #Zero pad for quantized signal 118 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 119 | time_steps = len(out) 120 | assert time_steps >= mel_frames * audio.get_hop_size() 121 | 122 | #time resolution adjustement 123 | #ensure length of raw audio is multiple of hop size so that we can use 124 | #transposed convolution to upsample 125 | out = out[:mel_frames * audio.get_hop_size()] 126 | assert time_steps % audio.get_hop_size() == 0 127 | 128 | # Write the spectrogram and audio to disk 129 | audio_filename = 'speech-audio-{:05d}.npy'.format(index) 130 | mel_filename = 'speech-mel-{:05d}.npy'.format(index) 131 | linear_filename = 'speech-linear-{:05d}.npy'.format(index) 132 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) 133 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 134 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) 135 | 136 | # Return a tuple describing this training example 137 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text) 138 | -------------------------------------------------------------------------------- /datasets/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | 5 | def _assert_valid_input_type(s): 6 | assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw' 7 | 8 | def is_mulaw_quantize(s): 9 | _assert_valid_input_type(s) 10 | return s == 'mulaw-quantize' 11 | 12 | def is_mulaw(s): 13 | _assert_valid_input_type(s) 14 | return s == 'mulaw' 15 | 16 | def is_raw(s): 17 | _assert_valid_input_type(s) 18 | return s == 'raw' 19 | 20 | def is_scalar_input(s): 21 | return is_raw(s) or is_mulaw(s) 22 | 23 | 24 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py 25 | def mulaw(x, mu=256): 26 | """Mu-Law companding 27 | Method described in paper [1]_. 28 | .. math:: 29 | f(x) = sign(x) \ln (1 + \mu |x|) / \ln (1 + \mu) 30 | Args: 31 | x (array-like): Input signal. Each value of input signal must be in 32 | range of [-1, 1]. 33 | mu (number): Compression parameter ``μ``. 34 | Returns: 35 | array-like: Compressed signal ([-1, 1]) 36 | See also: 37 | :func:`nnmnkwii.preprocessing.inv_mulaw` 38 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 39 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 40 | .. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding 41 | implementations using the tms320c54x." SPRA163 (1997). 42 | """ 43 | return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu) 44 | 45 | 46 | def inv_mulaw(y, mu=256): 47 | """Inverse of mu-law companding (mu-law expansion) 48 | .. math:: 49 | f^{-1}(x) = sign(y) (1 / \mu) (1 + \mu)^{|y|} - 1) 50 | Args: 51 | y (array-like): Compressed signal. Each value of input signal must be in 52 | range of [-1, 1]. 53 | mu (number): Compression parameter ``μ``. 54 | Returns: 55 | array-like: Uncomprresed signal (-1 <= x <= 1) 56 | See also: 57 | :func:`nnmnkwii.preprocessing.inv_mulaw` 58 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 59 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 60 | """ 61 | return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0) 62 | 63 | 64 | def mulaw_quantize(x, mu=256): 65 | """Mu-Law companding + quantize 66 | Args: 67 | x (array-like): Input signal. Each value of input signal must be in 68 | range of [-1, 1]. 69 | mu (number): Compression parameter ``μ``. 70 | Returns: 71 | array-like: Quantized signal (dtype=int) 72 | - y ∈ [0, mu] if x ∈ [-1, 1] 73 | - y ∈ [0, mu) if x ∈ [-1, 1) 74 | .. note:: 75 | If you want to get quantized values of range [0, mu) (not [0, mu]), 76 | then you need to provide input signal of range [-1, 1). 77 | Examples: 78 | >>> from scipy.io import wavfile 79 | >>> import pysptk 80 | >>> import numpy as np 81 | >>> from nnmnkwii import preprocessing as P 82 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 83 | >>> x = (x / 32768.0).astype(np.float32) 84 | >>> y = P.mulaw_quantize(x) 85 | >>> print(y.min(), y.max(), y.dtype) 86 | 15 246 int64 87 | See also: 88 | :func:`nnmnkwii.preprocessing.mulaw` 89 | :func:`nnmnkwii.preprocessing.inv_mulaw` 90 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 91 | """ 92 | y = mulaw(x, mu) 93 | # scale [-1, 1] to [0, mu] 94 | return _asint((y + 1) / 2 * mu) 95 | 96 | 97 | def inv_mulaw_quantize(y, mu=256): 98 | """Inverse of mu-law companding + quantize 99 | Args: 100 | y (array-like): Quantized signal (∈ [0, mu]). 101 | mu (number): Compression parameter ``μ``. 102 | Returns: 103 | array-like: Uncompressed signal ([-1, 1]) 104 | Examples: 105 | >>> from scipy.io import wavfile 106 | >>> import pysptk 107 | >>> import numpy as np 108 | >>> from nnmnkwii import preprocessing as P 109 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 110 | >>> x = (x / 32768.0).astype(np.float32) 111 | >>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x)) 112 | >>> x_hat = (x_hat * 32768).astype(np.int16) 113 | See also: 114 | :func:`nnmnkwii.preprocessing.mulaw` 115 | :func:`nnmnkwii.preprocessing.inv_mulaw` 116 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 117 | """ 118 | # [0, m) to [-1, 1] 119 | y = 2 * _asfloat(y) / mu - 1 120 | return inv_mulaw(y, mu) 121 | 122 | def _sign(x): 123 | isnumpy = isinstance(x, np.ndarray) 124 | isscalar = np.isscalar(x) 125 | return np.sign(x) if isnumpy or isscalar else x.sign() 126 | 127 | 128 | def _log1p(x): 129 | isnumpy = isinstance(x, np.ndarray) 130 | isscalar = np.isscalar(x) 131 | return np.log1p(x) if isnumpy or isscalar else x.log1p() 132 | 133 | 134 | def _abs(x): 135 | isnumpy = isinstance(x, np.ndarray) 136 | isscalar = np.isscalar(x) 137 | return np.abs(x) if isnumpy or isscalar else x.abs() 138 | 139 | 140 | def _asint(x): 141 | # ugly wrapper to support torch/numpy arrays 142 | isnumpy = isinstance(x, np.ndarray) 143 | isscalar = np.isscalar(x) 144 | return x.astype(np.int) if isnumpy else int(x) if isscalar else x.long() 145 | 146 | 147 | def _asfloat(x): 148 | # ugly wrapper to support torch/numpy arrays 149 | isnumpy = isinstance(x, np.ndarray) 150 | isscalar = np.isscalar(x) 151 | return x.astype(np.float32) if isnumpy else float(x) if isscalar else x.float() 152 | 153 | 154 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/lrschedule.py 155 | def noam_learning_rate_decay(init_lr, global_step, warmup_steps=4000): 156 | # Noam scheme from tensor2tensor: 157 | warmup_steps = float(warmup_steps) 158 | step = global_step + 1. 159 | lr = init_lr * warmup_steps**0.5 * np.minimum( 160 | step * warmup_steps**-1.5, step**-0.5) 161 | return lr 162 | 163 | 164 | def step_learning_rate_decay(init_lr, global_step, 165 | anneal_rate=0.98, 166 | anneal_interval=30000): 167 | return init_lr * anneal_rate ** (global_step // anneal_interval) 168 | 169 | 170 | def cyclic_cosine_annealing(init_lr, global_step, T, M): 171 | """Cyclic cosine annealing 172 | 173 | https://arxiv.org/pdf/1704.00109.pdf 174 | 175 | Args: 176 | init_lr (float): Initial learning rate 177 | global_step (int): Current iteration number 178 | T (int): Total iteration number (i,e. nepoch) 179 | M (int): Number of ensembles we want 180 | 181 | Returns: 182 | float: Annealed learning rate 183 | """ 184 | TdivM = T // M 185 | return init_lr / 2.0 * (np.cos(np.pi * ((global_step - 1) % TdivM) / TdivM) + 1.0) -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | # Default hyperparameters 6 | hparams = tf.contrib.training.HParams( 7 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 8 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners". 9 | cleaners='english_cleaners', 10 | 11 | 12 | #Audio 13 | num_mels = 80, 14 | num_freq = 513, #only used when adding linear spectrograms post processing network 15 | rescale = True, 16 | rescaling_max = 0.999, 17 | trim_silence = True, 18 | 19 | #Mel spectrogram 20 | fft_size = 1024, 21 | hop_size = 256, 22 | sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset) 23 | frame_shift_ms = None, 24 | 25 | #Mel and Linear spectrograms normalization/scaling and clipping 26 | mel_normalization = False, 27 | signal_normalization = True, 28 | allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True 29 | symmetric_mels = True, #Whether to scale the data to be symmetric around 0 30 | max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 31 | 32 | #Limits 33 | min_level_db =- 100, 34 | ref_level_db = 20, 35 | fmin = 125, 36 | fmax = 7600, 37 | 38 | #Griffin Lim 39 | power = 1.55, 40 | griffin_lim_iters = 60, 41 | 42 | # VAE: 43 | use_vae=True, 44 | vae_dim=32, 45 | vae_warming_up=15000, 46 | init_vae_weights=0.001, 47 | vae_weight_multiler=0.002, 48 | filters=[32, 32, 64, 64, 128, 128], 49 | 50 | #Tacotron 51 | outputs_per_step = 1, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size) 52 | stop_at_any = True, #Determines whether the decoder should stop when predicting to any frame or to all of them 53 | 54 | embedding_dim = 512, #dimension of embedding space 55 | 56 | enc_conv_num_layers = 3, #number of encoder convolutional layers 57 | enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer 58 | enc_conv_channels = 512, #number of encoder convolutions filters for each layer 59 | encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward) 60 | encoder_depth=512, 61 | smoothing = False, #Whether to smooth the attention normalization function 62 | attention_dim = 128, #dimension of attention space 63 | attention_filters = 32, #number of attention convolution filters 64 | attention_kernel = (31, ), #kernel size of attention convolution 65 | cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) 66 | 67 | prenet_layers = [256, 256], #number of layers and number of units of prenet 68 | decoder_layers = 2, #number of decoder lstm layers 69 | decoder_lstm_units = 1024, #number of decoder lstm units on each layer 70 | max_iters = 2500, #Max decoder steps during inference (Just for safety from infinite loop cases) 71 | 72 | postnet_num_layers = 5, #number of postnet convolutional layers 73 | postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer 74 | postnet_channels = 512, #number of postnet convolution filters for each layer 75 | 76 | mask_encoder = False, #whether to mask encoder padding while computing attention 77 | impute_finished = False, #Whether to use loss mask for padded sequences 78 | mask_finished = False, #Whether to mask alignments beyond the (False for debug, True for style) 79 | 80 | predict_linear = False, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!) 81 | 82 | 83 | #Wavenet 84 | # Input type: 85 | # 1. raw [-1, 1] 86 | # 2. mulaw [-1, 1] 87 | # 3. mulaw-quantize [0, mu] 88 | # If input_type is raw or mulaw, network assumes scalar input and 89 | # discretized mixture of logistic distributions output, otherwise one-hot 90 | # input and softmax output are assumed. 91 | # **NOTE**: if you change the one of the two parameters below, you need to 92 | # re-run preprocessing before training. 93 | # **NOTE**: scaler input (raw or mulaw) is experimental. Use it your own risk. 94 | input_type="mulaw-quantize", 95 | quantize_channels=256, # 65536 or 256 96 | 97 | silence_threshold=2, 98 | 99 | # Mixture of logistic distributions: 100 | log_scale_min=float(np.log(1e-14)), 101 | 102 | #TODO model params 103 | 104 | 105 | #Tacotron Training 106 | tacotron_batch_size = 32, #number of training samples on each training steps 107 | tacotron_reg_weight = 1e-6, #regularization weight (for l2 regularization) 108 | tacotron_scale_regularization = True, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model) 109 | 110 | tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay 111 | tacotron_start_decay = 50000, #Step at which learning decay starts 112 | tacotron_decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope) (UNDER TEST) 113 | tacotron_decay_rate = 0.4, #learning rate decay rate (UNDER TEST) 114 | tacotron_initial_learning_rate = 1e-3, #starting learning rate 115 | tacotron_final_learning_rate = 1e-5, #minimal learning rate 116 | 117 | tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter 118 | tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter 119 | tacotron_adam_epsilon = 1e-6, #AdamOptimizer beta3 parameter 120 | 121 | tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network 122 | tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet 123 | 124 | tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs 125 | 126 | 127 | #Wavenet Training TODO 128 | 129 | 130 | 131 | #Eval sentences 132 | sentences = [ 133 | # From July 8, 2017 New York Times: 134 | 'Scientists at the CERN laboratory say they have discovered a new particle.', 135 | 'There\'s a way to measure the acute emotional intelligence that has never gone out of style.', 136 | 'President Trump met with other leaders at the Group of 20 conference.', 137 | 'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.', 138 | # From Google's Tacotron example page: 139 | 'Generative adversarial network or variational auto-encoder.', 140 | 'Basilar membrane and otolaryngology are not auto-correlations.', 141 | 'He has read the whole thing.', 142 | 'He reads books.', 143 | "Don't desert me here in the desert!", 144 | 'He thought it was time to present the present.', 145 | 'Thisss isrealy awhsome.', 146 | 'Punctuation sensitivity, is working.', 147 | 'Punctuation sensitivity is working.', 148 | "The buses aren't the problem, they actually provide a solution.", 149 | "The buses aren't the PROBLEM, they actually provide a SOLUTION.", 150 | "The quick brown fox jumps over the lazy dog.", 151 | "Does the quick brown fox jump over the lazy dog?", 152 | "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?", 153 | "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.", 154 | "The blue lagoon is a nineteen eighty American romance adventure film.", 155 | "Tajima Airport serves Toyooka.", 156 | 'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.', 157 | #From Training data: 158 | 'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.', 159 | 'in giltspur street compter, where he was first lodged.', 160 | 'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,', 161 | 'it appears that oswald had only one caller in response to all of his fpcc activities,', 162 | 'he relied on the absence of the strychnia.', 163 | 'scoggins thought it was lighter.', 164 | '''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, 165 | and would have possessed so much moral dignity''', 166 | '''the only purpose of this whole sentence is to evaluate the scalability of the model for very long sentences. 167 | This is not even a long sentence anymore, it has become an entire paragraph. 168 | Should I stop now? Let\'s add this last sentence in which we talk about nothing special.''', 169 | 'Thank you so much for your support!!' 170 | ] 171 | 172 | ) 173 | 174 | def hparams_debug_string(): 175 | values = hparams.values() 176 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences'] 177 | return 'Hyperparameters:\n' + '\n'.join(hp) 178 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from multiprocessing import cpu_count 3 | import os 4 | from tqdm import tqdm 5 | from datasets import preprocessor 6 | from hparams import hparams 7 | 8 | 9 | def preprocess(args, input_folders, out_dir): 10 | mel_dir = os.path.join(out_dir, 'mels') 11 | wav_dir = os.path.join(out_dir, 'audio') 12 | linear_dir = os.path.join(out_dir, 'linear') 13 | os.makedirs(mel_dir, exist_ok=True) 14 | os.makedirs(wav_dir, exist_ok=True) 15 | os.makedirs(linear_dir, exist_ok=True) 16 | metadata = preprocessor.build_from_path(input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm) 17 | write_metadata(metadata, out_dir) 18 | 19 | def write_metadata(metadata, out_dir): 20 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 21 | for m in metadata: 22 | f.write('|'.join([str(x) for x in m]) + '\n') 23 | mel_frames = sum([int(m[4]) for m in metadata]) 24 | timesteps = sum([int(m[3]) for m in metadata]) 25 | sr = hparams.sample_rate 26 | hours = timesteps / sr / 3600 27 | print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format( 28 | len(metadata), mel_frames, timesteps, hours)) 29 | print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata))) 30 | print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata))) 31 | print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 32 | 33 | def norm_data(args): 34 | print('Selecting data folders..') 35 | supported_datasets = ['LJSpeech-1.1', 'M-AILABS'] 36 | if args.dataset not in supported_datasets: 37 | raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format( 38 | args.dataset, supported_datasets)) 39 | 40 | if args.dataset == 'LJSpeech-1.1': 41 | return [os.path.join(args.base_dir, args.dataset)] 42 | 43 | 44 | if args.dataset == 'M-AILABS': 45 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 46 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA'] 47 | if args.language not in supported_languages: 48 | raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format( 49 | supported_languages)) 50 | 51 | supported_voices = ['female', 'male', 'mix'] 52 | if args.voice not in supported_voices: 53 | raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format( 54 | supported_voices)) 55 | 56 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice) 57 | supported_readers = [e for e in os.listdir(path) if 'DS_Store' not in e] 58 | if args.reader not in supported_readers: 59 | raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format( 60 | supported_readers)) 61 | 62 | path = os.path.join(path, args.reader) 63 | supported_books = [e for e in os.listdir(path) if e != '.DS_Store'] 64 | 65 | if args.merge_books: 66 | return [os.path.join(path, book) for book in supported_books] 67 | 68 | else: 69 | if args.book not in supported_books: 70 | raise ValueError('Please enter a valid book for your reader settings! \n{}'.format( 71 | supported_books)) 72 | 73 | return [os.path.join(path, args.book)] 74 | 75 | 76 | def run_preprocess(args): 77 | input_folders = norm_data(args) 78 | output_folder = os.path.join(args.base_dir, args.output) 79 | 80 | preprocess(args, input_folders, output_folder) 81 | 82 | 83 | def main(): 84 | print('initializing preprocessing..') 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument('--base_dir', default='') 87 | parser.add_argument('--dataset', default='LJSpeech-1.1') 88 | parser.add_argument('--language', default='en_US') 89 | parser.add_argument('--voice', default='female') 90 | parser.add_argument('--reader', default='mary_ann') 91 | parser.add_argument('--merge_books', type=bool, default=False) 92 | parser.add_argument('--book', default='northandsouth') 93 | parser.add_argument('--output', default='training_data') 94 | parser.add_argument('--n_jobs', type=int, default=cpu_count()) 95 | args = parser.parse_args() 96 | 97 | run_preprocess(args) 98 | 99 | 100 | if __name__ == '__main__': 101 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | falcon==1.2.0 2 | inflect==0.2.5 3 | librosa==0.5.1 4 | matplotlib==2.0.2 5 | numpy==1.13.0 6 | scipy==1.0.0 7 | tqdm==4.11.2 8 | Unidecode==0.4.20 -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tacotron.synthesize import tacotron_synthesize 3 | 4 | 5 | def main(): 6 | accepted_modes = ['eval', 'synthesis'] 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--checkpoint', default='logs-Tacotron/pretrained/', help='Path to model checkpoint') 9 | parser.add_argument('--hparams', default='', 10 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 11 | parser.add_argument('--reference_audio', required=True) 12 | parser.add_argument('--model', default='Tacotron') 13 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') 14 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 15 | parser.add_argument('--mode', default='synthesis', help='mode of run: can be one of {}'.format(accepted_modes)) 16 | parser.add_argument('--GTA', default=False, help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') 17 | args = parser.parse_args() 18 | 19 | accepted_models = ['Tacotron', 'Wavenet'] 20 | 21 | if args.model not in accepted_models: 22 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models)) 23 | 24 | if args.mode not in accepted_modes: 25 | raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode)) 26 | 27 | if args.model == 'Tacotron': 28 | tacotron_synthesize(args) 29 | elif args.model == 'Wavenet': 30 | raise NotImplementedError('Wavenet is still a work in progress, thank you for your patience!') 31 | 32 | 33 | if __name__ == '__main__': 34 | main() -------------------------------------------------------------------------------- /tacotron/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /tacotron/feeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import threading 4 | import time 5 | import traceback 6 | from tacotron.utils.text import text_to_sequence 7 | from tacotron.utils.infolog import log 8 | import tensorflow as tf 9 | from hparams import hparams 10 | 11 | 12 | _batches_per_group = 32 13 | #pad input sequences with the 0 ( _ ) 14 | _pad = 0 15 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 16 | #to avoid any possible conflicts, without affecting the output range of the model too much 17 | if hparams.symmetric_mels: 18 | _target_pad = -(hparams.max_abs_value + .1) 19 | else: 20 | _target_pad = -0.1 21 | #Mark finished sequences with 1s 22 | _token_pad = 1. 23 | 24 | class Feeder(threading.Thread): 25 | """ 26 | Feeds batches of data into queue on a background thread. 27 | """ 28 | 29 | def __init__(self, coordinator, metadata_filename, hparams): 30 | super(Feeder, self).__init__() 31 | self._coord = coordinator 32 | self._hparams = hparams 33 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 34 | self._offset = 0 35 | 36 | # Load metadata 37 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') 38 | self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') 39 | with open(metadata_filename, encoding='utf-8') as f: 40 | self._metadata = [line.strip().split('|') for line in f] 41 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 42 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) 43 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) 44 | 45 | # Create placeholders for inputs and targets. Don't specify batch size because we want 46 | # to be able to feed different batch sizes at eval time. 47 | self._placeholders = [ 48 | tf.placeholder(tf.int32, shape=(None, None), name='inputs'), 49 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 50 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), 51 | tf.placeholder(tf.int32,[None],'mel_lengths'), 52 | tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), 53 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), 54 | ] 55 | 56 | # Create queue for buffering data 57 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue') 58 | self._enqueue_op = queue.enqueue(self._placeholders) 59 | self.inputs, self.input_lengths, self.mel_targets, self.mel_lengths, self.token_targets, self.linear_targets = queue.dequeue() 60 | self.inputs.set_shape(self._placeholders[0].shape) 61 | self.input_lengths.set_shape(self._placeholders[1].shape) 62 | self.mel_targets.set_shape(self._placeholders[2].shape) 63 | self.mel_lengths.set_shape(self._placeholders[3].shape) 64 | self.token_targets.set_shape(self._placeholders[4].shape) 65 | self.linear_targets.set_shape(self._placeholders[5].shape) 66 | 67 | def start_in_session(self, session): 68 | self._session = session 69 | self.start() 70 | 71 | def run(self): 72 | try: 73 | while not self._coord.should_stop(): 74 | self._enqueue_next_group() 75 | except Exception as e: 76 | traceback.print_exc() 77 | self._coord.request_stop(e) 78 | 79 | def _enqueue_next_group(self): 80 | start = time.time() 81 | 82 | # Read a group of examples 83 | n = self._hparams.tacotron_batch_size 84 | r = self._hparams.outputs_per_step 85 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 86 | 87 | # Bucket examples based on similar output sequence length for efficiency 88 | examples.sort(key=lambda x: x[-1]) 89 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 90 | np.random.shuffle(batches) 91 | 92 | log('\nGenerated {} batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 93 | for batch in batches: 94 | feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) 95 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 96 | 97 | def _get_next_example(self): 98 | """ 99 | Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk 100 | """ 101 | if self._offset >= len(self._metadata): 102 | self._offset = 0 103 | np.random.shuffle(self._metadata) 104 | meta = self._metadata[self._offset] 105 | self._offset += 1 106 | 107 | text = meta[5] 108 | 109 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 110 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 111 | #Create parallel sequences containing zeros to represent a non finished sequence 112 | token_target = np.asarray([0.] * len(mel_target)) 113 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 114 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 115 | 116 | 117 | def _prepare_batch(batch, outputs_per_step): 118 | np.random.shuffle(batch) 119 | inputs = _prepare_inputs([x[0] for x in batch]) 120 | input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) 121 | mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step) 122 | mel_lengths= [len(x[1]) for x in batch] 123 | #Pad sequences with 1 to infer that the sequence is done 124 | token_targets = _prepare_token_targets([x[2] for x in batch], outputs_per_step) 125 | linear_targets = _prepare_targets([x[3] for x in batch], outputs_per_step) 126 | return (inputs, input_lengths, mel_targets, mel_lengths, token_targets, linear_targets) 127 | 128 | def _prepare_inputs(inputs): 129 | max_len = max([len(x) for x in inputs]) 130 | return np.stack([_pad_input(x, max_len) for x in inputs]) 131 | 132 | def _prepare_targets(targets, alignment): 133 | max_len = max([len(t) for t in targets]) + 1 134 | return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) 135 | 136 | def _prepare_token_targets(targets, alignment): 137 | max_len = max([len(t) for t in targets]) + 1 138 | return np.stack([_pad_token_target(t, _round_up(max_len, alignment)) for t in targets]) 139 | 140 | def _pad_input(x, length): 141 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 142 | 143 | def _pad_target(t, length): 144 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_target_pad) 145 | 146 | def _pad_token_target(t, length): 147 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_token_pad) 148 | 149 | def _round_up(x, multiple): 150 | remainder = x % multiple 151 | return x if remainder == 0 else x + multiple - remainder 152 | -------------------------------------------------------------------------------- /tacotron/models/Architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers usefull for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow.contrib.rnn import RNNCell 8 | from tensorflow.python.framework import ops 9 | from tensorflow.python.ops import rnn_cell_impl 10 | from tensorflow.python.ops import check_ops 11 | from tensorflow.python.util import nest 12 | from tensorflow.python.ops import array_ops 13 | from tensorflow.python.ops import tensor_array_ops 14 | from tensorflow.python.framework import tensor_shape 15 | from tacotron.models.attention import _compute_attention 16 | 17 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 18 | 19 | 20 | 21 | class TacotronEncoderCell(RNNCell): 22 | """Tacotron 2 Encoder Cell 23 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 24 | layer to predict the hidden representation vector (or memory) 25 | """ 26 | 27 | def __init__(self, convolutional_layers, lstm_layer): 28 | """Initialize encoder parameters 29 | 30 | Args: 31 | convolutional_layers: Encoder convolutional block class 32 | lstm_layer: encoder bidirectional lstm layer class 33 | """ 34 | super(TacotronEncoderCell, self).__init__() 35 | #Initialize encoder layers 36 | self._convolutions = convolutional_layers 37 | self._cell = lstm_layer 38 | 39 | def __call__(self, inputs, input_lengths=None): 40 | #Pass input sequence through a stack of convolutional layers 41 | conv_output = self._convolutions(inputs) 42 | 43 | #Extract hidden representation from encoder lstm cells 44 | hidden_representation = self._cell(conv_output, input_lengths) 45 | 46 | #For shape visualization 47 | self.conv_output_shape = conv_output.shape 48 | return hidden_representation 49 | 50 | 51 | class TacotronDecoderCellState( 52 | collections.namedtuple("TacotronDecoderCellState", 53 | ("cell_state", "attention", "time", "alignments", 54 | "alignment_history", "finished"))): 55 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 56 | Contains: 57 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 58 | step. 59 | - `attention`: The attention emitted at the previous time step. 60 | - `time`: int32 scalar containing the current time step. 61 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 62 | emitted at the previous time step for each attention mechanism. 63 | - `alignment_history`: a single or tuple of `TensorArray`(s) 64 | containing alignment matrices from all time steps for each attention 65 | mechanism. Call `stack()` on each to convert to a `Tensor`. 66 | """ 67 | def replace(self, **kwargs): 68 | """Clones the current state while overwriting components provided by kwargs. 69 | """ 70 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 71 | 72 | class TacotronDecoderCell(RNNCell): 73 | """Tactron 2 Decoder Cell 74 | Decodes encoder output and previous mel frames into next r frames 75 | 76 | Decoder Step i: 77 | 1) Prenet to compress last output information 78 | 2) Concat compressed inputs with previous context vector (input feeding) * 79 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 80 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 81 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 82 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 83 | 84 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper, 85 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 86 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 87 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only. 88 | """ 89 | 90 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection, mask_finished=False): 91 | """Initialize decoder parameters 92 | 93 | Args: 94 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 95 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 96 | learn encoder-decoder alignments 97 | rnn_cell: Instance of RNNCell, main body of the decoder 98 | frame_projection: tensorflow fully connected layer with r * num_mels output units 99 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 100 | and through a sigmoid activation 101 | mask_finished: Boolean, Whether to mask decoder frames after the 102 | """ 103 | super(TacotronDecoderCell, self).__init__() 104 | #Initialize decoder layers 105 | self._prenet = prenet 106 | self._attention_mechanism = attention_mechanism 107 | self._cell = rnn_cell 108 | self._frame_projection = frame_projection 109 | self._stop_projection = stop_projection 110 | 111 | self._mask_finished = mask_finished 112 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 113 | 114 | def _batch_size_checks(self, batch_size, error_message): 115 | return [check_ops.assert_equal(batch_size, 116 | self._attention_mechanism.batch_size, 117 | message=error_message)] 118 | 119 | @property 120 | def output_size(self): 121 | return self._frame_projection.shape 122 | 123 | @property 124 | def state_size(self): 125 | """The `state_size` property of `TacotronDecoderCell`. 126 | 127 | Returns: 128 | An `TacotronDecoderCell` tuple containing shapes used by this object. 129 | """ 130 | return TacotronDecoderCellState( 131 | cell_state=self._cell._cell.state_size, 132 | time=tensor_shape.TensorShape([]), 133 | attention=self._attention_layer_size, 134 | alignments=self._attention_mechanism.alignments_size, 135 | alignment_history=(), 136 | finished=()) 137 | 138 | def zero_state(self, batch_size, dtype): 139 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 140 | 141 | Args: 142 | batch_size: `0D` integer tensor: the batch size. 143 | dtype: The internal state data type. 144 | Returns: 145 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 146 | possibly, empty `TensorArray` objects. 147 | Raises: 148 | ValueError: (or, possibly at runtime, InvalidArgument), if 149 | `batch_size` does not match the output size of the encoder passed 150 | to the wrapper object at initialization time. 151 | """ 152 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 153 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 154 | error_message = ( 155 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 156 | "Non-matching batch sizes between the memory " 157 | "(encoder output) and the requested batch size.") 158 | with ops.control_dependencies( 159 | self._batch_size_checks(batch_size, error_message)): 160 | cell_state = nest.map_structure( 161 | lambda s: array_ops.identity(s, name="checked_cell_state"), 162 | cell_state) 163 | return TacotronDecoderCellState( 164 | cell_state=cell_state, 165 | time=array_ops.zeros([], dtype=tf.int32), 166 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 167 | dtype), 168 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 169 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 170 | dynamic_size=True), 171 | finished=tf.reshape(tf.tile([0.0], [batch_size]), [-1, 1])) 172 | 173 | def __call__(self, inputs, state): 174 | #Information bottleneck (essential for learning attention) 175 | prenet_output = self._prenet(inputs) 176 | 177 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 178 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 179 | 180 | #Unidirectional LSTM layers 181 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 182 | 183 | #Compute the attention (context) vector and alignments using 184 | #the new decoder cell hidden state as query vector 185 | #and cumulative alignments to extract location features 186 | #The choice of the new cell hidden state (s_{i}) of the last 187 | #decoder RNN Cell is based on Luong et Al. (2015): 188 | #https://arxiv.org/pdf/1508.04025.pdf 189 | previous_alignments = state.alignments 190 | previous_alignment_history = state.alignment_history 191 | context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 192 | LSTM_output, 193 | previous_alignments, 194 | attention_layer=None) 195 | 196 | #Concat LSTM outputs and context vector to form projections inputs 197 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 198 | 199 | #Compute predicted frames and predicted 200 | cell_outputs = self._frame_projection(projections_input) 201 | stop_tokens = self._stop_projection(projections_input) 202 | 203 | #mask attention computed for decoding steps where sequence is already finished 204 | #this is purely for visual purposes and will not affect the training of the model 205 | #we don't pay much attention to the alignments of the output paddings if we impute 206 | #the decoder outputs beyond the end of sequence. 207 | if self._mask_finished: 208 | finished = tf.cast(state.finished * tf.ones(tf.shape(alignments)), tf.bool) 209 | mask = tf.zeros(tf.shape(alignments)) 210 | masked_alignments = tf.where(finished, mask, alignments) 211 | else: 212 | masked_alignments = alignments 213 | 214 | #Save alignment history 215 | alignment_history = previous_alignment_history.write(state.time, masked_alignments) 216 | 217 | #Prepare next decoder state 218 | next_state = TacotronDecoderCellState( 219 | time=state.time + 1, 220 | cell_state=next_cell_state, 221 | attention=context_vector, 222 | alignments=cumulated_alignments, 223 | alignment_history=alignment_history, 224 | finished=state.finished) 225 | 226 | return (cell_outputs, stop_tokens), next_state 227 | -------------------------------------------------------------------------------- /tacotron/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'Tacotron': 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /tacotron/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.ops import nn_ops 6 | from tensorflow.python.layers import core as layers_core 7 | from tensorflow.python.ops import array_ops 8 | from tensorflow.python.ops import variable_scope 9 | from tensorflow.python.ops import math_ops 10 | from hparams import hparams 11 | 12 | 13 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 14 | def _compute_attention(attention_mechanism, cell_output, attention_state, 15 | attention_layer): 16 | """Computes the attention and alignments for a given attention_mechanism.""" 17 | alignments, next_attention_state = attention_mechanism( 18 | cell_output, state=attention_state) 19 | 20 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 21 | expanded_alignments = array_ops.expand_dims(alignments, 1) 22 | # Context is the inner product of alignments and values along the 23 | # memory time dimension. 24 | # alignments shape is 25 | # [batch_size, 1, memory_time] 26 | # attention_mechanism.values shape is 27 | # [batch_size, memory_time, memory_size] 28 | # the batched matmul is over memory_time, so the output shape is 29 | # [batch_size, 1, memory_size]. 30 | # we then squeeze out the singleton dim. 31 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 32 | context = array_ops.squeeze(context, [1]) 33 | 34 | if attention_layer is not None: 35 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 36 | else: 37 | attention = context 38 | 39 | return attention, alignments, next_attention_state 40 | 41 | 42 | def _location_sensitive_score(W_query, W_fil, W_keys): 43 | """Impelements Bahdanau-style (cumulative) scoring function. 44 | This attention is described in: 45 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 46 | gio, “Attention-based models for speech recognition,” in Ad- 47 | vances in Neural Information Processing Systems, 2015, pp. 48 | 577–585. 49 | 50 | ############################################################################# 51 | hybrid attention (content-based + location-based) 52 | f = F * α_{i-1} 53 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 54 | ############################################################################# 55 | 56 | Args: 57 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 58 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 59 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 60 | Returns: 61 | A '[batch_size, max_time]' attention score (energy) 62 | """ 63 | # Get the number of hidden units from the trailing dimension of keys 64 | dtype = W_query.dtype 65 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 66 | 67 | v_a = tf.get_variable( 68 | 'attention_variable', shape=[num_units], dtype=dtype, 69 | initializer=tf.contrib.layers.xavier_initializer()) 70 | b_a = tf.get_variable( 71 | 'attention_bias', shape=[num_units], dtype=dtype, 72 | initializer=tf.zeros_initializer()) 73 | 74 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 75 | 76 | def _smoothing_normalization(e): 77 | """Applies a smoothing normalization function instead of softmax 78 | Introduced in: 79 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 80 | gio, “Attention-based models for speech recognition,” in Ad- 81 | vances in Neural Information Processing Systems, 2015, pp. 82 | 577–585. 83 | 84 | ############################################################################ 85 | Smoothing normalization function 86 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 87 | ############################################################################ 88 | 89 | Args: 90 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 91 | values of an attention mechanism 92 | Returns: 93 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 94 | attendance to multiple memory time steps. 95 | """ 96 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 97 | 98 | 99 | class LocationSensitiveAttention(BahdanauAttention): 100 | """Impelements Bahdanau-style (cumulative) scoring function. 101 | Usually referred to as "hybrid" attention (content-based + location-based) 102 | Extends the additive attention described in: 103 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 104 | tion by jointly learning to align and translate,” in Proceedings 105 | of ICLR, 2015." 106 | to use previous alignments as additional location features. 107 | 108 | This attention is described in: 109 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 110 | gio, “Attention-based models for speech recognition,” in Ad- 111 | vances in Neural Information Processing Systems, 2015, pp. 112 | 577–585. 113 | """ 114 | 115 | def __init__(self, 116 | num_units, 117 | memory, 118 | mask_encoder=True, 119 | memory_sequence_length=None, 120 | smoothing=False, 121 | cumulate_weights=True, 122 | name='LocationSensitiveAttention'): 123 | """Construct the Attention mechanism. 124 | Args: 125 | num_units: The depth of the query mechanism. 126 | memory: The memory to query; usually the output of an RNN encoder. This 127 | tensor should be shaped `[batch_size, max_time, ...]`. 128 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 129 | memory_sequence_length (optional): Sequence lengths for the batch entries 130 | in memory. If provided, the memory tensor rows are masked with zeros 131 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 132 | smoothing (optional): Boolean. Determines which normalization function to use. 133 | Default normalization function (probablity_fn) is softmax. If smoothing is 134 | enabled, we replace softmax with: 135 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 136 | Introduced in: 137 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 138 | gio, “Attention-based models for speech recognition,” in Ad- 139 | vances in Neural Information Processing Systems, 2015, pp. 140 | 577–585. 141 | This is mainly used if the model wants to attend to multiple inputs parts 142 | at the same decoding step. We probably won't be using it since multiple sound 143 | frames may depend from the same character, probably not the way around. 144 | Note: 145 | We still keep it implemented in case we want to test it. They used it in the 146 | paper in the context of speech recognition, where one phoneme may depend on 147 | multiple subsequent sound frames. 148 | name: Name to use when creating ops. 149 | """ 150 | #Create normalization function 151 | #Setting it to None defaults in using softmax 152 | normalization_function = _smoothing_normalization if (smoothing == True) else None 153 | memory_length = memory_sequence_length if (mask_encoder==True) else None 154 | super(LocationSensitiveAttention, self).__init__( 155 | num_units=num_units, 156 | memory=memory, 157 | memory_sequence_length=memory_length, 158 | probability_fn=normalization_function, 159 | name=name) 160 | 161 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 162 | kernel_size=hparams.attention_kernel, padding='same', use_bias=False, 163 | name='location_features_convolution') 164 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 165 | dtype=tf.float32, name='location_features_layer') 166 | self._cumulate = cumulate_weights 167 | 168 | def __call__(self, query, state): 169 | """Score the query based on the keys and values. 170 | Args: 171 | query: Tensor of dtype matching `self.values` and shape 172 | `[batch_size, query_depth]`. 173 | state (previous alignments): Tensor of dtype matching `self.values` and shape 174 | `[batch_size, alignments_size]` 175 | (`alignments_size` is memory's `max_time`). 176 | Returns: 177 | alignments: Tensor of dtype matching `self.values` and shape 178 | `[batch_size, alignments_size]` (`alignments_size` is memory's 179 | `max_time`). 180 | """ 181 | previous_alignments = state 182 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 183 | 184 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 185 | processed_query = self.query_layer(query) if self.query_layer else query 186 | # -> [batch_size, 1, attention_dim] 187 | processed_query = tf.expand_dims(processed_query, 1) 188 | 189 | # processed_location_features shape [batch_size, max_time, attention dimension] 190 | # [batch_size, max_time] -> [batch_size, max_time, 1] 191 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 192 | # location features [batch_size, max_time, filters] 193 | f = self.location_convolution(expanded_alignments) 194 | # Projected location features [batch_size, max_time, attention_dim] 195 | processed_location_features = self.location_layer(f) 196 | 197 | # energy shape [batch_size, max_time] 198 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 199 | 200 | # alignments shape = energy shape = [batch_size, max_time] 201 | alignments = self._probability_fn(energy, previous_alignments) 202 | 203 | # Cumulate alignments 204 | if self._cumulate: 205 | next_state = alignments + previous_alignments 206 | else: 207 | next_state = alignments 208 | 209 | return alignments, next_state 210 | -------------------------------------------------------------------------------- /tacotron/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib.seq2seq.python.ops import decoder 9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 10 | from tensorflow.python.framework import ops 11 | from tensorflow.python.framework import tensor_shape 12 | from tensorflow.python.layers import base as layers_base 13 | from tensorflow.python.ops import rnn_cell_impl 14 | from tensorflow.python.util import nest 15 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper 16 | 17 | 18 | 19 | class CustomDecoderOutput( 20 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 21 | pass 22 | 23 | 24 | class CustomDecoder(decoder.Decoder): 25 | """Custom sampling decoder. 26 | 27 | Allows for stop token prediction at inference time 28 | and returns equivalent loss in training time. 29 | 30 | Note: 31 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 32 | """ 33 | 34 | def __init__(self, cell, helper, initial_state, output_layer=None): 35 | """Initialize CustomDecoder. 36 | Args: 37 | cell: An `RNNCell` instance. 38 | helper: A `Helper` instance. 39 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 40 | The initial state of the RNNCell. 41 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 42 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 43 | to storing the result or sampling. 44 | Raises: 45 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 46 | """ 47 | if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access 48 | raise TypeError("cell must be an RNNCell, received: %s" % type(cell)) 49 | if not isinstance(helper, helper_py.Helper): 50 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 51 | if (output_layer is not None 52 | and not isinstance(output_layer, layers_base.Layer)): 53 | raise TypeError( 54 | "output_layer must be a Layer, received: %s" % type(output_layer)) 55 | self._cell = cell 56 | self._helper = helper 57 | self._initial_state = initial_state 58 | self._output_layer = output_layer 59 | 60 | @property 61 | def batch_size(self): 62 | return self._helper.batch_size 63 | 64 | def _rnn_output_size(self): 65 | size = self._cell.output_size 66 | if self._output_layer is None: 67 | return size 68 | else: 69 | # To use layer's compute_output_shape, we need to convert the 70 | # RNNCell's output_size entries into shapes with an unknown 71 | # batch size. We then pass this through the layer's 72 | # compute_output_shape and read off all but the first (batch) 73 | # dimensions to get the output size of the rnn with the layer 74 | # applied to the top. 75 | output_shape_with_unknown_batch = nest.map_structure( 76 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 77 | size) 78 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 79 | output_shape_with_unknown_batch) 80 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 81 | 82 | @property 83 | def output_size(self): 84 | # Return the cell output and the id 85 | return CustomDecoderOutput( 86 | rnn_output=self._rnn_output_size(), 87 | token_output=self._helper.token_output_size, 88 | sample_id=self._helper.sample_ids_shape) 89 | 90 | @property 91 | def output_dtype(self): 92 | # Assume the dtype of the cell is the output_size structure 93 | # containing the input_state's first component's dtype. 94 | # Return that structure and the sample_ids_dtype from the helper. 95 | dtype = nest.flatten(self._initial_state)[0].dtype 96 | return CustomDecoderOutput( 97 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 98 | tf.float32, 99 | self._helper.sample_ids_dtype) 100 | 101 | def initialize(self, name=None): 102 | """Initialize the decoder. 103 | Args: 104 | name: Name scope for any created operations. 105 | Returns: 106 | `(finished, first_inputs, initial_state)`. 107 | """ 108 | return self._helper.initialize() + (self._initial_state,) 109 | 110 | def step(self, time, inputs, state, name=None): 111 | """Perform a custom decoding step. 112 | Enables for dyanmic prediction 113 | Args: 114 | time: scalar `int32` tensor. 115 | inputs: A (structure of) input tensors. 116 | state: A (structure of) state tensors and TensorArrays. 117 | name: Name scope for any created operations. 118 | Returns: 119 | `(outputs, next_state, next_inputs, finished)`. 120 | """ 121 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 122 | #Call outputprojection wrapper cell 123 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 124 | 125 | #apply output_layer (if existant) 126 | if self._output_layer is not None: 127 | cell_outputs = self._output_layer(cell_outputs) 128 | sample_ids = self._helper.sample( 129 | time=time, outputs=cell_outputs, state=cell_state) 130 | 131 | (finished, next_inputs, next_state) = self._helper.next_inputs( 132 | time=time, 133 | outputs=cell_outputs, 134 | state=cell_state, 135 | sample_ids=sample_ids, 136 | stop_token_prediction=stop_token) 137 | 138 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 139 | return (outputs, next_state, next_inputs, finished) -------------------------------------------------------------------------------- /tacotron/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | from hparams import hparams 5 | 6 | 7 | class TacoTestHelper(Helper): 8 | def __init__(self, batch_size, output_dim, r): 9 | with tf.name_scope('TacoTestHelper'): 10 | self._batch_size = batch_size 11 | self._output_dim = output_dim 12 | self._reduction_factor = r 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 38 | with tf.name_scope('TacoTestHelper'): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn't 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if hparams.stop_at_any: 52 | finished = tf.reduce_any(finished) #Recommended 53 | else: 54 | finished = tf.reduce_all(finished) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, stop_targets, output_dim, r, ratio, gta): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope('TacoTrainingHelper'): 66 | self._batch_size = batch_size 67 | self._output_dim = output_dim 68 | self._reduction_factor = r 69 | self._ratio = tf.convert_to_tensor(ratio) 70 | self.gta = gta 71 | 72 | # Feed every r-th target frame as input 73 | self._targets = targets[:, r-1::r, :] 74 | 75 | if not gta: 76 | # Detect finished sequence using stop_targets 77 | self._stop_targets = stop_targets[:, r-1::r] 78 | else: 79 | # GTA synthesis 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 100 | 101 | def sample(self, time, outputs, state, name=None): 102 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 103 | 104 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 105 | with tf.name_scope(name or 'TacoTrainingHelper'): 106 | if not self.gta: 107 | #mark sequences where stop_target == 1 as finished (for case of imputation) 108 | finished = tf.equal(self._stop_targets[:, time], [1.]) 109 | else: 110 | #GTA synthesis stop 111 | finished = (time + 1 >= self._lengths) 112 | 113 | next_inputs = tf.cond( 114 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 115 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 116 | lambda: outputs[:,-self._output_dim:]) 117 | 118 | #Update the finished state 119 | next_state = state.replace(finished=tf.cast(tf.reshape(finished, [-1, 1]), tf.float32)) 120 | return (finished, next_inputs, next_state) 121 | 122 | 123 | def _go_frames(batch_size, output_dim): 124 | '''Returns all-zero frames for a given batch size and output dimension''' 125 | return tf.tile([[0.0]], [batch_size, output_dim]) -------------------------------------------------------------------------------- /tacotron/models/modules.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell 3 | from tensorflow.contrib.rnn import LSTMBlockCell 4 | from hparams import hparams 5 | from tensorflow.contrib.rnn import GRUCell 6 | from tacotron.utils.util import shape_list 7 | 8 | def VAE(inputs, input_lengths, filters, kernel_size, strides, num_units, is_training, scope): 9 | with tf.variable_scope(scope): 10 | outputs = ReferenceEncoder( 11 | inputs=inputs, 12 | input_lengths=input_lengths, 13 | filters=filters, 14 | kernel_size=kernel_size, 15 | strides=strides, 16 | is_training=is_training) 17 | 18 | mu = tf.layers.dense(outputs, num_units, name='mean') 19 | log_var = tf.layers.dense(outputs, num_units, name='vari') 20 | std = tf.exp(log_var) 21 | z = tf.random_normal(shape=[tf.shape(mu)[0], num_units], mean=0.0, stddev=1.0) 22 | output = mu + z * std 23 | return output, mu, log_var 24 | 25 | def ReferenceEncoder(inputs, input_lengths, filters, kernel_size, strides, is_training, scope='reference_encoder'): 26 | with tf.variable_scope(scope): 27 | reference_output = tf.expand_dims(inputs, axis=-1) 28 | for i, channel in enumerate(filters): 29 | reference_output = conv2d(reference_output, channel, kernel_size, 30 | strides, tf.nn.relu, is_training, 'conv2d_{}'.format(i)) 31 | 32 | shape = shape_list(reference_output) 33 | reference_output = tf.reshape(reference_output, shape[:-2] + [shape[2] * shape[3]]) 34 | 35 | #GRU 36 | encoder_outputs, encoder_state = tf.nn.dynamic_rnn( 37 | cell=GRUCell(128), 38 | inputs=reference_output, 39 | sequence_length=input_lengths, 40 | dtype=tf.float32 41 | ) 42 | return encoder_state 43 | 44 | 45 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope): 46 | drop_rate = hparams.tacotron_dropout_rate 47 | 48 | with tf.variable_scope(scope): 49 | conv1d_output = tf.layers.conv1d( 50 | inputs, 51 | filters=channels, 52 | kernel_size=kernel_size, 53 | activation=None, 54 | padding='same') 55 | batched = tf.layers.batch_normalization(conv1d_output, training=is_training) 56 | activated = activation(batched) 57 | return tf.layers.dropout(activated, rate=drop_rate, training=is_training, 58 | name='dropout_{}'.format(scope)) 59 | 60 | 61 | def conv2d(inputs, filters, kernel_size, strides, activation, is_training, scope): 62 | with tf.variable_scope(scope): 63 | conv2d_output = tf.layers.conv2d( 64 | inputs, filters=filters, kernel_size=kernel_size, strides=strides, padding='same') 65 | 66 | batch_norm_output = tf.layers.batch_normalization( 67 | conv2d_output, training=is_training, name='batch_norm') 68 | if activation is not None: 69 | conv2d_output = activation(batch_norm_output) 70 | 71 | return conv2d_output 72 | 73 | class EncoderConvolutions: 74 | """Encoder convolutional layers used to find local dependencies in inputs characters. 75 | """ 76 | def __init__(self, is_training, kernel_size=(5, ), channels=512, activation=tf.nn.relu, scope=None): 77 | """ 78 | Args: 79 | is_training: Boolean, determines if the model is training or in inference to control dropout 80 | kernel_size: tuple or integer, The size of convolution kernels 81 | channels: integer, number of convolutional kernels 82 | activation: callable, postnet activation function for each convolutional layer 83 | scope: Postnet scope. 84 | """ 85 | super(EncoderConvolutions, self).__init__() 86 | self.is_training = is_training 87 | 88 | self.kernel_size = kernel_size 89 | self.channels = channels 90 | self.activation = activation 91 | self.scope = 'enc_conv_layers' if scope is None else scope 92 | 93 | def __call__(self, inputs): 94 | with tf.variable_scope(self.scope): 95 | x = inputs 96 | for i in range(hparams.enc_conv_num_layers): 97 | x = conv1d(x, self.kernel_size, self.channels, self.activation, 98 | self.is_training, 'conv_layer_{}_'.format(i + 1)+self.scope) 99 | return x 100 | 101 | 102 | class EncoderRNN: 103 | """Encoder bidirectional one layer LSTM 104 | """ 105 | def __init__(self, is_training, size=256, zoneout=0.1, scope=None): 106 | """ 107 | Args: 108 | is_training: Boolean, determines if the model is training or in inference to control zoneout 109 | size: integer, the number of LSTM units for each direction 110 | zoneout: the zoneout factor 111 | scope: EncoderRNN scope. 112 | """ 113 | super(EncoderRNN, self).__init__() 114 | self.is_training = is_training 115 | 116 | self.size = size 117 | self.zoneout = zoneout 118 | self.scope = 'encoder_LSTM' if scope is None else scope 119 | 120 | #Create LSTM Cell 121 | self._cell = ZoneoutLSTMCell(size, is_training, 122 | zoneout_factor_cell=zoneout, 123 | zoneout_factor_output=zoneout) 124 | 125 | def __call__(self, inputs, input_lengths): 126 | with tf.variable_scope(self.scope): 127 | outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( 128 | self._cell, 129 | self._cell, 130 | inputs, 131 | sequence_length=input_lengths, 132 | dtype=tf.float32) 133 | 134 | return tf.concat(outputs, axis=2) # Concat and return forward + backward outputs 135 | 136 | 137 | class Prenet: 138 | """Two fully connected layers used as an information bottleneck for the attention. 139 | """ 140 | def __init__(self, is_training, layer_sizes=[256, 256], activation=tf.nn.relu, scope=None): 141 | """ 142 | Args: 143 | is_training: Boolean, determines if the model is in training or inference to control dropout 144 | layer_sizes: list of integers, the length of the list represents the number of pre-net 145 | layers and the list values represent the layers number of units 146 | activation: callable, activation functions of the prenet layers. 147 | scope: Prenet scope. 148 | """ 149 | super(Prenet, self).__init__() 150 | self.drop_rate = hparams.tacotron_dropout_rate 151 | 152 | self.layer_sizes = layer_sizes 153 | self.is_training = is_training 154 | self.activation = activation 155 | 156 | self.scope = 'prenet' if scope is None else scope 157 | 158 | def __call__(self, inputs): 159 | x = inputs 160 | 161 | with tf.variable_scope(self.scope): 162 | for i, size in enumerate(self.layer_sizes): 163 | dense = tf.layers.dense(x, units=size, activation=self.activation, 164 | name='dense_{}'.format(i + 1)) 165 | #The paper discussed introducing diversity in generation at inference time 166 | #by using a dropout of 0.5 only in prenet layers (in both training and inference). 167 | x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, 168 | name='dropout_{}'.format(i + 1) + self.scope) 169 | return x 170 | 171 | 172 | class DecoderRNN: 173 | """Decoder two uni directional LSTM Cells 174 | """ 175 | def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): 176 | """ 177 | Args: 178 | is_training: Boolean, determines if the model is in training or inference to control zoneout 179 | layers: integer, the number of LSTM layers in the decoder 180 | size: integer, the number of LSTM units in each layer 181 | zoneout: the zoneout factor 182 | """ 183 | super(DecoderRNN, self).__init__() 184 | self.is_training = is_training 185 | 186 | self.layers = layers 187 | self.size = size 188 | self.zoneout = zoneout 189 | self.scope = 'decoder_rnn' if scope is None else scope 190 | 191 | #Create a set of LSTM layers 192 | self.rnn_layers = [ZoneoutLSTMCell(size, is_training, 193 | zoneout_factor_cell=zoneout, 194 | zoneout_factor_output=zoneout) for i in range(layers)] 195 | 196 | self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) 197 | 198 | def __call__(self, inputs, states): 199 | with tf.variable_scope(self.scope): 200 | return self._cell(inputs, states) 201 | 202 | 203 | class FrameProjection: 204 | """Projection layer to r * num_mels dimensions or num_mels dimensions 205 | """ 206 | def __init__(self, shape=80, activation=None, scope=None): 207 | """ 208 | Args: 209 | shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for postnet) 210 | activation: callable, activation function 211 | scope: FrameProjection scope. 212 | """ 213 | super(FrameProjection, self).__init__() 214 | 215 | self.shape = shape 216 | self.activation = activation 217 | 218 | self.scope = 'Linear_projection' if scope is None else scope 219 | 220 | def __call__(self, inputs): 221 | with tf.variable_scope(self.scope): 222 | #If activation==None, this returns a simple Linear projection 223 | #else the projection will be passed through an activation function 224 | output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, 225 | name='projection_{}'.format(self.scope)) 226 | 227 | return output 228 | 229 | 230 | class StopProjection: 231 | """Projection to a scalar and through a sigmoid activation 232 | """ 233 | def __init__(self, is_training, shape=hparams.outputs_per_step, activation=tf.nn.sigmoid, scope=None): 234 | """ 235 | Args: 236 | is_training: Boolean, to control the use of sigmoid function as it is useless to use it 237 | during training since it is integrate inside the sigmoid_crossentropy loss 238 | shape: integer, dimensionality of output space. Defaults to 1 (scalar) 239 | activation: callable, activation function. only used during inference 240 | scope: StopProjection scope. 241 | """ 242 | super(StopProjection, self).__init__() 243 | self.is_training = is_training 244 | 245 | self.shape = shape 246 | self.activation = activation 247 | self.scope = 'stop_token_projection' if scope is None else scope 248 | 249 | def __call__(self, inputs): 250 | with tf.variable_scope(self.scope): 251 | output = tf.layers.dense(inputs, units=self.shape, 252 | activation=None, name='projection_{}'.format(self.scope)) 253 | 254 | #During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function 255 | if self.is_training: 256 | return output 257 | return self.activation(output) 258 | 259 | 260 | class Postnet: 261 | """Postnet that takes final decoder output and fine tunes it (using vision on past and future frames) 262 | """ 263 | def __init__(self, is_training, kernel_size=(5, ), channels=512, activation=tf.nn.tanh, scope=None): 264 | """ 265 | Args: 266 | is_training: Boolean, determines if the model is training or in inference to control dropout 267 | kernel_size: tuple or integer, The size of convolution kernels 268 | channels: integer, number of convolutional kernels 269 | activation: callable, postnet activation function for each convolutional layer 270 | scope: Postnet scope. 271 | """ 272 | super(Postnet, self).__init__() 273 | self.is_training = is_training 274 | 275 | self.kernel_size = kernel_size 276 | self.channels = channels 277 | self.activation = activation 278 | self.scope = 'postnet_convolutions' if scope is None else scope 279 | 280 | def __call__(self, inputs): 281 | with tf.variable_scope(self.scope): 282 | x = inputs 283 | for i in range(hparams.postnet_num_layers - 1): 284 | x = conv1d(x, self.kernel_size, self.channels, self.activation, 285 | self.is_training, 'conv_layer_{}_'.format(i + 1)+self.scope) 286 | x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, 'conv_layer_{}_'.format(5)+self.scope) 287 | return x -------------------------------------------------------------------------------- /tacotron/models/tacotron.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tacotron.utils.symbols import symbols 3 | from tacotron.utils.infolog import log 4 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper 5 | from tacotron.models.modules import * 6 | from tacotron.models.zoneout_LSTM import ZoneoutLSTMCell 7 | from tensorflow.contrib.seq2seq import dynamic_decode 8 | from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell 9 | from tacotron.models.custom_decoder import CustomDecoder 10 | from tacotron.models.attention import LocationSensitiveAttention 11 | from tacotron.utils.util import shape_list, vae_weight 12 | 13 | 14 | class Tacotron(): 15 | """vae_tacotron2 Feature prediction Model. 16 | """ 17 | def __init__(self, hparams): 18 | self._hparams = hparams 19 | 20 | def initialize(self, inputs, input_lengths, mel_targets=None, mel_lengths=None, stop_token_targets=None, linear_targets=None, gta=False, reference_mel=None): 21 | """ 22 | Initializes the model for inference 23 | 24 | sets "mel_outputs" and "alignments" fields. 25 | 26 | Args: 27 | - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of 28 | steps in the input time series, and values are character IDs 29 | - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths 30 | of each sequence in inputs. 31 | - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number 32 | of steps in the output time series, M is num_mels, and values are entries in the mel 33 | spectrogram. Only needed for training. 34 | """ 35 | if mel_targets is None and stop_token_targets is not None: 36 | raise ValueError('no mel targets were provided but token_targets were given') 37 | if mel_targets is not None and stop_token_targets is None and not gta: 38 | raise ValueError('Mel targets are provided without corresponding token_targets') 39 | if gta==False and self._hparams.predict_linear==True and linear_targets is None: 40 | raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') 41 | if gta and linear_targets is not None: 42 | raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') 43 | 44 | with tf.variable_scope('inference') as scope: 45 | is_training = mel_targets is not None and not gta 46 | batch_size = tf.shape(inputs)[0] 47 | hp = self._hparams 48 | #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis 49 | post_condition = hp.predict_linear and not gta 50 | 51 | # Embeddings ==> [batch_size, sequence_length, embedding_dim] 52 | embedding_table = tf.get_variable( 53 | 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) 54 | embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) 55 | 56 | 57 | #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] 58 | encoder_cell = TacotronEncoderCell( 59 | EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, 60 | channels=hp.enc_conv_channels, scope='encoder_convolutions'), 61 | EncoderRNN(is_training, size=hp.encoder_lstm_units, 62 | zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) 63 | 64 | encoder_outputs = encoder_cell(embedded_inputs, input_lengths) 65 | if hp.use_vae: 66 | if is_training: 67 | reference_mel = mel_targets 68 | 69 | style_embeddings, mu, log_var = VAE( 70 | inputs=reference_mel, 71 | input_lengths=mel_lengths, 72 | filters=hp.filters, 73 | kernel_size=(3, 3), 74 | strides=(2, 2), 75 | num_units=hp.vae_dim, 76 | is_training=is_training, 77 | scope='vae') 78 | 79 | self.mu = mu 80 | self.log_var = log_var 81 | style_embeddings = tf.layers.dense(style_embeddings, hp.encoder_depth) 82 | style_embeddings = tf.expand_dims(style_embeddings, axis=1) 83 | style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 256] 84 | encoder_outputs = encoder_outputs + style_embeddings 85 | 86 | #For shape visualization purpose 87 | enc_conv_output_shape = encoder_cell.conv_output_shape 88 | 89 | 90 | #Decoder Parts 91 | #Attention Decoder Prenet 92 | prenet = Prenet(is_training, layer_sizes=hp.prenet_layers, scope='decoder_prenet') 93 | #Attention Mechanism 94 | attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, 95 | mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 96 | cumulate_weights=hp.cumulative_weights) 97 | #Decoder LSTM Cells 98 | decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, 99 | size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') 100 | #Frames Projection layer 101 | frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') 102 | # projection layer 103 | stop_projection = StopProjection(is_training, scope='stop_token_projection') 104 | 105 | 106 | #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) 107 | decoder_cell = TacotronDecoderCell( 108 | prenet, 109 | attention_mechanism, 110 | decoder_lstm, 111 | frame_projection, 112 | stop_projection, 113 | mask_finished=hp.mask_finished) 114 | 115 | 116 | #Define the helper for our decoder 117 | if (is_training or gta) == True: 118 | self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, 119 | hp.num_mels, hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio, gta) 120 | else: 121 | self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) 122 | 123 | 124 | #initial decoder state 125 | decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) 126 | 127 | #Only use max iterations at synthesis time 128 | max_iters = hp.max_iters if not is_training else None 129 | 130 | #Decode 131 | (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( 132 | CustomDecoder(decoder_cell, self.helper, decoder_init_state), 133 | impute_finished=hp.impute_finished, 134 | maximum_iterations=max_iters) 135 | 136 | 137 | # Reshape outputs to be one output per entry 138 | #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] 139 | decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) 140 | stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) 141 | 142 | 143 | #Postnet 144 | postnet = Postnet(is_training, kernel_size=hp.postnet_kernel_size, 145 | channels=hp.postnet_channels, scope='postnet_convolutions') 146 | 147 | #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] 148 | residual = postnet(decoder_output) 149 | 150 | #Project residual to same dimension as mel spectrogram 151 | #==> [batch_size, decoder_steps * r, num_mels] 152 | residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') 153 | projected_residual = residual_projection(residual) 154 | 155 | 156 | #Compute the mel spectrogram 157 | mel_outputs = decoder_output + projected_residual 158 | 159 | 160 | if post_condition: 161 | #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py 162 | #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder 163 | post_processing_cell = TacotronEncoderCell( 164 | EncoderConvolutions(is_training, kernel_size=hp.enc_conv_kernel_size, 165 | channels=hp.enc_conv_channels, scope='post_processing_convolutions'), 166 | EncoderRNN(is_training, size=hp.encoder_lstm_units, 167 | zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) 168 | 169 | expand_outputs = post_processing_cell(mel_outputs) 170 | linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) 171 | 172 | #Grab alignments from the final decoder state 173 | alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) 174 | 175 | self.inputs = inputs 176 | self.input_lengths = input_lengths 177 | self.decoder_output = decoder_output 178 | self.alignments = alignments 179 | self.stop_token_prediction = stop_token_prediction 180 | self.stop_token_targets = stop_token_targets 181 | self.mel_outputs = mel_outputs 182 | self.reference_mel = reference_mel 183 | if post_condition: 184 | self.linear_outputs = linear_outputs 185 | self.linear_targets = linear_targets 186 | self.mel_targets = mel_targets 187 | self.mel_lengths = mel_lengths 188 | log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') 189 | log(' embedding: {}'.format(embedded_inputs.shape)) 190 | log(' enc conv out: {}'.format(enc_conv_output_shape)) 191 | log(' encoder out: {}'.format(encoder_outputs.shape)) 192 | log(' decoder out: {}'.format(decoder_output.shape)) 193 | log(' residual out: {}'.format(residual.shape)) 194 | log(' projected residual out: {}'.format(projected_residual.shape)) 195 | log(' mel out: {}'.format(mel_outputs.shape)) 196 | if post_condition: 197 | log(' linear out: {}'.format(linear_outputs.shape)) 198 | log(' out: {}'.format(stop_token_prediction.shape)) 199 | 200 | 201 | def add_loss(self, global_step): 202 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 203 | with tf.variable_scope('loss') as scope: 204 | hp = self._hparams 205 | 206 | # Compute loss of predictions before postnet 207 | before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_output) 208 | # Compute loss after postnet 209 | after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs) 210 | #Compute loss (for learning dynamic generation stop) 211 | stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 212 | labels=self.stop_token_targets, 213 | logits=self.stop_token_prediction)) 214 | 215 | if hp.predict_linear: 216 | #Compute linear loss 217 | #From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py 218 | #Prioritize loss for frequencies under 2000 Hz. 219 | l1 = tf.abs(self.linear_targets - self.linear_outputs) 220 | n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_mels) 221 | linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq]) 222 | else: 223 | linear_loss = 0. 224 | 225 | # Compute the regularization weight 226 | if hp.tacotron_scale_regularization: 227 | reg_weight_scaler = 1. / (2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (hp.max_abs_value) 228 | reg_weight = hp.tacotron_reg_weight * reg_weight_scaler 229 | else: 230 | reg_weight = hp.tacotron_reg_weight 231 | 232 | # Get all trainable variables 233 | all_vars = tf.trainable_variables() 234 | regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars 235 | if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight 236 | 237 | # Compute final loss term 238 | self.before_loss = before 239 | self.after_loss = after 240 | self.stop_token_loss = stop_token_loss 241 | self.regularization_loss = regularization 242 | self.linear_loss = linear_loss 243 | 244 | self.loss = self.before_loss + self.after_loss + self.stop_token_loss + self.regularization_loss + self.linear_loss 245 | 246 | if hp.use_vae: 247 | self.ki_loss = -0.5 * tf.reduce_sum(1 + self.log_var - tf.pow(self.mu, 2) - tf.exp(self.log_var)) 248 | vae_loss_weight = vae_weight(global_step) 249 | self.loss += self.ki_loss * vae_loss_weight 250 | 251 | 252 | def add_optimizer(self, global_step): 253 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 254 | 255 | Args: 256 | global_step: int32 scalar Tensor representing current global step in training 257 | ''' 258 | with tf.variable_scope('optimizer') as scope: 259 | hp = self._hparams 260 | if hp.tacotron_decay_learning_rate: 261 | self.decay_steps = hp.tacotron_decay_steps 262 | self.decay_rate = hp.tacotron_decay_rate 263 | self.learning_rate = self._learning_rate_decay(hp.tacotron_initial_learning_rate, global_step) 264 | else: 265 | self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) 266 | 267 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, 268 | hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) 269 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 270 | self.gradients = gradients 271 | #Just for causion 272 | #https://github.com/Rayhane-mamah/Tacotron-2/issues/11 273 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 0.5) 274 | 275 | # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 276 | # https://github.com/tensorflow/tensorflow/issues/1122 277 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 278 | self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 279 | global_step=global_step) 280 | 281 | def _learning_rate_decay(self, init_lr, global_step): 282 | ################################################################# 283 | # Narrow Exponential Decay: 284 | 285 | # Phase 1: lr = 1e-3 286 | # We only start learning rate decay after 50k steps 287 | 288 | # Phase 2: lr in ]1e-3, 1e-5[ 289 | # decay reach minimal value at step 300k 290 | 291 | # Phase 3: lr = 1e-5 292 | # clip by minimal learning rate value (step > 300k) 293 | ################################################################# 294 | hp = self._hparams 295 | 296 | #Compute natural exponential decay 297 | lr = tf.train.exponential_decay(init_lr, 298 | global_step - hp.tacotron_start_decay, #lr = 1e-3 at step 50k 299 | self.decay_steps, 300 | self.decay_rate, #lr = 1e-5 around step 300k 301 | name='exponential_decay') 302 | 303 | 304 | #clip learning rate by max and min values (initial and final values) 305 | return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) 306 | -------------------------------------------------------------------------------- /tacotron/models/zoneout_LSTM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.python.ops.rnn_cell import RNNCell 4 | 5 | 6 | # Thanks to 'initializers_enhanced.py' of Project RNN Enhancement: 7 | # https://github.com/nicolas-ivanov/Seq2Seq_Upgrade_TensorFlow/blob/master/rnn_enhancement/initializers_enhanced.py 8 | def orthogonal_initializer(scale=1.0): 9 | def _initializer(shape, dtype=tf.float32): 10 | flat_shape = (shape[0], np.prod(shape[1:])) 11 | a = np.random.normal(0.0, 1.0, flat_shape) 12 | u, _, v = np.linalg.svd(a, full_matrices=False) 13 | q = u if u.shape == flat_shape else v 14 | q = q.reshape(shape) 15 | return tf.constant(scale * q[:shape[0], :shape[1]], dtype=tf.float32) 16 | return _initializer 17 | 18 | 19 | class ZoneoutLSTMCell(RNNCell): 20 | """Zoneout Regularization for LSTM-RNN. 21 | """ 22 | 23 | def __init__(self, num_units, is_training, input_size=None, 24 | use_peepholes=False, cell_clip=None, 25 | #initializer=orthogonal_initializer(), 26 | initializer=tf.contrib.layers.xavier_initializer(), 27 | num_proj=None, proj_clip=None, ext_proj=None, 28 | forget_bias=1.0, 29 | state_is_tuple=True, 30 | activation=tf.tanh, 31 | zoneout_factor_cell=0.0, 32 | zoneout_factor_output=0.0, 33 | reuse=None): 34 | """Initialize the parameters for an LSTM cell. 35 | Args: 36 | num_units: int, The number of units in the LSTM cell. 37 | is_training: bool, set True when training. 38 | use_peepholes: bool, set True to enable diagonal/peephole 39 | connections. 40 | cell_clip: (optional) A float value, if provided the cell state 41 | is clipped by this value prior to the cell output activation. 42 | initializer: (optional) The initializer to use for the weight 43 | matrices. 44 | num_proj: (optional) int, The output dimensionality for 45 | the projection matrices. If None, no projection is performed. 46 | forget_bias: Biases of the forget gate are initialized by default 47 | to 1 in order to reduce the scale of forgetting at the beginning of 48 | the training. 49 | activation: Activation function of the inner states. 50 | """ 51 | if not state_is_tuple: 52 | tf.logging.warn( 53 | "%s: Using a concatenated state is slower and will soon be " 54 | "deprecated. Use state_is_tuple=True.", self) 55 | if input_size is not None: 56 | tf.logging.warn( 57 | "%s: The input_size parameter is deprecated.", self) 58 | 59 | if not (zoneout_factor_cell >= 0.0 and zoneout_factor_cell <= 1.0): 60 | raise ValueError( 61 | "Parameter zoneout_factor_cell must be in [0 1]") 62 | 63 | if not (zoneout_factor_output >= 0.0 and zoneout_factor_output <= 1.0): 64 | raise ValueError( 65 | "Parameter zoneout_factor_cell must be in [0 1]") 66 | 67 | self.num_units = num_units 68 | self.is_training = is_training 69 | self.use_peepholes = use_peepholes 70 | self.cell_clip = cell_clip 71 | self.num_proj = num_proj 72 | self.proj_clip = proj_clip 73 | self.initializer = initializer 74 | self.forget_bias = forget_bias 75 | self.state_is_tuple = state_is_tuple 76 | self.activation = activation 77 | self.zoneout_factor_cell = zoneout_factor_cell 78 | self.zoneout_factor_output = zoneout_factor_output 79 | 80 | if num_proj: 81 | self._state_size = ( 82 | tf.nn.rnn_cell.LSTMStateTuple(num_units, num_proj) 83 | if state_is_tuple else num_units + num_proj) 84 | self._output_size = num_proj 85 | else: 86 | self._state_size = ( 87 | tf.nn.rnn_cell.LSTMStateTuple(num_units, num_units) 88 | if state_is_tuple else 2 * num_units) 89 | self._output_size = num_units 90 | 91 | self._ext_proj = ext_proj 92 | 93 | @property 94 | def state_size(self): 95 | return self._state_size 96 | 97 | @property 98 | def output_size(self): 99 | if self._ext_proj is None: 100 | return self._output_size 101 | return self._ext_proj 102 | 103 | def __call__(self, inputs, state, scope=None): 104 | 105 | num_proj = self.num_units if self.num_proj is None else self.num_proj 106 | 107 | if self.state_is_tuple: 108 | (c_prev, h_prev) = state 109 | else: 110 | c_prev = tf.slice(state, [0, 0], [-1, self.num_units]) 111 | h_prev = tf.slice(state, [0, self.num_units], [-1, num_proj]) 112 | 113 | # c_prev : Tensor with the size of [batch_size, state_size] 114 | # h_prev : Tensor with the size of [batch_size, state_size/2] 115 | 116 | dtype = inputs.dtype 117 | input_size = inputs.get_shape().with_rank(2)[1] 118 | 119 | with tf.variable_scope(scope or type(self).__name__): 120 | if input_size.value is None: 121 | raise ValueError( 122 | "Could not infer input size from inputs.get_shape()[-1]") 123 | 124 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 125 | lstm_matrix = _linear([inputs, h_prev], 4 * self.num_units, True) 126 | i, j, f, o = tf.split(lstm_matrix, 4, 1) 127 | 128 | # diagonal connections 129 | if self.use_peepholes: 130 | w_f_diag = tf.get_variable( 131 | "W_F_diag", shape=[self.num_units], dtype=dtype) 132 | w_i_diag = tf.get_variable( 133 | "W_I_diag", shape=[self.num_units], dtype=dtype) 134 | w_o_diag = tf.get_variable( 135 | "W_O_diag", shape=[self.num_units], dtype=dtype) 136 | 137 | with tf.name_scope(None, "zoneout"): 138 | # make binary mask tensor for cell 139 | keep_prob_cell = tf.convert_to_tensor( 140 | self.zoneout_factor_cell, 141 | dtype=c_prev.dtype 142 | ) 143 | random_tensor_cell = keep_prob_cell 144 | random_tensor_cell += \ 145 | tf.random_uniform(tf.shape(c_prev), 146 | seed=None, dtype=c_prev.dtype) 147 | binary_mask_cell = tf.floor(random_tensor_cell) 148 | # 0 <-> 1 swap 149 | binary_mask_cell_complement = tf.ones(tf.shape(c_prev)) \ 150 | - binary_mask_cell 151 | 152 | # make binary mask tensor for output 153 | keep_prob_output = tf.convert_to_tensor( 154 | self.zoneout_factor_output, 155 | dtype=h_prev.dtype 156 | ) 157 | random_tensor_output = keep_prob_output 158 | random_tensor_output += \ 159 | tf.random_uniform(tf.shape(h_prev), 160 | seed=None, dtype=h_prev.dtype) 161 | binary_mask_output = tf.floor(random_tensor_output) 162 | # 0 <-> 1 swap 163 | binary_mask_output_complement = tf.ones(tf.shape(h_prev)) \ 164 | - binary_mask_output 165 | 166 | # apply zoneout for cell 167 | if self.use_peepholes: 168 | c_temp = c_prev * \ 169 | tf.sigmoid(f + self.forget_bias + 170 | w_f_diag * c_prev) + \ 171 | tf.sigmoid(i + w_i_diag * c_prev) * \ 172 | self.activation(j) 173 | if self.is_training and self.zoneout_factor_cell > 0.0: 174 | c = binary_mask_cell * c_prev + \ 175 | binary_mask_cell_complement * c_temp 176 | else: 177 | c = c_temp 178 | else: 179 | c_temp = c_prev * tf.sigmoid(f + self.forget_bias) + \ 180 | tf.sigmoid(i) * self.activation(j) 181 | if self.is_training and self.zoneout_factor_cell > 0.0: 182 | c = binary_mask_cell * c_prev + \ 183 | binary_mask_cell_complement * c_temp 184 | else: 185 | c = c_temp 186 | 187 | if self.cell_clip is not None: 188 | c = tf.clip_by_value(c, -self.cell_clip, self.cell_clip) 189 | 190 | # apply zoneout for output 191 | if self.use_peepholes: 192 | h_temp = tf.sigmoid(o + w_o_diag * c) * self.activation(c) 193 | if self.is_training and self.zoneout_factor_output > 0.0: 194 | h = binary_mask_output * h_prev + \ 195 | binary_mask_output_complement * h_temp 196 | else: 197 | h = h_temp 198 | else: 199 | h_temp = tf.sigmoid(o) * self.activation(c) 200 | if self.is_training and self.zoneout_factor_output > 0.0: 201 | h = binary_mask_output * h_prev + \ 202 | binary_mask_output_complement * h_temp 203 | else: 204 | h = h_temp 205 | 206 | # apply prejection 207 | if self.num_proj is not None: 208 | w_proj = tf.get_variable( 209 | "W_P", [self.num_units, num_proj], dtype=dtype) 210 | 211 | h = tf.matmul(h, w_proj) 212 | if self.proj_clip is not None: 213 | h = tf.clip_by_value(h, -self.proj_clip, self.proj_clip) 214 | 215 | new_state = (tf.nn.rnn_cell.LSTMStateTuple(c, h) 216 | if self.state_is_tuple else tf.concat(1, [c, h])) 217 | 218 | return h, new_state 219 | 220 | 221 | def _linear(args, output_size, bias, bias_start=0.0, scope=None): 222 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 223 | Args: 224 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 225 | output_size: int, second dimension of W[i]. 226 | bias: boolean, whether to add a bias term or not. 227 | bias_start: starting value to initialize the bias; 0 by default. 228 | scope: VariableScope for the created subgraph; defaults to "Linear". 229 | Returns: 230 | A 2D Tensor with shape [batch x output_size] equal to 231 | sum_i(args[i] * W[i]), where W[i]s are newly created matrices. 232 | Raises: 233 | ValueError: if some of the arguments has unspecified or wrong shape. 234 | """ 235 | if args is None or (isinstance(args, (list, tuple)) and not args): 236 | raise ValueError("`args` must be specified") 237 | if not isinstance(args, (list, tuple)): 238 | args = [args] 239 | 240 | # Calculate the total size of arguments on dimension 1. 241 | total_arg_size = 0 242 | shapes = [a.get_shape().as_list() for a in args] 243 | for shape in shapes: 244 | if len(shape) != 2: 245 | raise ValueError( 246 | "Linear is expecting 2D arguments: %s" % str(shapes)) 247 | if not shape[1]: 248 | raise ValueError( 249 | "Linear expects shape[1] of arguments: %s" % str(shapes)) 250 | else: 251 | total_arg_size += shape[1] 252 | 253 | # Now the computation. 254 | with tf.variable_scope(scope or "Linear"): 255 | matrix = tf.get_variable("Matrix", [total_arg_size, output_size]) 256 | if len(args) == 1: 257 | res = tf.matmul(args[0], matrix) 258 | else: 259 | res = tf.matmul(tf.concat(args, 1), matrix) 260 | if not bias: 261 | return res 262 | bias_term = tf.get_variable( 263 | "Bias", [output_size], 264 | initializer=tf.constant_initializer(bias_start)) 265 | return res + bias_term -------------------------------------------------------------------------------- /tacotron/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | from hparams import hparams, hparams_debug_string 5 | from tacotron.synthesizer import Synthesizer 6 | import tensorflow as tf 7 | import time 8 | from tqdm import tqdm 9 | from tacotron.utils.audio import load_wav, melspectrogram 10 | 11 | def run_eval(args, checkpoint_path, output_dir): 12 | print(hparams_debug_string()) 13 | synth = Synthesizer() 14 | synth.load(checkpoint_path) 15 | eval_dir = os.path.join(output_dir, 'eval') 16 | log_dir = os.path.join(output_dir, 'logs-eval') 17 | wav = load_wav(args.reference_audio) 18 | reference_mel = melspectrogram(wav).transpose() 19 | #Create output path if it doesn't exist 20 | os.makedirs(eval_dir, exist_ok=True) 21 | os.makedirs(log_dir, exist_ok=True) 22 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) 23 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) 24 | 25 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: 26 | for i, text in enumerate(tqdm(hparams.sentences)): 27 | start = time.time() 28 | mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None, reference_mel) 29 | 30 | file.write('{}|{}\n'.format(text, mel_filename)) 31 | print('synthesized mel spectrograms at {}'.format(eval_dir)) 32 | 33 | def run_synthesis(args, checkpoint_path, output_dir): 34 | metadata_filename = os.path.join(args.input_dir, 'train.txt') 35 | print(hparams_debug_string()) 36 | synth = Synthesizer() 37 | synth.load(checkpoint_path, gta=args.GTA) 38 | with open(metadata_filename, encoding='utf-8') as f: 39 | metadata = [line.strip().split('|') for line in f] 40 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 41 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) 42 | print('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) 43 | 44 | if args.GTA==True: 45 | synth_dir = os.path.join(output_dir, 'gta') 46 | else: 47 | synth_dir = os.path.join(output_dir, 'natural') 48 | 49 | #Create output path if it doesn't exist 50 | os.makedirs(synth_dir, exist_ok=True) 51 | 52 | print('starting synthesis') 53 | mel_dir = os.path.join(args.input_dir, 'mels') 54 | wav_dir = os.path.join(args.input_dir, 'audio') 55 | with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: 56 | for i, meta in enumerate(tqdm(metadata)): 57 | text = meta[5] 58 | mel_filename = os.path.join(mel_dir, meta[1]) 59 | wav_filename = os.path.join(wav_dir, meta[0]) 60 | mel_output_filename = synth.synthesize(text, None, i+1, synth_dir, None, mel_filename) 61 | 62 | file.write('{}|{}|{}|{}\n'.format(text, mel_filename, mel_output_filename, wav_filename)) 63 | print('synthesized mel spectrograms at {}'.format(synth_dir)) 64 | 65 | def tacotron_synthesize(args): 66 | hparams.parse(args.hparams) 67 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 68 | output_dir = 'tacotron_' + args.output_dir 69 | 70 | try: 71 | checkpoint_path = tf.train.get_checkpoint_state(args.checkpoint).model_checkpoint_path 72 | print('loaded model at {}'.format(checkpoint_path)) 73 | except: 74 | raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(args.checkpoint)) 75 | 76 | if args.mode == 'eval': 77 | run_eval(args, checkpoint_path, output_dir) 78 | else: 79 | run_synthesis(args, checkpoint_path, output_dir) 80 | -------------------------------------------------------------------------------- /tacotron/synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from hparams import hparams 5 | from librosa import effects 6 | from tacotron.models import create_model 7 | from tacotron.utils.text import text_to_sequence 8 | from tacotron.utils import plot 9 | from datasets import audio 10 | from datetime import datetime 11 | 12 | 13 | class Synthesizer: 14 | def load(self, checkpoint_path, gta=False, model_name='Tacotron'): 15 | print('Constructing model: %s' % model_name) 16 | inputs = tf.placeholder(tf.int32, [1, None], 'inputs') 17 | input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') 18 | 19 | with tf.variable_scope('model') as scope: 20 | self.model = create_model(model_name, hparams) 21 | if hparams.use_vae: 22 | ref_targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'ref_targets') 23 | if gta: 24 | targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') 25 | 26 | if hparams.use_vae: 27 | self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=ref_targets) 28 | else: 29 | self.model.initialize(inputs, input_lengths, targets, gta=gta) 30 | else: 31 | if hparams.use_vae: 32 | self.model.initialize(inputs, input_lengths, reference_mel=ref_targets) 33 | else: 34 | self.model.initialize(inputs, input_lengths) 35 | self.mel_outputs = self.model.mel_outputs 36 | self.alignment = self.model.alignments[0] 37 | 38 | self.gta = gta 39 | print('Loading checkpoint: %s' % checkpoint_path) 40 | self.session = tf.Session() 41 | self.session.run(tf.global_variables_initializer()) 42 | saver = tf.train.Saver() 43 | saver.restore(self.session, checkpoint_path) 44 | 45 | 46 | def synthesize(self, text, index, out_dir, log_dir, mel_filename, reference_mel): 47 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 48 | seq = text_to_sequence(text, cleaner_names) 49 | feed_dict = { 50 | self.model.inputs: [np.asarray(seq, dtype=np.int32)], 51 | self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) 52 | } 53 | 54 | if self.gta: 55 | feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) 56 | feed_dict[self.model.reference_mel] = np.load(mel_filename).reshape(1, -1, 80) 57 | elif hparams.use_vae: 58 | reference_mel = [np.asarray(reference_mel, dtype=np.float32)] 59 | feed_dict[self.model.reference_mel] = reference_mel 60 | 61 | 62 | if self.gta or not hparams.predict_linear: 63 | mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) 64 | 65 | else: 66 | linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) 67 | linear = linear.reshape(-1, hparams.num_freq) 68 | 69 | mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out 70 | 71 | # Write the spectrogram to disk 72 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders 73 | mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) 74 | np.save(mel_filename, mels, allow_pickle=False) 75 | 76 | if log_dir is not None: 77 | #save wav (mel -> wav) 78 | wav = audio.inv_mel_spectrogram(mels.T) 79 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index))) 80 | 81 | if hparams.predict_linear: 82 | #save wav (linear -> wav) 83 | wav = audio.inv_linear_spectrogram(linear.T) 84 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index))) 85 | 86 | #save alignments 87 | plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), 88 | info='{}'.format(text), split_title=True) 89 | 90 | #save mel spectrogram plot 91 | plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), 92 | info='{}'.format(text), split_title=True) 93 | 94 | return mel_filename 95 | -------------------------------------------------------------------------------- /tacotron/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datetime import datetime 3 | import os 4 | import subprocess 5 | import time 6 | import tensorflow as tf 7 | import traceback 8 | import argparse 9 | 10 | from tacotron.feeder import Feeder 11 | from hparams import hparams, hparams_debug_string 12 | from tacotron.models import create_model 13 | from tacotron.utils.text import sequence_to_text 14 | from tacotron.utils import infolog, plot, ValueWindow 15 | from datasets import audio 16 | log = infolog.log 17 | 18 | 19 | def add_stats(model): 20 | with tf.variable_scope('stats') as scope: 21 | tf.summary.histogram('mel_outputs', model.mel_outputs) 22 | tf.summary.histogram('mel_targets', model.mel_targets) 23 | tf.summary.scalar('before_loss', model.before_loss) 24 | tf.summary.scalar('after_loss', model.after_loss) 25 | if hparams.predict_linear: 26 | tf.summary.scalar('linear loss', model.linear_loss) 27 | tf.summary.scalar('regularization_loss', model.regularization_loss) 28 | tf.summary.scalar('stop_token_loss', model.stop_token_loss) 29 | tf.summary.scalar('loss', model.loss) 30 | tf.summary.scalar('learning_rate', model.learning_rate) #control learning rate decay speed 31 | # gradient_norms = [tf.norm(grad) for grad in model.gradients] 32 | # tf.summary.histogram('gradient_norm', gradient_norms) 33 | # tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion) 34 | if hparams.use_vae: 35 | tf.summary.scalar('ki_loss', model.ki_loss) 36 | return tf.summary.merge_all() 37 | 38 | def time_string(): 39 | return datetime.now().strftime('%Y-%m-%d %H:%M') 40 | 41 | def train(log_dir, args): 42 | save_dir = os.path.join(log_dir, 'pretrained/') 43 | checkpoint_path = os.path.join(save_dir, 'model.ckpt') 44 | input_path = os.path.join(args.base_dir, args.input) 45 | plot_dir = os.path.join(log_dir, 'plots') 46 | wav_dir = os.path.join(log_dir, 'wavs') 47 | mel_dir = os.path.join(log_dir, 'mel-spectrograms') 48 | os.makedirs(plot_dir, exist_ok=True) 49 | os.makedirs(wav_dir, exist_ok=True) 50 | os.makedirs(mel_dir, exist_ok=True) 51 | 52 | if hparams.predict_linear: 53 | linear_dir = os.path.join(log_dir, 'linear-spectrograms') 54 | os.makedirs(linear_dir, exist_ok=True) 55 | 56 | log('Checkpoint path: {}'.format(checkpoint_path)) 57 | log('Loading training data from: {}'.format(input_path)) 58 | log('Using model: {}'.format(args.model)) 59 | log(hparams_debug_string()) 60 | 61 | #Set up data feeder 62 | coord = tf.train.Coordinator() 63 | with tf.variable_scope('datafeeder') as scope: 64 | feeder = Feeder(coord, input_path, hparams) 65 | 66 | #Set up model: 67 | step_count = 0 68 | try: 69 | #simple text file to keep count of global step 70 | with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: 71 | step_count = int(file.read()) 72 | except: 73 | print('no step_counter file found, assuming there is no saved checkpoint') 74 | 75 | global_step = tf.Variable(step_count, name='global_step', trainable=False) 76 | with tf.variable_scope('model') as scope: 77 | model = create_model(args.model, hparams) 78 | if hparams.predict_linear: 79 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.mel_lengths, feeder.token_targets, feeder.linear_targets) 80 | else: 81 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.mel_lengths, feeder.token_targets) 82 | model.add_loss(global_step) 83 | model.add_optimizer(global_step) 84 | stats = add_stats(model) 85 | 86 | #Book keeping 87 | step = 0 88 | time_window = ValueWindow(100) 89 | loss_window = ValueWindow(100) 90 | saver = tf.train.Saver(max_to_keep=5) 91 | 92 | #Memory allocation on the GPU as needed 93 | config = tf.ConfigProto() 94 | config.gpu_options.allow_growth = True 95 | 96 | #Train 97 | with tf.Session(config=config) as sess: 98 | try: 99 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 100 | sess.run(tf.global_variables_initializer()) 101 | 102 | #saved model restoring 103 | if args.restore: 104 | #Restore saved model if the user requested it, Default = True. 105 | try: 106 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 107 | except tf.errors.OutOfRangeError as e: 108 | log('Cannot restore checkpoint: {}'.format(e)) 109 | 110 | if (checkpoint_state and checkpoint_state.model_checkpoint_path): 111 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) 112 | saver.restore(sess, checkpoint_state.model_checkpoint_path) 113 | 114 | else: 115 | if not args.restore: 116 | log('Starting new training!') 117 | else: 118 | log('No model to load at {}'.format(save_dir)) 119 | 120 | #initializing feeder 121 | feeder.start_in_session(sess) 122 | 123 | #Training loop 124 | while not coord.should_stop(): 125 | start_time = time.time() 126 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 127 | time_window.append(time.time() - start_time) 128 | loss_window.append(loss) 129 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( 130 | step, time_window.average, loss, loss_window.average) 131 | log(message, end='\r') 132 | 133 | if loss > 100 or np.isnan(loss): 134 | log('Loss exploded to {:.5f} at step {}'.format(loss, step)) 135 | raise Exception('Loss exploded') 136 | 137 | if step % args.summary_interval == 0: 138 | log('\nWriting summary at step: {}'.format(step)) 139 | summary_writer.add_summary(sess.run(stats), step) 140 | 141 | if step % args.checkpoint_interval == 0: 142 | with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file: 143 | file.write(str(step)) 144 | log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step)) 145 | saver.save(sess, checkpoint_path, global_step=step) 146 | 147 | log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') 148 | if hparams.predict_linear: 149 | input_seq, mel_prediction, linear_prediction, alignment, target = sess.run([ 150 | model.inputs[0], 151 | model.mel_outputs[0], 152 | model.linear_outputs[0], 153 | model.alignments[0], 154 | model.mel_targets[0], 155 | ]) 156 | 157 | #save predicted linear spectrogram to disk (debug) 158 | linear_filename = 'linear-prediction-step-{}.npy'.format(step) 159 | np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) 160 | 161 | #save griffin lim inverted wav for debug (linear -> wav) 162 | wav = audio.inv_linear_spectrogram(linear_prediction.T) 163 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-waveform-linear.wav'.format(step))) 164 | 165 | else: 166 | input_seq, mel_prediction, alignment, target = sess.run([model.inputs[0], 167 | model.mel_outputs[0], 168 | model.alignments[0], 169 | model.mel_targets[0], 170 | ]) 171 | 172 | #save predicted mel spectrogram to disk (debug) 173 | mel_filename = 'mel-prediction-step-{}.npy'.format(step) 174 | np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) 175 | 176 | #save griffin lim inverted wav for debug (mel -> wav) 177 | wav = audio.inv_mel_spectrogram(mel_prediction.T) 178 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-waveform-mel.wav'.format(step))) 179 | 180 | #save alignment plot to disk (control purposes) 181 | plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), 182 | info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss)) 183 | #save real mel-spectrogram plot to disk (control purposes) 184 | plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), 185 | info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss)) 186 | #save predicted mel-spectrogram plot to disk (control purposes) 187 | plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), 188 | info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss)) 189 | log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) 190 | 191 | except Exception as e: 192 | log('Exiting due to exception: {}'.format(e), slack=True) 193 | traceback.print_exc() 194 | coord.request_stop(e) 195 | 196 | def tacotron_train(args): 197 | hparams.parse(args.hparams) 198 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 199 | run_name = args.name or args.model 200 | log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name)) 201 | os.makedirs(log_dir, exist_ok=True) 202 | infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name) 203 | train(log_dir, args) 204 | -------------------------------------------------------------------------------- /tacotron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /tacotron/utils/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from scipy import signal 5 | from hparams import hparams 6 | import tensorflow as tf 7 | 8 | 9 | def load_wav(path): 10 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 11 | 12 | def save_wav(wav, path): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate) 15 | 16 | def trim_silence(wav): 17 | '''Trim leading and trailing silence 18 | 19 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silences. 20 | ''' 21 | return librosa.effects.trim(wav)[0] 22 | 23 | def preemphasis(x): 24 | return signal.lfilter([1, -hparams.preemphasis], [1], x) 25 | 26 | def inv_preemphasis(x): 27 | return signal.lfilter([1], [1, -hparams.preemphasis], x) 28 | 29 | def get_hop_size(): 30 | hop_size = hparams.hop_size 31 | if hop_size is None: 32 | assert hparams.frame_shift_ms is not None 33 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 34 | return hop_size 35 | 36 | def melspectrogram(wav): 37 | D = _stft(wav) 38 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 39 | 40 | if hparams.mel_normalization: 41 | return _normalize(S) 42 | return S 43 | 44 | 45 | def inv_mel_spectrogram(mel_spectrogram): 46 | '''Converts mel spectrogram to waveform using librosa''' 47 | if hparams.mel_normalization: 48 | D = _denormalize(mel_spectrogram) 49 | else: 50 | D = mel_spectrogram 51 | 52 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)) # Convert back to linear 53 | 54 | return _griffin_lim(S ** hparams.power) 55 | 56 | def _griffin_lim(S): 57 | '''librosa implementation of Griffin-Lim 58 | Based on https://github.com/librosa/librosa/issues/434 59 | ''' 60 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 61 | S_complex = np.abs(S).astype(np.complex) 62 | y = _istft(S_complex * angles) 63 | for i in range(hparams.griffin_lim_iters): 64 | angles = np.exp(1j * np.angle(_stft(y))) 65 | y = _istft(S_complex * angles) 66 | return y 67 | 68 | def _stft(y): 69 | return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size()) 70 | 71 | def _istft(y): 72 | return librosa.istft(y, hop_length=get_hop_size()) 73 | 74 | 75 | # Conversions 76 | _mel_basis = None 77 | _inv_mel_basis = None 78 | 79 | def _linear_to_mel(spectogram): 80 | global _mel_basis 81 | if _mel_basis is None: 82 | _mel_basis = _build_mel_basis() 83 | return np.dot(_mel_basis, spectogram) 84 | 85 | def _mel_to_linear(mel_spectrogram): 86 | global _inv_mel_basis 87 | if _inv_mel_basis is None: 88 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 89 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 90 | 91 | def _build_mel_basis(): 92 | assert hparams.fmax <= hparams.sample_rate // 2 93 | return librosa.filters.mel(hparams.sample_rate, hparams.fft_size, n_mels=hparams.num_mels, 94 | fmin=hparams.fmin, fmax=hparams.fmax) 95 | 96 | def _amp_to_db(x): 97 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 98 | return 20 * np.log10(np.maximum(min_level, x)) 99 | 100 | def _db_to_amp(x): 101 | return np.power(10.0, (x) * 0.05) 102 | 103 | def _normalize(S): 104 | if hparams.allow_clipping_in_normalization: 105 | if hparams.symmetric_mels: 106 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 107 | -hparams.max_abs_value, hparams.max_abs_value) 108 | else: 109 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 110 | 111 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 112 | if hparams.symmetric_mels: 113 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 114 | else: 115 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 116 | 117 | def _denormalize(D): 118 | if hparams.allow_clipping_in_normalization: 119 | if hparams.symmetric_mels: 120 | return (((np.clip(D, -hparams.max_abs_value, 121 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 122 | + hparams.min_level_db) 123 | else: 124 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 125 | 126 | if hparams.symmetric_mels: 127 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 128 | else: 129 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 130 | -------------------------------------------------------------------------------- /tacotron/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | '''lowercase input tokens. 56 | ''' 57 | return text.lower() 58 | 59 | 60 | def collapse_whitespace(text): 61 | return re.sub(_whitespace_re, ' ', text) 62 | 63 | 64 | def convert_to_ascii(text): 65 | return unidecode(text) 66 | 67 | 68 | def basic_cleaners(text): 69 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 70 | text = lowercase(text) 71 | text = collapse_whitespace(text) 72 | return text 73 | 74 | 75 | def transliteration_cleaners(text): 76 | '''Pipeline for non-English text that transliterates to ASCII.''' 77 | text = convert_to_ascii(text) 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def english_cleaners(text): 84 | '''Pipeline for English text, including number and abbreviation expansion.''' 85 | text = convert_to_ascii(text) 86 | text = expand_numbers(text) 87 | text = expand_abbreviations(text) 88 | text = collapse_whitespace(text) 89 | return text 90 | -------------------------------------------------------------------------------- /tacotron/utils/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | def __init__(self, file_or_path, keep_ambiguous=True): 20 | if isinstance(file_or_path, str): 21 | with open(file_or_path, encoding='latin-1') as f: 22 | entries = _parse_cmudict(f) 23 | else: 24 | entries = _parse_cmudict(file_or_path) 25 | if not keep_ambiguous: 26 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 27 | self._entries = entries 28 | 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | 34 | def lookup(self, word): 35 | '''Returns list of ARPAbet pronunciations of the given word.''' 36 | return self._entries.get(word.upper()) 37 | 38 | 39 | 40 | _alt_re = re.compile(r'\([0-9]+\)') 41 | 42 | 43 | def _parse_cmudict(file): 44 | cmudict = {} 45 | for line in file: 46 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 47 | parts = line.split(' ') 48 | word = re.sub(_alt_re, '', parts[0]) 49 | pronunciation = _get_pronunciation(parts[1]) 50 | if pronunciation: 51 | if word in cmudict: 52 | cmudict[word].append(pronunciation) 53 | else: 54 | cmudict[word] = [pronunciation] 55 | return cmudict 56 | 57 | 58 | def _get_pronunciation(s): 59 | parts = s.strip().split(' ') 60 | for part in parts: 61 | if part not in _valid_symbol_set: 62 | return None 63 | return ' '.join(parts) 64 | -------------------------------------------------------------------------------- /tacotron/utils/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file = open(filename, 'a') 19 | _file.write('\n-----------------------------------------------------------------\n') 20 | _file.write('Starting new training run\n') 21 | _file.write('-----------------------------------------------------------------\n') 22 | _run_name = run_name 23 | _slack_url = slack_url 24 | 25 | 26 | def log(msg, end='\n', slack=False): 27 | print(msg, end=end) 28 | if _file is not None: 29 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 30 | if slack and _slack_url is not None: 31 | Thread(target=_send_slack, args=(msg,)).start() 32 | 33 | 34 | def _close_logfile(): 35 | global _file 36 | if _file is not None: 37 | _file.close() 38 | _file = None 39 | 40 | 41 | def _send_slack(msg): 42 | req = Request(_slack_url) 43 | req.add_header('Content-Type', 'application/json') 44 | urlopen(req, json.dumps({ 45 | 'username': 'tacotron', 46 | 'icon_emoji': ':taco:', 47 | 'text': '*%s*: %s' % (_run_name, msg) 48 | }).encode()) 49 | 50 | 51 | atexit.register(_close_logfile) -------------------------------------------------------------------------------- /tacotron/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /tacotron/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def split_title_line(title_text, max_words=5): 8 | """ 9 | A function that splits any string based on specific character 10 | (returning it with the string), with maximum number of words on it 11 | """ 12 | seq = title_text.split() 13 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 14 | 15 | def plot_alignment(alignment, path, info=None, split_title=False): 16 | fig, ax = plt.subplots() 17 | im = ax.imshow( 18 | alignment, 19 | aspect='auto', 20 | origin='lower', 21 | interpolation='none') 22 | fig.colorbar(im, ax=ax) 23 | xlabel = 'Decoder timestep' 24 | if info is not None: 25 | if split_title: 26 | title = split_title_line(info) 27 | else: 28 | title = info 29 | plt.xlabel(xlabel) 30 | plt.title(title) 31 | plt.ylabel('Encoder timestep') 32 | plt.tight_layout() 33 | plt.savefig(path, format='png') 34 | 35 | 36 | def plot_spectrogram(spectrogram, path, info=None, split_title=False): 37 | plt.figure() 38 | plt.imshow(np.rot90(spectrogram)) 39 | plt.colorbar(shrink=0.65, orientation='horizontal') 40 | plt.ylabel('mels') 41 | xlabel = 'frames' 42 | if info is not None: 43 | if split_title: 44 | title = split_title_line(info) 45 | else: 46 | title = info 47 | plt.xlabel(xlabel) 48 | plt.title(title) 49 | plt.tight_layout() 50 | plt.savefig(path, format='png') 51 | -------------------------------------------------------------------------------- /tacotron/utils/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from . import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet -------------------------------------------------------------------------------- /tacotron/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | from . import cleaners 3 | from .symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /tacotron/utils/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from hparams import hparams as hp 4 | 5 | def shape_list(x): 6 | """Return list of dims, statically where possible.""" 7 | x = tf.convert_to_tensor(x) 8 | 9 | # If unknown rank, return dynamic shape 10 | if x.get_shape().dims is None: 11 | return tf.shape(x) 12 | 13 | static = x.get_shape().as_list() 14 | shape = tf.shape(x) 15 | 16 | ret = [] 17 | for i in range(len(static)): 18 | dim = static[i] 19 | if dim is None: 20 | dim = shape[i] 21 | ret.append(dim) 22 | return ret 23 | 24 | def vae_weight(global_step): 25 | warm_up_step = hp.vae_warming_up 26 | w1 = tf.cond( 27 | global_step < warm_up_step, 28 | lambda: tf.cond( 29 | global_step % 100 < 1, 30 | lambda: tf.convert_to_tensor(hp.init_vae_weights) + tf.cast(global_step / 100 * hp.vae_weight_multiler, tf.float32), 31 | lambda: tf.cast(tf.convert_to_tensor(0), tf.float32) 32 | ), 33 | lambda: tf.cast(tf.convert_to_tensor(0), tf.float32) 34 | ) 35 | 36 | w2 = tf.cond( 37 | global_step > warm_up_step, 38 | lambda: tf.cond( 39 | global_step % 400 < 1, 40 | lambda: tf.convert_to_tensor(hp.init_vae_weights) + tf.cast((global_step - warm_up_step) / 400 * hp.vae_weight_multiler + warm_up_step / 100 * hp.vae_weight_multiler, tf.float32), 41 | lambda: tf.cast(tf.convert_to_tensor(0), tf.float32) 42 | ), 43 | lambda: tf.cast(tf.convert_to_tensor(0), tf.float32) 44 | ) 45 | return tf.maximum(w1, w2) 46 | 47 | 48 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tacotron.train import tacotron_train 3 | 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--base_dir', default='.') 8 | parser.add_argument('--hparams', default='', 9 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 10 | parser.add_argument('--input', default='training_data/train.txt') 11 | parser.add_argument('--name', help='Name of logging directory.') 12 | parser.add_argument('--model', default='Tacotron') 13 | parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training') 14 | parser.add_argument('--summary_interval', type=int, default=100, 15 | help='Steps between running summary ops') 16 | parser.add_argument('--checkpoint_interval', type=int, default=500, 17 | help='Steps between writing checkpoints') 18 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') 19 | args = parser.parse_args() 20 | 21 | accepted_models = ['Tacotron', 'Wavenet'] 22 | 23 | if args.model not in accepted_models: 24 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models)) 25 | 26 | if args.model == 'Tacotron': 27 | tacotron_train(args) 28 | elif args.model == 'Wavenet': 29 | raise NotImplementedError('Wavenet is still a work in progress, thank you for your patience!') 30 | 31 | 32 | if __name__ == '__main__': 33 | main() --------------------------------------------------------------------------------