├── datasets ├── __init__.py ├── preprocessor.py └── audio.py ├── tacotron ├── __init__.py ├── models │ ├── __init__.py │ ├── multihead_attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── Architecture_wrappers.py │ ├── attention.py │ └── tacotron.py ├── utils │ ├── ops.py │ ├── __init__.py │ ├── symbols.py │ ├── cmudict.py │ ├── plot.py │ ├── numbers.py │ ├── text.py │ └── cleaners.py ├── synthesizer.py ├── synthesize.py ├── feeder.py └── train.py ├── wavenet_vocoder ├── __init__.py ├── models │ ├── __init__.py │ ├── mixture.py │ └── modules.py ├── synthesizer.py ├── synthesize.py ├── util.py ├── train.py └── feeder.py ├── requirements.txt ├── infolog.py ├── griffin_lim_synthesis_tool.ipynb ├── .gitignore ├── synthesize.py ├── preprocess.py ├── train.py ├── README.md └── hparams.py /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /tacotron/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /wavenet_vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /tacotron/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'Tacotron': 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | falcon==1.2.0 2 | inflect==0.2.5 3 | librosa==0.5.1 4 | matplotlib==2.0.2 5 | numpy==1.14.0 6 | scipy==1.0.0 7 | tqdm==4.11.2 8 | Unidecode==0.4.20 9 | pyaudio==0.2.11 10 | sounddevice==0.3.10 11 | lws 12 | keras -------------------------------------------------------------------------------- /tacotron/utils/ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | def shape_list(x): 5 | """Return list of dims, statically where possible.""" 6 | x = tf.convert_to_tensor(x) 7 | 8 | # If unknown rank, return dynamic shape 9 | if x.get_shape().dims is None: 10 | return tf.shape(x) 11 | 12 | static = x.get_shape().as_list() 13 | shape = tf.shape(x) 14 | 15 | ret = [] 16 | for i in range(len(static)): 17 | dim = static[i] 18 | if dim is None: 19 | dim = shape[i] 20 | ret.append(dim) 21 | return ret -------------------------------------------------------------------------------- /tacotron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /tacotron/utils/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from . import cmudict 8 | 9 | _pad = '_' 10 | _eos = '~' 11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) + _arpabet -------------------------------------------------------------------------------- /wavenet_vocoder/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .wavenet import WaveNet 2 | from warnings import warn 3 | from wavenet_vocoder.util import is_mulaw_quantize 4 | 5 | def create_model(name, hparams): 6 | if is_mulaw_quantize(hparams.input_type): 7 | if hparams.out_channels != hparams.quantize_channels: 8 | raise RuntimeError( 9 | "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'") 10 | if hparams.upsample_conditional_features and hparams.cin_channels < 0: 11 | s = "Upsample conv layers were specified while local conditioning disabled. " 12 | s += "Notice that upsample conv layers will never be used." 13 | warn(s) 14 | 15 | if name == 'WaveNet': 16 | return WaveNet(hparams) 17 | else: 18 | raise Exception('Unknow model: {}'.format(name)) -------------------------------------------------------------------------------- /infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file = open(filename, 'a') 19 | _file.write('\n-----------------------------------------------------------------\n') 20 | _file.write('Starting new {} training run\n'.format(run_name)) 21 | _file.write('-----------------------------------------------------------------\n') 22 | _run_name = run_name 23 | _slack_url = slack_url 24 | 25 | 26 | def log(msg, end='\n', slack=False): 27 | print(msg, end=end) 28 | if _file is not None: 29 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 30 | if slack and _slack_url is not None: 31 | Thread(target=_send_slack, args=(msg,)).start() 32 | 33 | 34 | def _close_logfile(): 35 | global _file 36 | if _file is not None: 37 | _file.close() 38 | _file = None 39 | 40 | 41 | def _send_slack(msg): 42 | req = Request(_slack_url) 43 | req.add_header('Content-Type', 'application/json') 44 | urlopen(req, json.dumps({ 45 | 'username': 'tacotron', 46 | 'icon_emoji': ':taco:', 47 | 'text': '*%s*: %s' % (_run_name, msg) 48 | }).encode()) 49 | 50 | 51 | atexit.register(_close_logfile) -------------------------------------------------------------------------------- /griffin_lim_synthesis_tool.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "from datasets.audio import *\n", 13 | "import os\n", 14 | "from hparams import hparams\n", 15 | "\n", 16 | "n_sample = 0 #Change n_steps here\n", 17 | "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n", 18 | "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n", 19 | "out_dir = 'wav_out'\n", 20 | "\n", 21 | "os.makedirs(out_dir, exist_ok=True)\n", 22 | "\n", 23 | "mel_file = os.path.join(mel_folder, mel_file)\n", 24 | "mel_spectro = np.load(mel_file)\n", 25 | "mel_spectro.shape" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n", 35 | "#save the wav under test__\n", 36 | "save_wav(wav, os.path.join(out_dir, 'test_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_'))),\n", 37 | " sr=hparams.sample_rate)" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.6.4" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 2 62 | } 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Tacotron 2 oddities 107 | logs-*/ 108 | training_data/ 109 | 110 | -------------------------------------------------------------------------------- /tacotron/utils/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | def __init__(self, file_or_path, keep_ambiguous=True): 20 | if isinstance(file_or_path, str): 21 | with open(file_or_path, encoding='latin-1') as f: 22 | entries = _parse_cmudict(f) 23 | else: 24 | entries = _parse_cmudict(file_or_path) 25 | if not keep_ambiguous: 26 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 27 | self._entries = entries 28 | 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | 34 | def lookup(self, word): 35 | '''Returns list of ARPAbet pronunciations of the given word.''' 36 | return self._entries.get(word.upper()) 37 | 38 | 39 | 40 | _alt_re = re.compile(r'\([0-9]+\)') 41 | 42 | 43 | def _parse_cmudict(file): 44 | cmudict = {} 45 | for line in file: 46 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 47 | parts = line.split(' ') 48 | word = re.sub(_alt_re, '', parts[0]) 49 | pronunciation = _get_pronunciation(parts[1]) 50 | if pronunciation: 51 | if word in cmudict: 52 | cmudict[word].append(pronunciation) 53 | else: 54 | cmudict[word] = [pronunciation] 55 | return cmudict 56 | 57 | 58 | def _get_pronunciation(s): 59 | parts = s.strip().split(' ') 60 | for part in parts: 61 | if part not in _valid_symbol_set: 62 | return None 63 | return ' '.join(parts) 64 | -------------------------------------------------------------------------------- /tacotron/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def split_title_line(title_text, max_words=5): 8 | """ 9 | A function that splits any string based on specific character 10 | (returning it with the string), with maximum number of words on it 11 | """ 12 | seq = title_text.split() 13 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 14 | 15 | def plot_alignment(alignment, path, info=None, split_title=False, max_len=None): 16 | if max_len is not None: 17 | alignment = alignment[:, :max_len] 18 | 19 | fig = plt.figure(figsize=(8, 6)) 20 | ax = fig.add_subplot(111) 21 | 22 | im = ax.imshow( 23 | alignment, 24 | aspect='auto', 25 | origin='lower', 26 | interpolation='none') 27 | fig.colorbar(im, ax=ax) 28 | xlabel = 'Decoder timestep' 29 | if info is not None: 30 | if split_title: 31 | title = split_title_line(info) 32 | else: 33 | title = info 34 | plt.xlabel(xlabel) 35 | plt.title(title) 36 | plt.ylabel('Encoder timestep') 37 | plt.tight_layout() 38 | plt.savefig(path, format='png') 39 | plt.close() 40 | 41 | 42 | def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None): 43 | if max_len is not None: 44 | target_spectrogram = target_spectrogram[:max_len] 45 | pred_spectrogram = pred_spectrogram[:max_len] 46 | 47 | if info is not None: 48 | if split_title: 49 | title = split_title_line(info) 50 | else: 51 | title = info 52 | 53 | fig = plt.figure(figsize=(10, 8)) 54 | # Set common labels 55 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16) 56 | 57 | #target spectrogram subplot 58 | if target_spectrogram is not None: 59 | ax1 = fig.add_subplot(311) 60 | ax2 = fig.add_subplot(312) 61 | 62 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none') 63 | ax1.set_title('Target Mel-Spectrogram') 64 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1) 65 | ax2.set_title('Predicted Mel-Spectrogram') 66 | else: 67 | ax2 = fig.add_subplot(211) 68 | 69 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none') 70 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2) 71 | 72 | plt.tight_layout() 73 | plt.savefig(path, format='png') 74 | plt.close() 75 | -------------------------------------------------------------------------------- /tacotron/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /tacotron/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | from . import cleaners 3 | from .symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /wavenet_vocoder/synthesizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import os 4 | from infolog import log 5 | from datasets.audio import save_wav 6 | from wavenet_vocoder.models import create_model 7 | from wavenet_vocoder.train import create_shadow_saver, load_averaged_model 8 | from . import util 9 | 10 | 11 | class Synthesizer: 12 | def load(self, checkpoint_path, hparams, model_name='WaveNet'): 13 | log('Constructing model: {}'.format(model_name)) 14 | self._hparams = hparams 15 | local_cond, global_cond = self._check_conditions() 16 | 17 | self.local_conditions = tf.placeholder(tf.float32, shape=[1, None, hparams.num_mels], name='local_condition_features') if local_cond else None 18 | self.global_conditions = tf.placeholder(tf.int32, shape=(), name='global_condition_features') if global_cond else None 19 | self.synthesis_length = tf.placeholder(tf.int32, shape=(), name='synthesis_length') if not local_cond else None 20 | 21 | with tf.variable_scope('model') as scope: 22 | self.model = create_model(model_name, hparams) 23 | self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions, 24 | input_lengths=None, synthesis_length=self.synthesis_length) 25 | 26 | self._hparams = hparams 27 | sh_saver = create_shadow_saver(self.model) 28 | 29 | log('Loading checkpoint: {}'.format(checkpoint_path)) 30 | self.session = tf.Session() 31 | self.session.run(tf.global_variables_initializer()) 32 | load_averaged_model(self.session, sh_saver, checkpoint_path) 33 | 34 | def synthesize(self, mel_spectrogram, speaker_id, index, out_dir, log_dir): 35 | hparams = self._hparams 36 | local_cond, global_cond = self._check_conditions() 37 | 38 | c = mel_spectrogram 39 | g = speaker_id 40 | feed_dict = {} 41 | 42 | if local_cond: 43 | feed_dict[self.local_conditions] = [np.array(c, dtype=np.float32)] 44 | else: 45 | feed_dict[self.synthesis_length] = 100 46 | 47 | if global_cond: 48 | feed_dict[self.global_conditions] = [np.array(g, dtype=np.int32)] 49 | 50 | generated_wav = self.session.run(self.model.y_hat, feed_dict=feed_dict) 51 | 52 | #Save wav to disk 53 | audio_filename = os.path.join(out_dir, 'speech-audio-{:05d}.wav'.format(index)) 54 | save_wav(generated_wav, audio_filename, sr=hparams.sample_rate) 55 | 56 | #Save waveplot to disk 57 | if log_dir is not None: 58 | plot_filename = os.path.join(log_dir, 'speech-waveplot-{:05d}.png'.format(index)) 59 | util.waveplot(plot_filename, generated_wav, None, hparams) 60 | 61 | return audio_filename 62 | 63 | def _check_conditions(self): 64 | local_condition = self._hparams.cin_channels > 0 65 | global_condition = self._hparams.gin_channels > 0 66 | return local_condition, global_condition 67 | -------------------------------------------------------------------------------- /tacotron/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | '''lowercase input tokens. 56 | ''' 57 | return text.lower() 58 | 59 | 60 | def collapse_whitespace(text): 61 | return re.sub(_whitespace_re, ' ', text) 62 | 63 | 64 | def convert_to_ascii(text): 65 | return unidecode(text) 66 | 67 | 68 | def basic_cleaners(text): 69 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 70 | text = lowercase(text) 71 | text = collapse_whitespace(text) 72 | return text 73 | 74 | 75 | def transliteration_cleaners(text): 76 | '''Pipeline for non-English text that transliterates to ASCII.''' 77 | text = convert_to_ascii(text) 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def english_cleaners(text): 84 | '''Pipeline for English text, including number and abbreviation expansion.''' 85 | text = convert_to_ascii(text) 86 | text = expand_numbers(text) 87 | text = expand_abbreviations(text) 88 | text = collapse_whitespace(text) 89 | return text 90 | -------------------------------------------------------------------------------- /wavenet_vocoder/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from hparams import hparams, hparams_debug_string 4 | from wavenet_vocoder.synthesizer import Synthesizer 5 | from tqdm import tqdm 6 | from infolog import log 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | 11 | 12 | def run_synthesis(args, checkpoint_path, output_dir, hparams): 13 | log_dir = os.path.join(output_dir, 'plots') 14 | wav_dir = os.path.join(output_dir, 'wavs') 15 | 16 | #We suppose user will provide correct folder depending on training method 17 | log(hparams_debug_string()) 18 | synth = Synthesizer() 19 | synth.load(checkpoint_path, hparams) 20 | 21 | if args.model in ('Both', 'Tacotron-2'): 22 | #If running all Tacotron-2, synthesize audio from evaluated mels 23 | metadata_filename = os.path.join(args.mels_dir, 'map.txt') 24 | with open(metadata_filename, encoding='utf-8') as f: 25 | metadata = [line.strip().split('|') for line in f] 26 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 27 | hours = sum([int(x[-1]) for x in metadata]) * frame_shift_ms / (3600) 28 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) 29 | 30 | metadata = np.array(metadata) 31 | mel_files = metadata[:, 1] 32 | texts = metadata[:, 0] 33 | else: 34 | #else Get all npy files in input_dir (supposing they are mels) 35 | mel_files = [os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy'] 36 | texts = None 37 | 38 | log('Starting synthesis! (this will take a while..)') 39 | os.makedirs(log_dir, exist_ok=True) 40 | os.makedirs(wav_dir, exist_ok=True) 41 | 42 | with open(os.path.join(wav_dir, 'map.txt'), 'w') as file: 43 | for i, mel_file in enumerate(tqdm(mel_files)): 44 | mel_spectro = np.load(mel_file) 45 | audio_file = synth.synthesize(mel_spectro, None, i+1, wav_dir, log_dir) 46 | 47 | if texts is None: 48 | file.write('{}|{}\n'.format(mel_file, audio_file)) 49 | else: 50 | file.write('{}|{}|{}\n'.format(texts[i], mel_file, audio_file)) 51 | 52 | log('synthesized audio waveforms at {}'.format(wav_dir)) 53 | 54 | 55 | 56 | def wavenet_synthesize(args, hparams, checkpoint): 57 | output_dir = 'wavenet_' + args.output_dir 58 | 59 | try: 60 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 61 | log('loaded model at {}'.format(checkpoint_path)) 62 | except AttributeError: 63 | #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) 64 | if 'Both' in checkpoint: 65 | checkpoint = checkpoint.replace('Both', 'Tacotron-2') 66 | elif 'Tacotron-2' in checkpoint: 67 | checkpoint = checkpoint.replace('Tacotron-2', 'Both') 68 | else: #Synthesizing separately 69 | raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint)) 70 | 71 | try: 72 | #Try loading again 73 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 74 | log('loaded model at {}'.format(checkpoint_path)) 75 | except: 76 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) 77 | 78 | run_synthesis(args, checkpoint_path, output_dir, hparams) -------------------------------------------------------------------------------- /wavenet_vocoder/models/mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | 6 | def log_sum_exp(x): 7 | """ numerically stable log_sum_exp implementation that prevents overflow """ 8 | axis = len(x.get_shape())-1 9 | m = tf.reduce_max(x, axis) 10 | m2 = tf.reduce_max(x, axis, keepdims=True) 11 | return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis)) 12 | 13 | def log_prob_from_logits(x): 14 | """ numerically stable log_softmax implementation that prevents overflow """ 15 | axis = len(x.get_shape())-1 16 | m = tf.reduce_max(x, axis, keepdims=True) 17 | return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keepdims=True)) 18 | 19 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256, 20 | log_scale_min=-7.0, reduce=True): 21 | '''Discretized mix of logistic distributions loss. 22 | 23 | Note that it is assumed that input is scaled to [-1, 1] 24 | 25 | Args: 26 | y_hat: Tensor [batch_size, channels, time_length], predicted output. 27 | y: Tensor [batch_size, time_length, 1], Target. 28 | Returns: 29 | Tensor loss 30 | ''' 31 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y_hat)[1], 3), 0), tf.assert_equal(tf.rank(y_hat), 3)]): 32 | nr_mix = tf.shape(y_hat)[1] // 3 33 | 34 | #[Batch_size, time_length, channels] 35 | y_hat = tf.transpose(y_hat, [0, 2, 1]) 36 | 37 | #unpack parameters. [batch_size, time_length, num_mixtures] x 3 38 | logit_probs = y_hat[:, :, :nr_mix] 39 | means = y_hat[:, :, nr_mix:2 * nr_mix] 40 | log_scales = tf.maximum(y_hat[:, :, 2* nr_mix: 3 * nr_mix], log_scale_min) 41 | 42 | #[batch_size, time_length, 1] -> [batch_size, time_length, num_mixtures] 43 | y = y * tf.ones(shape=[1, 1, nr_mix], dtype=tf.float32) 44 | 45 | centered_y = y - means 46 | inv_stdv = tf.exp(-log_scales) 47 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 48 | cdf_plus = tf.nn.sigmoid(plus_in) 49 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 50 | cdf_min = tf.nn.sigmoid(min_in) 51 | 52 | log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling) 53 | log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling) 54 | 55 | #probability for all other cases 56 | cdf_delta = cdf_plus - cdf_min 57 | 58 | mid_in = inv_stdv * centered_y 59 | #log probability in the center of the bin, to be used in extreme cases 60 | #(not actually used in this code) 61 | log_pdf_mid = mid_in - log_scales - 2. * tf.nn.softplus(mid_in) 62 | 63 | log_probs = tf.where(y < -0.999, log_cdf_plus, 64 | tf.where(y > 0.999, log_one_minus_cdf_min, 65 | tf.where(cdf_delta > 1e-5, 66 | tf.log(tf.maximum(cdf_delta, 1e-12)), 67 | log_pdf_mid - np.log((num_classes - 1) / 2)))) 68 | #log_probs = log_probs + tf.nn.log_softmax(logit_probs, -1) 69 | 70 | log_probs = log_probs + log_prob_from_logits(logit_probs) 71 | 72 | if reduce: 73 | return -tf.reduce_sum(log_sum_exp(log_probs)) 74 | else: 75 | return -tf.expand_dims(log_sum_exp(log_probs), [-1]) 76 | 77 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.): 78 | ''' 79 | Args: 80 | y: Tensor, [batch_size, channels, time_length] 81 | Returns: 82 | Tensor: sample in range of [-1, 1] 83 | ''' 84 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y)[1], 3), 0)]): 85 | nr_mix = tf.shape(y)[1] // 3 86 | 87 | #[batch_size, time_length, channels] 88 | y = tf.transpose(y, [0, 2, 1]) 89 | logit_probs = y[:, :, :nr_mix] 90 | 91 | #sample mixture indicator from softmax 92 | temp = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5) 93 | temp = logit_probs - tf.log(-tf.log(temp)) 94 | argmax = tf.argmax(temp, -1) 95 | 96 | #[batch_size, time_length] -> [batch_size, time_length, nr_mix] 97 | one_hot = tf.one_hot(argmax, depth=nr_mix, dtype=tf.float32) 98 | #select logistic parameters 99 | means = tf.reduce_sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1) 100 | log_scales = tf.maximum(tf.reduce_sum( 101 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), log_scale_min) 102 | 103 | #sample from logistic & clip to interval 104 | #we don't actually round to the nearest 8-bit value when sampling 105 | u = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5) 106 | x = means + tf.exp(log_scales) * (tf.log(u) - tf.log(1 -u)) 107 | 108 | return tf.minimum(tf.maximum(x, -1.), 1.) 109 | -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tacotron.synthesize import tacotron_synthesize 3 | from wavenet_vocoder.synthesize import wavenet_synthesize 4 | from infolog import log 5 | from hparams import hparams 6 | from warnings import warn 7 | import os 8 | 9 | 10 | def prepare_run(args): 11 | modified_hp = hparams.parse(args.hparams) 12 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 13 | 14 | run_name = args.name or args.tacotron_name or args.model 15 | taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint) 16 | 17 | run_name = args.name or args.wavenet_name or args.model 18 | wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint) 19 | return taco_checkpoint, wave_checkpoint, modified_hp 20 | 21 | def get_sentences(args): 22 | if args.text != '': 23 | sentences = (args.text.strip().split("."))[:-1] 24 | else: 25 | sentences = hparams.sentences 26 | return sentences 27 | 28 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences): 29 | log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model)) 30 | log('Synthesizing mel-spectrograms from text..') 31 | wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 32 | log('Synthesizing audio from mel-spectrograms.. (This may take a while)') 33 | wavenet_synthesize(args, hparams, wave_checkpoint) 34 | log('Tacotron-2 TTS synthesis complete!') 35 | 36 | 37 | 38 | def main(): 39 | accepted_modes = ['eval', 'synthesis', 'live'] 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint') 42 | parser.add_argument('--hparams', default='', 43 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 44 | parser.add_argument('--name', help='Name of logging directory if the two models were trained together.') 45 | parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately') 46 | parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately') 47 | parser.add_argument('--model', default='Tacotron-2') 48 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') 49 | parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') 50 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 51 | parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) 52 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') 53 | parser.add_argument('--text', default='', help='Text contains sentences to be synthesized. Valid if mode=eval') 54 | parser.add_argument('--reference_audio', default=None, help='Reference audio path') 55 | args = parser.parse_args() 56 | 57 | accepted_models = ['Tacotron', 'WaveNet', 'Both', 'Tacotron-2'] 58 | 59 | if args.model not in accepted_models: 60 | raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models)) 61 | 62 | if args.mode not in accepted_modes: 63 | raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode)) 64 | 65 | if args.mode=='live' and args.model=='Wavenet': 66 | raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!') 67 | 68 | if args.GTA not in ('True', 'False'): 69 | raise ValueError('GTA option must be either True or False') 70 | 71 | if args.model in ('Both', 'Tacotron-2'): 72 | if args.mode == 'live': 73 | warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!') 74 | if args.mode == 'synthesis': 75 | raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)') 76 | 77 | taco_checkpoint, wave_checkpoint, hparams = prepare_run(args) 78 | sentences = get_sentences(args) 79 | 80 | if args.model == 'Tacotron': 81 | _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences) 82 | elif args.model == 'WaveNet': 83 | wavenet_synthesize(args, hparams, wave_checkpoint) 84 | elif args.model in ('Both', 'Tacotron-2'): 85 | synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences) 86 | else: 87 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from multiprocessing import cpu_count 3 | import os 4 | from tqdm import tqdm 5 | from datasets import preprocessor 6 | from hparams import hparams 7 | 8 | 9 | def preprocess(args, input_folders, out_dir, hparams): 10 | mel_dir = os.path.join(out_dir, 'mels') 11 | wav_dir = os.path.join(out_dir, 'audio') 12 | linear_dir = os.path.join(out_dir, 'linear') 13 | os.makedirs(mel_dir, exist_ok=True) 14 | os.makedirs(wav_dir, exist_ok=True) 15 | os.makedirs(linear_dir, exist_ok=True) 16 | metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm) 17 | write_metadata(metadata, out_dir) 18 | 19 | def write_metadata(metadata, out_dir): 20 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 21 | for m in metadata: 22 | f.write('|'.join([str(x) for x in m]) + '\n') 23 | mel_frames = sum([int(m[4]) for m in metadata]) 24 | timesteps = sum([int(m[3]) for m in metadata]) 25 | sr = hparams.sample_rate 26 | hours = timesteps / sr / 3600 27 | print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format( 28 | len(metadata), mel_frames, timesteps, hours)) 29 | print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata))) 30 | print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata))) 31 | print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata))) 32 | 33 | def norm_data(args): 34 | 35 | merge_books = (args.merge_books=='True') 36 | 37 | print('Selecting data folders..') 38 | supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS'] 39 | if args.dataset not in supported_datasets: 40 | raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format( 41 | args.dataset, supported_datasets)) 42 | 43 | if args.dataset.startswith('LJSpeech'): 44 | return [os.path.join(args.base_dir, args.dataset)] 45 | 46 | 47 | if args.dataset == 'M-AILABS': 48 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 49 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA'] 50 | if args.language not in supported_languages: 51 | raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format( 52 | supported_languages)) 53 | 54 | supported_voices = ['female', 'male', 'mix'] 55 | if args.voice not in supported_voices: 56 | raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format( 57 | supported_voices)) 58 | 59 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice) 60 | supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 61 | if args.reader not in supported_readers: 62 | raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format( 63 | supported_readers)) 64 | 65 | path = os.path.join(path, args.reader) 66 | supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))] 67 | if merge_books: 68 | return [os.path.join(path, book) for book in supported_books] 69 | 70 | else: 71 | if args.book not in supported_books: 72 | raise ValueError('Please enter a valid book for your reader settings! \n{}'.format( 73 | supported_books)) 74 | 75 | return [os.path.join(path, args.book)] 76 | 77 | 78 | def run_preprocess(args, hparams): 79 | input_folders = norm_data(args) 80 | output_folder = os.path.join(args.base_dir, args.output) 81 | 82 | preprocess(args, input_folders, output_folder, hparams) 83 | 84 | 85 | def main(): 86 | print('initializing preprocessing..') 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('--base_dir', default='') 89 | parser.add_argument('--hparams', default='', 90 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 91 | parser.add_argument('--dataset', default='LJSpeech-1.1') 92 | parser.add_argument('--language', default='en_US') 93 | parser.add_argument('--voice', default='female') 94 | parser.add_argument('--reader', default='mary_ann') 95 | parser.add_argument('--merge_books', default='False') 96 | parser.add_argument('--book', default='northandsouth') 97 | parser.add_argument('--output', default='training_data') 98 | parser.add_argument('--n_jobs', type=int, default=cpu_count()) 99 | args = parser.parse_args() 100 | 101 | modified_hp = hparams.parse(args.hparams) 102 | 103 | assert args.merge_books in ('False', 'True') 104 | 105 | run_preprocess(args, modified_hp) 106 | 107 | 108 | if __name__ == '__main__': 109 | main() -------------------------------------------------------------------------------- /tacotron/synthesizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from librosa import effects 5 | from tacotron.models import create_model 6 | from tacotron.utils.text import text_to_sequence 7 | from tacotron.utils import plot 8 | from datasets import audio 9 | from datetime import datetime 10 | import sounddevice as sd 11 | import pyaudio 12 | import wave 13 | from infolog import log 14 | 15 | 16 | class Synthesizer: 17 | def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron', reference_mel = None): 18 | log('Constructing model: %s' % model_name) 19 | inputs = tf.placeholder(tf.int32, [1, None], 'inputs') 20 | input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') 21 | targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets') 22 | 23 | if reference_mel is not None: 24 | reference_mel = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'reference_mel') 25 | with tf.variable_scope('model') as scope: 26 | self.model = create_model(model_name, hparams) 27 | if gta: 28 | self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=reference_mel) 29 | else: 30 | self.model.initialize(inputs, input_lengths, reference_mel=reference_mel) 31 | self.mel_outputs = self.model.mel_outputs 32 | self.alignment = self.model.alignments[0] 33 | 34 | self.gta = gta 35 | self._hparams = hparams 36 | 37 | log('Loading checkpoint: %s' % checkpoint_path) 38 | self.session = tf.Session() 39 | self.session.run(tf.global_variables_initializer()) 40 | saver = tf.train.Saver() 41 | saver.restore(self.session, checkpoint_path) 42 | 43 | 44 | def synthesize(self, text, index, out_dir, log_dir, mel_filename, reference_mel=None): 45 | hparams = self._hparams 46 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 47 | seq = text_to_sequence(text, cleaner_names) 48 | feed_dict = { 49 | self.model.inputs: [np.asarray(seq, dtype=np.int32)], 50 | self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32), 51 | } 52 | 53 | 54 | if reference_mel is not None: 55 | reference_mel = np.expand_dims(reference_mel, 0) 56 | feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) 57 | 58 | if self.gta: 59 | feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80) 60 | 61 | if self.gta or not hparams.predict_linear: 62 | mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict) 63 | 64 | else: 65 | linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict) 66 | linear = linear.reshape(-1, hparams.num_freq) 67 | 68 | mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out 69 | 70 | 71 | if index is None: 72 | #Generate wav and read it 73 | wav = audio.inv_mel_spectrogram(mels.T, hparams) 74 | audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way 75 | 76 | chunk = 512 77 | f = wave.open('temp.wav', 'rb') 78 | p = pyaudio.PyAudio() 79 | stream = p.open(format=p.get_format_from_width(f.getsampwidth()), 80 | channels=f.getnchannels(), 81 | rate=f.getframerate(), 82 | output=True) 83 | data = f.readframes(chunk) 84 | while data: 85 | stream.write(data) 86 | data=f.readframes(chunk) 87 | 88 | stream.stop_stream() 89 | stream.close() 90 | 91 | p.terminate() 92 | return 93 | 94 | 95 | # Write the spectrogram to disk 96 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders 97 | mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index)) 98 | np.save(mel_filename, mels, allow_pickle=False) 99 | 100 | if log_dir is not None: 101 | #save wav (mel -> wav) 102 | wav = audio.inv_mel_spectrogram(mels.T, hparams) 103 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate) 104 | 105 | if hparams.predict_linear: 106 | #save wav (linear -> wav) 107 | wav = audio.inv_linear_spectrogram(linear.T, hparams) 108 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate) 109 | 110 | #save alignments 111 | plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)), 112 | info='{}'.format(text), split_title=True) 113 | 114 | #save mel spectrogram plot 115 | plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)), 116 | info='{}'.format(text), split_title=True) 117 | 118 | return mel_filename -------------------------------------------------------------------------------- /tacotron/synthesize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | from hparams import hparams, hparams_debug_string 5 | from tacotron.synthesizer import Synthesizer 6 | import time 7 | from tqdm import tqdm 8 | from time import sleep 9 | from infolog import log 10 | import tensorflow as tf 11 | 12 | 13 | 14 | def generate_fast(model, text): 15 | model.synthesize(text, None, None, None, None) 16 | 17 | 18 | def run_live(args, checkpoint_path, hparams): 19 | #Log to Terminal without keeping any records in files 20 | log(hparams_debug_string()) 21 | synth = Synthesizer() 22 | synth.load(checkpoint_path, hparams) 23 | 24 | #Generate fast greeting message 25 | greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' 26 | log(greetings) 27 | generate_fast(synth, greetings) 28 | 29 | #Interaction loop 30 | while True: 31 | try: 32 | text = input() 33 | generate_fast(synth, text) 34 | 35 | except KeyboardInterrupt: 36 | leave = 'Thank you for testing our features. see you soon.' 37 | log(leave) 38 | generate_fast(synth, leave) 39 | sleep(2) 40 | break 41 | 42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences): 43 | eval_dir = os.path.join(output_dir, 'eval') 44 | log_dir = os.path.join(output_dir, 'logs-eval') 45 | 46 | 47 | #Create output path if it doesn't exist 48 | os.makedirs(eval_dir, exist_ok=True) 49 | os.makedirs(log_dir, exist_ok=True) 50 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) 51 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) 52 | 53 | log(hparams_debug_string()) 54 | synth = Synthesizer() 55 | synth.load(checkpoint_path, hparams, reference_mel=args.reference_audio) 56 | if args.reference_audio is not None: 57 | ref_wav = audio.load_wav(args.reference_audio) 58 | reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T 59 | else: 60 | raise ValueError("Evaluation without reference audio. Please provide path to reference audio.") 61 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: 62 | for i, text in enumerate(tqdm(sentences)): 63 | start = time.time() 64 | mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None, reference_mel=reference_mel) 65 | file.write('{}|{}\n'.format(text, mel_filename)) 66 | 67 | log('synthesized mel spectrograms at {}'.format(eval_dir)) 68 | return eval_dir 69 | 70 | def run_synthesis(args, checkpoint_path, output_dir, hparams): 71 | GTA = (args.GTA == 'True') 72 | if GTA: 73 | synth_dir = os.path.join(output_dir, 'gta') 74 | 75 | else: 76 | synth_dir = os.path.join(output_dir, 'natural') 77 | 78 | os.makedirs(synth_dir, exist_ok=True) 79 | 80 | 81 | metadata_filename = os.path.join(args.input_dir, 'train.txt') 82 | log(hparams_debug_string()) 83 | synth = Synthesizer() 84 | synth.load(checkpoint_path, hparams, gta=GTA) 85 | with open(metadata_filename, encoding='utf-8') as f: 86 | metadata = [line.strip().split('|') for line in f] 87 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 88 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600) 89 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours)) 90 | 91 | log('starting synthesis') 92 | mel_dir = os.path.join(args.input_dir, 'mels') 93 | wav_dir = os.path.join(args.input_dir, 'audio') 94 | with open(os.path.join(synth_dir, 'map.txt'), 'w') as file: 95 | for i, meta in enumerate(tqdm(metadata)): 96 | text = meta[5] 97 | mel_filename = os.path.join(mel_dir, meta[1]) 98 | wav_filename = os.path.join(wav_dir, meta[0]) 99 | mel_output_filename = synth.synthesize(text, i+1, synth_dir, None, mel_filename) 100 | file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text)) 101 | log('synthesized mel spectrograms at {}'.format(synth_dir)) 102 | return os.path.join(synth_dir, 'map.txt') 103 | 104 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None): 105 | output_dir = 'tacotron_' + args.output_dir 106 | 107 | try: 108 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 109 | log('loaded model at {}'.format(checkpoint_path)) 110 | except AttributeError: 111 | #Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa) 112 | if 'Both' in checkpoint: 113 | checkpoint = checkpoint.replace('Both', 'Tacotron-2') 114 | elif 'Tacotron-2' in checkpoint: 115 | checkpoint = checkpoint.replace('Tacotron-2', 'Both') 116 | else: 117 | raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint)) 118 | 119 | try: 120 | #Try loading again 121 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path 122 | log('loaded model at {}'.format(checkpoint_path)) 123 | except: 124 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint)) 125 | if args.mode == 'eval': 126 | return run_eval(args, checkpoint_path, output_dir, hparams, sentences) 127 | elif args.mode == 'synthesis': 128 | return run_synthesis(args, checkpoint_path, output_dir, hparams) 129 | else: 130 | run_live(args, checkpoint_path, hparams) 131 | -------------------------------------------------------------------------------- /tacotron/models/multihead_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import math 4 | from tacotron.utils.ops import shape_list 5 | 6 | class MultiheadAttention(): 7 | '''Computes the multi-head attention as described in 8 | https://arxiv.org/abs/1706.03762. 9 | Args: 10 | num_heads: The number of attention heads. 11 | query: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`. 12 | value: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`. 13 | If ``None``, computes self-attention. 14 | num_units: The number of hidden units. If not set, it is set to the input 15 | dimension. 16 | attention_type: a string, either "dot_attention", "mlp_attention". 17 | Returns: 18 | The concatenated attention context of each head. 19 | ''' 20 | def __init__(self, 21 | query, 22 | value, 23 | num_heads=4, 24 | attention_type='mlp_attention', 25 | num_units=None, 26 | normalize=True): 27 | self.query = query 28 | self.value = value 29 | self.num_heads = num_heads 30 | self.attention_type = attention_type 31 | self.num_units = num_units or query.get_shape().as_list()[-1] 32 | self.normalize = normalize 33 | 34 | def multi_head_attention(self): 35 | if self.num_units % self.num_heads != 0: 36 | raise ValueError("Multi head attention requires that num_units is a" 37 | " multiple of {}".format(num_heads)) 38 | 39 | with tf.variable_scope("Multihead-attention"): 40 | q = tf.layers.conv1d(self.query, self.num_units, 1) 41 | k = tf.layers.conv1d(self.value, self.num_units, 1) 42 | v = self.value 43 | qs, ks, vs = self._split_heads(q, k, v) 44 | if self.attention_type == 'mlp_attention': 45 | style_embeddings = self._mlp_attention(qs, ks, vs) 46 | elif self.attention_type == 'dot_attention': 47 | style_embeddings = self._dot_product(qs, ks, vs) 48 | else: 49 | raise ValueError('Only mlp_attention and dot_attention are supported') 50 | 51 | return self._combine_heads(style_embeddings) 52 | 53 | def _split_heads(self, q, k, v): 54 | '''Split the channels into multiple heads 55 | 56 | Returns: 57 | Tensors with shape [batch, num_heads, length_x, dim_x/num_heads] 58 | ''' 59 | qs = tf.transpose(self._split_last_dimension(q, self.num_heads), [0, 2, 1, 3]) 60 | ks = tf.transpose(self._split_last_dimension(k, self.num_heads), [0, 2, 1, 3]) 61 | v_shape = shape_list(v) 62 | vs = tf.tile(tf.expand_dims(v, axis=1), [1, self.num_heads, 1, 1]) 63 | return qs, ks, vs 64 | 65 | def _split_last_dimension(self, x, num_heads): 66 | '''Reshape x to num_heads 67 | Returns: 68 | a Tensor with shape [batch, length_x, num_heads, dim_x/num_heads] 69 | ''' 70 | x_shape = shape_list(x) 71 | dim = x_shape[-1] 72 | assert dim % num_heads == 0 73 | return tf.reshape(x, x_shape[:-1] + [num_heads, dim // num_heads]) 74 | 75 | def _dot_product(self, qs, ks, vs): 76 | '''dot-product computation 77 | Returns: 78 | a context vector with shape [batch, num_heads, length_q, dim_vs] 79 | ''' 80 | qk = tf.matmul(qs, ks, transpose_b=True) 81 | scale_factor = (self.num_units // self.num_heads)**-0.5 82 | if self.normalize: 83 | qk *= scale_factor 84 | weights = tf.nn.softmax(qk, name="dot_attention_weights") 85 | context = tf.matmul(weights, vs) 86 | return context 87 | 88 | def _mlp_attention(self, qs, ks, vs): 89 | '''MLP computation modified from https://github.com/npuichigo 90 | Returns: 91 | a context vector with shape [batch, num_heads, length_q, dim_vs] 92 | ''' 93 | num_units = qs.get_shape()[-1].value 94 | dtype = qs.dtype 95 | 96 | v = tf.get_variable("attention_v", [num_units], dtype=dtype) 97 | if self.normalize: 98 | #https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py#L470 99 | # Scalar used in weight normalization 100 | g = tf.get_variable( 101 | "attention_g", dtype=dtype, 102 | initializer=math.sqrt((1. / num_units))) 103 | # Bias added prior to the nonlinearity 104 | b = tf.get_variable( 105 | "attention_b", [num_units], dtype=dtype, 106 | initializer=tf.zeros_initializer()) 107 | # normed_v = g * v / ||v|| 108 | normed_v = g * v * tf.rsqrt( 109 | tf.reduce_sum(tf.square(v))) 110 | # Single layer multilayer perceptron. 111 | add = tf.reduce_sum(normed_v * tf.tanh(ks + qs + b), [-1], keep_dims=True) 112 | else: 113 | # Single layer multilayer perceptron. 114 | add = tf.reduce_sum(v * tf.tanh(ks + qs), [-1], keep_dims=True) 115 | 116 | # Compute attention weights. 117 | weights = tf.nn.softmax(tf.transpose(add, [0, 1, 3, 2]), name="mlp_attention_weights") 118 | # Compute attention context. 119 | context = tf.matmul(weights, vs) 120 | return context 121 | 122 | def _combine_heads(self, x): 123 | '''Combine all heads 124 | Returns: 125 | a Tensor with shape [batch, length_x, shape_x[-1] * shape_x[-3]] 126 | ''' 127 | x = tf.transpose(x, [0, 2, 1, 3]) 128 | x_shape = shape_list(x) 129 | return tf.reshape(x, x_shape[:-2] + [self.num_heads * x_shape[-1]]) 130 | -------------------------------------------------------------------------------- /tacotron/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib.seq2seq.python.ops import decoder 9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 10 | from tensorflow.python.framework import ops 11 | from tensorflow.python.framework import tensor_shape 12 | from tensorflow.python.layers import base as layers_base 13 | from tensorflow.python.ops import rnn_cell_impl 14 | from tensorflow.python.util import nest 15 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper 16 | 17 | 18 | 19 | class CustomDecoderOutput( 20 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 21 | pass 22 | 23 | 24 | class CustomDecoder(decoder.Decoder): 25 | """Custom sampling decoder. 26 | 27 | Allows for stop token prediction at inference time 28 | and returns equivalent loss in training time. 29 | 30 | Note: 31 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 32 | """ 33 | 34 | def __init__(self, cell, helper, initial_state, output_layer=None): 35 | """Initialize CustomDecoder. 36 | Args: 37 | cell: An `RNNCell` instance. 38 | helper: A `Helper` instance. 39 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 40 | The initial state of the RNNCell. 41 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 42 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 43 | to storing the result or sampling. 44 | Raises: 45 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 46 | """ 47 | if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access 48 | raise TypeError("cell must be an RNNCell, received: %s" % type(cell)) 49 | if not isinstance(helper, helper_py.Helper): 50 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 51 | if (output_layer is not None 52 | and not isinstance(output_layer, layers_base.Layer)): 53 | raise TypeError( 54 | "output_layer must be a Layer, received: %s" % type(output_layer)) 55 | self._cell = cell 56 | self._helper = helper 57 | self._initial_state = initial_state 58 | self._output_layer = output_layer 59 | 60 | @property 61 | def batch_size(self): 62 | return self._helper.batch_size 63 | 64 | def _rnn_output_size(self): 65 | size = self._cell.output_size 66 | if self._output_layer is None: 67 | return size 68 | else: 69 | # To use layer's compute_output_shape, we need to convert the 70 | # RNNCell's output_size entries into shapes with an unknown 71 | # batch size. We then pass this through the layer's 72 | # compute_output_shape and read off all but the first (batch) 73 | # dimensions to get the output size of the rnn with the layer 74 | # applied to the top. 75 | output_shape_with_unknown_batch = nest.map_structure( 76 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 77 | size) 78 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 79 | output_shape_with_unknown_batch) 80 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 81 | 82 | @property 83 | def output_size(self): 84 | # Return the cell output and the id 85 | return CustomDecoderOutput( 86 | rnn_output=self._rnn_output_size(), 87 | token_output=self._helper.token_output_size, 88 | sample_id=self._helper.sample_ids_shape) 89 | 90 | @property 91 | def output_dtype(self): 92 | # Assume the dtype of the cell is the output_size structure 93 | # containing the input_state's first component's dtype. 94 | # Return that structure and the sample_ids_dtype from the helper. 95 | dtype = nest.flatten(self._initial_state)[0].dtype 96 | return CustomDecoderOutput( 97 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 98 | tf.float32, 99 | self._helper.sample_ids_dtype) 100 | 101 | def initialize(self, name=None): 102 | """Initialize the decoder. 103 | Args: 104 | name: Name scope for any created operations. 105 | Returns: 106 | `(finished, first_inputs, initial_state)`. 107 | """ 108 | return self._helper.initialize() + (self._initial_state,) 109 | 110 | def step(self, time, inputs, state, name=None): 111 | """Perform a custom decoding step. 112 | Enables for dyanmic prediction 113 | Args: 114 | time: scalar `int32` tensor. 115 | inputs: A (structure of) input tensors. 116 | state: A (structure of) state tensors and TensorArrays. 117 | name: Name scope for any created operations. 118 | Returns: 119 | `(outputs, next_state, next_inputs, finished)`. 120 | """ 121 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 122 | #Call outputprojection wrapper cell 123 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 124 | 125 | #apply output_layer (if existant) 126 | if self._output_layer is not None: 127 | cell_outputs = self._output_layer(cell_outputs) 128 | sample_ids = self._helper.sample( 129 | time=time, outputs=cell_outputs, state=cell_state) 130 | 131 | (finished, next_inputs, next_state) = self._helper.next_inputs( 132 | time=time, 133 | outputs=cell_outputs, 134 | state=cell_state, 135 | sample_ids=sample_ids, 136 | stop_token_prediction=stop_token) 137 | 138 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 139 | return (outputs, next_state, next_inputs, finished) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tensorflow as tf 3 | from tacotron.train import tacotron_train 4 | from wavenet_vocoder.train import wavenet_train 5 | from tacotron.synthesize import tacotron_synthesize 6 | from infolog import log 7 | from hparams import hparams 8 | import os 9 | import infolog 10 | from time import sleep 11 | 12 | log = infolog.log 13 | 14 | 15 | def save_seq(file, sequence, input_path): 16 | '''Save Tacotron-2 training state to disk. (To skip for future runs) 17 | ''' 18 | sequence = [str(int(s)) for s in sequence] + [input_path] 19 | with open(file, 'w') as f: 20 | f.write('|'.join(sequence)) 21 | 22 | def read_seq(file, restore): 23 | '''Load Tacotron-2 training state from disk. (To skip if not first run) 24 | ''' 25 | if os.path.isfile(file) and restore == True: 26 | with open(file, 'r') as f: 27 | sequence = f.read().split('|') 28 | 29 | return [bool(int(s)) for s in sequence[:-1]], sequence[-1] 30 | else: 31 | return [0, 0, 0], '' 32 | 33 | def prepare_run(args): 34 | modified_hp = hparams.parse(args.hparams) 35 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 36 | run_name = args.name or args.model 37 | log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name)) 38 | os.makedirs(log_dir, exist_ok=True) 39 | infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name) 40 | return log_dir, modified_hp 41 | 42 | def train(args, log_dir, hparams): 43 | state_file = os.path.join(log_dir, 'state_log') 44 | #Get training states 45 | (taco_state, GTA_state, wave_state), input_path = read_seq(state_file, args.restore) 46 | 47 | if not taco_state: 48 | log('\n#############################################################\n') 49 | log('Tacotron Train\n') 50 | log('###########################################################\n') 51 | checkpoint = tacotron_train(args, log_dir, hparams) 52 | tf.reset_default_graph() 53 | #Sleep 1 second to let previous graph close and avoid error messages while synthesis 54 | sleep(1) 55 | if checkpoint is None: 56 | raise('Error occured while training Tacotron, Exiting!') 57 | taco_state = 1 58 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 59 | 60 | if not GTA_state: 61 | log('\n#############################################################\n') 62 | log('Tacotron GTA Synthesis\n') 63 | log('###########################################################\n') 64 | input_path = tacotron_synthesize(args, hparams, checkpoint) 65 | GTA_state = 1 66 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 67 | 68 | if input_path == '' or input_path is None: 69 | raise RuntimeError('input_path has an unpleasant value -> {}'.format(input_path)) 70 | 71 | if not wave_state: 72 | log('\n#############################################################\n') 73 | log('Wavenet Train\n') 74 | log('###########################################################\n') 75 | checkpoint = wavenet_train(args, log_dir, hparams, input_path) 76 | if checkpoint is None: 77 | raise ('Error occured while training Wavenet, Exiting!') 78 | wave_state = 1 79 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path) 80 | 81 | if wave_state and GTA_state and taco_state: 82 | log('TRAINING IS ALREADY COMPLETE!!') 83 | 84 | def main(): 85 | parser = argparse.ArgumentParser() 86 | parser.add_argument('--base_dir', default='') 87 | parser.add_argument('--hparams', default='', 88 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 89 | parser.add_argument('--tacotron_input', default='training_data/train.txt') 90 | parser.add_argument('--wavenet_input', default='tacotron_output/gta/map.txt') 91 | parser.add_argument('--name', help='Name of logging directory.') 92 | parser.add_argument('--model', default='Tacotron-2') 93 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets') 94 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') 95 | parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training') 96 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode') 97 | parser.add_argument('--restore', type=bool, default=False, help='Set this to True to resume training') 98 | parser.add_argument('--summary_interval', type=int, default=250, 99 | help='Steps between running summary ops') 100 | parser.add_argument('--checkpoint_interval', type=int, default=500, 101 | help='Steps between writing checkpoints') 102 | parser.add_argument('--eval_interval', type=int, default=5000, 103 | help='Steps between eval on test data') 104 | parser.add_argument('--tacotron_train_steps', type=int, default=100000, help='total number of tacotron training steps') 105 | parser.add_argument('--wavenet_train_steps', type=int, default=100000, help='total number of wavenet training steps') 106 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') 107 | args = parser.parse_args() 108 | 109 | accepted_models = ['Tacotron', 'WaveNet', 'Both', 'Tacotron-2'] 110 | 111 | if args.model not in accepted_models: 112 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models)) 113 | 114 | log_dir, hparams = prepare_run(args) 115 | 116 | if args.model == 'Tacotron': 117 | tacotron_train(args, log_dir, hparams) 118 | elif args.model == 'WaveNet': 119 | wavenet_train(args, log_dir, hparams, args.wavenet_input) 120 | elif args.model in ('Both', 'Tacotron-2'): 121 | train(args, log_dir, hparams) 122 | else: 123 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models)) 124 | 125 | 126 | if __name__ == '__main__': 127 | main() -------------------------------------------------------------------------------- /datasets/preprocessor.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | from datasets import audio 4 | import os 5 | import numpy as np 6 | from wavenet_vocoder.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize 7 | 8 | 9 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): 10 | """ 11 | Preprocesses the speech dataset from a gven input path to given output directories 12 | 13 | Args: 14 | - hparams: hyper parameters 15 | - input_dir: input directory that contains the files to prerocess 16 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset 17 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset 18 | - wav_dir: output directory of the preprocessed speech audio dataset 19 | - n_jobs: Optional, number of worker process to parallelize across 20 | - tqdm: Optional, provides a nice progress bar 21 | 22 | Returns: 23 | - A list of tuple describing the train examples. this should be written to train.txt 24 | """ 25 | 26 | # We use ProcessPoolExecutor to parallelize across processes, this is just for 27 | # optimization purposes and it can be omited 28 | executor = ProcessPoolExecutor(max_workers=n_jobs) 29 | futures = [] 30 | index = 1 31 | for input_dir in input_dirs: 32 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f: 33 | for line in f: 34 | parts = line.strip().split('|') 35 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0])) 36 | text = parts[2] 37 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams))) 38 | index += 1 39 | 40 | return [future.result() for future in tqdm(futures) if future.result() is not None] 41 | 42 | 43 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): 44 | """ 45 | Preprocesses a single utterance wav/text pair 46 | 47 | this writes the mel scale spectogram to disk and return a tuple to write 48 | to the train.txt file 49 | 50 | Args: 51 | - mel_dir: the directory to write the mel spectograms into 52 | - linear_dir: the directory to write the linear spectrograms into 53 | - wav_dir: the directory to write the preprocessed wav into 54 | - index: the numeric index to use in the spectogram filename 55 | - wav_path: path to the audio file containing the speech input 56 | - text: text spoken in the input audio file 57 | - hparams: hyper parameters 58 | 59 | Returns: 60 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) 61 | """ 62 | try: 63 | # Load the audio as numpy array 64 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate) 65 | except FileNotFoundError: #catch missing wav exception 66 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format( 67 | wav_path)) 68 | return None 69 | 70 | #rescale wav 71 | if hparams.rescale: 72 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 73 | 74 | #M-AILABS extra silence specific 75 | if hparams.trim_silence: 76 | wav = audio.trim_silence(wav, hparams) 77 | 78 | #Mu-law quantize 79 | if is_mulaw_quantize(hparams.input_type): 80 | #[0, quantize_channels) 81 | out = mulaw_quantize(wav, hparams.quantize_channels) 82 | 83 | #Trim silences 84 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold) 85 | wav = wav[start: end] 86 | out = out[start: end] 87 | 88 | constant_values = mulaw_quantize(0, hparams.quantize_channels) 89 | out_dtype = np.int16 90 | 91 | elif is_mulaw(hparams.input_type): 92 | #[-1, 1] 93 | out = mulaw(wav, hparams.quantize_channels) 94 | constant_values = mulaw(0., hparams.quantize_channels) 95 | out_dtype = np.float32 96 | 97 | else: 98 | #[-1, 1] 99 | out = wav 100 | constant_values = 0. 101 | out_dtype = np.float32 102 | 103 | # Compute the mel scale spectrogram from the wav 104 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 105 | mel_frames = mel_spectrogram.shape[1] 106 | 107 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 108 | return None 109 | 110 | #Compute the linear scale spectrogram from the wav 111 | linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) 112 | linear_frames = linear_spectrogram.shape[1] 113 | 114 | #sanity check 115 | assert linear_frames == mel_frames 116 | 117 | #Ensure time resolution adjustement between audio and mel-spectrogram 118 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size 119 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) 120 | 121 | #Zero pad for quantized signal 122 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) 123 | assert len(out) >= mel_frames * audio.get_hop_size(hparams) 124 | 125 | #time resolution adjustement 126 | #ensure length of raw audio is multiple of hop size so that we can use 127 | #transposed convolution to upsample 128 | out = out[:mel_frames * audio.get_hop_size(hparams)] 129 | assert len(out) % audio.get_hop_size(hparams) == 0 130 | time_steps = len(out) 131 | 132 | # Write the spectrogram and audio to disk 133 | audio_filename = 'speech-audio-{:05d}.npy'.format(index) 134 | mel_filename = 'speech-mel-{:05d}.npy'.format(index) 135 | linear_filename = 'speech-linear-{:05d}.npy'.format(index) 136 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) 137 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 138 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) 139 | 140 | # Return a tuple describing this training example 141 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text) -------------------------------------------------------------------------------- /wavenet_vocoder/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import librosa.display as dsp 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def _assert_valid_input_type(s): 8 | assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw' 9 | 10 | def is_mulaw_quantize(s): 11 | _assert_valid_input_type(s) 12 | return s == 'mulaw-quantize' 13 | 14 | def is_mulaw(s): 15 | _assert_valid_input_type(s) 16 | return s == 'mulaw' 17 | 18 | def is_raw(s): 19 | _assert_valid_input_type(s) 20 | return s == 'raw' 21 | 22 | def is_scalar_input(s): 23 | return is_raw(s) or is_mulaw(s) 24 | 25 | 26 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py 27 | def mulaw(x, mu=256): 28 | """Mu-Law companding 29 | Method described in paper [1]_. 30 | .. math:: 31 | f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu) 32 | Args: 33 | x (array-like): Input signal. Each value of input signal must be in 34 | range of [-1, 1]. 35 | mu (number): Compression parameter ``μ``. 36 | Returns: 37 | array-like: Compressed signal ([-1, 1]) 38 | See also: 39 | :func:`nnmnkwii.preprocessing.inv_mulaw` 40 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 41 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 42 | .. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding 43 | implementations using the tms320c54x." SPRA163 (1997). 44 | """ 45 | mu -= 1 46 | return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu) 47 | 48 | 49 | def inv_mulaw(y, mu=256): 50 | """Inverse of mu-law companding (mu-law expansion) 51 | .. math:: 52 | f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1) 53 | Args: 54 | y (array-like): Compressed signal. Each value of input signal must be in 55 | range of [-1, 1]. 56 | mu (number): Compression parameter ``μ``. 57 | Returns: 58 | array-like: Uncomprresed signal (-1 <= x <= 1) 59 | See also: 60 | :func:`nnmnkwii.preprocessing.inv_mulaw` 61 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 62 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 63 | """ 64 | mu -= 1 65 | return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0) 66 | 67 | 68 | def mulaw_quantize(x, mu=256): 69 | """Mu-Law companding + quantize 70 | Args: 71 | x (array-like): Input signal. Each value of input signal must be in 72 | range of [-1, 1]. 73 | mu (number): Compression parameter ``μ``. 74 | Returns: 75 | array-like: Quantized signal (dtype=int) 76 | - y ∈ [0, mu] if x ∈ [-1, 1] 77 | - y ∈ [0, mu) if x ∈ [-1, 1) 78 | .. note:: 79 | If you want to get quantized values of range [0, mu) (not [0, mu]), 80 | then you need to provide input signal of range [-1, 1). 81 | Examples: 82 | >>> from scipy.io import wavfile 83 | >>> import pysptk 84 | >>> import numpy as np 85 | >>> from nnmnkwii import preprocessing as P 86 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 87 | >>> x = (x / 32768.0).astype(np.float32) 88 | >>> y = P.mulaw_quantize(x) 89 | >>> print(y.min(), y.max(), y.dtype) 90 | 15 246 int64 91 | See also: 92 | :func:`nnmnkwii.preprocessing.mulaw` 93 | :func:`nnmnkwii.preprocessing.inv_mulaw` 94 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize` 95 | """ 96 | mu -= 1 97 | y = mulaw(x, mu) 98 | # scale [-1, 1] to [0, mu] 99 | return _asint((y + 1) / 2 * mu) 100 | 101 | 102 | def inv_mulaw_quantize(y, mu=255): 103 | """Inverse of mu-law companding + quantize 104 | Args: 105 | y (array-like): Quantized signal (∈ [0, mu]). 106 | mu (number): Compression parameter ``μ``. 107 | Returns: 108 | array-like: Uncompressed signal ([-1, 1]) 109 | Examples: 110 | >>> from scipy.io import wavfile 111 | >>> import pysptk 112 | >>> import numpy as np 113 | >>> from nnmnkwii import preprocessing as P 114 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file()) 115 | >>> x = (x / 32768.0).astype(np.float32) 116 | >>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x)) 117 | >>> x_hat = (x_hat * 32768).astype(np.int16) 118 | See also: 119 | :func:`nnmnkwii.preprocessing.mulaw` 120 | :func:`nnmnkwii.preprocessing.inv_mulaw` 121 | :func:`nnmnkwii.preprocessing.mulaw_quantize` 122 | """ 123 | # [0, m) to [-1, 1] 124 | mu -= 1 125 | y = 2 * _asfloat(y) / mu - 1 126 | return inv_mulaw(y, mu) 127 | 128 | def _sign(x): 129 | #wrapper to support tensorflow tensors/numpy arrays 130 | isnumpy = isinstance(x, np.ndarray) 131 | isscalar = np.isscalar(x) 132 | return np.sign(x) if (isnumpy or isscalar) else tf.sign(x) 133 | 134 | 135 | def _log1p(x): 136 | #wrapper to support tensorflow tensors/numpy arrays 137 | isnumpy = isinstance(x, np.ndarray) 138 | isscalar = np.isscalar(x) 139 | return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x) 140 | 141 | 142 | def _abs(x): 143 | #wrapper to support tensorflow tensors/numpy arrays 144 | isnumpy = isinstance(x, np.ndarray) 145 | isscalar = np.isscalar(x) 146 | return np.abs(x) if (isnumpy or isscalar) else tf.abs(x) 147 | 148 | 149 | def _asint(x): 150 | #wrapper to support tensorflow tensors/numpy arrays 151 | isnumpy = isinstance(x, np.ndarray) 152 | isscalar = np.isscalar(x) 153 | return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32) 154 | 155 | 156 | def _asfloat(x): 157 | #wrapper to support tensorflow tensors/numpy arrays 158 | isnumpy = isinstance(x, np.ndarray) 159 | isscalar = np.isscalar(x) 160 | return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32) 161 | 162 | def sequence_mask(input_lengths, max_len=None, expand=True): 163 | if max_len is None: 164 | max_len = tf.reduce_max(input_lengths) 165 | 166 | if expand: 167 | return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1) 168 | return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32) 169 | 170 | 171 | def waveplot(path, y_hat, y_target, hparams): 172 | sr = hparams.sample_rate 173 | 174 | plt.figure(figsize=(12, 4)) 175 | if y_target is not None: 176 | ax = plt.subplot(2, 1, 1) 177 | dsp.waveplot(y_target, sr=sr) 178 | ax.set_title('Target waveform') 179 | ax = plt.subplot(2, 1, 2) 180 | dsp.waveplot(y_hat, sr=sr) 181 | ax.set_title('Prediction waveform') 182 | else: 183 | ax = plt.subplot(1, 1, 1) 184 | dsp.waveplot(y_hat, sr=sr) 185 | ax.set_title('Generated waveform') 186 | 187 | plt.tight_layout() 188 | plt.savefig(path, format="png") 189 | plt.close() -------------------------------------------------------------------------------- /tacotron/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | class TacoTestHelper(Helper): 7 | def __init__(self, batch_size, hparams): 8 | with tf.name_scope('TacoTestHelper'): 9 | self._batch_size = batch_size 10 | self._output_dim = hparams.num_mels 11 | self._reduction_factor = hparams.outputs_per_step 12 | self.stop_at_any = hparams.stop_at_any 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 38 | with tf.name_scope('TacoTestHelper'): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn't 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if self.stop_at_any: 52 | finished = tf.reduce_any(finished) #Recommended 53 | else: 54 | finished = tf.reduce_all(finished) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, stop_targets, hparams, gta, evaluating, global_step): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope('TacoTrainingHelper'): 66 | self._batch_size = batch_size 67 | self._output_dim = hparams.num_mels 68 | self._reduction_factor = hparams.outputs_per_step 69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) 70 | self.gta = gta 71 | self.eval = evaluating 72 | self._hparams = hparams 73 | self.global_step = global_step 74 | 75 | r = self._reduction_factor 76 | # Feed every r-th target frame as input 77 | self._targets = targets[:, r-1::r, :] 78 | 79 | #Maximal sequence length 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | #Compute teacher forcing ratio for this global step. 100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing 101 | if self.gta: 102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth 103 | elif self.eval and self._hparams.natural_eval: 104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions 105 | else: 106 | if self._hparams.tacotron_teacher_forcing_mode == 'scheduled': 107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, 108 | self.global_step, self._hparams) 109 | 110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 111 | 112 | def sample(self, time, outputs, state, name=None): 113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 114 | 115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 116 | with tf.name_scope(name or 'TacoTrainingHelper'): 117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions) 118 | finished = (time + 1 >= self._lengths) 119 | 120 | #Pick previous outputs randomly with respect to teacher forcing ratio 121 | next_inputs = tf.cond( 122 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 124 | lambda: outputs[:,-self._output_dim:]) 125 | 126 | #Pass on state 127 | next_state = state 128 | return (finished, next_inputs, next_state) 129 | 130 | 131 | def _go_frames(batch_size, output_dim): 132 | '''Returns all-zero frames for a given batch size and output dimension''' 133 | return tf.tile([[0.0]], [batch_size, output_dim]) 134 | 135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): 136 | ################################################################# 137 | # Narrow Cosine Decay: 138 | 139 | # Phase 1: tfr = 1 140 | # We only start learning rate decay after 10k steps 141 | 142 | # Phase 2: tfr in ]0, 1[ 143 | # decay reach minimal value at step ~280k 144 | 145 | # Phase 3: tfr = 0 146 | # clip by minimal teacher forcing ratio value (step >~ 280k) 147 | ################################################################# 148 | #Compute natural cosine decay 149 | tfr = tf.train.cosine_decay(init_tfr, 150 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k 151 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k 152 | alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value 153 | name='tfr_cosine_decay') 154 | 155 | #force teacher forcing ratio to take initial value when global step < start decay step. 156 | narrow_tfr = tf.cond( 157 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), 158 | lambda: tf.convert_to_tensor(init_tfr), 159 | lambda: tfr) 160 | 161 | return narrow_tfr -------------------------------------------------------------------------------- /datasets/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | from scipy import signal 5 | import tensorflow as tf 6 | from scipy.io import wavfile 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py 18 | def start_and_end_indices(quantized, silence_threshold=2): 19 | for start in range(quantized.size): 20 | if abs(quantized[start] - 127) > silence_threshold: 21 | break 22 | for end in range(quantized.size - 1, 1, -1): 23 | if abs(quantized[end] - 127) > silence_threshold: 24 | break 25 | 26 | assert abs(quantized[start] - 127) > silence_threshold 27 | assert abs(quantized[end] - 127) > silence_threshold 28 | 29 | return start, end 30 | 31 | def trim_silence(wav, hparams): 32 | '''Trim leading and trailing silence 33 | 34 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end. 35 | ''' 36 | #Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset. 37 | return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0] 38 | 39 | def get_hop_size(hparams): 40 | hop_size = hparams.hop_size 41 | if hop_size is None: 42 | assert hparams.frame_shift_ms is not None 43 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 44 | return hop_size 45 | 46 | def linearspectrogram(wav, hparams): 47 | D = _stft(wav, hparams) 48 | S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db 49 | 50 | if hparams.signal_normalization: 51 | return _normalize(S, hparams) 52 | return S 53 | 54 | def melspectrogram(wav, hparams): 55 | D = _stft(wav, hparams) 56 | S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db 57 | 58 | if hparams.signal_normalization: 59 | return _normalize(S, hparams) 60 | return S 61 | 62 | def inv_linear_spectrogram(linear_spectrogram, hparams): 63 | '''Converts linear spectrogram to waveform using librosa''' 64 | if hparams.signal_normalization: 65 | D = _denormalize(linear_spectrogram, hparams) 66 | else: 67 | D = linear_spectrogram 68 | 69 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 70 | 71 | if hparams.use_lws: 72 | processor = _lws_processor(hparams) 73 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 74 | y = processor.istft(D).astype(np.float32) 75 | return y 76 | else: 77 | return _griffin_lim(S ** hparams.power, hparams) 78 | 79 | 80 | def inv_mel_spectrogram(mel_spectrogram, hparams): 81 | '''Converts mel spectrogram to waveform using librosa''' 82 | if hparams.signal_normalization: 83 | D = _denormalize(mel_spectrogram, hparams) 84 | else: 85 | D = mel_spectrogram 86 | 87 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear 88 | 89 | if hparams.use_lws: 90 | processor = _lws_processor(hparams) 91 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 92 | y = processor.istft(D).astype(np.float32) 93 | return y 94 | else: 95 | return _griffin_lim(S ** hparams.power, hparams) 96 | 97 | def _lws_processor(hparams): 98 | import lws 99 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") 100 | 101 | def _griffin_lim(S, hparams): 102 | '''librosa implementation of Griffin-Lim 103 | Based on https://github.com/librosa/librosa/issues/434 104 | ''' 105 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 106 | S_complex = np.abs(S).astype(np.complex) 107 | y = _istft(S_complex * angles, hparams) 108 | for i in range(hparams.griffin_lim_iters): 109 | angles = np.exp(1j * np.angle(_stft(y, hparams))) 110 | y = _istft(S_complex * angles, hparams) 111 | return y 112 | 113 | def _stft(y, hparams): 114 | if hparams.use_lws: 115 | return _lws_processor(hparams).stft(y).T 116 | else: 117 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 118 | 119 | def _istft(y, hparams): 120 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 121 | 122 | def num_frames(length, fsize, fshift): 123 | """Compute number of time frames of spectrogram 124 | """ 125 | pad = (fsize - fshift) 126 | if length % fshift == 0: 127 | M = (length + pad * 2 - fsize) // fshift + 1 128 | else: 129 | M = (length + pad * 2 - fsize) // fshift + 2 130 | return M 131 | 132 | 133 | def pad_lr(x, fsize, fshift): 134 | """Compute left and right padding 135 | """ 136 | M = num_frames(len(x), fsize, fshift) 137 | pad = (fsize - fshift) 138 | T = len(x) + 2 * pad 139 | r = (M - 1) * fshift + fsize - T 140 | return pad, pad + r 141 | 142 | 143 | # Conversions 144 | _mel_basis = None 145 | _inv_mel_basis = None 146 | 147 | def _linear_to_mel(spectogram, hparams): 148 | global _mel_basis 149 | if _mel_basis is None: 150 | _mel_basis = _build_mel_basis(hparams) 151 | return np.dot(_mel_basis, spectogram) 152 | 153 | def _mel_to_linear(mel_spectrogram, hparams): 154 | global _inv_mel_basis 155 | if _inv_mel_basis is None: 156 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 157 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 158 | 159 | def _build_mel_basis(hparams): 160 | assert hparams.fmax <= hparams.sample_rate // 2 161 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 162 | fmin=hparams.fmin, fmax=hparams.fmax) 163 | 164 | def _amp_to_db(x, hparams): 165 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 166 | return 20 * np.log10(np.maximum(min_level, x)) 167 | 168 | def _db_to_amp(x): 169 | return np.power(10.0, (x) * 0.05) 170 | 171 | def _normalize(S, hparams): 172 | if hparams.allow_clipping_in_normalization: 173 | if hparams.symmetric_mels: 174 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 175 | -hparams.max_abs_value, hparams.max_abs_value) 176 | else: 177 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 178 | 179 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 180 | if hparams.symmetric_mels: 181 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 182 | else: 183 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 184 | 185 | def _denormalize(D, hparams): 186 | if hparams.allow_clipping_in_normalization: 187 | if hparams.symmetric_mels: 188 | return (((np.clip(D, -hparams.max_abs_value, 189 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 190 | + hparams.min_level_db) 191 | else: 192 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 193 | 194 | if hparams.symmetric_mels: 195 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 196 | else: 197 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron-2: 2 | Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf) 3 | 4 | 5 | # Repository Structure: 6 | Tacotron-2 7 | ├── datasets 8 | ├── en_UK (0) 9 | │   └── by_book 10 | │   └── female 11 | ├── en_US (0) 12 | │   └── by_book 13 | │   ├── female 14 | │   └── male 15 | ├── LJSpeech-1.1 (0) 16 | │   └── wavs 17 | ├── logs-Tacotron (2) 18 | │   ├── eval_-dir 19 | │   │  ├── plots 20 | │  │  └── wavs 21 | │   ├── mel-spectrograms 22 | │   ├── plots 23 | │   ├── pretrained 24 | │   └── wavs 25 | ├── logs-Wavenet (4) 26 | │   ├── eval-dir 27 | │   │  ├── plots 28 | │  │  └── wavs 29 | │   ├── plots 30 | │   ├── pretrained 31 | │   └── wavs 32 | ├── papers 33 | ├── tacotron 34 | │   ├── models 35 | │   └── utils 36 | ├── tacotron_output (3) 37 | │   ├── eval 38 | │   ├── gta 39 | │   ├── logs-eval 40 | │   │   ├── plots 41 | │   │   └── wavs 42 | │   └── natural 43 | ├── wavenet_output (5) 44 | │   ├── plots 45 | │   └── wavs 46 | ├── training_data (1) 47 | │   ├── audio 48 | │   ├── linear 49 | │ └── mels 50 | └── wavenet_vocoder 51 | └── models 52 | 53 | 54 | The previous tree shows the current state of the repository (separate training, one step at a time). 55 | 56 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**). 57 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder. 58 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder. 59 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder. 60 | - Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder. 61 | - Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder. 62 | 63 | 64 | Note: 65 | - **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script. 66 | - In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity. 67 | - If you run training of both **models at the same time**, repository structure will be different. 68 | 69 | # Model Architecture: 70 |

71 | 72 |

73 | 74 | The model described by the authors can be divided in two parts: 75 | - Spectrogram prediction network 76 | - Wavenet vocoder 77 | 78 | To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki) 79 | 80 | # Current state: 81 | 82 | To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4) 83 | 84 | since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training. 85 | 86 | # How to start 87 | first, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/). 88 | 89 | next you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**) 90 | 91 | > pip install -r requirements.txt 92 | 93 | # Dataset: 94 | We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it) 95 | 96 | We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages. 97 | 98 | After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.** 99 | 100 | # Preprocessing 101 | Before running the following steps, please make sure you are inside **Tacotron-2 folder** 102 | 103 | > cd Tacotron-2 104 | 105 | Preprocessing can then be started using: 106 | 107 | > python preprocess.py 108 | 109 | dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**. 110 | 111 | Example M-AILABS: 112 | 113 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth' 114 | 115 | or if you want to use all books for a single speaker: 116 | 117 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True 118 | 119 | This should take no longer than a **few minutes.** 120 | 121 | # Training: 122 | To **train both models** sequentially (one after the other): 123 | 124 | > python train.py --model='Tacotron-2' 125 | 126 | or: 127 | 128 | > python train.py --model='Both' 129 | 130 | Feature prediction model can **separately** be **trained** using: 131 | 132 | > python train.py --model='Tacotron' 133 | 134 | checkpoints will be made each **250 steps** and stored under **logs-Tacotron folder.** 135 | 136 | Naturally, **training the wavenet separately** is done by: 137 | 138 | > python train.py --model='WaveNet' 139 | 140 | logs will be stored inside **logs-Wavenet**. 141 | 142 | **Note:** 143 | - If model argument is not provided, training will default to Tacotron-2 model training. (both models) 144 | - Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use. 145 | 146 | # Synthesis 147 | To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work): 148 | 149 | > python synthesize.py --model='Tacotron-2' 150 | 151 | For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis: 152 | 153 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model. 154 | 155 | > python synthesize.py --model='Tacotron' --mode='eval' 156 | 157 | - **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step). 158 | 159 | > python synthesize.py --model='Tacotron' --GTA=False 160 | 161 | 162 | - **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper) 163 | 164 | > python synthesize.py --model='Tacotron' --GTA=True 165 | 166 | Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with: 167 | 168 | > python synthesize.py --model='WaveNet' 169 | 170 | **Note:** 171 | - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS) 172 | - Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use. 173 | 174 | # Pretrained model and Samples: 175 | Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). 176 | 177 | 178 | # References and Resources: 179 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf) 180 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf) 181 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf) 182 | - [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf) 183 | - [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf) 184 | - [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) 185 | - [keithito/tacotron](https://github.com/keithito/tacotron) 186 | 187 | -------------------------------------------------------------------------------- /tacotron/models/Architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers usefull for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow.contrib.rnn import RNNCell 8 | from tensorflow.python.framework import ops 9 | from tensorflow.python.ops import rnn_cell_impl 10 | from tensorflow.python.ops import check_ops 11 | from tensorflow.python.util import nest 12 | from tensorflow.python.ops import array_ops 13 | from tensorflow.python.ops import tensor_array_ops 14 | from tensorflow.python.framework import tensor_shape 15 | from tacotron.models.attention import _compute_attention 16 | 17 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 18 | 19 | 20 | 21 | class TacotronEncoderCell(RNNCell): 22 | """Tacotron 2 Encoder Cell 23 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 24 | layer to predict the hidden representation vector (or memory) 25 | """ 26 | 27 | def __init__(self, convolutional_layers, lstm_layer): 28 | """Initialize encoder parameters 29 | 30 | Args: 31 | convolutional_layers: Encoder convolutional block class 32 | lstm_layer: encoder bidirectional lstm layer class 33 | """ 34 | super(TacotronEncoderCell, self).__init__() 35 | #Initialize encoder layers 36 | self._convolutions = convolutional_layers 37 | self._cell = lstm_layer 38 | 39 | def __call__(self, inputs, input_lengths=None): 40 | #Pass input sequence through a stack of convolutional layers 41 | conv_output = self._convolutions(inputs) 42 | 43 | #Extract hidden representation from encoder lstm cells 44 | hidden_representation = self._cell(conv_output, input_lengths) 45 | 46 | #For shape visualization 47 | self.conv_output_shape = conv_output.shape 48 | return hidden_representation 49 | 50 | 51 | class TacotronDecoderCellState( 52 | collections.namedtuple("TacotronDecoderCellState", 53 | ("cell_state", "attention", "time", "alignments", 54 | "alignment_history"))): 55 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 56 | Contains: 57 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 58 | step. 59 | - `attention`: The attention emitted at the previous time step. 60 | - `time`: int32 scalar containing the current time step. 61 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 62 | emitted at the previous time step for each attention mechanism. 63 | - `alignment_history`: a single or tuple of `TensorArray`(s) 64 | containing alignment matrices from all time steps for each attention 65 | mechanism. Call `stack()` on each to convert to a `Tensor`. 66 | """ 67 | def replace(self, **kwargs): 68 | """Clones the current state while overwriting components provided by kwargs. 69 | """ 70 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 71 | 72 | class TacotronDecoderCell(RNNCell): 73 | """Tactron 2 Decoder Cell 74 | Decodes encoder output and previous mel frames into next r frames 75 | 76 | Decoder Step i: 77 | 1) Prenet to compress last output information 78 | 2) Concat compressed inputs with previous context vector (input feeding) * 79 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 80 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 81 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 82 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 83 | 84 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper, 85 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 86 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 87 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only. 88 | """ 89 | 90 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): 91 | """Initialize decoder parameters 92 | 93 | Args: 94 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 95 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 96 | learn encoder-decoder alignments 97 | rnn_cell: Instance of RNNCell, main body of the decoder 98 | frame_projection: tensorflow fully connected layer with r * num_mels output units 99 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 100 | and through a sigmoid activation 101 | mask_finished: Boolean, Whether to mask decoder frames after the 102 | """ 103 | super(TacotronDecoderCell, self).__init__() 104 | #Initialize decoder layers 105 | self._prenet = prenet 106 | self._attention_mechanism = attention_mechanism 107 | self._cell = rnn_cell 108 | self._frame_projection = frame_projection 109 | self._stop_projection = stop_projection 110 | 111 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 112 | 113 | def _batch_size_checks(self, batch_size, error_message): 114 | return [check_ops.assert_equal(batch_size, 115 | self._attention_mechanism.batch_size, 116 | message=error_message)] 117 | 118 | @property 119 | def output_size(self): 120 | return self._frame_projection.shape 121 | 122 | @property 123 | def state_size(self): 124 | """The `state_size` property of `TacotronDecoderCell`. 125 | 126 | Returns: 127 | An `TacotronDecoderCell` tuple containing shapes used by this object. 128 | """ 129 | return TacotronDecoderCellState( 130 | cell_state=self._cell._cell.state_size, 131 | time=tensor_shape.TensorShape([]), 132 | attention=self._attention_layer_size, 133 | alignments=self._attention_mechanism.alignments_size, 134 | alignment_history=()) 135 | 136 | def zero_state(self, batch_size, dtype): 137 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 138 | 139 | Args: 140 | batch_size: `0D` integer tensor: the batch size. 141 | dtype: The internal state data type. 142 | Returns: 143 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 144 | possibly, empty `TensorArray` objects. 145 | Raises: 146 | ValueError: (or, possibly at runtime, InvalidArgument), if 147 | `batch_size` does not match the output size of the encoder passed 148 | to the wrapper object at initialization time. 149 | """ 150 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 151 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 152 | error_message = ( 153 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 154 | "Non-matching batch sizes between the memory " 155 | "(encoder output) and the requested batch size.") 156 | with ops.control_dependencies( 157 | self._batch_size_checks(batch_size, error_message)): 158 | cell_state = nest.map_structure( 159 | lambda s: array_ops.identity(s, name="checked_cell_state"), 160 | cell_state) 161 | return TacotronDecoderCellState( 162 | cell_state=cell_state, 163 | time=array_ops.zeros([], dtype=tf.int32), 164 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 165 | dtype), 166 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 167 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 168 | dynamic_size=True)) 169 | 170 | def __call__(self, inputs, state): 171 | #Information bottleneck (essential for learning attention) 172 | prenet_output = self._prenet(inputs) 173 | 174 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 175 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 176 | 177 | #Unidirectional LSTM layers 178 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 179 | 180 | 181 | #Compute the attention (context) vector and alignments using 182 | #the new decoder cell hidden state as query vector 183 | #and cumulative alignments to extract location features 184 | #The choice of the new cell hidden state (s_{i}) of the last 185 | #decoder RNN Cell is based on Luong et Al. (2015): 186 | #https://arxiv.org/pdf/1508.04025.pdf 187 | previous_alignments = state.alignments 188 | previous_alignment_history = state.alignment_history 189 | context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 190 | LSTM_output, 191 | previous_alignments, 192 | attention_layer=None) 193 | 194 | #Concat LSTM outputs and context vector to form projections inputs 195 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 196 | 197 | #Compute predicted frames and predicted 198 | cell_outputs = self._frame_projection(projections_input) 199 | stop_tokens = self._stop_projection(projections_input) 200 | 201 | #Save alignment history 202 | alignment_history = previous_alignment_history.write(state.time, alignments) 203 | 204 | #Prepare next decoder state 205 | next_state = TacotronDecoderCellState( 206 | time=state.time + 1, 207 | cell_state=next_cell_state, 208 | attention=context_vector, 209 | alignments=cumulated_alignments, 210 | alignment_history=alignment_history) 211 | 212 | return (cell_outputs, stop_tokens), next_state 213 | -------------------------------------------------------------------------------- /wavenet_vocoder/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import os 4 | from datetime import datetime 5 | import time 6 | import librosa 7 | 8 | from wavenet_vocoder.models import create_model 9 | from wavenet_vocoder.feeder import Feeder 10 | from tacotron.utils import ValueWindow 11 | import numpy as np 12 | from scipy.io import wavfile 13 | import tensorflow as tf 14 | from . import util 15 | 16 | from hparams import hparams_debug_string 17 | import infolog 18 | 19 | log = infolog.log 20 | 21 | 22 | def add_train_stats(model): 23 | with tf.variable_scope('stats') as scope: 24 | tf.summary.histogram('wav_outputs', model.y_hat) 25 | tf.summary.histogram('wav_targets', model.y) 26 | tf.summary.scalar('loss', model.loss) 27 | return tf.summary.merge_all() 28 | 29 | def add_test_stats(summary_writer, step, eval_loss): 30 | values = [ 31 | tf.Summary.Value(tag='eval_model/eval_stats/eval_loss'), 32 | ] 33 | test_summary = tf.Summary(value=values) 34 | summary_writer.add_summary(test_summary, step) 35 | 36 | 37 | def create_shadow_saver(model, global_step=None): 38 | '''Load shadow variables of saved model. 39 | 40 | Inspired by: https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 41 | 42 | Can also use: shadow_dict = model.ema.variables_to_restore() 43 | ''' 44 | #Add global step to saved variables to save checkpoints correctly 45 | shadow_variables = [model.ema.average_name(v) for v in model.variables] 46 | variables = model.variables 47 | 48 | if global_step is not None: 49 | shadow_variables += ['global_step'] 50 | variables += [global_step] 51 | 52 | shadow_dict = dict(zip(shadow_variables, variables)) #dict(zip(keys, values)) -> {key1: value1, key2: value2, ...} 53 | return tf.train.Saver(shadow_dict, max_to_keep=5) 54 | 55 | def load_averaged_model(sess, sh_saver, checkpoint_path): 56 | sh_saver.restore(sess, checkpoint_path) 57 | 58 | 59 | def eval_step(sess, global_step, model, plot_dir, audio_dir, summary_writer, hparams): 60 | '''Evaluate model during training. 61 | Supposes that model variables are averaged. 62 | ''' 63 | start_time = time.time() 64 | y_hat, y_target, loss = sess.run([model.y_hat, model.y_target, model.eval_loss]) 65 | duration = time.time() - start_time 66 | log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format( 67 | len(y_target), duration, len(y_target)/duration)) 68 | 69 | pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(global_step)) 70 | target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(global_step)) 71 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) 72 | 73 | #Save Audio 74 | wavfile.write(pred_wav_path, hparams.sample_rate, y_hat) 75 | wavfile.write(target_wav_path, hparams.sample_rate, y_target) 76 | 77 | #Save figure 78 | util.waveplot(plot_path, y_hat, y_target, model._hparams) 79 | log('Eval loss for global step {}: {:.3f}'.format(global_step, loss)) 80 | 81 | log('Writing eval summary!') 82 | add_test_stats(summary_writer, global_step, loss) 83 | 84 | def save_log(sess, global_step, model, plot_dir, audio_dir, hparams): 85 | log('\nSaving intermediate states at step {}'.format(global_step)) 86 | idx = 0 87 | y_hat, y, length = sess.run([model.y_hat_log[idx], model.y_log[idx], model.input_lengths[idx]]) 88 | 89 | #mask by length 90 | y_hat[length:] = 0 91 | y[length:] = 0 92 | 93 | #Make audio and plot paths 94 | pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(global_step)) 95 | target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(global_step)) 96 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step)) 97 | 98 | #Save audio 99 | librosa.output.write_wav(pred_wav_path, y_hat, sr=hparams.sample_rate) 100 | librosa.output.write_wav(target_wav_path, y, sr=hparams.sample_rate) 101 | 102 | #Save figure 103 | util.waveplot(plot_path, y_hat, y, hparams) 104 | 105 | def save_checkpoint(sess, saver, checkpoint_path, global_step): 106 | saver.save(sess, checkpoint_path, global_step=global_step) 107 | 108 | 109 | def model_train_mode(args, feeder, hparams, global_step): 110 | with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: 111 | model_name = None 112 | if args.model in ('Tacotron-2', 'Both'): 113 | model_name = 'WaveNet' 114 | model = create_model(model_name or args.model, hparams) 115 | #initialize model to train mode 116 | model.initialize(feeder.targets, feeder.local_condition_features, feeder.global_condition_features, 117 | feeder.input_lengths, x=feeder.inputs) 118 | model.add_loss() 119 | model.add_optimizer(global_step) 120 | stats = add_train_stats(model) 121 | return model, stats 122 | 123 | def model_test_mode(args, feeder, hparams, global_step): 124 | with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: 125 | model_name = None 126 | if args.model in ('Tacotron-2', 'Both'): 127 | model_name = 'WaveNet' 128 | model = create_model(model_name or args.model, hparams) 129 | #initialize model to test mode 130 | model.initialize(feeder.eval_targets, feeder.eval_local_condition_features, feeder.eval_global_condition_features, 131 | feeder.eval_input_lengths) 132 | model.add_loss() 133 | return model 134 | 135 | def train(log_dir, args, hparams, input_path): 136 | save_dir = os.path.join(log_dir, 'wave_pretrained/') 137 | eval_dir = os.path.join(log_dir, 'eval-dir') 138 | audio_dir = os.path.join(log_dir, 'wavs') 139 | plot_dir = os.path.join(log_dir, 'plots') 140 | wav_dir = os.path.join(log_dir, 'wavs') 141 | eval_audio_dir = os.path.join(eval_dir, 'wavs') 142 | eval_plot_dir = os.path.join(eval_dir, 'plots') 143 | checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt') 144 | input_path = os.path.join(args.base_dir, input_path) 145 | os.makedirs(save_dir, exist_ok=True) 146 | os.makedirs(wav_dir, exist_ok=True) 147 | os.makedirs(audio_dir, exist_ok=True) 148 | os.makedirs(plot_dir, exist_ok=True) 149 | os.makedirs(eval_audio_dir, exist_ok=True) 150 | os.makedirs(eval_plot_dir, exist_ok=True) 151 | 152 | log('Checkpoint_path: {}'.format(checkpoint_path)) 153 | log('Loading training data from: {}'.format(input_path)) 154 | log('Using model: {}'.format(args.model)) 155 | log(hparams_debug_string()) 156 | 157 | #Start by setting a seed for repeatability 158 | tf.set_random_seed(hparams.wavenet_random_seed) 159 | 160 | #Set up data feeder 161 | coord = tf.train.Coordinator() 162 | with tf.variable_scope('datafeeder') as scope: 163 | feeder = Feeder(coord, input_path, args.base_dir, hparams) 164 | 165 | #Set up model 166 | global_step = tf.Variable(0, name='global_step', trainable=False) 167 | model, stats = model_train_mode(args, feeder, hparams, global_step) 168 | eval_model = model_test_mode(args, feeder, hparams, global_step) 169 | 170 | #book keeping 171 | step = 0 172 | time_window = ValueWindow(100) 173 | loss_window = ValueWindow(100) 174 | sh_saver = create_shadow_saver(model, global_step) 175 | 176 | log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps)) 177 | 178 | #Memory allocation on the memory 179 | config = tf.ConfigProto() 180 | config.gpu_options.allow_growth = True 181 | 182 | #Train 183 | with tf.Session(config=config) as sess: 184 | try: 185 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 186 | sess.run(tf.global_variables_initializer()) 187 | checkpoint_state=None 188 | #saved model restoring 189 | if args.restore: 190 | #Restore saved model if the user requested it, default = True 191 | try: 192 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 193 | except tf.errors.OutOfRangeError as e: 194 | log('Cannot restore checkpoint: {}'.format(e)) 195 | 196 | if (checkpoint_state and checkpoint_state.model_checkpoint_path): 197 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) 198 | load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path) 199 | 200 | else: 201 | if not args.restore: 202 | log('Starting new training!') 203 | else: 204 | log('No model to load at {}'.format(save_dir)) 205 | 206 | #initializing feeder 207 | feeder.start_threads(sess) 208 | 209 | #Training loop 210 | while not coord.should_stop() and step < args.wavenet_train_steps: 211 | start_time = time.time() 212 | step, y_hat, loss, opt = sess.run([global_step, model.y_hat, model.loss, model.optimize]) 213 | time_window.append(time.time() - start_time) 214 | loss_window.append(loss) 215 | 216 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( 217 | step, time_window.average, loss, loss_window.average) 218 | log(message, end='\r') 219 | 220 | if loss > 100 or np.isnan(loss): 221 | log('Loss exploded to {:.5f} at step {}'.format(loss, step)) 222 | raise Exception('Loss exploded') 223 | 224 | if step % args.summary_interval == 0: 225 | log('\nWriting summary at step {}'.format(step)) 226 | summary_writer.add_summary(sess.run(stats), step) 227 | 228 | if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps: 229 | save_log(sess, step, model, plot_dir, audio_dir, hparams=hparams) 230 | save_checkpoint(sess, sh_saver, checkpoint_path, global_step) 231 | 232 | if step % args.eval_interval == 0: 233 | log('\nEvaluating at step {}'.format(step)) 234 | eval_step(sess, step, eval_model, eval_plot_dir, eval_audio_dir, summary_writer=summary_writer , hparams=model._hparams) 235 | 236 | log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps)) 237 | return save_dir 238 | 239 | except Exception as e: 240 | log('Exiting due to Exception: {}'.format(e)) 241 | 242 | 243 | def wavenet_train(args, log_dir, hparams, input_path): 244 | return train(log_dir, args, hparams, input_path) 245 | -------------------------------------------------------------------------------- /tacotron/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.ops import nn_ops 6 | from tensorflow.python.layers import core as layers_core 7 | from tensorflow.python.ops import array_ops 8 | from tensorflow.python.ops import variable_scope 9 | from tensorflow.python.ops import math_ops 10 | 11 | 12 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 13 | def _compute_attention(attention_mechanism, cell_output, attention_state, 14 | attention_layer): 15 | """Computes the attention and alignments for a given attention_mechanism.""" 16 | alignments, next_attention_state = attention_mechanism( 17 | cell_output, state=attention_state) 18 | 19 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 20 | expanded_alignments = array_ops.expand_dims(alignments, 1) 21 | # Context is the inner product of alignments and values along the 22 | # memory time dimension. 23 | # alignments shape is 24 | # [batch_size, 1, memory_time] 25 | # attention_mechanism.values shape is 26 | # [batch_size, memory_time, memory_size] 27 | # the batched matmul is over memory_time, so the output shape is 28 | # [batch_size, 1, memory_size]. 29 | # we then squeeze out the singleton dim. 30 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 31 | context = array_ops.squeeze(context, [1]) 32 | 33 | if attention_layer is not None: 34 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 35 | else: 36 | attention = context 37 | 38 | return attention, alignments, next_attention_state 39 | 40 | 41 | def _location_sensitive_score(W_query, W_fil, W_keys): 42 | """Impelements Bahdanau-style (cumulative) scoring function. 43 | This attention is described in: 44 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 45 | gio, “Attention-based models for speech recognition,” in Ad- 46 | vances in Neural Information Processing Systems, 2015, pp. 47 | 577–585. 48 | 49 | ############################################################################# 50 | hybrid attention (content-based + location-based) 51 | f = F * α_{i-1} 52 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 53 | ############################################################################# 54 | 55 | Args: 56 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 57 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 58 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 59 | Returns: 60 | A '[batch_size, max_time]' attention score (energy) 61 | """ 62 | # Get the number of hidden units from the trailing dimension of keys 63 | dtype = W_query.dtype 64 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 65 | v_a = tf.get_variable( 66 | 'attention_variable', shape=[num_units], dtype=dtype, 67 | initializer=tf.contrib.layers.xavier_initializer()) 68 | b_a = tf.get_variable( 69 | 'attention_bias', shape=[num_units], dtype=dtype, 70 | initializer=tf.zeros_initializer()) 71 | 72 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 73 | 74 | def _smoothing_normalization(e): 75 | """Applies a smoothing normalization function instead of softmax 76 | Introduced in: 77 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 78 | gio, “Attention-based models for speech recognition,” in Ad- 79 | vances in Neural Information Processing Systems, 2015, pp. 80 | 577–585. 81 | 82 | ############################################################################ 83 | Smoothing normalization function 84 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 85 | ############################################################################ 86 | 87 | Args: 88 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 89 | values of an attention mechanism 90 | Returns: 91 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 92 | attendance to multiple memory time steps. 93 | """ 94 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 95 | 96 | 97 | class LocationSensitiveAttention(BahdanauAttention): 98 | """Impelements Bahdanau-style (cumulative) scoring function. 99 | Usually referred to as "hybrid" attention (content-based + location-based) 100 | Extends the additive attention described in: 101 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 102 | tion by jointly learning to align and translate,” in Proceedings 103 | of ICLR, 2015." 104 | to use previous alignments as additional location features. 105 | 106 | This attention is described in: 107 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 108 | gio, “Attention-based models for speech recognition,” in Ad- 109 | vances in Neural Information Processing Systems, 2015, pp. 110 | 577–585. 111 | """ 112 | 113 | def __init__(self, 114 | num_units, 115 | memory, 116 | hparams, 117 | mask_encoder=True, 118 | memory_sequence_length=None, 119 | smoothing=False, 120 | cumulate_weights=True, 121 | name='LocationSensitiveAttention'): 122 | """Construct the Attention mechanism. 123 | Args: 124 | num_units: The depth of the query mechanism. 125 | memory: The memory to query; usually the output of an RNN encoder. This 126 | tensor should be shaped `[batch_size, max_time, ...]`. 127 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 128 | memory_sequence_length (optional): Sequence lengths for the batch entries 129 | in memory. If provided, the memory tensor rows are masked with zeros 130 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 131 | smoothing (optional): Boolean. Determines which normalization function to use. 132 | Default normalization function (probablity_fn) is softmax. If smoothing is 133 | enabled, we replace softmax with: 134 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 135 | Introduced in: 136 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 137 | gio, “Attention-based models for speech recognition,” in Ad- 138 | vances in Neural Information Processing Systems, 2015, pp. 139 | 577–585. 140 | This is mainly used if the model wants to attend to multiple inputs parts 141 | at the same decoding step. We probably won't be using it since multiple sound 142 | frames may depend from the same character, probably not the way around. 143 | Note: 144 | We still keep it implemented in case we want to test it. They used it in the 145 | paper in the context of speech recognition, where one phoneme may depend on 146 | multiple subsequent sound frames. 147 | name: Name to use when creating ops. 148 | """ 149 | #Create normalization function 150 | #Setting it to None defaults in using softmax 151 | normalization_function = _smoothing_normalization if (smoothing == True) else None 152 | memory_length = memory_sequence_length if (mask_encoder==True) else None 153 | super(LocationSensitiveAttention, self).__init__( 154 | num_units=num_units, 155 | memory=memory, 156 | memory_sequence_length=memory_length, 157 | probability_fn=normalization_function, 158 | name=name) 159 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 160 | kernel_size=hparams.attention_kernel, padding='same', use_bias=True, 161 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution') 162 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 163 | dtype=tf.float32, name='location_features_layer') 164 | self._cumulate = cumulate_weights 165 | 166 | def __call__(self, query, state): 167 | """Score the query based on the keys and values. 168 | Args: 169 | query: Tensor of dtype matching `self.values` and shape 170 | `[batch_size, query_depth]`. 171 | state (previous alignments): Tensor of dtype matching `self.values` and shape 172 | `[batch_size, alignments_size]` 173 | (`alignments_size` is memory's `max_time`). 174 | Returns: 175 | alignments: Tensor of dtype matching `self.values` and shape 176 | `[batch_size, alignments_size]` (`alignments_size` is memory's 177 | `max_time`). 178 | """ 179 | previous_alignments = state 180 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 181 | 182 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 183 | processed_query = self.query_layer(query) if self.query_layer else query 184 | # -> [batch_size, 1, attention_dim] 185 | processed_query = tf.expand_dims(processed_query, 1) 186 | 187 | # processed_location_features shape [batch_size, max_time, attention dimension] 188 | # [batch_size, max_time] -> [batch_size, max_time, 1] 189 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 190 | # location features [batch_size, max_time, filters] 191 | f = self.location_convolution(expanded_alignments) 192 | # Projected location features [batch_size, max_time, attention_dim] 193 | processed_location_features = self.location_layer(f) 194 | 195 | # energy shape [batch_size, max_time] 196 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 197 | 198 | # alignments shape = energy shape = [batch_size, max_time] 199 | alignments = self._probability_fn(energy, previous_alignments) 200 | # Cumulate alignments 201 | if self._cumulate: 202 | next_state = alignments + previous_alignments 203 | else: 204 | next_state = alignments 205 | 206 | return alignments, next_state 207 | -------------------------------------------------------------------------------- /tacotron/feeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import threading 4 | import time 5 | import traceback 6 | from tacotron.utils.text import text_to_sequence 7 | from infolog import log 8 | from sklearn.model_selection import train_test_split 9 | import tensorflow as tf 10 | 11 | 12 | _batches_per_group = 32 13 | 14 | class Feeder: 15 | """ 16 | Feeds batches of data into queue on a background thread. 17 | """ 18 | 19 | def __init__(self, coordinator, metadata_filename, hparams): 20 | super(Feeder, self).__init__() 21 | self._coord = coordinator 22 | self._hparams = hparams 23 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 24 | self._train_offset = 0 25 | self._test_offset = 0 26 | 27 | # Load metadata 28 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels') 29 | self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear') 30 | with open(metadata_filename, encoding='utf-8') as f: 31 | self._metadata = [line.strip().split('|') for line in f] 32 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 33 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) 34 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours)) 35 | 36 | #Train test split 37 | if hparams.tacotron_test_size is None: 38 | assert hparams.tacotron_test_batches is not None 39 | 40 | test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 41 | else hparams.tacotron_test_batches * hparams.tacotron_batch_size) 42 | indices = np.arange(len(self._metadata)) 43 | train_indices, test_indices = train_test_split(indices, 44 | test_size=test_size, random_state=hparams.tacotron_data_random_state) 45 | 46 | #Make sure test_indices is a multiple of batch_size else round up 47 | len_test_indices = self._round_up(len(test_indices), hparams.tacotron_batch_size) 48 | extra_test = test_indices[len_test_indices:] 49 | test_indices = test_indices[:len_test_indices] 50 | train_indices = np.concatenate([train_indices, extra_test]) 51 | 52 | self._train_meta = list(np.array(self._metadata)[train_indices]) 53 | self._test_meta = list(np.array(self._metadata)[test_indices]) 54 | 55 | self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size 56 | 57 | if hparams.tacotron_test_size is None: 58 | assert hparams.tacotron_test_batches == self.test_steps 59 | 60 | #pad input sequences with the 0 ( _ ) 61 | self._pad = 0 62 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram 63 | #to avoid any possible conflicts, without affecting the output range of the model too much 64 | if hparams.symmetric_mels: 65 | self._target_pad = -(hparams.max_abs_value + .1) 66 | else: 67 | self._target_pad = -0.1 68 | #Mark finished sequences with 1s 69 | self._token_pad = 1. 70 | 71 | with tf.device('/cpu:0'): 72 | # Create placeholders for inputs and targets. Don't specify batch size because we want 73 | # to be able to feed different batch sizes at eval time. 74 | self._placeholders = [ 75 | tf.placeholder(tf.int32, shape=(None, None), name='inputs'), 76 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 77 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'), 78 | tf.placeholder(tf.float32, shape=(None, None), name='token_targets'), 79 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'), 80 | tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'), 81 | ] 82 | 83 | # Create queue for buffering data 84 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='input_queue') 85 | self._enqueue_op = queue.enqueue(self._placeholders) 86 | self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue() 87 | 88 | self.inputs.set_shape(self._placeholders[0].shape) 89 | self.input_lengths.set_shape(self._placeholders[1].shape) 90 | self.mel_targets.set_shape(self._placeholders[2].shape) 91 | self.token_targets.set_shape(self._placeholders[3].shape) 92 | self.linear_targets.set_shape(self._placeholders[4].shape) 93 | self.targets_lengths.set_shape(self._placeholders[5].shape) 94 | 95 | # Create eval queue for buffering eval data 96 | eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='eval_queue') 97 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 98 | self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \ 99 | self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue() 100 | 101 | self.eval_inputs.set_shape(self._placeholders[0].shape) 102 | self.eval_input_lengths.set_shape(self._placeholders[1].shape) 103 | self.eval_mel_targets.set_shape(self._placeholders[2].shape) 104 | self.eval_token_targets.set_shape(self._placeholders[3].shape) 105 | self.eval_linear_targets.set_shape(self._placeholders[4].shape) 106 | self.eval_targets_lengths.set_shape(self._placeholders[5].shape) 107 | 108 | def start_threads(self, session): 109 | self._session = session 110 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group) 111 | thread.daemon = True #Thread will close when parent quits 112 | thread.start() 113 | 114 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group) 115 | thread.daemon = True #Thread will close when parent quits 116 | thread.start() 117 | 118 | def _get_test_groups(self): 119 | meta = self._test_meta[self._test_offset] 120 | self._test_offset += 1 121 | 122 | text = meta[5] 123 | 124 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 125 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 126 | #Create parallel sequences containing zeros to represent a non finished sequence 127 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 128 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 129 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 130 | 131 | def make_test_batches(self): 132 | start = time.time() 133 | 134 | # Read a group of examples 135 | n = self._hparams.tacotron_batch_size 136 | r = self._hparams.outputs_per_step 137 | 138 | #Test on entire test set 139 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 140 | 141 | # Bucket examples based on similar output sequence length for efficiency 142 | examples.sort(key=lambda x: x[-1]) 143 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 144 | np.random.shuffle(batches) 145 | 146 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 147 | return batches, r 148 | 149 | def _enqueue_next_train_group(self): 150 | while not self._coord.should_stop(): 151 | start = time.time() 152 | 153 | # Read a group of examples 154 | n = self._hparams.tacotron_batch_size 155 | r = self._hparams.outputs_per_step 156 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 157 | 158 | # Bucket examples based on similar output sequence length for efficiency 159 | examples.sort(key=lambda x: x[-1]) 160 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 161 | np.random.shuffle(batches) 162 | 163 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 164 | for batch in batches: 165 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 166 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 167 | 168 | def _enqueue_next_test_group(self): 169 | #Create test batches once and evaluate on them for all test steps 170 | test_batches, r = self.make_test_batches() 171 | while not self._coord.should_stop(): 172 | for batch in test_batches: 173 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 174 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 175 | 176 | def _get_next_example(self): 177 | """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk 178 | """ 179 | if self._train_offset >= len(self._train_meta): 180 | self._train_offset = 0 181 | np.random.shuffle(self._train_meta) 182 | 183 | meta = self._train_meta[self._train_offset] 184 | self._train_offset += 1 185 | 186 | text = meta[5] 187 | 188 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 189 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 190 | #Create parallel sequences containing zeros to represent a non finished sequence 191 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 192 | linear_target = np.load(os.path.join(self._linear_dir, meta[2])) 193 | return (input_data, mel_target, token_target, linear_target, len(mel_target)) 194 | 195 | 196 | def _prepare_batch(self, batch, outputs_per_step): 197 | np.random.shuffle(batch) 198 | inputs = self._prepare_inputs([x[0] for x in batch]) 199 | input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) 200 | mel_targets = self._prepare_targets([x[1] for x in batch], outputs_per_step) 201 | #Pad sequences with 1 to infer that the sequence is done 202 | token_targets = self._prepare_token_targets([x[2] for x in batch], outputs_per_step) 203 | linear_targets = self._prepare_targets([x[3] for x in batch], outputs_per_step) 204 | targets_lengths = np.asarray([x[-1] for x in batch], dtype=np.int32) #Used to mask loss 205 | return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths) 206 | 207 | def _prepare_inputs(self, inputs): 208 | max_len = max([len(x) for x in inputs]) 209 | return np.stack([self._pad_input(x, max_len) for x in inputs]) 210 | 211 | def _prepare_targets(self, targets, alignment): 212 | max_len = max([len(t) for t in targets]) 213 | return np.stack([self._pad_target(t, self._round_up(max_len, alignment)) for t in targets]) 214 | 215 | def _prepare_token_targets(self, targets, alignment): 216 | max_len = max([len(t) for t in targets]) + 1 217 | return np.stack([self._pad_token_target(t, self._round_up(max_len, alignment)) for t in targets]) 218 | 219 | def _pad_input(self, x, length): 220 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad) 221 | 222 | def _pad_target(self, t, length): 223 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad) 224 | 225 | def _pad_token_target(self, t, length): 226 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad) 227 | 228 | def _round_up(self, x, multiple): 229 | remainder = x % multiple 230 | return x if remainder == 0 else x + multiple - remainder 231 | -------------------------------------------------------------------------------- /wavenet_vocoder/models/modules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from wavenet_vocoder.util import sequence_mask 4 | from .mixture import discretized_mix_logistic_loss 5 | 6 | class Embedding: 7 | """Embedding class for global conditions. 8 | """ 9 | def __init__(self, num_embeddings, embedding_dim, std=0.1, name='gc_embedding'): 10 | #Create embedding table 11 | self.embedding_table = tf.get_variable(name, 12 | [num_embeddings, embedding_dim], dtype=tf.float32, 13 | initializer=tf.truncated_normal_initializer(mean=0., stddev=std)) 14 | 15 | def __call__(self, inputs): 16 | #Do the actual embedding 17 | return tf.nn.embedding_lookup(self.embedding_table, inputs) 18 | 19 | class ReluActivation: 20 | """Simple class to wrap relu activation function in classe for later call. 21 | """ 22 | def __init__(self, name=None): 23 | self.name = name 24 | 25 | def __call__(self, inputs): 26 | return tf.nn.relu(inputs, name=self.name) 27 | 28 | 29 | class Conv1d1x1(tf.layers.Conv1D): 30 | """Extend tf.layers.Conv1D for dilated layers convolutions. 31 | """ 32 | def __init__(self, in_channels, filters, kernel_size=1, padding=None, dilation=1, use_bias=True, name='Conv1d1x1'): 33 | with tf.variable_scope(name) as scope: 34 | #Create variables 35 | kernel_shape = (kernel_size, in_channels, filters) 36 | self.kernel = tf.get_variable( 37 | name='kernel_{}'.format(name), 38 | shape=kernel_shape, 39 | dtype=tf.float32 40 | ) 41 | 42 | if use_bias: 43 | self.bias = tf.get_variable( 44 | name='bias_{}'.format(name), 45 | shape=(filters, ), 46 | initializer=tf.zeros_initializer(), 47 | dtype=tf.float32) 48 | 49 | self.filters = filters 50 | self.in_channels = in_channels 51 | self.dilation_rate = dilation 52 | self.convolution_queue = None 53 | self._linearized_weight = None 54 | self.paddings = None 55 | self.use_bias = use_bias 56 | self.paddings = padding 57 | self.scope = scope 58 | 59 | def set_mode(self, is_training): 60 | self.training = is_training 61 | 62 | def _to_dilation(self, inputs): 63 | '''Pad and reshape inputs by dilation rate. 64 | 65 | Used to perfrom 1D dilation convolution. 66 | ''' 67 | if self.paddings is not None: #dilated conv 68 | assert isinstance(self.paddings, int) 69 | inputs_padded = tf.pad(inputs, [[0, 0], [0, 0], [self.paddings, 0]], "CONSTANT") 70 | 71 | #inputs are channels first 72 | inputs_shape = tf.shape(inputs_padded) 73 | channels = inputs_shape[1] 74 | width_pad = inputs_shape[-1] 75 | 76 | dilation_shape = (width_pad // self.dilation_rate, -1, channels) #-1 refers to batch_size * dilation_rate 77 | #[width_pad, batch_size, channels] 78 | inputs_transposed = tf.transpose(inputs_padded, [2, 0, 1]) 79 | #[width_pad / dilation_rate, batch_size * dilation_rate, channels] 80 | inputs_reshaped = tf.reshape(inputs_transposed, dilation_shape) 81 | #[batch_size * dilation_rate, width_pad / dilation_rate, channels] 82 | outputs = tf.transpose(inputs_reshaped, [1, 0, 2]) 83 | 84 | else: #Simple channels first convolution 85 | outputs = tf.transpose(inputs, [0, 2, 1]) 86 | 87 | return outputs 88 | 89 | def _from_dilation(self, inputs, crop): 90 | '''Remove paddings and reshape to 1d signal. 91 | 92 | Used after 1D dilation convolution. 93 | ''' 94 | if self.paddings is not None: #dilated conv 95 | assert isinstance(self.paddings, int) 96 | #inputs: [batch_size * dilation_rate, width_pad / dilation_rate, channels] 97 | inputs_shape = tf.shape(inputs) 98 | batch_size = inputs_shape[0] / self.dilation_rate 99 | width_pad = inputs_shape[1] * self.dilation_rate 100 | channels = inputs_shape[-1] 101 | new_shape = (width_pad, -1, channels) #-1 refers to batch_size 102 | 103 | #[width_pad / dilation_rate, batch_size * dilation_rate, channels] 104 | inputs_transposed = tf.transpose(inputs, [1, 0, 2]) 105 | #[width_pad, batch_size, channels] 106 | inputs_reshaped = tf.reshape(inputs_transposed, new_shape) 107 | #[batch_size, channels, width_pad] 108 | outputs = tf.transpose(inputs_reshaped, [1, 2, 0]) 109 | #[batch_size, channels, width] 110 | cropped = tf.slice(outputs, [0, 0, crop], [-1, -1, -1]) 111 | 112 | else: #Simple channels first convolution 113 | cropped = tf.transpose(inputs, [0, 2, 1]) 114 | 115 | return cropped 116 | 117 | 118 | def __call__(self, inputs): 119 | '''During this call, we change to channel last scheme for a better generalization and easier bias computation 120 | ''' 121 | with tf.variable_scope(self.scope): 122 | #Reshape to dilated conv mode (if this instance is of a dilated convolution) 123 | inputs_ = self._to_dilation(inputs) 124 | 125 | outputs_ = tf.nn.conv1d(inputs_, self.kernel, 126 | stride=1, padding='VALID', data_format='NWC') 127 | 128 | if self.use_bias: 129 | outputs_ = tf.nn.bias_add(outputs_, self.bias) 130 | 131 | #Reshape back ((if this instance is of a dilated convolution)) 132 | diff = tf.shape(outputs_)[1] * self.dilation_rate - tf.shape(inputs)[-1] 133 | outputs = self._from_dilation(outputs_, crop=diff) 134 | 135 | #Make sure that outputs have same time steps as inputs 136 | #[batch_size, channels(filters), width] 137 | with tf.control_dependencies([tf.assert_equal(tf.shape(outputs)[-1], tf.shape(inputs)[-1])]): 138 | outputs = tf.identity(outputs, name='output_equal_input_time_assert') 139 | 140 | return outputs 141 | 142 | def incremental_step(self, inputs): 143 | '''At sequential inference times: 144 | we adopt fast wavenet convolution queues by saving precomputed states for faster generation 145 | 146 | inputs: [batch_size, time_length, channels] ('NWC')! Channels last! 147 | ''' 148 | with tf.variable_scope(self.scope): 149 | #input: [batch_size, time_length, channels] 150 | if self.training: 151 | raise RuntimeError('incremental_step only supports eval mode') 152 | 153 | #reshape weight 154 | weight = self._get_linearized_weight(inputs) 155 | kw = self.kernel.shape[0] 156 | dilation = self.dilation_rate 157 | 158 | batch_size = tf.shape(inputs)[0] 159 | #Fast dilation 160 | #Similar to using tf FIFOQueue to schedule states of dilated convolutions 161 | if kw > 1: 162 | if self.convolution_queue is None: 163 | self.convolution_queue = tf.zeros((batch_size, (kw - 1) + (kw - 1) * (dilation - 1), tf.shape(inputs)[2])) 164 | else: 165 | #shift queue 166 | self.convolution_queue = self.convolution_queue[:, 1:, :] 167 | 168 | #append next input 169 | self.convolution_queue = tf.concat([self.convolution_queue, tf.expand_dims(inputs[:, -1, :], axis=1)], axis=1) 170 | #self.convolution_queue[:, -1, :] = inputs[:, -1, :] 171 | inputs = self.convolution_queue 172 | if dilation > 1: 173 | inputs = inputs[:, 0::dilation, :] 174 | 175 | #Compute step prediction 176 | output = tf.matmul(tf.reshape(inputs, [batch_size, -1]), weight) 177 | if self.use_bias: 178 | output = tf.nn.bias_add(output, self.bias) 179 | 180 | #[batch_size, 1(time_step), channels(filters)] 181 | return tf.reshape(output, [batch_size, 1, self.filters]) 182 | 183 | def _get_linearized_weight(self, inputs): 184 | if self._linearized_weight is None: 185 | kw = self.kernel.shape[0] 186 | #layers.Conv1D 187 | if tf.shape(self.kernel) == (self.filters, self.in_channels, kw): 188 | #[filters, in, kw] 189 | weight = tf.transpose(self.kernel, [2, 1, 0]) 190 | else: 191 | #[kw, in, filters] 192 | weight = self.kernel 193 | 194 | #[kw, in, filters] 195 | assert weight.shape == (kw, self.in_channels, self.filters) 196 | self._linearized_weight = tf.cast(tf.reshape(weight, [-1, self.filters]), dtype=inputs.dtype) 197 | return self._linearized_weight 198 | 199 | def clear_queue(self): 200 | self.convolution_queue = None 201 | 202 | def _conv1x1_forward(conv, x, is_incremental): 203 | """conv1x1 step 204 | """ 205 | if is_incremental: 206 | return conv.incremental_step(x) 207 | else: 208 | return conv(x) 209 | 210 | class ResidualConv1dGLU(): 211 | '''Residual dilated conv1d + Gated Linear Unit 212 | ''' 213 | 214 | def __init__(self, residual_channels, gate_channels, kernel_size, 215 | skip_out_channels=None, cin_channels=-1, gin_channels=-1, 216 | dropout=1 - .95, padding=None, dilation=1, causal=True, 217 | use_bias=True, name='ResidualConv1dGLU'): 218 | self.dropout = dropout 219 | 220 | if skip_out_channels is None: 221 | skip_out_channels = residual_channels 222 | 223 | if padding is None: 224 | #No future time stamps available 225 | if causal: 226 | padding = (kernel_size - 1) * dilation 227 | else: 228 | padding = (kernel_size - 1) // 2 * dilation 229 | 230 | self.causal = causal 231 | 232 | self.conv = Conv1d1x1(residual_channels, gate_channels, kernel_size, 233 | padding=padding, dilation=dilation, use_bias=use_bias, name='residual_block_conv') 234 | 235 | #Local conditioning 236 | if cin_channels > 0: 237 | self.conv1x1c = Conv1d1x1(cin_channels, gate_channels, 238 | use_bias=use_bias, name='residual_block_cin_conv') 239 | else: 240 | self.conv1x1c = None 241 | 242 | #Global conditioning 243 | if gin_channels > 0: 244 | self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, 245 | use_bias=use_bias, name='residual_block_gin_conv') 246 | else: 247 | self.conv1x1g = None 248 | 249 | gate_out_channels = gate_channels // 2 250 | self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, use_bias=use_bias, name='residual_block_out_conv') 251 | self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, use_bias=use_bias, name='residual_block_skip_conv') 252 | 253 | def set_mode(self, is_training): 254 | for conv in [self.conv, self.conv1x1c, self.conv1x1g, self.conv1x1_out, self.conv1x1_skip]: 255 | try: 256 | conv.set_mode(is_training) 257 | except AttributeError: 258 | pass 259 | 260 | def __call__(self, x, c=None, g=None): 261 | return self.step(x, c, g, False) 262 | 263 | def incremental_step(self, x, c=None, g=None): 264 | return self.step(x, c, g, True) 265 | 266 | def step(self, x, c, g, is_incremental): 267 | ''' 268 | 269 | Args: 270 | x: Tensor [batch_size, channels, time_length] 271 | c: Tensor [batch_size, c_channels, time_length]. Local conditioning features 272 | g: Tensor [batch_size, g_channels, time_length], global conditioning features 273 | is_incremental: Boolean, whether incremental mode is on 274 | Returns: 275 | Tensor output 276 | ''' 277 | residual = x 278 | x = tf.layers.dropout(x, rate=self.dropout, training=not is_incremental) 279 | if is_incremental: 280 | splitdim = -1 281 | x = self.conv.incremental_step(x) 282 | else: 283 | splitdim = 1 284 | x = self.conv(x) 285 | #Remove future time steps 286 | x = x[:, :, :tf.shape(residual)[-1]] if self.causal else x 287 | 288 | a, b = tf.split(x, num_or_size_splits=2, axis=splitdim) 289 | 290 | #local conditioning 291 | if c is not None: 292 | assert self.conv1x1c is not None 293 | c = _conv1x1_forward(self.conv1x1c, c, is_incremental) 294 | ca, cb = tf.split(c, num_or_size_splits=2, axis=splitdim) 295 | a, b = a + ca, b + cb 296 | 297 | #global conditioning 298 | if g is not None: 299 | assert self.conv1x1g is not None 300 | g = _conv1x1_forward(self.conv1x1g, g, is_incremental) 301 | ga, gb = tf.split(g, num_or_size_splits=2, axis=splitdim) 302 | a, b = a + ga, b + gb 303 | 304 | x = tf.nn.tanh(a) * tf.nn.sigmoid(b) 305 | #For Skip connection 306 | s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental) 307 | 308 | #For Residual connection 309 | x = _conv1x1_forward(self.conv1x1_out, x, is_incremental) 310 | 311 | x = (x + residual) * tf.sqrt(0.5) 312 | return x, s 313 | 314 | def clear_queue(self): 315 | for conv in [self.conv, self.conv1x1_out, self.conv1x1_skip, 316 | self.conv1x1c, self.conv1x1g]: 317 | if conv is not None: 318 | self.conv.clear_queue() 319 | 320 | 321 | class ConvTranspose2d: 322 | def __init__(self, filters, kernel_size, freq_axis_kernel_size, padding, strides): 323 | self.convt = tf.layers.Conv2DTranspose( 324 | filters=filters, 325 | kernel_size=kernel_size, 326 | strides=strides, 327 | padding=padding, 328 | kernel_initializer=tf.constant_initializer(1 / freq_axis_kernel_size, dtype=tf.float32), 329 | bias_initializer=tf.zeros_initializer(), 330 | data_format='channels_first') 331 | 332 | def __call__(self, inputs): 333 | return self.convt(inputs) 334 | 335 | 336 | 337 | def MaskedCrossEntropyLoss(outputs, targets, lengths=None, mask=None, max_len=None): 338 | if lengths is None and mask is None: 339 | raise RuntimeError('Please provide either lengths or mask') 340 | 341 | #[batch_size, time_length] 342 | if mask is None: 343 | mask = sequence_mask(lengths, max_len, False) 344 | 345 | #One hot encode targets (outputs.shape[-1] = hparams.quantize_channels) 346 | targets_ = tf.one_hot(targets, depth=tf.shape(outputs)[-1]) 347 | 348 | with tf.control_dependencies([tf.assert_equal(tf.shape(outputs), tf.shape(targets_))]): 349 | losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=outputs, labels=targets_) 350 | 351 | with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]): 352 | masked_loss = losses * mask 353 | 354 | return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32) 355 | 356 | def DiscretizedMixtureLogisticLoss(outputs, targets, hparams, lengths=None, mask=None, max_len=None): 357 | if lengths is None and mask is None: 358 | raise RuntimeError('Please provide either lengths or mask') 359 | 360 | #[batch_size, time_length, 1] 361 | if mask is None: 362 | mask = sequence_mask(lengths, max_len, True) 363 | 364 | #[batch_size, time_length, dimension] 365 | ones = tf.ones([tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]], tf.float32) 366 | mask_ = mask * ones 367 | 368 | losses = discretized_mix_logistic_loss( 369 | outputs, targets, num_classes=hparams.quantize_channels, 370 | log_scale_min=hparams.log_scale_min, reduce=False) 371 | 372 | with tf.control_dependencies([tf.assert_equal(tf.shape(losses), tf.shape(targets))]): 373 | return tf.reduce_sum(losses * mask_) / tf.reduce_sum(mask_) -------------------------------------------------------------------------------- /tacotron/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from datetime import datetime 3 | import os 4 | import subprocess 5 | import time 6 | import tensorflow as tf 7 | import traceback 8 | import argparse 9 | 10 | from tacotron.feeder import Feeder 11 | from hparams import hparams_debug_string 12 | from tacotron.models import create_model 13 | from tacotron.utils.text import sequence_to_text 14 | from tacotron.utils import plot, ValueWindow 15 | import infolog 16 | from datasets import audio 17 | from tqdm import tqdm 18 | 19 | log = infolog.log 20 | 21 | 22 | def add_train_stats(model, hparams): 23 | with tf.variable_scope('stats') as scope: 24 | tf.summary.histogram('mel_outputs', model.mel_outputs) 25 | tf.summary.histogram('mel_targets', model.mel_targets) 26 | tf.summary.scalar('before_loss', model.before_loss) 27 | tf.summary.scalar('after_loss', model.after_loss) 28 | if hparams.predict_linear: 29 | tf.summary.scalar('linear_loss', model.linear_loss) 30 | tf.summary.scalar('regularization_loss', model.regularization_loss) 31 | tf.summary.scalar('stop_token_loss', model.stop_token_loss) 32 | tf.summary.scalar('loss', model.loss) 33 | tf.summary.scalar('learning_rate', model.learning_rate) #Control learning rate decay speed 34 | if hparams.tacotron_teacher_forcing_mode == 'scheduled': 35 | tf.summary.scalar('teacher_forcing_ratio', model.ratio) #Control teacher forcing ratio decay when mode = 'scheduled' 36 | gradient_norms = [tf.norm(grad) for grad in model.gradients] 37 | tf.summary.histogram('gradient_norm', gradient_norms) 38 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion) 39 | return tf.summary.merge_all() 40 | 41 | def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): 42 | values = [ 43 | tf.Summary.Value(tag='eval_model/eval_stats/eval_before_loss', simple_value=before_loss), 44 | tf.Summary.Value(tag='eval_model/eval_stats/eval_after_loss', simple_value=after_loss), 45 | tf.Summary.Value(tag='eval_model/eval_stats/stop_token_loss', simple_value=stop_token_loss), 46 | tf.Summary.Value(tag='eval_model/eval_stats/eval_loss', simple_value=loss), 47 | ] 48 | if linear_loss is not None: 49 | values.append(tf.Summary.Value(tag='model/eval_stats/eval_linear_loss', simple_value=linear_loss)) 50 | test_summary = tf.Summary(value=values) 51 | summary_writer.add_summary(test_summary, step) 52 | 53 | def time_string(): 54 | return datetime.now().strftime('%Y-%m-%d %H:%M') 55 | 56 | def model_train_mode(args, feeder, hparams, global_step): 57 | with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: 58 | model_name = None 59 | if args.model in ('Tacotron-2', 'Both'): 60 | model_name = 'Tacotron' 61 | model = create_model(model_name or args.model, hparams) 62 | if hparams.predict_linear: 63 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, linear_targets=feeder.linear_targets, 64 | targets_lengths=feeder.targets_lengths, global_step=global_step, 65 | is_training=True) 66 | else: 67 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, 68 | targets_lengths=feeder.targets_lengths, global_step=global_step, 69 | is_training=True) 70 | model.add_loss() 71 | model.add_optimizer(global_step) 72 | stats = add_train_stats(model, hparams) 73 | return model, stats 74 | 75 | def model_test_mode(args, feeder, hparams, global_step): 76 | with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: 77 | model_name = None 78 | if args.model in ('Tacotron-2', 'Both'): 79 | model_name = 'Tacotron' 80 | model = create_model(model_name or args.model, hparams) 81 | if hparams.predict_linear: 82 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 83 | linear_targets=feeder.eval_linear_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step, 84 | is_training=False, is_evaluating=True) 85 | else: 86 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 87 | targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True) 88 | model.add_loss() 89 | return model 90 | 91 | def train(log_dir, args, hparams): 92 | save_dir = os.path.join(log_dir, 'taco_pretrained/') 93 | checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') 94 | input_path = os.path.join(args.base_dir, args.tacotron_input) 95 | plot_dir = os.path.join(log_dir, 'plots') 96 | wav_dir = os.path.join(log_dir, 'wavs') 97 | mel_dir = os.path.join(log_dir, 'mel-spectrograms') 98 | eval_dir = os.path.join(log_dir, 'eval-dir') 99 | eval_plot_dir = os.path.join(eval_dir, 'plots') 100 | eval_wav_dir = os.path.join(eval_dir, 'wavs') 101 | os.makedirs(eval_dir, exist_ok=True) 102 | os.makedirs(plot_dir, exist_ok=True) 103 | os.makedirs(wav_dir, exist_ok=True) 104 | os.makedirs(mel_dir, exist_ok=True) 105 | os.makedirs(eval_plot_dir, exist_ok=True) 106 | os.makedirs(eval_wav_dir, exist_ok=True) 107 | 108 | if hparams.predict_linear: 109 | linear_dir = os.path.join(log_dir, 'linear-spectrograms') 110 | os.makedirs(linear_dir, exist_ok=True) 111 | 112 | log('Checkpoint path: {}'.format(checkpoint_path)) 113 | log('Loading training data from: {}'.format(input_path)) 114 | log('Using model: {}'.format(args.model)) 115 | log(hparams_debug_string()) 116 | 117 | #Start by setting a seed for repeatability 118 | tf.set_random_seed(hparams.tacotron_random_seed) 119 | 120 | #Set up data feeder 121 | coord = tf.train.Coordinator() 122 | with tf.variable_scope('datafeeder') as scope: 123 | feeder = Feeder(coord, input_path, hparams) 124 | 125 | #Set up model: 126 | global_step = tf.Variable(0, name='global_step', trainable=False) 127 | model, stats = model_train_mode(args, feeder, hparams, global_step) 128 | eval_model = model_test_mode(args, feeder, hparams, global_step) 129 | 130 | #Book keeping 131 | step = 0 132 | time_window = ValueWindow(100) 133 | loss_window = ValueWindow(100) 134 | saver = tf.train.Saver(max_to_keep=5) 135 | 136 | log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) 137 | 138 | #Memory allocation on the GPU as needed 139 | config = tf.ConfigProto() 140 | config.gpu_options.allow_growth = True 141 | 142 | #Train 143 | with tf.Session(config=config) as sess: 144 | try: 145 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 146 | sess.run(tf.global_variables_initializer()) 147 | checkpoint_state=None 148 | #saved model restoring 149 | if args.restore: 150 | #Restore saved model if the user requested it, Default = True. 151 | try: 152 | checkpoint_state = tf.train.get_checkpoint_state(save_dir) 153 | except tf.errors.OutOfRangeError as e: 154 | log('Cannot restore checkpoint: {}'.format(e)) 155 | 156 | if (checkpoint_state and checkpoint_state.model_checkpoint_path): 157 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) 158 | saver.restore(sess, checkpoint_state.model_checkpoint_path) 159 | 160 | else: 161 | if not args.restore: 162 | log('Starting new training!') 163 | else: 164 | log('No model to load at {}'.format(save_dir)) 165 | 166 | #initializing feeder 167 | feeder.start_threads(sess) 168 | 169 | #Training loop 170 | while not coord.should_stop() and step < args.tacotron_train_steps: 171 | start_time = time.time() 172 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 173 | time_window.append(time.time() - start_time) 174 | loss_window.append(loss) 175 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( 176 | step, time_window.average, loss, loss_window.average) 177 | log(message, end='\r') 178 | 179 | if np.isnan(loss): 180 | log('Loss exploded to {:.5f} at step {}'.format(loss, step)) 181 | raise Exception('Loss exploded') 182 | 183 | if step % args.summary_interval == 0: 184 | log('\nWriting summary at step {}'.format(step)) 185 | summary_writer.add_summary(sess.run(stats), step) 186 | 187 | if step % args.eval_interval == 0: 188 | #Run eval and save eval stats 189 | log('\nRunning evaluation at step {}'.format(step)) 190 | 191 | eval_losses = [] 192 | before_losses = [] 193 | after_losses = [] 194 | stop_token_losses = [] 195 | linear_losses = [] 196 | linear_loss = None 197 | 198 | if hparams.predict_linear: 199 | for i in tqdm(range(feeder.test_steps)): 200 | eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run( 201 | [eval_model.loss, eval_model.before_loss, eval_model.after_loss, 202 | eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], 203 | eval_model.mel_targets[0], eval_model.targets_lengths[0], 204 | eval_model.alignments[0], eval_model.linear_outputs[0]]) 205 | print(i) 206 | eval_losses.append(eloss) 207 | before_losses.append(before_loss) 208 | after_losses.append(after_loss) 209 | stop_token_losses.append(stop_token_loss) 210 | linear_losses.append(linear_loss) 211 | linear_loss = sum(linear_losses) / len(linear_losses) 212 | 213 | wav = audio.inv_linear_spectrogram(lin_p.T, hparams) 214 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate) 215 | else: 216 | for i in tqdm(range(feeder.test_steps)): 217 | eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run( 218 | [eval_model.loss, eval_model.before_loss, eval_model.after_loss, 219 | eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0], 220 | eval_model.targets_lengths[0], eval_model.alignments[0]]) 221 | eval_losses.append(eloss) 222 | before_losses.append(before_loss) 223 | after_losses.append(after_loss) 224 | stop_token_losses.append(stop_token_loss) 225 | 226 | eval_loss = sum(eval_losses) / len(eval_losses) 227 | before_loss = sum(before_losses) / len(before_losses) 228 | after_loss = sum(after_losses) / len(after_losses) 229 | stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) 230 | 231 | log('Saving eval log to {}..'.format(eval_dir)) 232 | #Save some log to monitor model improvement on same unseen sequence 233 | wav = audio.inv_mel_spectrogram(mel_p.T, hparams) 234 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate) 235 | 236 | plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), 237 | info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), 238 | max_len=t_len // hparams.outputs_per_step) 239 | plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), 240 | info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, 241 | max_len=t_len) 242 | 243 | log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) 244 | log('Writing eval summary!') 245 | add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) 246 | 247 | 248 | if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps: 249 | #Save model and current global step 250 | saver.save(sess, checkpoint_path, global_step=global_step) 251 | 252 | log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') 253 | if hparams.predict_linear: 254 | input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([ 255 | model.inputs[0], 256 | model.mel_outputs[0], 257 | model.linear_outputs[0], 258 | model.alignments[0], 259 | model.mel_targets[0], 260 | model.targets_lengths[0], 261 | ]) 262 | 263 | #save predicted linear spectrogram to disk (debug) 264 | linear_filename = 'linear-prediction-step-{}.npy'.format(step) 265 | np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False) 266 | 267 | #save griffin lim inverted wav for debug (linear -> wav) 268 | wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) 269 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate) 270 | 271 | else: 272 | input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0], 273 | model.mel_outputs[0], 274 | model.alignments[0], 275 | model.mel_targets[0], 276 | model.targets_lengths[0], 277 | ]) 278 | 279 | #save predicted mel spectrogram to disk (debug) 280 | mel_filename = 'mel-prediction-step-{}.npy'.format(step) 281 | np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) 282 | 283 | #save griffin lim inverted wav for debug (mel -> wav) 284 | wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) 285 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) 286 | 287 | #save alignment plot to disk (control purposes) 288 | plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), 289 | info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), 290 | max_len=target_length // hparams.outputs_per_step) 291 | #save real and predicted mel-spectrogram plot to disk (control purposes) 292 | plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), 293 | info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, 294 | max_len=target_length) 295 | log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) 296 | 297 | log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps)) 298 | return save_dir 299 | 300 | except Exception as e: 301 | log('Exiting due to exception: {}'.format(e)) 302 | traceback.print_exc() 303 | coord.request_stop(e) 304 | 305 | def tacotron_train(args, log_dir, hparams): 306 | return train(log_dir, args, hparams) 307 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | # Default hyperparameters 6 | hparams = tf.contrib.training.HParams( 7 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 8 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners". 9 | cleaners='english_cleaners', 10 | 11 | #Hardware setup (TODO: multi-GPU parallel tacotron training) 12 | use_all_gpus = False, #Whether to use all GPU resources. If True, total number of available gpus will override num_gpus. 13 | num_gpus = 1, #Determines the number of gpus in use 14 | ########################################################################################################################################### 15 | 16 | #Audio 17 | num_mels = 80, #Number of mel-spectrogram channels and local conditioning dimensionality 18 | num_freq = 513, # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing network 19 | rescale = True, #Whether to rescale audio prior to preprocessing 20 | rescaling_max = 0.999, #Rescaling value 21 | trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle) 22 | clip_mels_length = True, #For cases of OOM (Not really recommended, working on a workaround) 23 | max_mel_frames = 900, #Only relevant when clip_mels_length = True 24 | 25 | # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction 26 | # It's preferred to set True to use with https://github.com/r9y9/wavenet_vocoder 27 | # Does not work if n_ffit is not multiple of hop_size!! 28 | use_lws=True, 29 | silence_threshold=2, #silence threshold used for sound trimming for wavenet preprocessing 30 | 31 | #Mel spectrogram 32 | n_fft = 1024, #Extra window size is filled with 0 paddings to match this parameter 33 | hop_size = 256, #For 22050Hz, 275 ~= 12.5 ms 34 | win_size = None, #For 22050Hz, 1100 ~= 50 ms (If None, win_size = n_fft) 35 | sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset) 36 | frame_shift_ms = None, 37 | 38 | #M-AILABS (and other datasets) trim params 39 | trim_fft_size = 512, 40 | trim_hop_size = 128, 41 | trim_top_db = 60, 42 | 43 | #Mel and Linear spectrograms normalization/scaling and clipping 44 | signal_normalization = True, 45 | allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True 46 | symmetric_mels = True, #Whether to scale the data to be symmetric around 0 47 | max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 48 | 49 | #Global style token 50 | use_gst=True, # When false, the scripit will do as the paper "Towards End-to-End Prosody Transfer for Expressive Speech Synthesis with Tacotron" 51 | num_gst=10, 52 | num_heads=4, # Head number for multi-head attention 53 | style_embed_depth=256, 54 | reference_filters=[32, 32, 64, 64, 128, 128], 55 | reference_depth=128, 56 | style_att_type="mlp_attention", # Attention type for style attention module (dot_attention, mlp_attention) 57 | style_att_dim=128, 58 | 59 | #Limits 60 | min_level_db = -100, 61 | ref_level_db = 20, 62 | fmin = 25, #Set this to 75 if your speaker is male! if female, 125 should help taking off noise. (To test depending on dataset) 63 | fmax = 7600, 64 | 65 | #Griffin Lim 66 | power = 1.2, 67 | griffin_lim_iters = 60, 68 | ########################################################################################################################################### 69 | 70 | #Tacotron 71 | outputs_per_step = 2, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size) 72 | stop_at_any = True, #Determines whether the decoder should stop when predicting to any frame or to all of them 73 | 74 | embedding_dim = 512, #dimension of embedding space 75 | 76 | enc_conv_num_layers = 3, #number of encoder convolutional layers 77 | enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer 78 | enc_conv_channels = 512, # number of encoder convolutions filters for each layer 79 | encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward) 80 | 81 | smoothing = False, #Whether to smooth the attention normalization function 82 | attention_dim = 128, #dimension of attention space 83 | attention_filters = 32, #number of attention convolution filters 84 | attention_kernel = (31, ), #kernel size of attention convolution 85 | cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) 86 | 87 | prenet_layers = [256, 256], #number of layers and number of units of prenet 88 | decoder_layers = 2, #number of decoder lstm layers 89 | decoder_lstm_units = 1024, #number of decoder lstm units on each layer 90 | max_iters = 2500, #Max decoder steps during inference (Just for safety from infinite loop cases) 91 | 92 | postnet_num_layers = 5, #number of postnet convolutional layers 93 | postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer 94 | postnet_channels = 512, #number of postnet convolution filters for each layer 95 | 96 | mask_encoder = True, #whether to mask encoder padding while computing attention 97 | mask_decoder = True, #Whether to use loss mask for padded sequences (if False, loss function will not be weighted, else recommended pos_weight = 20) 98 | 99 | cross_entropy_pos_weight = 20, #Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1 = disabled) 100 | predict_linear = False, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!) 101 | ########################################################################################################################################### 102 | 103 | 104 | #Wavenet 105 | # Input type: 106 | # 1. raw [-1, 1] 107 | # 2. mulaw [-1, 1] 108 | # 3. mulaw-quantize [0, mu] 109 | # If input_type is raw or mulaw, network assumes scalar input and 110 | # discretized mixture of logistic distributions output, otherwise one-hot 111 | # input and softmax output are assumed. 112 | input_type="mulaw", 113 | quantize_channels=256, # 65536 (16-bit) (raw) or 256 (8-bit) (mulaw or mulaw-quantize) // number of classes = 256 <=> mu = 255 114 | 115 | log_scale_min=float(np.log(1e-14)), #Mixture of logistic distributions minimal log scale 116 | 117 | out_channels = 10 * 3, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale) 118 | layers = 24, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper) 119 | stacks = 4, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper) 120 | residual_channels = 512, 121 | gate_channels = 512, #split in 2 in gated convolutions 122 | skip_out_channels = 256, 123 | kernel_size = 3, 124 | 125 | cin_channels = 80, #Set this to -1 to disable local conditioning, else it must be equal to num_mels!! 126 | upsample_conditional_features = True, #Whether to repeat conditional features or upsample them (The latter is recommended) 127 | upsample_scales = [16, 16], #prod(scales) should be equal to hop size 128 | freq_axis_kernel_size = 3, 129 | 130 | gin_channels = -1, #Set this to -1 to disable global conditioning, Only used for multi speaker dataset 131 | use_bias = True, #Whether to use bias in convolutional layers of the Wavenet 132 | 133 | max_time_sec = None, 134 | max_time_steps = 13000, #Max time steps in audio used to train wavenet (decrease to save memory) 135 | ########################################################################################################################################### 136 | 137 | #Tacotron Training 138 | tacotron_random_seed = 5339, #Determines initial graph and operations (i.e: model) random state for reproducibility 139 | tacotron_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!) 140 | 141 | tacotron_batch_size = 48,#number of training samples on each training steps 142 | tacotron_reg_weight = 1e-6, #regularization weight (for L2 regularization) 143 | tacotron_scale_regularization = True, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model) 144 | 145 | tacotron_test_size = None, #% of data to keep as test data, if None, tacotron_test_batches must be not None 146 | tacotron_test_batches = 48, #number of test batches (For Ljspeech: 10% ~= 41 batches of 32 samples) 147 | tacotron_data_random_state=1234, #random state for train test split repeatability 148 | 149 | tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay 150 | tacotron_start_decay = 50000, #Step at which learning decay starts 151 | tacotron_decay_steps = 40000, #Determines the learning rate decay slope (UNDER TEST) 152 | tacotron_decay_rate = 0.2, #learning rate decay rate (UNDER TEST) 153 | tacotron_initial_learning_rate = 1e-3, #starting learning rate 154 | tacotron_final_learning_rate = 1e-5, #minimal learning rate 155 | 156 | tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter 157 | tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter 158 | tacotron_adam_epsilon = 1e-6, #AdamOptimizer beta3 parameter 159 | 160 | tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network 161 | tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet 162 | 163 | natural_eval = False, #Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same teacher-forcing ratio as in training (just for overfit) 164 | 165 | #Decoder RNN learning can take be done in one of two ways: 166 | # Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode='constant' 167 | # Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is function of global step. (teacher forcing ratio decay) mode='scheduled' 168 | #The second approach is inspired by: 169 | #Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. 170 | #Can be found under: https://arxiv.org/pdf/1506.03099.pdf 171 | tacotron_teacher_forcing_mode = 'constant', #Can be ('constant' or 'scheduled'). 'scheduled' mode applies a cosine teacher forcing ratio decay. (Preference: scheduled) 172 | tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs, Only relevant if mode='constant' 173 | tacotron_teacher_forcing_init_ratio = 1., #initial teacher forcing ratio. Relevant if mode='scheduled' 174 | tacotron_teacher_forcing_final_ratio = 0., #final teacher forcing ratio. Relevant if mode='scheduled' 175 | tacotron_teacher_forcing_start_decay = 10000, #starting point of teacher forcing ratio decay. Relevant if mode='scheduled' 176 | tacotron_teacher_forcing_decay_steps = 280000, #Determines the teacher forcing ratio decay slope. Relevant if mode='scheduled' 177 | tacotron_teacher_forcing_decay_alpha = 0., #teacher forcing ratio decay rate. Relevant if mode='scheduled' 178 | ########################################################################################################################################### 179 | 180 | #Wavenet Training 181 | wavenet_random_seed = 5339, # S=5, E=3, D=9 :) 182 | wavenet_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!) 183 | 184 | wavenet_batch_size = 4, #batch size used to train wavenet. 185 | wavenet_test_size = 0.0441, #% of data to keep as test data, if None, wavenet_test_batches must be not None 186 | wavenet_test_batches = None, #number of test batches. 187 | wavenet_data_random_state = 1234, #random state for train test split repeatability 188 | 189 | wavenet_learning_rate = 1e-4, 190 | wavenet_adam_beta1 = 0.9, 191 | wavenet_adam_beta2 = 0.999, 192 | wavenet_adam_epsilon = 1e-6, 193 | 194 | wavenet_ema_decay = 0.9999, #decay rate of exponential moving average 195 | 196 | wavenet_dropout = 0.05, #drop rate of wavenet layers 197 | train_with_GTA = False, #Whether to use GTA mels to train WaveNet instead of ground truth mels. 198 | ########################################################################################################################################### 199 | 200 | #Eval sentences (if no eval file was specified, these sentences are used for eval) 201 | sentences = [ 202 | # From July 8, 2017 New York Times: 203 | 'Scientists at the CERN laboratory say they have discovered a new particle.', 204 | 'There\'s a way to measure the acute emotional intelligence that has never gone out of style.', 205 | 'President Trump met with other leaders at the Group of 20 conference.', 206 | 'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.', 207 | # From Google's Tacotron example page: 208 | 'Generative adversarial network or variational auto-encoder.', 209 | 'Basilar membrane and otolaryngology are not auto-correlations.', 210 | 'He has read the whole thing.', 211 | 'He reads books.', 212 | "Don't desert me here in the desert!", 213 | 'He thought it was time to present the present.', 214 | 'Thisss isrealy awhsome.', 215 | 'Punctuation sensitivity, is working.', 216 | 'Punctuation sensitivity is working.', 217 | "The buses aren't the problem, they actually provide a solution.", 218 | "The buses aren't the PROBLEM, they actually provide a SOLUTION.", 219 | "The quick brown fox jumps over the lazy dog.", 220 | "does the quick brown fox jump over the lazy dog?", 221 | "Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?", 222 | "She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.", 223 | "The blue lagoon is a nineteen eighty American romance adventure film.", 224 | "Tajima Airport serves Toyooka.", 225 | 'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.', 226 | #From Training data: 227 | 'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.', 228 | 'in giltspur street compter, where he was first lodged.', 229 | 'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,', 230 | 'it appears that oswald had only one caller in response to all of his fpcc activities,', 231 | 'he relied on the absence of the strychnia.', 232 | 'scoggins thought it was lighter.', 233 | '''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, 234 | and would have possessed so much moral dignity''', 235 | '''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. 236 | This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that 237 | the adopted architecture is able to perform this task with wild success.''', 238 | 'Thank you so much for your support!', 239 | ] 240 | 241 | ) 242 | 243 | def hparams_debug_string(): 244 | values = hparams.values() 245 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences'] 246 | return 'Hyperparameters:\n' + '\n'.join(hp) -------------------------------------------------------------------------------- /wavenet_vocoder/feeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from sklearn.model_selection import train_test_split 4 | import time 5 | import threading 6 | import os 7 | from .util import is_scalar_input, is_mulaw_quantize 8 | from infolog import log 9 | from datasets import audio 10 | from keras.utils import np_utils 11 | 12 | _batches_per_group = 32 13 | _pad = 0 14 | 15 | 16 | class Feeder: 17 | """ 18 | Feeds batches of data into queue in a background thread. 19 | """ 20 | def __init__(self, coordinator, metadata_filename, base_dir, hparams): 21 | super(Feeder, self).__init__() 22 | 23 | if hparams.gin_channels > 0: 24 | raise NotImplementedError('Global conditioning preprocessing has not been added yet, it will be out soon. Thanks for your patience!') 25 | 26 | self._coord = coordinator 27 | self._hparams = hparams 28 | self._train_offset = 0 29 | self._test_offset = 0 30 | 31 | #Base directory of the project (to map files from different locations) 32 | self._base_dir = base_dir 33 | 34 | #Load metadata 35 | self._data_dir = os.path.dirname(metadata_filename) 36 | with open(metadata_filename, 'r') as f: 37 | self._metadata = [line.strip().split('|') for line in f] 38 | 39 | #Train test split 40 | if hparams.wavenet_test_size is None: 41 | assert hparams.wavenet_test_batches is not None 42 | 43 | test_size = (hparams.wavenet_test_size if hparams.wavenet_test_size is not None 44 | else hparams.wavenet_test_batches * hparams.wavenet_batch_size) 45 | indices = np.arange(len(self._metadata)) 46 | train_indices, test_indices = train_test_split(indices, 47 | test_size=test_size, random_state=hparams.wavenet_data_random_state) 48 | 49 | #Make sure test size is a multiple of batch size else round up 50 | len_test_indices = _round_up(len(test_indices), hparams.wavenet_batch_size) 51 | extra_test = test_indices[len_test_indices:] 52 | test_indices = test_indices[:len_test_indices] 53 | train_indices = np.concatenate([train_indices, extra_test]) 54 | 55 | self._train_meta = list(np.array(self._metadata)[train_indices]) 56 | self._test_meta = list(np.array(self._metadata)[test_indices]) 57 | 58 | self.test_steps = len(self._test_meta) // hparams.wavenet_batch_size 59 | 60 | if hparams.wavenet_test_size is None: 61 | assert hparams.wavenet_test_batches == self.test_steps 62 | 63 | #Get conditioning status 64 | self.local_condition, self.global_condition = self._check_conditions() 65 | 66 | with tf.device('/cpu:0'): 67 | # Create placeholders for inputs and targets. Don't specify batch size because we want 68 | # to be able to feed different batch sizes at eval time. 69 | if is_scalar_input(hparams.input_type): 70 | input_placeholder = tf.placeholder(tf.float32, shape=(None, 1, None), name='audio_inputs') 71 | target_placeholder = tf.placeholder(tf.float32, shape=(None, None, 1), name='audio_targets') 72 | target_type = tf.float32 73 | else: 74 | input_placeholder = tf.placeholder(tf.float32, shape=(None, hparams.quantize_channels, None), name='audio_inputs') 75 | target_placeholder = tf.placeholder(tf.int32, shape=(None, None, 1), name='audio_targets') 76 | target_type = tf.int32 77 | 78 | self._placeholders = [ 79 | input_placeholder, 80 | target_placeholder, 81 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'), 82 | ] 83 | 84 | queue_types = [tf.float32, target_type, tf.int32] 85 | 86 | if self.local_condition: 87 | self._placeholders.append(tf.placeholder(tf.float32, shape=(None, hparams.num_mels, None), name='local_condition_features')) 88 | queue_types.append(tf.float32) 89 | if self.global_condition: 90 | self._placeholders.append(tf.placeholder(tf.int32, shape=(), name='global_condition_features')) 91 | queue_types.append(tf.int32) 92 | 93 | # Create queue for buffering data 94 | queue = tf.FIFOQueue(8, queue_types, name='intput_queue') 95 | self._enqueue_op = queue.enqueue(self._placeholders) 96 | variables = queue.dequeue() 97 | 98 | self.inputs = variables[0] 99 | self.inputs.set_shape(self._placeholders[0].shape) 100 | self.targets = variables[1] 101 | self.targets.set_shape(self._placeholders[1].shape) 102 | self.input_lengths = variables[2] 103 | self.input_lengths.set_shape(self._placeholders[2].shape) 104 | 105 | #If local conditioning disabled override c inputs with None 106 | if hparams.cin_channels < 0: 107 | self.local_condition_features = None 108 | else: 109 | self.local_condition_features = variables[3] 110 | self.local_condition_features.set_shape(self._placeholders[3].shape) 111 | 112 | #If global conditioning disabled override g inputs with None 113 | if hparams.gin_channels < 0: 114 | self.global_condition_features = None 115 | else: 116 | self.global_condition_features = variables[4] 117 | self.global_condition_features.set_shape(self._placeholders[4].shape) 118 | 119 | 120 | # Create queue for buffering eval data 121 | eval_queue = tf.FIFOQueue(1, queue_types, name='eval_queue') 122 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 123 | eval_variables = eval_queue.dequeue() 124 | 125 | self.eval_inputs = eval_variables[0] 126 | self.eval_inputs.set_shape(self._placeholders[0].shape) 127 | self.eval_targets = eval_variables[1] 128 | self.eval_targets.set_shape(self._placeholders[1].shape) 129 | self.eval_input_lengths = eval_variables[2] 130 | self.eval_input_lengths.set_shape(self._placeholders[2].shape) 131 | 132 | #If local conditioning disabled override c inputs with None 133 | if hparams.cin_channels < 0: 134 | self.eval_local_condition_features = None 135 | else: 136 | self.eval_local_condition_features = eval_variables[3] 137 | self.eval_local_condition_features.set_shape(self._placeholders[3].shape) 138 | 139 | #If global conditioning disabled override g inputs with None 140 | if hparams.gin_channels < 0: 141 | self.eval_global_condition_features = None 142 | else: 143 | self.eval_global_condition_features = eval_variables[4] 144 | self.eval_global_condition_features.set_shape(self._placeholders[4].shape) 145 | 146 | 147 | 148 | def start_threads(self, session): 149 | self._session = session 150 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group) 151 | thread.daemon = True #Thread will close when parent quits 152 | thread.start() 153 | 154 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group) 155 | thread.daemon = True #Thread will close when parent quits 156 | thread.start() 157 | 158 | def _get_test_groups(self): 159 | meta = self._test_meta[self._test_offset] 160 | self._test_offset += 1 161 | 162 | if self._hparams.train_with_GTA: 163 | mel_file = meta[2] 164 | else: 165 | mel_file = meta[1] 166 | audio_file = meta[0] 167 | 168 | input_data = np.load(os.path.join(self._base_dir, audio_file)) 169 | 170 | if self.local_condition: 171 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file)) 172 | else: 173 | local_condition_features = None 174 | 175 | global_condition_features = None 176 | 177 | return (input_data, local_condition_features, global_condition_features, len(input_data)) 178 | 179 | def make_test_batches(self): 180 | start = time.time() 181 | 182 | #Read one example for evaluation 183 | n = 1 184 | 185 | #Test on entire test set (one sample at an evaluation step) 186 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 187 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 188 | np.random.shuffle(batches) 189 | 190 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 191 | return batches 192 | 193 | def _enqueue_next_train_group(self): 194 | while not self._coord.should_stop(): 195 | start = time.time() 196 | 197 | # Read a group of examples 198 | n = self._hparams.wavenet_batch_size 199 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 200 | 201 | # Bucket examples base on similiar output length for efficiency 202 | examples.sort(key=lambda x: x[-1]) 203 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 204 | np.random.shuffle(batches) 205 | 206 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) 207 | for batch in batches: 208 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch))) 209 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 210 | 211 | def _enqueue_next_test_group(self): 212 | test_batches = self.make_test_batches() 213 | while not self._coord.should_stop(): 214 | for batch in test_batches: 215 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch))) 216 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 217 | 218 | def _get_next_example(self): 219 | '''Get a single example (input, output, len_output) from disk 220 | ''' 221 | if self._train_offset >= len(self._train_meta): 222 | self._train_offset = 0 223 | np.random.shuffle(self._train_meta) 224 | meta = self._train_meta[self._train_offset] 225 | self._train_offset += 1 226 | 227 | if self._hparams.train_with_GTA: 228 | mel_file = meta[2] 229 | if 'linear' in mel_file: 230 | raise RuntimeError('Linear spectrogram files selected instead of GTA mels, did you specify the wrong metadata?') 231 | else: 232 | mel_file = meta[1] 233 | audio_file = meta[0] 234 | 235 | input_data = np.load(os.path.join(self._base_dir, audio_file)) 236 | 237 | if self.local_condition: 238 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file)) 239 | else: 240 | local_condition_features = None 241 | 242 | global_condition_features = None 243 | 244 | return (input_data, local_condition_features, global_condition_features, len(input_data)) 245 | 246 | 247 | def _prepare_batch(self, batch): 248 | np.random.shuffle(batch) 249 | 250 | #Limit time steps to save GPU Memory usage 251 | max_time_steps = self._limit_time() 252 | #Adjust time resolution for upsampling 253 | batch = self._adjust_time_resolution(batch, self.local_condition, max_time_steps) 254 | 255 | #time lengths 256 | input_lengths = [len(x[0]) for x in batch] 257 | max_input_length = max(input_lengths) 258 | 259 | inputs = self._prepare_inputs([x[0] for x in batch], max_input_length) 260 | targets = self._prepare_targets([x[0] for x in batch], max_input_length) 261 | local_condition_features = self._prepare_local_conditions(self.local_condition, [x[1] for x in batch]) 262 | global_condition_features = self._prepare_global_conditions(self.global_condition, [x[2] for x in batch]) 263 | 264 | new_batch = (inputs, targets, input_lengths) 265 | if local_condition_features is not None: 266 | new_batch += (local_condition_features, ) 267 | if global_condition_features is not None: 268 | new_batch += (global_condition_features, ) 269 | 270 | return new_batch 271 | 272 | def _prepare_inputs(self, inputs, maxlen): 273 | if is_mulaw_quantize(self._hparams.input_type): 274 | #[batch_size, time_steps, quantize_channels] 275 | x_batch = np.stack([_pad_inputs(np_utils.to_categorical( 276 | x, num_classes=self._hparams.quantize_channels), maxlen) for x in inputs]).astype(np.float32) 277 | else: 278 | #[batch_size, time_steps, 1] 279 | x_batch = np.stack([_pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs]).astype(np.float32) 280 | assert len(x_batch.shape) == 3 281 | #Convert to channels first [batch_size, quantize_channels (or 1), time_steps] 282 | x_batch = np.transpose(x_batch, (0, 2, 1)) 283 | return x_batch 284 | 285 | def _prepare_targets(self, targets, maxlen): 286 | #[batch_size, time_steps] 287 | if is_mulaw_quantize(self._hparams.input_type): 288 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.int32) 289 | else: 290 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.float32) 291 | assert len(y_batch.shape) == 2 292 | #Add extra axis (make 3 dimension) 293 | y_batch = np.expand_dims(y_batch, axis=-1) 294 | return y_batch 295 | 296 | def _prepare_local_conditions(self, local_condition, c_features): 297 | if local_condition: 298 | maxlen = max([len(x) for x in c_features]) 299 | c_batch = np.stack([_pad_inputs(x, maxlen) for x in c_features]).astype(np.float32) 300 | assert len(c_batch.shape) == 3 301 | #[batch_size, c_channels, time_steps] 302 | c_batch = np.transpose(c_batch, (0, 2, 1)) 303 | else: 304 | c_batch = None 305 | return c_batch 306 | 307 | def _prepare_global_conditions(self, global_condition, g_features): 308 | if global_condition: 309 | g_batch = g_features 310 | else: 311 | g_batch = None 312 | return g_batch 313 | 314 | def _check_conditions(self): 315 | local_condition = self._hparams.cin_channels > 0 316 | global_condition = self._hparams.gin_channels > 0 317 | return local_condition, global_condition 318 | 319 | def _limit_time(self): 320 | '''Limit time resolution to save GPU memory. 321 | ''' 322 | if self._hparams.max_time_sec is not None: 323 | return int(self._hparams.max_time_sec * self._hparams.sample_rate) 324 | elif self._hparams.max_time_steps is not None: 325 | return self._hparams.max_time_steps 326 | else: 327 | return None 328 | 329 | def _adjust_time_resolution(self, batch, local_condition, max_time_steps): 330 | '''Adjust time resolution between audio and local condition 331 | ''' 332 | if local_condition: 333 | new_batch = [] 334 | for b in batch: 335 | x, c, g, l = b 336 | self._assert_ready_for_upsample(x, c) 337 | if max_time_steps is not None: 338 | max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(self._hparams), True) 339 | if len(x) > max_time_steps: 340 | max_time_frames = max_steps // audio.get_hop_size(self._hparams) 341 | start = np.random.randint(0, len(c) - max_time_frames) 342 | time_start = start * audio.get_hop_size(self._hparams) 343 | x = x[time_start: time_start + max_time_frames * audio.get_hop_size(self._hparams)] 344 | c = c[start: start + max_time_frames, :] 345 | self._assert_ready_for_upsample(x, c) 346 | 347 | new_batch.append((x, c, g, l)) 348 | return new_batch 349 | else: 350 | new_batch = [] 351 | for b in batch: 352 | x, c, g, l = b 353 | x = audio.trim(x) 354 | if max_time_steps is not None and len(x) > max_time_steps: 355 | start = np.random.randint(0, len(c) - max_time_steps) 356 | x = x[start: start + max_time_steps] 357 | new_batch.append((x, c, g, l)) 358 | return new_batch 359 | 360 | def _assert_ready_for_upsample(self, x, c): 361 | assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(self._hparams) 362 | 363 | 364 | def _pad_inputs(x, maxlen): 365 | return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad) 366 | 367 | def _pad_targets(x, maxlen): 368 | return np.pad(x, (0, maxlen - len(x)), mode='constant', constant_values=_pad) 369 | 370 | def _round_up(x, multiple): 371 | remainder = x % multiple 372 | return x if remainder == 0 else x + multiple - remainder 373 | 374 | def _ensure_divisible(length, divisible_by=256, lower=True): 375 | if length % divisible_by == 0: 376 | return length 377 | if lower: 378 | return length - length % divisible_by 379 | else: 380 | return length + (divisible_by - length % divisible_by) 381 | -------------------------------------------------------------------------------- /tacotron/models/tacotron.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.rnn import GRUCell 3 | from tacotron.utils.symbols import symbols 4 | from infolog import log 5 | from .modules import reference_encoder 6 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper 7 | from tacotron.models.modules import * 8 | from tensorflow.contrib.seq2seq import dynamic_decode 9 | from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell 10 | from tacotron.models.custom_decoder import CustomDecoder 11 | from tacotron.models.attention import LocationSensitiveAttention 12 | from .multihead_attention import MultiheadAttention 13 | 14 | 15 | 16 | class Tacotron(): 17 | """Tacotron-2 Feature prediction Model. 18 | """ 19 | def __init__(self, hparams): 20 | self._hparams = hparams 21 | 22 | 23 | def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, 24 | global_step=None, is_training=False, is_evaluating=False, reference_mel=None): 25 | """ 26 | Initializes the model for inference 27 | 28 | sets "mel_outputs" and "alignments" fields. 29 | 30 | Args: 31 | - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of 32 | steps in the input time series, and values are character IDs 33 | - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths 34 | of each sequence in inputs. 35 | - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number 36 | of steps in the output time series, M is num_mels, and values are entries in the mel 37 | spectrogram. Only needed for training. 38 | """ 39 | if mel_targets is None and stop_token_targets is not None: 40 | raise ValueError('no mel targets were provided but token_targets were given') 41 | if mel_targets is not None and stop_token_targets is None and not gta: 42 | raise ValueError('Mel targets are provided without corresponding token_targets') 43 | if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training: 44 | raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!') 45 | if gta and linear_targets is not None: 46 | raise ValueError('Linear spectrogram prediction is not supported in GTA mode!') 47 | if is_training and self._hparams.mask_decoder and targets_lengths is None: 48 | raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!') 49 | if is_training and is_evaluating: 50 | raise RuntimeError('Model can not be in training and evaluation modes at the same time!') 51 | 52 | with tf.variable_scope('inference') as scope: 53 | batch_size = tf.shape(inputs)[0] 54 | hp = self._hparams 55 | assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled') 56 | if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training: 57 | assert global_step is not None 58 | 59 | #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis 60 | post_condition = hp.predict_linear and not gta 61 | 62 | # Embeddings ==> [batch_size, sequence_length, embedding_dim] 63 | embedding_table = tf.get_variable( 64 | 'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32) 65 | embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) 66 | 67 | 68 | if hp.use_gst: 69 | #Global style tokens (GST) 70 | gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 71 | dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) 72 | self.gst_tokens = gst_tokens 73 | 74 | 75 | #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] 76 | encoder_cell = TacotronEncoderCell( 77 | EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'), 78 | EncoderRNN(is_training, size=hp.encoder_lstm_units, 79 | zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM')) 80 | 81 | encoder_outputs = encoder_cell(embedded_inputs, input_lengths) 82 | 83 | #For shape visualization purpose 84 | enc_conv_output_shape = encoder_cell.conv_output_shape 85 | 86 | if is_training: 87 | reference_mel = mel_targets 88 | 89 | if reference_mel is not None: 90 | # Reference encoder 91 | refnet_outputs = reference_encoder( 92 | reference_mel, 93 | filters=hp.reference_filters, 94 | kernel_size=(3,3), 95 | strides=(2,2), 96 | encoder_cell=GRUCell(hp.reference_depth), 97 | is_training=is_training) # [N, 128] 98 | self.refnet_outputs = refnet_outputs 99 | 100 | if hp.use_gst: 101 | # Style attention 102 | style_attention = MultiheadAttention( 103 | tf.expand_dims(refnet_outputs, axis=1), # [N, 1, 128] 104 | tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])), # [N, hp.num_gst, 256/hp.num_heads] 105 | num_heads=hp.num_heads, 106 | num_units=hp.style_att_dim, 107 | attention_type=hp.style_att_type) 108 | 109 | style_embeddings = style_attention.multi_head_attention() 110 | else: 111 | style_embeddings = tf.expand_dims(refnet_outputs, axis=1) # [N, 1, 128] 112 | else: 113 | print("Use random weight for GST.") 114 | random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32) 115 | random_weights = tf.nn.softmax(random_weights, name="random_weights") 116 | style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens)) 117 | style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]]) 118 | 119 | 120 | #Extend style embeddings to be compatible with encoder_outputs. 121 | #Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes. 122 | #Preserves effect of both style and encoder_outputs. 123 | neg = tf.add(style_embeddings, tf.negative(style_embeddings)) 124 | style_embeddings = tf.concat([style_embeddings, neg], axis=-1) 125 | 126 | 127 | # Add style embedding to every text encoder state 128 | style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128] 129 | encoder_outputs = tf.add(encoder_outputs, style_embeddings) 130 | 131 | #Decoder Parts 132 | #Attention Decoder Prenet 133 | prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet') 134 | #Attention Mechanism 135 | attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, 136 | mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 137 | cumulate_weights=hp.cumulative_weights) 138 | #Decoder LSTM Cells 139 | 140 | decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, 141 | size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm') 142 | #Frames Projection layer 143 | frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform') 144 | # projection layer 145 | stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection') 146 | 147 | 148 | #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) 149 | decoder_cell = TacotronDecoderCell( 150 | prenet, 151 | attention_mechanism, 152 | decoder_lstm, 153 | frame_projection, 154 | stop_projection) 155 | #Define the helper for our decoder 156 | if is_training or is_evaluating or gta: 157 | self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step) 158 | else: 159 | self.helper = TacoTestHelper(batch_size, hp) 160 | 161 | 162 | #initial decoder state 163 | decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) 164 | 165 | #Only use max iterations at synthesis time 166 | max_iters = hp.max_iters if not (is_training or is_evaluating) else None 167 | 168 | #Decode 169 | (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( 170 | CustomDecoder(decoder_cell, self.helper, decoder_init_state), 171 | impute_finished=False, 172 | maximum_iterations=max_iters, 173 | swap_memory=hp.tacotron_swap_with_cpu) 174 | 175 | 176 | # Reshape outputs to be one output per entry 177 | #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] 178 | decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) 179 | stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) 180 | 181 | 182 | #Postnet 183 | postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions') 184 | 185 | #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels] 186 | residual = postnet(decoder_output) 187 | 188 | #Project residual to same dimension as mel spectrogram 189 | #==> [batch_size, decoder_steps * r, num_mels] 190 | residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection') 191 | projected_residual = residual_projection(residual) 192 | 193 | 194 | #Compute the mel spectrogram 195 | mel_outputs = decoder_output + projected_residual 196 | 197 | 198 | if post_condition: 199 | #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py 200 | #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder 201 | post_processing_cell = TacotronEncoderCell( 202 | EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'), 203 | EncoderRNN(is_training, size=hp.encoder_lstm_units, 204 | zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM')) 205 | 206 | expand_outputs = post_processing_cell(mel_outputs) 207 | linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs) 208 | 209 | #Grab alignments from the final decoder state 210 | alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) 211 | 212 | if is_training: 213 | self.ratio = self.helper._ratio 214 | self.inputs = inputs 215 | self.input_lengths = input_lengths 216 | self.decoder_output = decoder_output 217 | self.alignments = alignments 218 | self.style_embeddings = style_embeddings 219 | self.stop_token_prediction = stop_token_prediction 220 | self.stop_token_targets = stop_token_targets 221 | self.mel_outputs = mel_outputs 222 | if post_condition: 223 | self.linear_outputs = linear_outputs 224 | self.linear_targets = linear_targets 225 | self.mel_targets = mel_targets 226 | self.targets_lengths = targets_lengths 227 | log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') 228 | log(' Train mode: {}'.format(is_training)) 229 | log(' Eval mode: {}'.format(is_evaluating)) 230 | log(' GTA mode: {}'.format(gta)) 231 | log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) 232 | log(' embedding: {}'.format(embedded_inputs.shape)) 233 | log(' enc conv out: {}'.format(enc_conv_output_shape)) 234 | log(' encoder out: {}'.format(encoder_outputs.shape)) 235 | log(' decoder out: {}'.format(decoder_output.shape)) 236 | log(' residual out: {}'.format(residual.shape)) 237 | log(' projected residual out: {}'.format(projected_residual.shape)) 238 | log(' style embedding: %d' % style_embeddings.shape[-1]) 239 | log(' mel out: {}'.format(mel_outputs.shape)) 240 | if post_condition: 241 | log(' linear out: {}'.format(linear_outputs.shape)) 242 | log(' out: {}'.format(stop_token_prediction.shape)) 243 | 244 | 245 | def add_loss(self): 246 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 247 | with tf.variable_scope('loss') as scope: 248 | hp = self._hparams 249 | 250 | if hp.mask_decoder: 251 | # Compute loss of predictions before postnet 252 | before = MaskedMSE(self.mel_targets, self.decoder_output, self.targets_lengths, 253 | hparams=self._hparams) 254 | # Compute loss after postnet 255 | after = MaskedMSE(self.mel_targets, self.mel_outputs, self.targets_lengths, 256 | hparams=self._hparams) 257 | #Compute loss (for learning dynamic generation stop) 258 | stop_token_loss = MaskedSigmoidCrossEntropy(self.stop_token_targets, 259 | self.stop_token_prediction, self.targets_lengths, hparams=self._hparams) 260 | else: 261 | # Compute loss of predictions before postnet 262 | before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_output) 263 | # Compute loss after postnet 264 | after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs) 265 | #Compute loss (for learning dynamic generation stop) 266 | stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 267 | labels=self.stop_token_targets, 268 | logits=self.stop_token_prediction)) 269 | 270 | if hp.predict_linear: 271 | #Compute linear loss 272 | #From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py 273 | #Prioritize loss for frequencies under 2000 Hz. 274 | l1 = tf.abs(self.linear_targets - self.linear_outputs) 275 | n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_mels) 276 | linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq]) 277 | else: 278 | linear_loss = 0. 279 | 280 | # Compute the regularization weight 281 | if hp.tacotron_scale_regularization: 282 | reg_weight_scaler = 1. / (2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (hp.max_abs_value) 283 | reg_weight = hp.tacotron_reg_weight * reg_weight_scaler 284 | else: 285 | reg_weight = hp.tacotron_reg_weight 286 | 287 | # Get all trainable variables 288 | all_vars = tf.trainable_variables() 289 | regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars 290 | if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight 291 | 292 | # Compute final loss term 293 | self.before_loss = before 294 | self.after_loss = after 295 | self.stop_token_loss = stop_token_loss 296 | self.regularization_loss = regularization 297 | self.linear_loss = linear_loss 298 | 299 | self.loss = self.before_loss + self.after_loss + self.stop_token_loss + self.regularization_loss + self.linear_loss 300 | 301 | def add_optimizer(self, global_step): 302 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 303 | 304 | Args: 305 | global_step: int32 scalar Tensor representing current global step in training 306 | ''' 307 | with tf.variable_scope('optimizer') as scope: 308 | hp = self._hparams 309 | if hp.tacotron_decay_learning_rate: 310 | self.decay_steps = hp.tacotron_decay_steps 311 | self.decay_rate = hp.tacotron_decay_rate 312 | self.learning_rate = self._learning_rate_decay(hp.tacotron_initial_learning_rate, global_step) 313 | else: 314 | self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) 315 | 316 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, 317 | hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) 318 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 319 | self.gradients = gradients 320 | #Just for causion 321 | #https://github.com/Rayhane-mamah/Tacotron-2/issues/11 322 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.) 323 | 324 | # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 325 | # https://github.com/tensorflow/tensorflow/issues/1122 326 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 327 | self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 328 | global_step=global_step) 329 | 330 | def _learning_rate_decay(self, init_lr, global_step): 331 | ################################################################# 332 | # Narrow Exponential Decay: 333 | 334 | # Phase 1: lr = 1e-3 335 | # We only start learning rate decay after 50k steps 336 | 337 | # Phase 2: lr in ]1e-5, 1e-3[ 338 | # decay reach minimal value at step 310k 339 | 340 | # Phase 3: lr = 1e-5 341 | # clip by minimal learning rate value (step > 310k) 342 | ################################################################# 343 | hp = self._hparams 344 | 345 | #Compute natural exponential decay 346 | lr = tf.train.exponential_decay(init_lr, 347 | global_step - hp.tacotron_start_decay, #lr = 1e-3 at step 50k 348 | self.decay_steps, 349 | self.decay_rate, #lr = 1e-5 around step 310k 350 | name='lr_exponential_decay') 351 | 352 | 353 | #clip learning rate by max and min values (initial and final values) 354 | return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) 355 | --------------------------------------------------------------------------------