├── datasets
    ├── __init__.py
    ├── preprocessor.py
    └── audio.py
├── tacotron
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── multihead_attention.py
    │   ├── custom_decoder.py
    │   ├── helpers.py
    │   ├── Architecture_wrappers.py
    │   ├── attention.py
    │   └── tacotron.py
    ├── utils
    │   ├── ops.py
    │   ├── __init__.py
    │   ├── symbols.py
    │   ├── cmudict.py
    │   ├── plot.py
    │   ├── numbers.py
    │   ├── text.py
    │   └── cleaners.py
    ├── synthesizer.py
    ├── synthesize.py
    ├── feeder.py
    └── train.py
├── wavenet_vocoder
    ├── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── mixture.py
    │   └── modules.py
    ├── synthesizer.py
    ├── synthesize.py
    ├── util.py
    ├── train.py
    └── feeder.py
├── requirements.txt
├── infolog.py
├── griffin_lim_synthesis_tool.ipynb
├── .gitignore
├── synthesize.py
├── preprocess.py
├── train.py
├── README.md
└── hparams.py


/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/tacotron/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/tacotron/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == 'Tacotron':
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception('Unknown model: ' + name)
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | falcon==1.2.0
 2 | inflect==0.2.5
 3 | librosa==0.5.1
 4 | matplotlib==2.0.2
 5 | numpy==1.14.0
 6 | scipy==1.0.0
 7 | tqdm==4.11.2
 8 | Unidecode==0.4.20
 9 | pyaudio==0.2.11
10 | sounddevice==0.3.10
11 | lws
12 | keras


--------------------------------------------------------------------------------
/tacotron/utils/ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def shape_list(x):
 5 |   """Return list of dims, statically where possible."""
 6 |   x = tf.convert_to_tensor(x)
 7 | 
 8 |   # If unknown rank, return dynamic shape
 9 |   if x.get_shape().dims is None:
10 |     return tf.shape(x)
11 | 
12 |   static = x.get_shape().as_list()
13 |   shape = tf.shape(x)
14 | 
15 |   ret = []
16 |   for i in range(len(static)):
17 |     dim = static[i]
18 |     if dim is None:
19 |       dim = shape[i]
20 |     ret.append(dim)
21 |   return ret


--------------------------------------------------------------------------------
/tacotron/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/tacotron/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from . import cmudict
 8 | 
 9 | _pad        = '_'
10 | _eos        = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | _arpabet = ['@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) + _arpabet


--------------------------------------------------------------------------------
/wavenet_vocoder/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .wavenet import WaveNet 
 2 | from warnings import warn
 3 | from wavenet_vocoder.util import is_mulaw_quantize
 4 | 
 5 | def create_model(name, hparams):
 6 | 	if is_mulaw_quantize(hparams.input_type):
 7 | 		if hparams.out_channels != hparams.quantize_channels:
 8 | 			raise RuntimeError(
 9 | 				"out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
10 | 	if hparams.upsample_conditional_features and hparams.cin_channels < 0:
11 | 		s = "Upsample conv layers were specified while local conditioning disabled. "
12 | 		s += "Notice that upsample conv layers will never be used."
13 | 		warn(s)
14 | 
15 | 	if name == 'WaveNet':
16 | 		return WaveNet(hparams)
17 | 	else:
18 | 		raise Exception('Unknow model: {}'.format(name))


--------------------------------------------------------------------------------
/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread 
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 | 	global _file, _run_name, _slack_url
16 | 	_close_logfile()
17 | 	_file = open(filename, 'a')
18 | 	_file = open(filename, 'a')
19 | 	_file.write('\n-----------------------------------------------------------------\n')
20 | 	_file.write('Starting new {} training run\n'.format(run_name))
21 | 	_file.write('-----------------------------------------------------------------\n')
22 | 	_run_name = run_name
23 | 	_slack_url = slack_url
24 | 
25 | 
26 | def log(msg, end='\n', slack=False):
27 | 	print(msg, end=end)
28 | 	if _file is not None:
29 | 		_file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
30 | 	if slack and _slack_url is not None:
31 | 		Thread(target=_send_slack, args=(msg,)).start()
32 | 
33 | 
34 | def _close_logfile():
35 | 	global _file
36 | 	if _file is not None:
37 | 		_file.close()
38 | 		_file = None
39 | 
40 | 
41 | def _send_slack(msg):
42 | 	req = Request(_slack_url)
43 | 	req.add_header('Content-Type', 'application/json')
44 | 	urlopen(req, json.dumps({
45 | 		'username': 'tacotron',
46 | 		'icon_emoji': ':taco:',
47 | 		'text': '*%s*: %s' % (_run_name, msg)
48 | 	}).encode())
49 | 
50 | 
51 | atexit.register(_close_logfile)


--------------------------------------------------------------------------------
/griffin_lim_synthesis_tool.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "scrolled": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import numpy as np\n",
12 |     "from datasets.audio import *\n",
13 |     "import os\n",
14 |     "from hparams import hparams\n",
15 |     "\n",
16 |     "n_sample = 0 #Change n_steps here\n",
17 |     "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n",
18 |     "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n",
19 |     "out_dir = 'wav_out'\n",
20 |     "\n",
21 |     "os.makedirs(out_dir, exist_ok=True)\n",
22 |     "\n",
23 |     "mel_file = os.path.join(mel_folder, mel_file)\n",
24 |     "mel_spectro = np.load(mel_file)\n",
25 |     "mel_spectro.shape"
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": null,
31 |    "metadata": {},
32 |    "outputs": [],
33 |    "source": [
34 |     "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n",
35 |     "#save the wav under test_<folder>_<file>\n",
36 |     "save_wav(wav, os.path.join(out_dir, 'test_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_'))),\n",
37 |     "        sr=hparams.sample_rate)"
38 |    ]
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "kernelspec": {
43 |    "display_name": "Python 3",
44 |    "language": "python",
45 |    "name": "python3"
46 |   },
47 |   "language_info": {
48 |    "codemirror_mode": {
49 |     "name": "ipython",
50 |     "version": 3
51 |    },
52 |    "file_extension": ".py",
53 |    "mimetype": "text/x-python",
54 |    "name": "python",
55 |    "nbconvert_exporter": "python",
56 |    "pygments_lexer": "ipython3",
57 |    "version": "3.6.4"
58 |   }
59 |  },
60 |  "nbformat": 4,
61 |  "nbformat_minor": 2
62 | }
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Tacotron 2 oddities
107 | logs-*/
108 | training_data/
109 | 
110 | 


--------------------------------------------------------------------------------
/tacotron/utils/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |   def __init__(self, file_or_path, keep_ambiguous=True):
20 |     if isinstance(file_or_path, str):
21 |       with open(file_or_path, encoding='latin-1') as f:
22 |         entries = _parse_cmudict(f)
23 |     else:
24 |       entries = _parse_cmudict(file_or_path)
25 |     if not keep_ambiguous:
26 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
27 |     self._entries = entries
28 | 
29 | 
30 |   def __len__(self):
31 |     return len(self._entries)
32 | 
33 | 
34 |   def lookup(self, word):
35 |     '''Returns list of ARPAbet pronunciations of the given word.'''
36 |     return self._entries.get(word.upper())
37 | 
38 | 
39 | 
40 | _alt_re = re.compile(r'\([0-9]+\)')
41 | 
42 | 
43 | def _parse_cmudict(file):
44 |   cmudict = {}
45 |   for line in file:
46 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
47 |       parts = line.split('  ')
48 |       word = re.sub(_alt_re, '', parts[0])
49 |       pronunciation = _get_pronunciation(parts[1])
50 |       if pronunciation:
51 |         if word in cmudict:
52 |           cmudict[word].append(pronunciation)
53 |         else:
54 |           cmudict[word] = [pronunciation]
55 |   return cmudict
56 | 
57 | 
58 | def _get_pronunciation(s):
59 |   parts = s.strip().split(' ')
60 |   for part in parts:
61 |     if part not in _valid_symbol_set:
62 |       return None
63 |   return ' '.join(parts)
64 | 


--------------------------------------------------------------------------------
/tacotron/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np 
 5 | 
 6 | 
 7 | def split_title_line(title_text, max_words=5):
 8 | 	"""
 9 | 	A function that splits any string based on specific character
10 | 	(returning it with the string), with maximum number of words on it
11 | 	"""
12 | 	seq = title_text.split()
13 | 	return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
14 | 
15 | def plot_alignment(alignment, path, info=None, split_title=False, max_len=None):
16 | 	if max_len is not None:
17 | 		alignment = alignment[:, :max_len]
18 | 
19 | 	fig = plt.figure(figsize=(8, 6))
20 | 	ax = fig.add_subplot(111)
21 | 
22 | 	im = ax.imshow(
23 | 		alignment,
24 | 		aspect='auto',
25 | 		origin='lower',
26 | 		interpolation='none')
27 | 	fig.colorbar(im, ax=ax)
28 | 	xlabel = 'Decoder timestep'
29 | 	if info is not None:
30 | 		if split_title:
31 | 			title = split_title_line(info)
32 | 		else:
33 | 			title = info
34 | 	plt.xlabel(xlabel)
35 | 	plt.title(title)
36 | 	plt.ylabel('Encoder timestep')
37 | 	plt.tight_layout()
38 | 	plt.savefig(path, format='png')
39 | 	plt.close()
40 | 
41 | 
42 | def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None):
43 | 	if max_len is not None:
44 | 		target_spectrogram = target_spectrogram[:max_len]
45 | 		pred_spectrogram = pred_spectrogram[:max_len]
46 | 
47 | 	if info is not None:
48 | 		if split_title:
49 | 			title = split_title_line(info)
50 | 		else:
51 | 			title = info
52 | 
53 | 	fig = plt.figure(figsize=(10, 8))
54 | 	# Set common labels
55 | 	fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
56 | 
57 | 	#target spectrogram subplot
58 | 	if target_spectrogram is not None:
59 | 		ax1 = fig.add_subplot(311)
60 | 		ax2 = fig.add_subplot(312)
61 | 
62 | 		im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
63 | 		ax1.set_title('Target Mel-Spectrogram')
64 | 		fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
65 | 		ax2.set_title('Predicted Mel-Spectrogram')
66 | 	else:
67 | 		ax2 = fig.add_subplot(211)
68 | 
69 | 	im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
70 | 	fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
71 | 
72 | 	plt.tight_layout()
73 | 	plt.savefig(path, format='png')
74 | 	plt.close()
75 | 


--------------------------------------------------------------------------------
/tacotron/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/tacotron/utils/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from . import cleaners
 3 | from .symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |     Args:
21 |       text: string to convert to a sequence
22 |       cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |     Returns:
25 |       List of integers corresponding to the symbols in the text
26 |   '''
27 |   sequence = []
28 | 
29 |   # Check for curly braces and treat their contents as ARPAbet:
30 |   while len(text):
31 |     m = _curly_re.match(text)
32 |     if not m:
33 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |       break
35 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |     sequence += _arpabet_to_sequence(m.group(2))
37 |     text = m.group(3)
38 | 
39 |   # Append EOS token
40 |   sequence.append(_symbol_to_id['~'])
41 |   return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |   '''Converts a sequence of IDs back to a string'''
46 |   result = ''
47 |   for symbol_id in sequence:
48 |     if symbol_id in _id_to_symbol:
49 |       s = _id_to_symbol[symbol_id]
50 |       # Enclose ARPAbet back in curly braces:
51 |       if len(s) > 1 and s[0] == '@':
52 |         s = '{%s}' % s[1:]
53 |       result += s
54 |   return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |   for name in cleaner_names:
59 |     cleaner = getattr(cleaners, name)
60 |     if not cleaner:
61 |       raise Exception('Unknown cleaner: %s' % name)
62 |     text = cleaner(text)
63 |   return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |   return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |   return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/synthesizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import tensorflow as tf 
 3 | import os
 4 | from infolog import log
 5 | from datasets.audio import save_wav
 6 | from wavenet_vocoder.models import create_model
 7 | from wavenet_vocoder.train import create_shadow_saver, load_averaged_model
 8 | from . import util
 9 | 
10 | 
11 | class Synthesizer:
12 | 	def load(self, checkpoint_path, hparams, model_name='WaveNet'):
13 | 		log('Constructing model: {}'.format(model_name))
14 | 		self._hparams = hparams
15 | 		local_cond, global_cond = self._check_conditions()
16 | 
17 | 		self.local_conditions = tf.placeholder(tf.float32, shape=[1, None, hparams.num_mels], name='local_condition_features') if local_cond else None
18 | 		self.global_conditions = tf.placeholder(tf.int32, shape=(), name='global_condition_features') if global_cond else None
19 | 		self.synthesis_length = tf.placeholder(tf.int32, shape=(), name='synthesis_length') if not local_cond else None
20 | 
21 | 		with tf.variable_scope('model') as scope:
22 | 			self.model = create_model(model_name, hparams)
23 | 			self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions,
24 | 				input_lengths=None, synthesis_length=self.synthesis_length)
25 | 
26 | 			self._hparams = hparams
27 | 			sh_saver = create_shadow_saver(self.model)
28 | 
29 | 			log('Loading checkpoint: {}'.format(checkpoint_path))
30 | 			self.session = tf.Session()
31 | 			self.session.run(tf.global_variables_initializer())
32 | 			load_averaged_model(self.session, sh_saver, checkpoint_path)
33 | 
34 | 	def synthesize(self, mel_spectrogram, speaker_id, index, out_dir, log_dir):
35 | 		hparams = self._hparams
36 | 		local_cond, global_cond = self._check_conditions()
37 | 
38 | 		c = mel_spectrogram
39 | 		g = speaker_id
40 | 		feed_dict = {}
41 | 
42 | 		if local_cond:
43 | 			feed_dict[self.local_conditions] = [np.array(c, dtype=np.float32)]
44 | 		else:
45 | 			feed_dict[self.synthesis_length] = 100
46 | 
47 | 		if global_cond:
48 | 			feed_dict[self.global_conditions] = [np.array(g, dtype=np.int32)]
49 | 
50 | 		generated_wav = self.session.run(self.model.y_hat, feed_dict=feed_dict)
51 | 
52 | 		#Save wav to disk
53 | 		audio_filename = os.path.join(out_dir, 'speech-audio-{:05d}.wav'.format(index))
54 | 		save_wav(generated_wav, audio_filename, sr=hparams.sample_rate)
55 | 
56 | 		#Save waveplot to disk
57 | 		if log_dir is not None:
58 | 			plot_filename = os.path.join(log_dir, 'speech-waveplot-{:05d}.png'.format(index))
59 | 			util.waveplot(plot_filename, generated_wav, None, hparams)
60 | 
61 | 		return audio_filename
62 | 
63 | 	def _check_conditions(self):
64 | 		local_condition = self._hparams.cin_channels > 0
65 | 		global_condition = self._hparams.gin_channels > 0
66 | 		return local_condition, global_condition
67 | 


--------------------------------------------------------------------------------
/tacotron/utils/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 |   ('mrs', 'misess'),
24 |   ('mr', 'mister'),
25 |   ('dr', 'doctor'),
26 |   ('st', 'saint'),
27 |   ('co', 'company'),
28 |   ('jr', 'junior'),
29 |   ('maj', 'major'),
30 |   ('gen', 'general'),
31 |   ('drs', 'doctors'),
32 |   ('rev', 'reverend'),
33 |   ('lt', 'lieutenant'),
34 |   ('hon', 'honorable'),
35 |   ('sgt', 'sergeant'),
36 |   ('capt', 'captain'),
37 |   ('esq', 'esquire'),
38 |   ('ltd', 'limited'),
39 |   ('col', 'colonel'),
40 |   ('ft', 'fort'),
41 | ]]
42 | 
43 | 
44 | def expand_abbreviations(text):
45 |   for regex, replacement in _abbreviations:
46 |     text = re.sub(regex, replacement, text)
47 |   return text
48 | 
49 | 
50 | def expand_numbers(text):
51 |   return normalize_numbers(text)
52 | 
53 | 
54 | def lowercase(text):
55 |   '''lowercase input tokens.
56 |   '''
57 |   return text.lower()
58 | 
59 | 
60 | def collapse_whitespace(text):
61 |   return re.sub(_whitespace_re, ' ', text)
62 | 
63 | 
64 | def convert_to_ascii(text):
65 |   return unidecode(text)
66 | 
67 | 
68 | def basic_cleaners(text):
69 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70 |   text = lowercase(text)
71 |   text = collapse_whitespace(text)
72 |   return text
73 | 
74 | 
75 | def transliteration_cleaners(text):
76 |   '''Pipeline for non-English text that transliterates to ASCII.'''
77 |   text = convert_to_ascii(text)
78 |   text = lowercase(text)
79 |   text = collapse_whitespace(text)
80 |   return text
81 | 
82 | 
83 | def english_cleaners(text):
84 |   '''Pipeline for English text, including number and abbreviation expansion.'''
85 |   text = convert_to_ascii(text)
86 |   text = expand_numbers(text)
87 |   text = expand_abbreviations(text)
88 |   text = collapse_whitespace(text)
89 |   return text
90 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/synthesize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from hparams import hparams, hparams_debug_string
 4 | from wavenet_vocoder.synthesizer import Synthesizer 
 5 | from tqdm import tqdm
 6 | from infolog import log
 7 | import numpy as np 
 8 | import tensorflow as tf 
 9 | 
10 | 
11 | 
12 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
13 | 	log_dir = os.path.join(output_dir, 'plots')
14 | 	wav_dir = os.path.join(output_dir, 'wavs')
15 | 
16 | 	#We suppose user will provide correct folder depending on training method
17 | 	log(hparams_debug_string())
18 | 	synth = Synthesizer()
19 | 	synth.load(checkpoint_path, hparams)
20 | 
21 | 	if args.model in ('Both', 'Tacotron-2'):
22 | 		#If running all Tacotron-2, synthesize audio from evaluated mels
23 | 		metadata_filename = os.path.join(args.mels_dir, 'map.txt')
24 | 		with open(metadata_filename, encoding='utf-8') as f:
25 | 			metadata = [line.strip().split('|') for line in f]
26 | 			frame_shift_ms = hparams.hop_size / hparams.sample_rate
27 | 			hours = sum([int(x[-1]) for x in metadata]) * frame_shift_ms / (3600)
28 | 			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
29 | 
30 | 		metadata = np.array(metadata)
31 | 		mel_files = metadata[:, 1]
32 | 		texts = metadata[:, 0]
33 | 	else:
34 | 		#else Get all npy files in input_dir (supposing they are mels)
35 | 		mel_files  = [os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy']
36 | 		texts = None
37 | 
38 | 	log('Starting synthesis! (this will take a while..)')
39 | 	os.makedirs(log_dir, exist_ok=True)
40 | 	os.makedirs(wav_dir, exist_ok=True)
41 | 
42 | 	with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
43 | 		for i, mel_file in enumerate(tqdm(mel_files)):
44 | 			mel_spectro = np.load(mel_file)
45 | 			audio_file = synth.synthesize(mel_spectro, None, i+1, wav_dir, log_dir)
46 | 
47 | 			if texts is None:
48 | 				file.write('{}|{}\n'.format(mel_file, audio_file))
49 | 			else:
50 | 				file.write('{}|{}|{}\n'.format(texts[i], mel_file, audio_file))
51 | 
52 | 	log('synthesized audio waveforms at {}'.format(wav_dir))
53 | 
54 | 
55 | 
56 | def wavenet_synthesize(args, hparams, checkpoint):
57 | 	output_dir = 'wavenet_' + args.output_dir
58 | 
59 | 	try:
60 | 		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
61 | 		log('loaded model at {}'.format(checkpoint_path))
62 | 	except AttributeError:
63 | 		#Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
64 | 		if 'Both' in checkpoint:
65 | 			checkpoint = checkpoint.replace('Both', 'Tacotron-2')
66 | 		elif 'Tacotron-2' in checkpoint:
67 | 			checkpoint = checkpoint.replace('Tacotron-2', 'Both')
68 | 		else: #Synthesizing separately
69 | 			raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint))
70 | 
71 | 		try:
72 | 			#Try loading again
73 | 			checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
74 | 			log('loaded model at {}'.format(checkpoint_path))
75 | 		except:
76 | 			raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
77 | 
78 | 	run_synthesis(args, checkpoint_path, output_dir, hparams)


--------------------------------------------------------------------------------
/wavenet_vocoder/models/mixture.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf 
  3 | 
  4 | 
  5 | 
  6 | def log_sum_exp(x):
  7 | 	""" numerically stable log_sum_exp implementation that prevents overflow """
  8 | 	axis = len(x.get_shape())-1
  9 | 	m = tf.reduce_max(x, axis)
 10 | 	m2 = tf.reduce_max(x, axis, keepdims=True)
 11 | 	return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis))
 12 | 
 13 | def log_prob_from_logits(x):
 14 | 	""" numerically stable log_softmax implementation that prevents overflow """
 15 | 	axis = len(x.get_shape())-1
 16 | 	m = tf.reduce_max(x, axis, keepdims=True)
 17 | 	return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keepdims=True))
 18 | 
 19 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256,
 20 | 		log_scale_min=-7.0, reduce=True):
 21 | 	'''Discretized mix of logistic distributions loss.
 22 | 
 23 | 	Note that it is assumed that input is scaled to [-1, 1]
 24 | 
 25 | 	Args:
 26 | 		y_hat: Tensor [batch_size, channels, time_length], predicted output.
 27 | 		y: Tensor [batch_size, time_length, 1], Target.
 28 | 	Returns:
 29 | 		Tensor loss
 30 | 	'''
 31 | 	with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y_hat)[1], 3), 0), tf.assert_equal(tf.rank(y_hat), 3)]):
 32 | 		nr_mix = tf.shape(y_hat)[1] // 3
 33 | 
 34 | 	#[Batch_size, time_length, channels]
 35 | 	y_hat = tf.transpose(y_hat, [0, 2, 1])
 36 | 
 37 | 	#unpack parameters. [batch_size, time_length, num_mixtures] x 3
 38 | 	logit_probs = y_hat[:, :, :nr_mix]
 39 | 	means = y_hat[:, :, nr_mix:2 * nr_mix]
 40 | 	log_scales = tf.maximum(y_hat[:, :, 2* nr_mix: 3 * nr_mix], log_scale_min)
 41 | 
 42 | 	#[batch_size, time_length, 1] -> [batch_size, time_length, num_mixtures]
 43 | 	y = y * tf.ones(shape=[1, 1, nr_mix], dtype=tf.float32)
 44 | 
 45 | 	centered_y = y - means
 46 | 	inv_stdv = tf.exp(-log_scales)
 47 | 	plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 48 | 	cdf_plus = tf.nn.sigmoid(plus_in)
 49 | 	min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 50 | 	cdf_min = tf.nn.sigmoid(min_in)
 51 | 
 52 | 	log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling)
 53 | 	log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling)
 54 | 
 55 | 	#probability for all other cases
 56 | 	cdf_delta = cdf_plus - cdf_min
 57 | 
 58 | 	mid_in = inv_stdv * centered_y
 59 | 	#log probability in the center of the bin, to be used in extreme cases
 60 | 	#(not actually used in this code)
 61 | 	log_pdf_mid = mid_in - log_scales - 2. * tf.nn.softplus(mid_in)
 62 | 
 63 | 	log_probs = tf.where(y < -0.999, log_cdf_plus,
 64 | 		tf.where(y > 0.999, log_one_minus_cdf_min,
 65 | 			tf.where(cdf_delta > 1e-5,
 66 | 				tf.log(tf.maximum(cdf_delta, 1e-12)),
 67 | 				log_pdf_mid - np.log((num_classes - 1) / 2))))
 68 | 	#log_probs = log_probs + tf.nn.log_softmax(logit_probs, -1)
 69 | 
 70 | 	log_probs = log_probs + log_prob_from_logits(logit_probs)
 71 | 
 72 | 	if reduce:
 73 | 		return -tf.reduce_sum(log_sum_exp(log_probs))
 74 | 	else:
 75 | 		return -tf.expand_dims(log_sum_exp(log_probs), [-1])
 76 | 
 77 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.):
 78 | 	'''
 79 | 	Args:
 80 | 		y: Tensor, [batch_size, channels, time_length]
 81 | 	Returns:
 82 | 		Tensor: sample in range of [-1, 1]
 83 | 	'''
 84 | 	with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y)[1], 3), 0)]):
 85 | 		nr_mix = tf.shape(y)[1] // 3
 86 | 
 87 | 	#[batch_size, time_length, channels]
 88 | 	y = tf.transpose(y, [0, 2, 1])
 89 | 	logit_probs = y[:, :, :nr_mix]
 90 | 
 91 | 	#sample mixture indicator from softmax
 92 | 	temp = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5)
 93 | 	temp = logit_probs - tf.log(-tf.log(temp))
 94 | 	argmax = tf.argmax(temp, -1)
 95 | 
 96 | 	#[batch_size, time_length] -> [batch_size, time_length, nr_mix]
 97 | 	one_hot = tf.one_hot(argmax, depth=nr_mix, dtype=tf.float32)
 98 | 	#select logistic parameters
 99 | 	means = tf.reduce_sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
100 | 	log_scales = tf.maximum(tf.reduce_sum(
101 | 		y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), log_scale_min)
102 | 
103 | 	#sample from logistic & clip to interval
104 | 	#we don't actually round to the nearest 8-bit value when sampling
105 | 	u = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5)
106 | 	x = means + tf.exp(log_scales) * (tf.log(u) - tf.log(1 -u))
107 | 
108 | 	return tf.minimum(tf.maximum(x, -1.), 1.)
109 | 


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tacotron.synthesize import tacotron_synthesize
 3 | from wavenet_vocoder.synthesize import wavenet_synthesize
 4 | from infolog import log
 5 | from hparams import hparams
 6 | from warnings import warn
 7 | import os
 8 | 
 9 | 
10 | def prepare_run(args):
11 | 	modified_hp = hparams.parse(args.hparams)
12 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
13 | 
14 | 	run_name = args.name or args.tacotron_name or args.model
15 | 	taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint)
16 | 
17 | 	run_name = args.name or args.wavenet_name or args.model
18 | 	wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
19 | 	return taco_checkpoint, wave_checkpoint, modified_hp
20 | 
21 | def get_sentences(args):
22 | 	if args.text != '':
23 | 		sentences = (args.text.strip().split("."))[:-1]
24 | 	else:
25 | 		sentences = hparams.sentences
26 | 	return sentences
27 | 
28 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
29 | 	log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
30 | 	log('Synthesizing mel-spectrograms from text..')
31 | 	wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
32 | 	log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
33 | 	wavenet_synthesize(args, hparams, wave_checkpoint)
34 | 	log('Tacotron-2 TTS synthesis complete!')
35 | 
36 | 
37 | 
38 | def main():
39 | 	accepted_modes = ['eval', 'synthesis', 'live']
40 | 	parser = argparse.ArgumentParser()
41 | 	parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint')
42 | 	parser.add_argument('--hparams', default='',
43 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
44 | 	parser.add_argument('--name', help='Name of logging directory if the two models were trained together.')
45 | 	parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately')
46 | 	parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately')
47 | 	parser.add_argument('--model', default='Tacotron-2')
48 | 	parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
49 | 	parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet')
50 | 	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
51 | 	parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
52 | 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
53 | 	parser.add_argument('--text', default='', help='Text contains sentences to be synthesized. Valid if mode=eval')
54 | 	parser.add_argument('--reference_audio', default=None, help='Reference audio path')
55 | 	args = parser.parse_args()
56 | 	
57 | 	accepted_models = ['Tacotron', 'WaveNet', 'Both', 'Tacotron-2']
58 | 
59 | 	if args.model not in accepted_models:
60 | 		raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models))
61 | 
62 | 	if args.mode not in accepted_modes:
63 | 		raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))
64 | 
65 | 	if args.mode=='live' and args.model=='Wavenet':
66 | 		raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!')
67 | 
68 | 	if args.GTA not in ('True', 'False'):
69 | 		raise ValueError('GTA option must be either True or False')
70 | 
71 | 	if args.model in ('Both', 'Tacotron-2'):
72 | 		if args.mode == 'live':
73 | 			warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!')
74 | 		if args.mode == 'synthesis':
75 | 			raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)')
76 | 
77 | 	taco_checkpoint, wave_checkpoint, hparams = prepare_run(args)
78 | 	sentences = get_sentences(args)
79 | 
80 | 	if args.model == 'Tacotron':
81 | 		_ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
82 | 	elif args.model == 'WaveNet':
83 | 		wavenet_synthesize(args, hparams, wave_checkpoint)
84 | 	elif args.model in ('Both', 'Tacotron-2'):
85 | 		synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences)
86 | 	else:
87 | 		raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
88 | 
89 | 
90 | if __name__ == '__main__':
91 | 	main()


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from multiprocessing import cpu_count
  3 | import os
  4 | from tqdm import tqdm
  5 | from datasets import preprocessor
  6 | from hparams import hparams
  7 | 
  8 | 
  9 | def preprocess(args, input_folders, out_dir, hparams):
 10 | 	mel_dir = os.path.join(out_dir, 'mels')
 11 | 	wav_dir = os.path.join(out_dir, 'audio')
 12 | 	linear_dir = os.path.join(out_dir, 'linear')
 13 | 	os.makedirs(mel_dir, exist_ok=True)
 14 | 	os.makedirs(wav_dir, exist_ok=True)
 15 | 	os.makedirs(linear_dir, exist_ok=True)
 16 | 	metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm)
 17 | 	write_metadata(metadata, out_dir)
 18 | 
 19 | def write_metadata(metadata, out_dir):
 20 | 	with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
 21 | 		for m in metadata:
 22 | 			f.write('|'.join([str(x) for x in m]) + '\n')
 23 | 	mel_frames = sum([int(m[4]) for m in metadata])
 24 | 	timesteps = sum([int(m[3]) for m in metadata])
 25 | 	sr = hparams.sample_rate
 26 | 	hours = timesteps / sr / 3600
 27 | 	print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
 28 | 		len(metadata), mel_frames, timesteps, hours))
 29 | 	print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
 30 | 	print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
 31 | 	print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
 32 | 
 33 | def norm_data(args):
 34 | 
 35 | 	merge_books = (args.merge_books=='True')
 36 | 
 37 | 	print('Selecting data folders..')
 38 | 	supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS']
 39 | 	if args.dataset not in supported_datasets:
 40 | 		raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
 41 | 			args.dataset, supported_datasets))
 42 | 
 43 | 	if args.dataset.startswith('LJSpeech'):
 44 | 		return [os.path.join(args.base_dir, args.dataset)]
 45 | 
 46 | 	
 47 | 	if args.dataset == 'M-AILABS':
 48 | 		supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU', 
 49 | 			'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
 50 | 		if args.language not in supported_languages:
 51 | 			raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
 52 | 				supported_languages))
 53 | 
 54 | 		supported_voices = ['female', 'male', 'mix']
 55 | 		if args.voice not in supported_voices:
 56 | 			raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
 57 | 				supported_voices))
 58 | 
 59 | 		path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
 60 | 		supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 61 | 		if args.reader not in supported_readers:
 62 | 			raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
 63 | 				supported_readers))
 64 | 
 65 | 		path = os.path.join(path, args.reader)
 66 | 		supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
 67 | 		if merge_books:
 68 | 			return [os.path.join(path, book) for book in supported_books]
 69 | 
 70 | 		else:
 71 | 			if args.book not in supported_books:
 72 | 				raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
 73 | 					supported_books))
 74 | 
 75 | 			return [os.path.join(path, args.book)]
 76 | 
 77 | 
 78 | def run_preprocess(args, hparams):
 79 | 	input_folders = norm_data(args)
 80 | 	output_folder = os.path.join(args.base_dir, args.output)
 81 | 
 82 | 	preprocess(args, input_folders, output_folder, hparams)
 83 | 
 84 | 
 85 | def main():
 86 | 	print('initializing preprocessing..')
 87 | 	parser = argparse.ArgumentParser()
 88 | 	parser.add_argument('--base_dir', default='')
 89 | 	parser.add_argument('--hparams', default='', 
 90 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 91 | 	parser.add_argument('--dataset', default='LJSpeech-1.1')
 92 | 	parser.add_argument('--language', default='en_US')
 93 | 	parser.add_argument('--voice', default='female')
 94 | 	parser.add_argument('--reader', default='mary_ann')
 95 | 	parser.add_argument('--merge_books', default='False')
 96 | 	parser.add_argument('--book', default='northandsouth')
 97 | 	parser.add_argument('--output', default='training_data')
 98 | 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
 99 | 	args = parser.parse_args()
100 | 
101 | 	modified_hp = hparams.parse(args.hparams)
102 | 
103 | 	assert args.merge_books in ('False', 'True')
104 | 
105 | 	run_preprocess(args, modified_hp)
106 | 
107 | 
108 | if __name__ == '__main__':
109 | 	main()


--------------------------------------------------------------------------------
/tacotron/synthesizer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from librosa import effects
  5 | from tacotron.models import create_model
  6 | from tacotron.utils.text import text_to_sequence
  7 | from tacotron.utils import plot
  8 | from datasets import audio
  9 | from datetime import datetime
 10 | import sounddevice as sd
 11 | import pyaudio
 12 | import wave
 13 | from infolog import log
 14 | 
 15 | 
 16 | class Synthesizer:
 17 | 	def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron', reference_mel = None):
 18 | 		log('Constructing model: %s' % model_name)
 19 | 		inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
 20 | 		input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
 21 | 		targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
 22 | 		
 23 | 		if reference_mel is not None:
 24 | 			reference_mel = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'reference_mel')
 25 | 		with tf.variable_scope('model') as scope:
 26 | 			self.model = create_model(model_name, hparams)
 27 | 			if gta:
 28 | 				self.model.initialize(inputs, input_lengths, targets, gta=gta, reference_mel=reference_mel)
 29 | 			else:		
 30 | 				self.model.initialize(inputs, input_lengths, reference_mel=reference_mel)
 31 | 			self.mel_outputs = self.model.mel_outputs
 32 | 			self.alignment = self.model.alignments[0]
 33 | 
 34 | 		self.gta = gta
 35 | 		self._hparams = hparams
 36 | 
 37 | 		log('Loading checkpoint: %s' % checkpoint_path)
 38 | 		self.session = tf.Session()
 39 | 		self.session.run(tf.global_variables_initializer())
 40 | 		saver = tf.train.Saver()
 41 | 		saver.restore(self.session, checkpoint_path)
 42 | 
 43 | 
 44 | 	def synthesize(self, text, index, out_dir, log_dir, mel_filename, reference_mel=None):
 45 | 		hparams = self._hparams
 46 | 		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 47 | 		seq = text_to_sequence(text, cleaner_names)
 48 | 		feed_dict = {
 49 | 			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
 50 | 			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
 51 | 		}
 52 | 		
 53 | 		
 54 | 		if reference_mel is not None:
 55 | 			reference_mel = np.expand_dims(reference_mel, 0)
 56 | 			feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})
 57 | 		
 58 | 		if self.gta:
 59 | 			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)
 60 | 
 61 | 		if self.gta or not hparams.predict_linear:
 62 | 			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
 63 | 
 64 | 		else:
 65 | 			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
 66 | 			linear = linear.reshape(-1, hparams.num_freq)
 67 | 
 68 | 		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out
 69 | 
 70 | 
 71 | 		if index is None:
 72 | 			#Generate wav and read it
 73 | 			wav = audio.inv_mel_spectrogram(mels.T, hparams)
 74 | 			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way
 75 | 
 76 | 			chunk = 512
 77 | 			f = wave.open('temp.wav', 'rb')
 78 | 			p = pyaudio.PyAudio()
 79 | 			stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
 80 | 				channels=f.getnchannels(),
 81 | 				rate=f.getframerate(),
 82 | 				output=True)
 83 | 			data = f.readframes(chunk)
 84 | 			while data:
 85 | 				stream.write(data)
 86 | 				data=f.readframes(chunk)
 87 | 
 88 | 			stream.stop_stream()
 89 | 			stream.close()
 90 | 
 91 | 			p.terminate()
 92 | 			return
 93 | 
 94 | 
 95 | 		# Write the spectrogram to disk
 96 | 		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
 97 | 		mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
 98 | 		np.save(mel_filename, mels, allow_pickle=False)
 99 | 
100 | 		if log_dir is not None:
101 | 			#save wav (mel -> wav)
102 | 			wav = audio.inv_mel_spectrogram(mels.T, hparams)
103 | 			audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate)
104 | 
105 | 			if hparams.predict_linear:
106 | 				#save wav (linear -> wav)
107 | 				wav = audio.inv_linear_spectrogram(linear.T, hparams)
108 | 				audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate)
109 | 
110 | 			#save alignments
111 | 			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
112 | 				info='{}'.format(text), split_title=True)
113 | 
114 | 			#save mel spectrogram plot
115 | 			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
116 | 				info='{}'.format(text), split_title=True)
117 | 
118 | 		return mel_filename


--------------------------------------------------------------------------------
/tacotron/synthesize.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import re
  4 | from hparams import hparams, hparams_debug_string
  5 | from tacotron.synthesizer import Synthesizer
  6 | import time
  7 | from tqdm import tqdm
  8 | from time import sleep
  9 | from infolog import log
 10 | import tensorflow as tf 
 11 | 
 12 | 
 13 | 
 14 | def generate_fast(model, text):
 15 | 	model.synthesize(text, None, None, None, None)
 16 | 
 17 | 
 18 | def run_live(args, checkpoint_path, hparams):
 19 | 	#Log to Terminal without keeping any records in files
 20 | 	log(hparams_debug_string())
 21 | 	synth = Synthesizer()
 22 | 	synth.load(checkpoint_path, hparams)
 23 | 
 24 | 	#Generate fast greeting message
 25 | 	greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
 26 | 	log(greetings)
 27 | 	generate_fast(synth, greetings)
 28 | 
 29 | 	#Interaction loop
 30 | 	while True:
 31 | 		try:
 32 | 			text = input()
 33 | 			generate_fast(synth, text)
 34 | 
 35 | 		except KeyboardInterrupt:
 36 | 			leave = 'Thank you for testing our features. see you soon.'
 37 | 			log(leave)
 38 | 			generate_fast(synth, leave)
 39 | 			sleep(2)
 40 | 			break
 41 | 
 42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 43 | 	eval_dir = os.path.join(output_dir, 'eval')
 44 | 	log_dir = os.path.join(output_dir, 'logs-eval')
 45 | 	
 46 | 	
 47 | 	#Create output path if it doesn't exist
 48 | 	os.makedirs(eval_dir, exist_ok=True)
 49 | 	os.makedirs(log_dir, exist_ok=True)
 50 | 	os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
 51 | 	os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
 52 | 
 53 | 	log(hparams_debug_string())
 54 | 	synth = Synthesizer()
 55 | 	synth.load(checkpoint_path, hparams, reference_mel=args.reference_audio)
 56 | 	if args.reference_audio is not None:
 57 | 		ref_wav = audio.load_wav(args.reference_audio)
 58 | 		reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
 59 | 	else:
 60 | 		raise ValueError("Evaluation without reference audio. Please provide path to reference audio.")
 61 | 	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
 62 | 		for i, text in enumerate(tqdm(sentences)):
 63 | 			start = time.time()
 64 | 			mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None, reference_mel=reference_mel)
 65 | 			file.write('{}|{}\n'.format(text, mel_filename))
 66 | 			
 67 | 	log('synthesized mel spectrograms at {}'.format(eval_dir))
 68 | 	return eval_dir
 69 | 
 70 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
 71 | 	GTA = (args.GTA == 'True')
 72 | 	if GTA:
 73 | 		synth_dir = os.path.join(output_dir, 'gta')
 74 | 
 75 | 	else:
 76 | 		synth_dir = os.path.join(output_dir, 'natural')
 77 | 
 78 | 	os.makedirs(synth_dir, exist_ok=True)
 79 | 
 80 | 
 81 | 	metadata_filename = os.path.join(args.input_dir, 'train.txt')
 82 | 	log(hparams_debug_string())
 83 | 	synth = Synthesizer()
 84 | 	synth.load(checkpoint_path, hparams, gta=GTA)
 85 | 	with open(metadata_filename, encoding='utf-8') as f:
 86 | 		metadata = [line.strip().split('|') for line in f]
 87 | 		frame_shift_ms = hparams.hop_size / hparams.sample_rate
 88 | 		hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
 89 | 		log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
 90 | 
 91 | 	log('starting synthesis')
 92 | 	mel_dir = os.path.join(args.input_dir, 'mels')
 93 | 	wav_dir = os.path.join(args.input_dir, 'audio')
 94 | 	with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
 95 | 		for i, meta in enumerate(tqdm(metadata)):
 96 | 			text = meta[5]
 97 | 			mel_filename = os.path.join(mel_dir, meta[1])
 98 | 			wav_filename = os.path.join(wav_dir, meta[0])
 99 | 			mel_output_filename = synth.synthesize(text, i+1, synth_dir, None, mel_filename)
100 | 			file.write('{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, text))
101 | 	log('synthesized mel spectrograms at {}'.format(synth_dir))
102 | 	return os.path.join(synth_dir, 'map.txt')
103 | 
104 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
105 | 	output_dir = 'tacotron_' + args.output_dir
106 | 
107 | 	try:
108 | 		checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
109 | 		log('loaded model at {}'.format(checkpoint_path))
110 | 	except AttributeError:
111 | 		#Swap logs dir name in case user used Tacotron-2 for train and Both for test (and vice versa)
112 | 		if 'Both' in checkpoint:
113 | 			checkpoint = checkpoint.replace('Both', 'Tacotron-2')
114 | 		elif 'Tacotron-2' in checkpoint:
115 | 			checkpoint = checkpoint.replace('Tacotron-2', 'Both')
116 | 		else:
117 | 			raise AssertionError('Cannot restore checkpoint: {}, did you train a model?'.format(checkpoint))
118 | 
119 | 		try:
120 | 			#Try loading again
121 | 			checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
122 | 			log('loaded model at {}'.format(checkpoint_path))
123 | 		except:
124 | 			raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
125 | 	if args.mode == 'eval':
126 | 		return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
127 | 	elif args.mode == 'synthesis':
128 | 		return run_synthesis(args, checkpoint_path, output_dir, hparams)
129 | 	else:
130 | 		run_live(args, checkpoint_path, hparams)
131 | 


--------------------------------------------------------------------------------
/tacotron/models/multihead_attention.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import math
  4 | from tacotron.utils.ops import shape_list
  5 | 
  6 | class MultiheadAttention():
  7 |   '''Computes the multi-head attention as described in
  8 |   https://arxiv.org/abs/1706.03762.
  9 |   Args:
 10 |     num_heads: The number of attention heads.
 11 |     query: The sequence of queries. A tensor of shape :math:`[B, T_1, ...]`.
 12 |     value: The sequence to attend. A tensor of shape :math:`[B, T_2, ...]`.
 13 |       If ``None``, computes self-attention.
 14 |     num_units: The number of hidden units. If not set, it is set to the input
 15 |       dimension.
 16 |     attention_type: a string, either "dot_attention", "mlp_attention".
 17 |   Returns:
 18 |      The concatenated attention context of each head.
 19 |   '''
 20 |   def __init__(self,
 21 |                query,
 22 |                value,
 23 |                num_heads=4,
 24 |                attention_type='mlp_attention',
 25 |                num_units=None,
 26 |                normalize=True):
 27 |     self.query = query
 28 |     self.value = value
 29 |     self.num_heads = num_heads
 30 |     self.attention_type = attention_type
 31 |     self.num_units = num_units or query.get_shape().as_list()[-1]
 32 |     self.normalize = normalize
 33 | 
 34 |   def multi_head_attention(self):
 35 |     if self.num_units % self.num_heads != 0:
 36 |       raise ValueError("Multi head attention requires that num_units is a"
 37 |                        " multiple of {}".format(num_heads))
 38 | 
 39 |     with tf.variable_scope("Multihead-attention"):
 40 |       q = tf.layers.conv1d(self.query, self.num_units, 1)
 41 |       k =  tf.layers.conv1d(self.value, self.num_units, 1)
 42 |       v = self.value
 43 |       qs, ks, vs = self._split_heads(q, k, v)
 44 |       if self.attention_type == 'mlp_attention':
 45 |         style_embeddings = self._mlp_attention(qs, ks, vs)
 46 |       elif self.attention_type == 'dot_attention':
 47 |         style_embeddings = self._dot_product(qs, ks, vs)
 48 |       else:
 49 |         raise ValueError('Only mlp_attention and dot_attention are supported')
 50 | 
 51 |       return self._combine_heads(style_embeddings)
 52 | 
 53 |   def _split_heads(self, q, k, v):
 54 |     '''Split the channels into multiple heads
 55 |     
 56 |     Returns:
 57 |          Tensors with shape [batch, num_heads, length_x, dim_x/num_heads]
 58 |     '''
 59 |     qs = tf.transpose(self._split_last_dimension(q, self.num_heads), [0, 2, 1, 3])
 60 |     ks = tf.transpose(self._split_last_dimension(k, self.num_heads), [0, 2, 1, 3])
 61 |     v_shape = shape_list(v)  
 62 |     vs = tf.tile(tf.expand_dims(v, axis=1), [1, self.num_heads, 1, 1])
 63 |     return qs, ks, vs
 64 | 
 65 |   def _split_last_dimension(self, x, num_heads):
 66 |     '''Reshape x to num_heads
 67 |     Returns:
 68 |         a Tensor with shape [batch, length_x, num_heads, dim_x/num_heads]
 69 |     '''
 70 |     x_shape = shape_list(x)
 71 |     dim = x_shape[-1]
 72 |     assert dim % num_heads == 0 
 73 |     return tf.reshape(x, x_shape[:-1] + [num_heads, dim // num_heads])
 74 | 
 75 |   def _dot_product(self, qs, ks, vs):
 76 |     '''dot-product computation
 77 |     Returns:
 78 |         a context vector with shape [batch, num_heads, length_q, dim_vs]
 79 |     '''
 80 |     qk = tf.matmul(qs, ks, transpose_b=True)
 81 |     scale_factor = (self.num_units // self.num_heads)**-0.5
 82 |     if self.normalize:
 83 |       qk *= scale_factor
 84 |     weights = tf.nn.softmax(qk, name="dot_attention_weights")
 85 |     context = tf.matmul(weights, vs)
 86 |     return context
 87 | 
 88 |   def _mlp_attention(self, qs, ks, vs):
 89 |     '''MLP computation modified from https://github.com/npuichigo
 90 |     Returns:
 91 |         a context vector with shape [batch, num_heads, length_q, dim_vs]
 92 |     '''
 93 |     num_units = qs.get_shape()[-1].value
 94 |     dtype = qs.dtype
 95 | 
 96 |     v = tf.get_variable("attention_v", [num_units], dtype=dtype)
 97 |     if self.normalize:
 98 |       #https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py#L470
 99 |       # Scalar used in weight normalization
100 |       g = tf.get_variable(
101 |           "attention_g", dtype=dtype,
102 |           initializer=math.sqrt((1. / num_units)))
103 |       # Bias added prior to the nonlinearity
104 |       b = tf.get_variable(
105 |           "attention_b", [num_units], dtype=dtype,
106 |           initializer=tf.zeros_initializer())
107 |       # normed_v = g * v / ||v||
108 |       normed_v = g * v * tf.rsqrt(
109 |               tf.reduce_sum(tf.square(v)))
110 |       # Single layer multilayer perceptron.
111 |       add = tf.reduce_sum(normed_v * tf.tanh(ks + qs + b), [-1], keep_dims=True)
112 |     else:
113 |       # Single layer multilayer perceptron.
114 |       add = tf.reduce_sum(v * tf.tanh(ks + qs), [-1], keep_dims=True)
115 | 
116 |     # Compute attention weights.
117 |     weights = tf.nn.softmax(tf.transpose(add, [0, 1, 3, 2]), name="mlp_attention_weights")
118 |     # Compute attention context.
119 |     context = tf.matmul(weights, vs)
120 |     return context
121 | 
122 |   def _combine_heads(self, x):
123 |     '''Combine all heads
124 |        Returns:
125 |            a Tensor with shape [batch, length_x, shape_x[-1] * shape_x[-3]]
126 |     '''
127 |     x = tf.transpose(x, [0, 2, 1, 3])
128 |     x_shape = shape_list(x)
129 |     return tf.reshape(x, x_shape[:-2] + [self.num_heads * x_shape[-1]])
130 |   


--------------------------------------------------------------------------------
/tacotron/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import collections
  6 | import tensorflow as tf
  7 | 
  8 | from tensorflow.contrib.seq2seq.python.ops import decoder
  9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 10 | from tensorflow.python.framework import ops
 11 | from tensorflow.python.framework import tensor_shape
 12 | from tensorflow.python.layers import base as layers_base
 13 | from tensorflow.python.ops import rnn_cell_impl
 14 | from tensorflow.python.util import nest
 15 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
 16 | 
 17 | 
 18 | 
 19 | class CustomDecoderOutput(
 20 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 21 | 	pass
 22 | 
 23 | 
 24 | class CustomDecoder(decoder.Decoder):
 25 | 	"""Custom sampling decoder.
 26 | 
 27 | 	Allows for stop token prediction at inference time
 28 | 	and returns equivalent loss in training time.
 29 | 
 30 | 	Note:
 31 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 32 | 	"""
 33 | 
 34 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 35 | 		"""Initialize CustomDecoder.
 36 | 		Args:
 37 | 			cell: An `RNNCell` instance.
 38 | 			helper: A `Helper` instance.
 39 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 40 | 				The initial state of the RNNCell.
 41 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 42 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 43 | 				to storing the result or sampling.
 44 | 		Raises:
 45 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 46 | 		"""
 47 | 		if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
 48 | 			raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
 49 | 		if not isinstance(helper, helper_py.Helper):
 50 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 51 | 		if (output_layer is not None
 52 | 				and not isinstance(output_layer, layers_base.Layer)):
 53 | 			raise TypeError(
 54 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 55 | 		self._cell = cell
 56 | 		self._helper = helper
 57 | 		self._initial_state = initial_state
 58 | 		self._output_layer = output_layer
 59 | 
 60 | 	@property
 61 | 	def batch_size(self):
 62 | 		return self._helper.batch_size
 63 | 
 64 | 	def _rnn_output_size(self):
 65 | 		size = self._cell.output_size
 66 | 		if self._output_layer is None:
 67 | 			return size
 68 | 		else:
 69 | 			# To use layer's compute_output_shape, we need to convert the
 70 | 			# RNNCell's output_size entries into shapes with an unknown
 71 | 			# batch size.  We then pass this through the layer's
 72 | 			# compute_output_shape and read off all but the first (batch)
 73 | 			# dimensions to get the output size of the rnn with the layer
 74 | 			# applied to the top.
 75 | 			output_shape_with_unknown_batch = nest.map_structure(
 76 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 77 | 					size)
 78 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 79 | 					output_shape_with_unknown_batch)
 80 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 81 | 
 82 | 	@property
 83 | 	def output_size(self):
 84 | 		# Return the cell output and the id
 85 | 		return CustomDecoderOutput(
 86 | 				rnn_output=self._rnn_output_size(),
 87 | 				token_output=self._helper.token_output_size,
 88 | 				sample_id=self._helper.sample_ids_shape)
 89 | 
 90 | 	@property
 91 | 	def output_dtype(self):
 92 | 		# Assume the dtype of the cell is the output_size structure
 93 | 		# containing the input_state's first component's dtype.
 94 | 		# Return that structure and the sample_ids_dtype from the helper.
 95 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 96 | 		return CustomDecoderOutput(
 97 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 98 | 				tf.float32,
 99 | 				self._helper.sample_ids_dtype)
100 | 
101 | 	def initialize(self, name=None):
102 | 		"""Initialize the decoder.
103 | 		Args:
104 | 			name: Name scope for any created operations.
105 | 		Returns:
106 | 			`(finished, first_inputs, initial_state)`.
107 | 		"""
108 | 		return self._helper.initialize() + (self._initial_state,)
109 | 
110 | 	def step(self, time, inputs, state, name=None):
111 | 		"""Perform a custom decoding step.
112 | 		Enables for dyanmic <stop_token> prediction
113 | 		Args:
114 | 			time: scalar `int32` tensor.
115 | 			inputs: A (structure of) input tensors.
116 | 			state: A (structure of) state tensors and TensorArrays.
117 | 			name: Name scope for any created operations.
118 | 		Returns:
119 | 			`(outputs, next_state, next_inputs, finished)`.
120 | 		"""
121 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
122 | 			#Call outputprojection wrapper cell
123 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
124 | 
125 | 			#apply output_layer (if existant)
126 | 			if self._output_layer is not None:
127 | 				cell_outputs = self._output_layer(cell_outputs)
128 | 			sample_ids = self._helper.sample(
129 | 					time=time, outputs=cell_outputs, state=cell_state)
130 | 
131 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
132 | 					time=time,
133 | 					outputs=cell_outputs,
134 | 					state=cell_state,
135 | 					sample_ids=sample_ids,
136 | 					stop_token_prediction=stop_token)
137 | 
138 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
139 | 		return (outputs, next_state, next_inputs, finished)


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import tensorflow as tf 
  3 | from tacotron.train import tacotron_train
  4 | from wavenet_vocoder.train import wavenet_train
  5 | from tacotron.synthesize import tacotron_synthesize
  6 | from infolog import log
  7 | from hparams import hparams
  8 | import os
  9 | import infolog
 10 | from time import sleep
 11 | 
 12 | log = infolog.log
 13 | 
 14 | 
 15 | def save_seq(file, sequence, input_path):
 16 | 	'''Save Tacotron-2 training state to disk. (To skip for future runs)
 17 | 	'''
 18 | 	sequence = [str(int(s)) for s in sequence] + [input_path]
 19 | 	with open(file, 'w') as f:
 20 | 		f.write('|'.join(sequence))
 21 | 
 22 | def read_seq(file, restore):
 23 | 	'''Load Tacotron-2 training state from disk. (To skip if not first run)
 24 | 	'''
 25 | 	if os.path.isfile(file) and restore == True:
 26 | 		with open(file, 'r') as f:
 27 | 			sequence = f.read().split('|')
 28 | 
 29 | 		return [bool(int(s)) for s in sequence[:-1]], sequence[-1]
 30 | 	else:
 31 | 		return [0, 0, 0], ''
 32 | 
 33 | def prepare_run(args):
 34 | 	modified_hp = hparams.parse(args.hparams)
 35 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
 36 | 	run_name = args.name or args.model
 37 | 	log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
 38 | 	os.makedirs(log_dir, exist_ok=True)
 39 | 	infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name)
 40 | 	return log_dir, modified_hp
 41 | 
 42 | def train(args, log_dir, hparams):
 43 | 	state_file = os.path.join(log_dir, 'state_log')
 44 | 	#Get training states
 45 | 	(taco_state, GTA_state, wave_state), input_path = read_seq(state_file, args.restore)
 46 | 
 47 | 	if not taco_state:
 48 | 		log('\n#############################################################\n')
 49 | 		log('Tacotron Train\n')
 50 | 		log('###########################################################\n')
 51 | 		checkpoint = tacotron_train(args, log_dir, hparams)
 52 | 		tf.reset_default_graph()
 53 | 		#Sleep 1 second to let previous graph close and avoid error messages while synthesis
 54 | 		sleep(1)
 55 | 		if checkpoint is None:
 56 | 			raise('Error occured while training Tacotron, Exiting!')
 57 | 		taco_state = 1
 58 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 59 | 
 60 | 	if not GTA_state:
 61 | 		log('\n#############################################################\n')
 62 | 		log('Tacotron GTA Synthesis\n')
 63 | 		log('###########################################################\n')
 64 | 		input_path = tacotron_synthesize(args, hparams, checkpoint)
 65 | 		GTA_state = 1
 66 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 67 | 
 68 | 	if input_path == '' or input_path is None:
 69 | 		raise RuntimeError('input_path has an unpleasant value -> {}'.format(input_path))
 70 | 
 71 | 	if not wave_state:
 72 | 		log('\n#############################################################\n')
 73 | 		log('Wavenet Train\n')
 74 | 		log('###########################################################\n')
 75 | 		checkpoint = wavenet_train(args, log_dir, hparams, input_path)
 76 | 		if checkpoint is None:
 77 | 			raise ('Error occured while training Wavenet, Exiting!')
 78 | 		wave_state = 1
 79 | 		save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
 80 | 
 81 | 	if wave_state and GTA_state and taco_state:
 82 | 		log('TRAINING IS ALREADY COMPLETE!!')
 83 | 
 84 | def main():
 85 | 	parser = argparse.ArgumentParser()
 86 | 	parser.add_argument('--base_dir', default='')
 87 | 	parser.add_argument('--hparams', default='',
 88 | 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 89 | 	parser.add_argument('--tacotron_input', default='training_data/train.txt')
 90 | 	parser.add_argument('--wavenet_input', default='tacotron_output/gta/map.txt')
 91 | 	parser.add_argument('--name', help='Name of logging directory.')
 92 | 	parser.add_argument('--model', default='Tacotron-2')
 93 | 	parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
 94 | 	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
 95 | 	parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training')
 96 | 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode')
 97 | 	parser.add_argument('--restore', type=bool, default=False, help='Set this to True to resume training')
 98 | 	parser.add_argument('--summary_interval', type=int, default=250,
 99 | 		help='Steps between running summary ops')
100 | 	parser.add_argument('--checkpoint_interval', type=int, default=500,
101 | 		help='Steps between writing checkpoints')
102 | 	parser.add_argument('--eval_interval', type=int, default=5000,
103 | 		help='Steps between eval on test data')
104 | 	parser.add_argument('--tacotron_train_steps', type=int, default=100000, help='total number of tacotron training steps')
105 | 	parser.add_argument('--wavenet_train_steps', type=int, default=100000, help='total number of wavenet training steps')
106 | 	parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
107 | 	args = parser.parse_args()
108 | 
109 | 	accepted_models = ['Tacotron', 'WaveNet', 'Both', 'Tacotron-2']
110 | 
111 | 	if args.model not in accepted_models:
112 | 		raise ValueError('please enter a valid model to train: {}'.format(accepted_models))
113 | 
114 | 	log_dir, hparams = prepare_run(args)
115 | 
116 | 	if args.model == 'Tacotron':
117 | 		tacotron_train(args, log_dir, hparams)
118 | 	elif args.model == 'WaveNet':
119 | 		wavenet_train(args, log_dir, hparams, args.wavenet_input)
120 | 	elif args.model in ('Both', 'Tacotron-2'):
121 | 		train(args, log_dir, hparams)
122 | 	else:
123 | 		raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
124 | 
125 | 
126 | if __name__ == '__main__':
127 | 	main()


--------------------------------------------------------------------------------
/datasets/preprocessor.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ProcessPoolExecutor
  2 | from functools import partial
  3 | from datasets import audio
  4 | import os
  5 | import numpy as np 
  6 | from wavenet_vocoder.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize
  7 | 
  8 | 
  9 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
 10 | 	"""
 11 | 	Preprocesses the speech dataset from a gven input path to given output directories
 12 | 
 13 | 	Args:
 14 | 		- hparams: hyper parameters
 15 | 		- input_dir: input directory that contains the files to prerocess
 16 | 		- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
 17 | 		- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
 18 | 		- wav_dir: output directory of the preprocessed speech audio dataset
 19 | 		- n_jobs: Optional, number of worker process to parallelize across
 20 | 		- tqdm: Optional, provides a nice progress bar
 21 | 
 22 | 	Returns:
 23 | 		- A list of tuple describing the train examples. this should be written to train.txt
 24 | 	"""
 25 | 
 26 | 	# We use ProcessPoolExecutor to parallelize across processes, this is just for 
 27 | 	# optimization purposes and it can be omited
 28 | 	executor = ProcessPoolExecutor(max_workers=n_jobs)
 29 | 	futures = []
 30 | 	index = 1
 31 | 	for input_dir in input_dirs:
 32 | 		with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f:
 33 | 			for line in f:
 34 | 				parts = line.strip().split('|')
 35 | 				wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(parts[0]))
 36 | 				text = parts[2]
 37 | 				futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams)))
 38 | 				index += 1
 39 | 
 40 | 	return [future.result() for future in tqdm(futures) if future.result() is not None]
 41 | 
 42 | 
 43 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
 44 | 	"""
 45 | 	Preprocesses a single utterance wav/text pair
 46 | 
 47 | 	this writes the mel scale spectogram to disk and return a tuple to write
 48 | 	to the train.txt file
 49 | 
 50 | 	Args:
 51 | 		- mel_dir: the directory to write the mel spectograms into
 52 | 		- linear_dir: the directory to write the linear spectrograms into
 53 | 		- wav_dir: the directory to write the preprocessed wav into
 54 | 		- index: the numeric index to use in the spectogram filename
 55 | 		- wav_path: path to the audio file containing the speech input
 56 | 		- text: text spoken in the input audio file
 57 | 		- hparams: hyper parameters
 58 | 
 59 | 	Returns:
 60 | 		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
 61 | 	"""
 62 | 	try:
 63 | 		# Load the audio as numpy array
 64 | 		wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
 65 | 	except FileNotFoundError: #catch missing wav exception
 66 | 		print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
 67 | 			wav_path))
 68 | 		return None
 69 | 
 70 | 	#rescale wav
 71 | 	if hparams.rescale:
 72 | 		wav = wav / np.abs(wav).max() * hparams.rescaling_max
 73 | 
 74 | 	#M-AILABS extra silence specific
 75 | 	if hparams.trim_silence:
 76 | 		wav = audio.trim_silence(wav, hparams)
 77 | 
 78 | 	#Mu-law quantize
 79 | 	if is_mulaw_quantize(hparams.input_type):
 80 | 		#[0, quantize_channels)
 81 | 		out = mulaw_quantize(wav, hparams.quantize_channels)
 82 | 
 83 | 		#Trim silences
 84 | 		start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
 85 | 		wav = wav[start: end]
 86 | 		out = out[start: end]
 87 | 
 88 | 		constant_values = mulaw_quantize(0, hparams.quantize_channels)
 89 | 		out_dtype = np.int16
 90 | 
 91 | 	elif is_mulaw(hparams.input_type):
 92 | 		#[-1, 1]
 93 | 		out = mulaw(wav, hparams.quantize_channels)
 94 | 		constant_values = mulaw(0., hparams.quantize_channels)
 95 | 		out_dtype = np.float32
 96 | 	
 97 | 	else:
 98 | 		#[-1, 1]
 99 | 		out = wav
100 | 		constant_values = 0.
101 | 		out_dtype = np.float32
102 | 
103 | 	# Compute the mel scale spectrogram from the wav
104 | 	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
105 | 	mel_frames = mel_spectrogram.shape[1]
106 | 
107 | 	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
108 | 		return None
109 | 
110 | 	#Compute the linear scale spectrogram from the wav
111 | 	linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
112 | 	linear_frames = linear_spectrogram.shape[1] 
113 | 
114 | 	#sanity check
115 | 	assert linear_frames == mel_frames
116 | 
117 | 	#Ensure time resolution adjustement between audio and mel-spectrogram
118 | 	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
119 | 	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
120 | 
121 | 	#Zero pad for quantized signal
122 | 	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
123 | 	assert len(out) >= mel_frames * audio.get_hop_size(hparams)
124 | 
125 | 	#time resolution adjustement
126 | 	#ensure length of raw audio is multiple of hop size so that we can use
127 | 	#transposed convolution to upsample
128 | 	out = out[:mel_frames * audio.get_hop_size(hparams)]
129 | 	assert len(out) % audio.get_hop_size(hparams) == 0
130 | 	time_steps = len(out)
131 | 
132 | 	# Write the spectrogram and audio to disk
133 | 	audio_filename = 'speech-audio-{:05d}.npy'.format(index)
134 | 	mel_filename = 'speech-mel-{:05d}.npy'.format(index)
135 | 	linear_filename = 'speech-linear-{:05d}.npy'.format(index)
136 | 	np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
137 | 	np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
138 | 	np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)
139 | 
140 | 	# Return a tuple describing this training example
141 | 	return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)


--------------------------------------------------------------------------------
/wavenet_vocoder/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf 
  3 | import librosa.display as dsp
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | 
  7 | def _assert_valid_input_type(s):
  8 | 	assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw'
  9 | 
 10 | def is_mulaw_quantize(s):
 11 | 	_assert_valid_input_type(s)
 12 | 	return s == 'mulaw-quantize'
 13 | 
 14 | def is_mulaw(s):
 15 | 	_assert_valid_input_type(s)
 16 | 	return s == 'mulaw'
 17 | 
 18 | def is_raw(s):
 19 | 	_assert_valid_input_type(s)
 20 | 	return s == 'raw'
 21 | 
 22 | def is_scalar_input(s):
 23 | 	return is_raw(s) or is_mulaw(s)
 24 | 
 25 | 
 26 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py
 27 | def mulaw(x, mu=256):
 28 | 	"""Mu-Law companding
 29 | 	Method described in paper [1]_.
 30 | 	.. math::
 31 | 		f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu)
 32 | 	Args:
 33 | 		x (array-like): Input signal. Each value of input signal must be in
 34 | 		  range of [-1, 1].
 35 | 		mu (number): Compression parameter ``μ``.
 36 | 	Returns:
 37 | 		array-like: Compressed signal ([-1, 1])
 38 | 	See also:
 39 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 40 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 41 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 42 | 	.. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding
 43 | 		implementations using the tms320c54x." SPRA163 (1997).
 44 | 	"""
 45 | 	mu -= 1
 46 | 	return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu)
 47 | 
 48 | 
 49 | def inv_mulaw(y, mu=256):
 50 | 	"""Inverse of mu-law companding (mu-law expansion)
 51 | 	.. math::
 52 | 		f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1)
 53 | 	Args:
 54 | 		y (array-like): Compressed signal. Each value of input signal must be in
 55 | 		  range of [-1, 1].
 56 | 		mu (number): Compression parameter ``μ``.
 57 | 	Returns:
 58 | 		array-like: Uncomprresed signal (-1 <= x <= 1)
 59 | 	See also:
 60 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 61 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
 62 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 63 | 	"""
 64 | 	mu -= 1
 65 | 	return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
 66 | 
 67 | 
 68 | def mulaw_quantize(x, mu=256):
 69 | 	"""Mu-Law companding + quantize
 70 | 	Args:
 71 | 		x (array-like): Input signal. Each value of input signal must be in
 72 | 		  range of [-1, 1].
 73 | 		mu (number): Compression parameter ``μ``.
 74 | 	Returns:
 75 | 		array-like: Quantized signal (dtype=int)
 76 | 		  - y ∈ [0, mu] if x ∈ [-1, 1]
 77 | 		  - y ∈ [0, mu) if x ∈ [-1, 1)
 78 | 	.. note::
 79 | 		If you want to get quantized values of range [0, mu) (not [0, mu]),
 80 | 		then you need to provide input signal of range [-1, 1).
 81 | 	Examples:
 82 | 		>>> from scipy.io import wavfile
 83 | 		>>> import pysptk
 84 | 		>>> import numpy as np
 85 | 		>>> from nnmnkwii import preprocessing as P
 86 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
 87 | 		>>> x = (x / 32768.0).astype(np.float32)
 88 | 		>>> y = P.mulaw_quantize(x)
 89 | 		>>> print(y.min(), y.max(), y.dtype)
 90 | 		15 246 int64
 91 | 	See also:
 92 | 		:func:`nnmnkwii.preprocessing.mulaw`
 93 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
 94 | 		:func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
 95 | 	"""
 96 | 	mu -= 1
 97 | 	y = mulaw(x, mu)
 98 | 	# scale [-1, 1] to [0, mu]
 99 | 	return _asint((y + 1) / 2 * mu)
100 | 
101 | 
102 | def inv_mulaw_quantize(y, mu=255):
103 | 	"""Inverse of mu-law companding + quantize
104 | 	Args:
105 | 		y (array-like): Quantized signal (∈ [0, mu]).
106 | 		mu (number): Compression parameter ``μ``.
107 | 	Returns:
108 | 		array-like: Uncompressed signal ([-1, 1])
109 | 	Examples:
110 | 		>>> from scipy.io import wavfile
111 | 		>>> import pysptk
112 | 		>>> import numpy as np
113 | 		>>> from nnmnkwii import preprocessing as P
114 | 		>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
115 | 		>>> x = (x / 32768.0).astype(np.float32)
116 | 		>>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x))
117 | 		>>> x_hat = (x_hat * 32768).astype(np.int16)
118 | 	See also:
119 | 		:func:`nnmnkwii.preprocessing.mulaw`
120 | 		:func:`nnmnkwii.preprocessing.inv_mulaw`
121 | 		:func:`nnmnkwii.preprocessing.mulaw_quantize`
122 | 	"""
123 | 	# [0, m) to [-1, 1]
124 | 	mu -= 1
125 | 	y = 2 * _asfloat(y) / mu - 1
126 | 	return inv_mulaw(y, mu)
127 | 
128 | def _sign(x):
129 | 	#wrapper to support tensorflow tensors/numpy arrays
130 | 	isnumpy = isinstance(x, np.ndarray)
131 | 	isscalar = np.isscalar(x)
132 | 	return np.sign(x) if (isnumpy or isscalar) else tf.sign(x)
133 | 
134 | 
135 | def _log1p(x):
136 | 	#wrapper to support tensorflow tensors/numpy arrays
137 | 	isnumpy = isinstance(x, np.ndarray)
138 | 	isscalar = np.isscalar(x)
139 | 	return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x)
140 | 
141 | 
142 | def _abs(x):
143 | 	#wrapper to support tensorflow tensors/numpy arrays
144 | 	isnumpy = isinstance(x, np.ndarray)
145 | 	isscalar = np.isscalar(x)
146 | 	return np.abs(x) if (isnumpy or isscalar) else tf.abs(x)
147 | 
148 | 
149 | def _asint(x):
150 | 	#wrapper to support tensorflow tensors/numpy arrays
151 | 	isnumpy = isinstance(x, np.ndarray)
152 | 	isscalar = np.isscalar(x)
153 | 	return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32)
154 | 
155 | 
156 | def _asfloat(x):
157 | 	#wrapper to support tensorflow tensors/numpy arrays
158 | 	isnumpy = isinstance(x, np.ndarray)
159 | 	isscalar = np.isscalar(x)
160 | 	return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32)
161 | 
162 | def sequence_mask(input_lengths, max_len=None, expand=True):
163 | 	if max_len is None:
164 | 		max_len = tf.reduce_max(input_lengths)
165 | 
166 | 	if expand:
167 | 		return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1)
168 | 	return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
169 | 
170 | 
171 | def waveplot(path, y_hat, y_target, hparams):
172 | 	sr = hparams.sample_rate
173 | 
174 | 	plt.figure(figsize=(12, 4))
175 | 	if y_target is not None:
176 | 		ax = plt.subplot(2, 1, 1)
177 | 		dsp.waveplot(y_target, sr=sr)
178 | 		ax.set_title('Target waveform')
179 | 		ax = plt.subplot(2, 1, 2)
180 | 		dsp.waveplot(y_hat, sr=sr)
181 | 		ax.set_title('Prediction waveform')
182 | 	else:
183 | 		ax = plt.subplot(1, 1, 1)
184 | 		dsp.waveplot(y_hat, sr=sr)
185 | 		ax.set_title('Generated waveform')
186 | 
187 | 	plt.tight_layout()
188 | 	plt.savefig(path, format="png")
189 | 	plt.close()


--------------------------------------------------------------------------------
/tacotron/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | 
  5 | 
  6 | class TacoTestHelper(Helper):
  7 | 	def __init__(self, batch_size, hparams):
  8 | 		with tf.name_scope('TacoTestHelper'):
  9 | 			self._batch_size = batch_size
 10 | 			self._output_dim = hparams.num_mels
 11 | 			self._reduction_factor = hparams.outputs_per_step
 12 | 			self.stop_at_any = hparams.stop_at_any
 13 | 
 14 | 	@property
 15 | 	def batch_size(self):
 16 | 		return self._batch_size
 17 | 
 18 | 	@property
 19 | 	def token_output_size(self):
 20 | 		return self._reduction_factor
 21 | 
 22 | 	@property
 23 | 	def sample_ids_shape(self):
 24 | 		return tf.TensorShape([])
 25 | 
 26 | 	@property
 27 | 	def sample_ids_dtype(self):
 28 | 		return np.int32
 29 | 
 30 | 	def initialize(self, name=None):
 31 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 | 	def sample(self, time, outputs, state, name=None):
 34 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
 37 | 		'''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
 38 | 		with tf.name_scope('TacoTestHelper'):
 39 | 			#A sequence is finished when the output probability is > 0.5
 40 | 			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
 41 | 
 42 | 			#Since we are predicting r frames at each step, two modes are 
 43 | 			#then possible:
 44 | 			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
 45 | 			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
 46 | 			#Note:
 47 | 			#	With enough training steps, the model should be able to predict when to stop correctly
 48 | 			#	and the use of stop_at_any = True would be recommended. If however the model didn't
 49 | 			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option 
 50 | 			#	to get a correct synthesis
 51 | 			if self.stop_at_any:
 52 | 				finished = tf.reduce_any(finished) #Recommended
 53 | 			else:
 54 | 				finished = tf.reduce_all(finished) #Safer option
 55 | 			
 56 | 			# Feed last output frame as next input. outputs is [N, output_dim * r]
 57 | 			next_inputs = outputs[:, -self._output_dim:]
 58 | 			next_state = state
 59 | 			return (finished, next_inputs, next_state)
 60 | 
 61 | 
 62 | class TacoTrainingHelper(Helper):
 63 | 	def __init__(self, batch_size, targets, stop_targets, hparams, gta, evaluating, global_step):
 64 | 		# inputs is [N, T_in], targets is [N, T_out, D]
 65 | 		with tf.name_scope('TacoTrainingHelper'):
 66 | 			self._batch_size = batch_size
 67 | 			self._output_dim = hparams.num_mels
 68 | 			self._reduction_factor = hparams.outputs_per_step
 69 | 			self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
 70 | 			self.gta = gta
 71 | 			self.eval = evaluating
 72 | 			self._hparams = hparams
 73 | 			self.global_step = global_step
 74 | 
 75 | 			r = self._reduction_factor
 76 | 			# Feed every r-th target frame as input
 77 | 			self._targets = targets[:, r-1::r, :]
 78 | 
 79 | 			#Maximal sequence length
 80 | 			self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
 81 | 
 82 | 	@property
 83 | 	def batch_size(self):
 84 | 		return self._batch_size
 85 | 
 86 | 	@property
 87 | 	def token_output_size(self):
 88 | 		return self._reduction_factor
 89 | 
 90 | 	@property
 91 | 	def sample_ids_shape(self):
 92 | 		return tf.TensorShape([])
 93 | 
 94 | 	@property
 95 | 	def sample_ids_dtype(self):
 96 | 		return np.int32
 97 | 
 98 | 	def initialize(self, name=None):
 99 | 		#Compute teacher forcing ratio for this global step.
100 | 		#In GTA mode, override teacher forcing scheme to work with full teacher forcing
101 | 		if self.gta:
102 | 			self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
103 | 		elif self.eval and self._hparams.natural_eval:
104 | 			self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
105 | 		else:
106 | 			if self._hparams.tacotron_teacher_forcing_mode == 'scheduled':
107 | 				self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
108 | 					self.global_step, self._hparams)
109 | 
110 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
111 | 
112 | 	def sample(self, time, outputs, state, name=None):
113 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
114 | 
115 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
116 | 		with tf.name_scope(name or 'TacoTrainingHelper'):
117 | 			#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
118 | 			finished = (time + 1 >= self._lengths)
119 | 
120 | 			#Pick previous outputs randomly with respect to teacher forcing ratio
121 | 			next_inputs = tf.cond(
122 | 				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
123 | 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
124 | 				lambda: outputs[:,-self._output_dim:])
125 | 
126 | 			#Pass on state
127 | 			next_state = state
128 | 			return (finished, next_inputs, next_state)
129 | 
130 | 
131 | def _go_frames(batch_size, output_dim):
132 | 	'''Returns all-zero <GO> frames for a given batch size and output dimension'''
133 | 	return tf.tile([[0.0]], [batch_size, output_dim])
134 | 
135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
136 | 		#################################################################
137 | 		# Narrow Cosine Decay:
138 | 
139 | 		# Phase 1: tfr = 1
140 | 		# We only start learning rate decay after 10k steps
141 | 
142 | 		# Phase 2: tfr in ]0, 1[
143 | 		# decay reach minimal value at step ~280k
144 | 
145 | 		# Phase 3: tfr = 0
146 | 		# clip by minimal teacher forcing ratio value (step >~ 280k)
147 | 		#################################################################
148 | 		#Compute natural cosine decay
149 | 		tfr = tf.train.cosine_decay(init_tfr,
150 | 			global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
151 | 			decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
152 | 			alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
153 | 			name='tfr_cosine_decay')
154 | 
155 | 		#force teacher forcing ratio to take initial value when global step < start decay step.
156 | 		narrow_tfr = tf.cond(
157 | 			tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
158 | 			lambda: tf.convert_to_tensor(init_tfr),
159 | 			lambda: tfr)
160 | 
161 | 		return narrow_tfr


--------------------------------------------------------------------------------
/datasets/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np 
  4 | from scipy import signal
  5 | import tensorflow as tf 
  6 | from scipy.io import wavfile
  7 | 
  8 | 
  9 | def load_wav(path, sr):
 10 | 	return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | def save_wav(wav, path, sr):
 13 | 	wav *= 32767 / max(0.01, np.max(np.abs(wav))) 
 14 | 	#proposed by @dsmiller
 15 | 	wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 18 | def start_and_end_indices(quantized, silence_threshold=2):
 19 | 	for start in range(quantized.size):
 20 | 		if abs(quantized[start] - 127) > silence_threshold:
 21 | 			break
 22 | 	for end in range(quantized.size - 1, 1, -1):
 23 | 		if abs(quantized[end] - 127) > silence_threshold:
 24 | 			break
 25 | 
 26 | 	assert abs(quantized[start] - 127) > silence_threshold
 27 | 	assert abs(quantized[end] - 127) > silence_threshold
 28 | 
 29 | 	return start, end
 30 | 
 31 | def trim_silence(wav, hparams):
 32 | 	'''Trim leading and trailing silence
 33 | 
 34 | 	Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end.
 35 | 	'''
 36 | 	#Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset.
 37 | 	return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
 38 | 
 39 | def get_hop_size(hparams):
 40 | 	hop_size = hparams.hop_size
 41 | 	if hop_size is None:
 42 | 		assert hparams.frame_shift_ms is not None
 43 | 		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 44 | 	return hop_size
 45 | 
 46 | def linearspectrogram(wav, hparams):
 47 | 	D = _stft(wav, hparams)
 48 | 	S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
 49 | 
 50 | 	if hparams.signal_normalization:
 51 | 		return _normalize(S, hparams)
 52 | 	return S
 53 | 
 54 | def melspectrogram(wav, hparams):
 55 | 	D = _stft(wav, hparams)
 56 | 	S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
 57 | 
 58 | 	if hparams.signal_normalization:
 59 | 		return _normalize(S, hparams)
 60 | 	return S
 61 | 
 62 | def inv_linear_spectrogram(linear_spectrogram, hparams):
 63 | 	'''Converts linear spectrogram to waveform using librosa'''
 64 | 	if hparams.signal_normalization:
 65 | 		D = _denormalize(linear_spectrogram, hparams)
 66 | 	else:
 67 | 		D = linear_spectrogram
 68 | 
 69 | 	S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
 70 | 
 71 | 	if hparams.use_lws:
 72 | 		processor = _lws_processor(hparams)
 73 | 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 74 | 		y = processor.istft(D).astype(np.float32)
 75 | 		return y
 76 | 	else:
 77 | 		return _griffin_lim(S ** hparams.power, hparams)
 78 | 	
 79 | 
 80 | def inv_mel_spectrogram(mel_spectrogram, hparams):
 81 | 	'''Converts mel spectrogram to waveform using librosa'''
 82 | 	if hparams.signal_normalization:
 83 | 		D = _denormalize(mel_spectrogram, hparams)
 84 | 	else:
 85 | 		D = mel_spectrogram
 86 | 
 87 | 	S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
 88 | 
 89 | 	if hparams.use_lws:
 90 | 		processor = _lws_processor(hparams)
 91 | 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 92 | 		y = processor.istft(D).astype(np.float32)
 93 | 		return y
 94 | 	else:
 95 | 		return _griffin_lim(S ** hparams.power, hparams)
 96 | 
 97 | def _lws_processor(hparams):
 98 | 	import lws
 99 | 	return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
100 | 
101 | def _griffin_lim(S, hparams):
102 | 	'''librosa implementation of Griffin-Lim
103 | 	Based on https://github.com/librosa/librosa/issues/434
104 | 	'''
105 | 	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
106 | 	S_complex = np.abs(S).astype(np.complex)
107 | 	y = _istft(S_complex * angles, hparams)
108 | 	for i in range(hparams.griffin_lim_iters):
109 | 		angles = np.exp(1j * np.angle(_stft(y, hparams)))
110 | 		y = _istft(S_complex * angles, hparams)
111 | 	return y
112 | 
113 | def _stft(y, hparams):
114 | 	if hparams.use_lws:
115 | 		return _lws_processor(hparams).stft(y).T
116 | 	else:
117 | 		return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
118 | 
119 | def _istft(y, hparams):
120 | 	return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
121 | 
122 | def num_frames(length, fsize, fshift):
123 | 	"""Compute number of time frames of spectrogram
124 | 	"""
125 | 	pad = (fsize - fshift)
126 | 	if length % fshift == 0:
127 | 		M = (length + pad * 2 - fsize) // fshift + 1
128 | 	else:
129 | 		M = (length + pad * 2 - fsize) // fshift + 2
130 | 	return M
131 | 
132 | 
133 | def pad_lr(x, fsize, fshift):
134 | 	"""Compute left and right padding
135 | 	"""
136 | 	M = num_frames(len(x), fsize, fshift)
137 | 	pad = (fsize - fshift)
138 | 	T = len(x) + 2 * pad
139 | 	r = (M - 1) * fshift + fsize - T
140 | 	return pad, pad + r
141 | 
142 | 
143 | # Conversions
144 | _mel_basis = None
145 | _inv_mel_basis = None
146 | 
147 | def _linear_to_mel(spectogram, hparams):
148 | 	global _mel_basis
149 | 	if _mel_basis is None:
150 | 		_mel_basis = _build_mel_basis(hparams)
151 | 	return np.dot(_mel_basis, spectogram)
152 | 
153 | def _mel_to_linear(mel_spectrogram, hparams):
154 | 	global _inv_mel_basis
155 | 	if _inv_mel_basis is None:
156 | 		_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
157 | 	return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
158 | 
159 | def _build_mel_basis(hparams):
160 | 	assert hparams.fmax <= hparams.sample_rate // 2
161 | 	return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
162 | 							   fmin=hparams.fmin, fmax=hparams.fmax)
163 | 
164 | def _amp_to_db(x, hparams):
165 | 	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
166 | 	return 20 * np.log10(np.maximum(min_level, x))
167 | 
168 | def _db_to_amp(x):
169 | 	return np.power(10.0, (x) * 0.05)
170 | 
171 | def _normalize(S, hparams):
172 | 	if hparams.allow_clipping_in_normalization:
173 | 		if hparams.symmetric_mels:
174 | 			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
175 | 			 -hparams.max_abs_value, hparams.max_abs_value)
176 | 		else:
177 | 			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
178 | 
179 | 	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
180 | 	if hparams.symmetric_mels:
181 | 		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
182 | 	else:
183 | 		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
184 | 
185 | def _denormalize(D, hparams):
186 | 	if hparams.allow_clipping_in_normalization:
187 | 		if hparams.symmetric_mels:
188 | 			return (((np.clip(D, -hparams.max_abs_value,
189 | 				hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 
190 | 				+ hparams.min_level_db)
191 | 		else:
192 | 			return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
193 | 
194 | 	if hparams.symmetric_mels:
195 | 		return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
196 | 	else:
197 | 		return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tacotron-2:
  2 | Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
  3 | 
  4 | 
  5 | # Repository Structure:
  6 | 	Tacotron-2
  7 | 	├── datasets
  8 | 	├── en_UK		(0)
  9 | 	│   └── by_book
 10 | 	│       └── female
 11 | 	├── en_US		(0)
 12 | 	│   └── by_book
 13 | 	│       ├── female
 14 | 	│       └── male
 15 | 	├── LJSpeech-1.1	(0)
 16 | 	│   └── wavs
 17 | 	├── logs-Tacotron	(2)
 18 | 	│   ├── eval_-dir
 19 | 	│   │ 	├── plots
 20 | 	│ 	│ 	└── wavs
 21 | 	│   ├── mel-spectrograms
 22 | 	│   ├── plots
 23 | 	│   ├── pretrained
 24 | 	│   └── wavs
 25 | 	├── logs-Wavenet	(4)
 26 | 	│   ├── eval-dir
 27 | 	│   │ 	├── plots
 28 | 	│ 	│ 	└── wavs
 29 | 	│   ├── plots
 30 | 	│   ├── pretrained
 31 | 	│   └── wavs
 32 | 	├── papers
 33 | 	├── tacotron
 34 | 	│   ├── models
 35 | 	│   └── utils
 36 | 	├── tacotron_output	(3)
 37 | 	│   ├── eval
 38 | 	│   ├── gta
 39 | 	│   ├── logs-eval
 40 | 	│   │   ├── plots
 41 | 	│   │   └── wavs
 42 | 	│   └── natural
 43 | 	├── wavenet_output	(5)
 44 | 	│   ├── plots
 45 | 	│   └── wavs
 46 | 	├── training_data	(1)
 47 | 	│   ├── audio
 48 | 	│   ├── linear
 49 | 	│	└── mels
 50 | 	└── wavenet_vocoder
 51 | 		└── models
 52 | 
 53 | 
 54 | The previous tree shows the current state of the repository (separate training, one step at a time).
 55 | 
 56 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**).
 57 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
 58 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
 59 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
 60 | - Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder.
 61 | - Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder.
 62 | 
 63 | 
 64 | Note:
 65 | - **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script.
 66 | - In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity.
 67 | - If you run training of both **models at the same time**, repository structure will be different.
 68 | 
 69 | # Model Architecture:
 70 | <p align="center">
 71 |   <img src="https://preview.ibb.co/bU8sLS/Tacotron_2_Architecture.png"/>
 72 | </p>
 73 | 
 74 | The model described by the authors can be divided in two parts:
 75 | - Spectrogram prediction network
 76 | - Wavenet vocoder
 77 | 
 78 | To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki)
 79 | 
 80 | # Current state:
 81 | 
 82 | To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4)
 83 | 
 84 | since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training.
 85 | 
 86 | # How to start
 87 | first, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/).
 88 | 
 89 | next you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**)
 90 | 
 91 | > pip install -r requirements.txt
 92 | 
 93 | # Dataset:
 94 | We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it)
 95 | 
 96 | We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages.
 97 | 
 98 | After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.**
 99 | 
100 | # Preprocessing
101 | Before running the following steps, please make sure you are inside **Tacotron-2 folder**
102 | 
103 | > cd Tacotron-2
104 | 
105 | Preprocessing can then be started using: 
106 | 
107 | > python preprocess.py
108 | 
109 | dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**.
110 | 
111 | Example M-AILABS:
112 | 
113 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth'
114 | 
115 | or if you want to use all books for a single speaker:
116 | 
117 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True
118 | 
119 | This should take no longer than a **few minutes.**
120 | 
121 | # Training:
122 | To **train both models** sequentially (one after the other):
123 | 
124 | > python train.py --model='Tacotron-2'
125 | 
126 | or:
127 | 
128 | > python train.py --model='Both'
129 | 
130 | Feature prediction model can **separately** be **trained** using:
131 | 
132 | > python train.py --model='Tacotron'
133 | 
134 | checkpoints will be made each **250 steps** and stored under **logs-Tacotron folder.**
135 | 
136 | Naturally, **training the wavenet separately** is done by:
137 | 
138 | > python train.py --model='WaveNet'
139 | 
140 | logs will be stored inside **logs-Wavenet**.
141 | 
142 | **Note:**
143 | - If model argument is not provided, training will default to Tacotron-2 model training. (both models)
144 | - Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
145 | 
146 | # Synthesis
147 | To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work):
148 | 
149 | > python synthesize.py --model='Tacotron-2'
150 | 
151 | For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis:
152 | 
153 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
154 | 
155 | > python synthesize.py --model='Tacotron' --mode='eval'
156 | 
157 | - **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step).
158 | 
159 | > python synthesize.py --model='Tacotron' --GTA=False
160 | 
161 | 
162 | - **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper)
163 | 
164 | > python synthesize.py --model='Tacotron' --GTA=True
165 | 
166 | Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with:
167 | 
168 | > python synthesize.py --model='WaveNet'
169 | 
170 | **Note:**
171 | - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
172 | - Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
173 | 
174 | # Pretrained model and Samples:
175 | Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465).
176 | 
177 | 
178 | # References and Resources:
179 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
180 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
181 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
182 | - [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf)
183 | - [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf)
184 | - [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
185 | - [keithito/tacotron](https://github.com/keithito/tacotron)
186 | 
187 | 


--------------------------------------------------------------------------------
/tacotron/models/Architecture_wrappers.py:
--------------------------------------------------------------------------------
  1 | """A set of wrappers usefull for tacotron 2 architecture
  2 | All notations and variable names were used in concordance with originial tensorflow implementation
  3 | """
  4 | import collections
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow.contrib.rnn import RNNCell
  8 | from tensorflow.python.framework import ops
  9 | from tensorflow.python.ops import rnn_cell_impl
 10 | from tensorflow.python.ops import check_ops
 11 | from tensorflow.python.util import nest
 12 | from tensorflow.python.ops import array_ops
 13 | from tensorflow.python.ops import tensor_array_ops
 14 | from tensorflow.python.framework import tensor_shape
 15 | from tacotron.models.attention import _compute_attention
 16 | 
 17 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 18 | 
 19 | 
 20 | 
 21 | class TacotronEncoderCell(RNNCell):
 22 | 	"""Tacotron 2 Encoder Cell
 23 | 	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
 24 | 	layer to predict the hidden representation vector (or memory)
 25 | 	"""
 26 | 
 27 | 	def __init__(self, convolutional_layers, lstm_layer):
 28 | 		"""Initialize encoder parameters
 29 | 
 30 | 		Args:
 31 | 			convolutional_layers: Encoder convolutional block class
 32 | 			lstm_layer: encoder bidirectional lstm layer class
 33 | 		"""
 34 | 		super(TacotronEncoderCell, self).__init__()
 35 | 		#Initialize encoder layers
 36 | 		self._convolutions = convolutional_layers
 37 | 		self._cell = lstm_layer
 38 | 
 39 | 	def __call__(self, inputs, input_lengths=None):
 40 | 		#Pass input sequence through a stack of convolutional layers
 41 | 		conv_output = self._convolutions(inputs)
 42 | 
 43 | 		#Extract hidden representation from encoder lstm cells
 44 | 		hidden_representation = self._cell(conv_output, input_lengths)
 45 | 
 46 | 		#For shape visualization
 47 | 		self.conv_output_shape = conv_output.shape
 48 | 		return hidden_representation
 49 | 
 50 | 
 51 | class TacotronDecoderCellState(
 52 | 	collections.namedtuple("TacotronDecoderCellState",
 53 | 	 ("cell_state", "attention", "time", "alignments",
 54 | 	  "alignment_history"))):
 55 | 	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
 56 | 	Contains:
 57 | 	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 58 | 		step.
 59 | 	  - `attention`: The attention emitted at the previous time step.
 60 | 	  - `time`: int32 scalar containing the current time step.
 61 | 	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 62 | 		 emitted at the previous time step for each attention mechanism.
 63 | 	  - `alignment_history`: a single or tuple of `TensorArray`(s)
 64 | 		 containing alignment matrices from all time steps for each attention
 65 | 		 mechanism. Call `stack()` on each to convert to a `Tensor`.
 66 | 	"""
 67 | 	def replace(self, **kwargs):
 68 | 		"""Clones the current state while overwriting components provided by kwargs.
 69 | 		"""
 70 | 		return super(TacotronDecoderCellState, self)._replace(**kwargs)
 71 | 
 72 | class TacotronDecoderCell(RNNCell):
 73 | 	"""Tactron 2 Decoder Cell
 74 | 	Decodes encoder output and previous mel frames into next r frames
 75 | 
 76 | 	Decoder Step i:
 77 | 		1) Prenet to compress last output information
 78 | 		2) Concat compressed inputs with previous context vector (input feeding) *
 79 | 		3) Decoder RNN (actual decoding) to predict current state s_{i} *
 80 | 		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 81 | 		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 82 | 		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 83 | 
 84 | 	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
 85 | 	and wrap that with the prenet before doing an input feeding, and with the prediction layer
 86 | 	that uses RNN states to project on output space. Actions marked with (*) can be replaced with 
 87 | 	tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
 88 | 	"""
 89 | 
 90 | 	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
 91 | 		"""Initialize decoder parameters
 92 | 
 93 | 		Args:
 94 | 		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
 95 | 		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to 
 96 | 			    learn encoder-decoder alignments
 97 | 		    rnn_cell: Instance of RNNCell, main body of the decoder
 98 | 		    frame_projection: tensorflow fully connected layer with r * num_mels output units
 99 | 		    stop_projection: tensorflow fully connected layer, expected to project to a scalar 
100 | 			    and through a sigmoid activation
101 | 			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
102 | 		"""
103 | 		super(TacotronDecoderCell, self).__init__()
104 | 		#Initialize decoder layers
105 | 		self._prenet = prenet
106 | 		self._attention_mechanism = attention_mechanism
107 | 		self._cell = rnn_cell
108 | 		self._frame_projection = frame_projection
109 | 		self._stop_projection = stop_projection
110 | 
111 | 		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
112 | 
113 | 	def _batch_size_checks(self, batch_size, error_message):
114 | 		return [check_ops.assert_equal(batch_size,
115 | 		  self._attention_mechanism.batch_size,
116 | 		  message=error_message)]
117 | 
118 | 	@property
119 | 	def output_size(self):
120 | 		return self._frame_projection.shape
121 | 
122 | 	@property
123 | 	def state_size(self):
124 | 		"""The `state_size` property of `TacotronDecoderCell`.
125 | 
126 | 		Returns:
127 | 		  An `TacotronDecoderCell` tuple containing shapes used by this object.
128 | 		"""
129 | 		return TacotronDecoderCellState(
130 | 			cell_state=self._cell._cell.state_size,
131 | 			time=tensor_shape.TensorShape([]),
132 | 			attention=self._attention_layer_size,
133 | 			alignments=self._attention_mechanism.alignments_size,
134 | 			alignment_history=())
135 | 
136 | 	def zero_state(self, batch_size, dtype):
137 | 		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
138 | 		
139 | 		Args:
140 | 		  batch_size: `0D` integer tensor: the batch size.
141 | 		  dtype: The internal state data type.
142 | 		Returns:
143 | 		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
144 | 		  possibly, empty `TensorArray` objects.
145 | 		Raises:
146 | 		  ValueError: (or, possibly at runtime, InvalidArgument), if
147 | 			`batch_size` does not match the output size of the encoder passed
148 | 			to the wrapper object at initialization time.
149 | 		"""
150 | 		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
151 | 			cell_state = self._cell._cell.zero_state(batch_size, dtype)
152 | 			error_message = (
153 | 				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
154 | 				"Non-matching batch sizes between the memory "
155 | 				"(encoder output) and the requested batch size.")
156 | 			with ops.control_dependencies(
157 | 				self._batch_size_checks(batch_size, error_message)):
158 | 				cell_state = nest.map_structure(
159 | 					lambda s: array_ops.identity(s, name="checked_cell_state"),
160 | 					cell_state)
161 | 			return TacotronDecoderCellState(
162 | 				cell_state=cell_state,
163 | 				time=array_ops.zeros([], dtype=tf.int32),
164 | 				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
165 | 				  dtype),
166 | 				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
167 | 				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
168 | 				dynamic_size=True))
169 | 
170 | 	def __call__(self, inputs, state):
171 | 		#Information bottleneck (essential for learning attention)
172 | 		prenet_output = self._prenet(inputs)
173 | 
174 | 		#Concat context vector and prenet output to form LSTM cells input (input feeding)
175 | 		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
176 | 
177 | 		#Unidirectional LSTM layers
178 | 		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
179 | 
180 | 
181 | 		#Compute the attention (context) vector and alignments using
182 | 		#the new decoder cell hidden state as query vector 
183 | 		#and cumulative alignments to extract location features
184 | 		#The choice of the new cell hidden state (s_{i}) of the last
185 | 		#decoder RNN Cell is based on Luong et Al. (2015):
186 | 		#https://arxiv.org/pdf/1508.04025.pdf
187 | 		previous_alignments = state.alignments
188 | 		previous_alignment_history = state.alignment_history
189 | 		context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 
190 | 			LSTM_output,
191 | 			previous_alignments,
192 | 			attention_layer=None)
193 | 
194 | 		#Concat LSTM outputs and context vector to form projections inputs
195 | 		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
196 | 
197 | 		#Compute predicted frames and predicted <stop_token>
198 | 		cell_outputs = self._frame_projection(projections_input)
199 | 		stop_tokens = self._stop_projection(projections_input)
200 | 
201 | 		#Save alignment history
202 | 		alignment_history = previous_alignment_history.write(state.time, alignments)
203 | 
204 | 		#Prepare next decoder state
205 | 		next_state = TacotronDecoderCellState(
206 | 			time=state.time + 1,
207 | 			cell_state=next_cell_state,
208 | 			attention=context_vector,
209 | 			alignments=cumulated_alignments,
210 | 			alignment_history=alignment_history)
211 | 
212 | 		return (cell_outputs, stop_tokens), next_state 
213 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import os
  4 | from datetime import datetime
  5 | import time
  6 | import librosa
  7 | 
  8 | from wavenet_vocoder.models import create_model
  9 | from wavenet_vocoder.feeder import Feeder
 10 | from tacotron.utils import ValueWindow
 11 | import numpy as np 
 12 | from scipy.io import wavfile
 13 | import tensorflow as tf
 14 | from . import util
 15 | 
 16 | from hparams import hparams_debug_string
 17 | import infolog
 18 | 
 19 | log = infolog.log
 20 | 
 21 | 
 22 | def add_train_stats(model):
 23 | 	with tf.variable_scope('stats') as scope:
 24 | 		tf.summary.histogram('wav_outputs', model.y_hat)
 25 | 		tf.summary.histogram('wav_targets', model.y)
 26 | 		tf.summary.scalar('loss', model.loss)
 27 | 		return tf.summary.merge_all()
 28 | 
 29 | def add_test_stats(summary_writer, step, eval_loss):
 30 | 	values = [
 31 | 	tf.Summary.Value(tag='eval_model/eval_stats/eval_loss'),
 32 | 	]
 33 | 	test_summary = tf.Summary(value=values)
 34 | 	summary_writer.add_summary(test_summary, step)
 35 | 
 36 | 
 37 | def create_shadow_saver(model, global_step=None):
 38 | 	'''Load shadow variables of saved model.
 39 | 
 40 | 	Inspired by: https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
 41 | 
 42 | 	Can also use: shadow_dict = model.ema.variables_to_restore()
 43 | 	'''
 44 | 	#Add global step to saved variables to save checkpoints correctly
 45 | 	shadow_variables = [model.ema.average_name(v) for v in model.variables]
 46 | 	variables = model.variables
 47 | 
 48 | 	if global_step is not None:
 49 | 		shadow_variables += ['global_step']
 50 | 		variables += [global_step]
 51 | 
 52 | 	shadow_dict = dict(zip(shadow_variables, variables)) #dict(zip(keys, values)) -> {key1: value1, key2: value2, ...}
 53 | 	return tf.train.Saver(shadow_dict, max_to_keep=5)
 54 | 
 55 | def load_averaged_model(sess, sh_saver, checkpoint_path):
 56 | 	sh_saver.restore(sess, checkpoint_path)
 57 | 
 58 | 
 59 | def eval_step(sess, global_step, model, plot_dir, audio_dir, summary_writer, hparams):
 60 | 	'''Evaluate model during training.
 61 | 	Supposes that model variables are averaged.
 62 | 	'''
 63 | 	start_time = time.time()
 64 | 	y_hat, y_target, loss = sess.run([model.y_hat, model.y_target, model.eval_loss])
 65 | 	duration = time.time() - start_time
 66 | 	log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
 67 | 		len(y_target), duration, len(y_target)/duration))
 68 | 
 69 | 	pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(global_step))
 70 | 	target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(global_step))
 71 | 	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
 72 | 
 73 | 	#Save Audio
 74 | 	wavfile.write(pred_wav_path, hparams.sample_rate, y_hat)
 75 | 	wavfile.write(target_wav_path, hparams.sample_rate, y_target)
 76 | 
 77 | 	#Save figure
 78 | 	util.waveplot(plot_path, y_hat, y_target, model._hparams)
 79 | 	log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))
 80 | 
 81 | 	log('Writing eval summary!')
 82 | 	add_test_stats(summary_writer, global_step, loss)
 83 | 
 84 | def save_log(sess, global_step, model, plot_dir, audio_dir, hparams):
 85 | 	log('\nSaving intermediate states at step {}'.format(global_step))
 86 | 	idx = 0
 87 | 	y_hat, y, length = sess.run([model.y_hat_log[idx], model.y_log[idx], model.input_lengths[idx]])
 88 | 
 89 | 	#mask by length
 90 | 	y_hat[length:] = 0
 91 | 	y[length:] = 0
 92 | 
 93 | 	#Make audio and plot paths
 94 | 	pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(global_step))
 95 | 	target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(global_step))
 96 | 	plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
 97 | 
 98 | 	#Save audio
 99 | 	librosa.output.write_wav(pred_wav_path, y_hat, sr=hparams.sample_rate)
100 | 	librosa.output.write_wav(target_wav_path, y, sr=hparams.sample_rate)
101 | 
102 | 	#Save figure
103 | 	util.waveplot(plot_path, y_hat, y, hparams)
104 | 
105 | def save_checkpoint(sess, saver, checkpoint_path, global_step):
106 | 	saver.save(sess, checkpoint_path, global_step=global_step)
107 | 
108 | 
109 | def model_train_mode(args, feeder, hparams, global_step):
110 | 	with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
111 | 		model_name = None
112 | 		if args.model in ('Tacotron-2', 'Both'):
113 | 			model_name = 'WaveNet'
114 | 		model = create_model(model_name or args.model, hparams)
115 | 		#initialize model to train mode
116 | 		model.initialize(feeder.targets, feeder.local_condition_features, feeder.global_condition_features,
117 | 			feeder.input_lengths, x=feeder.inputs)
118 | 		model.add_loss()
119 | 		model.add_optimizer(global_step)
120 | 		stats = add_train_stats(model)
121 | 		return model, stats
122 | 
123 | def model_test_mode(args, feeder, hparams, global_step):
124 | 	with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
125 | 		model_name = None
126 | 		if args.model in ('Tacotron-2', 'Both'):
127 | 			model_name = 'WaveNet'
128 | 		model = create_model(model_name or args.model, hparams)
129 | 		#initialize model to test mode
130 | 		model.initialize(feeder.eval_targets, feeder.eval_local_condition_features, feeder.eval_global_condition_features,
131 | 			feeder.eval_input_lengths)
132 | 		model.add_loss()
133 | 		return model
134 | 
135 | def train(log_dir, args, hparams, input_path):
136 | 	save_dir = os.path.join(log_dir, 'wave_pretrained/')
137 | 	eval_dir = os.path.join(log_dir, 'eval-dir')
138 | 	audio_dir = os.path.join(log_dir, 'wavs')
139 | 	plot_dir = os.path.join(log_dir, 'plots')
140 | 	wav_dir = os.path.join(log_dir, 'wavs')
141 | 	eval_audio_dir = os.path.join(eval_dir, 'wavs')
142 | 	eval_plot_dir = os.path.join(eval_dir, 'plots')
143 | 	checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')
144 | 	input_path = os.path.join(args.base_dir, input_path)
145 | 	os.makedirs(save_dir, exist_ok=True)
146 | 	os.makedirs(wav_dir, exist_ok=True)
147 | 	os.makedirs(audio_dir, exist_ok=True)
148 | 	os.makedirs(plot_dir, exist_ok=True)
149 | 	os.makedirs(eval_audio_dir, exist_ok=True)
150 | 	os.makedirs(eval_plot_dir, exist_ok=True)
151 | 
152 | 	log('Checkpoint_path: {}'.format(checkpoint_path))
153 | 	log('Loading training data from: {}'.format(input_path))
154 | 	log('Using model: {}'.format(args.model))
155 | 	log(hparams_debug_string())
156 | 
157 | 	#Start by setting a seed for repeatability
158 | 	tf.set_random_seed(hparams.wavenet_random_seed)
159 | 
160 | 	#Set up data feeder
161 | 	coord = tf.train.Coordinator()
162 | 	with tf.variable_scope('datafeeder') as scope:
163 | 		feeder = Feeder(coord, input_path, args.base_dir, hparams)
164 | 
165 | 	#Set up model
166 | 	global_step = tf.Variable(0, name='global_step', trainable=False)
167 | 	model, stats = model_train_mode(args, feeder, hparams, global_step)
168 | 	eval_model = model_test_mode(args, feeder, hparams, global_step)
169 | 
170 | 	#book keeping
171 | 	step = 0
172 | 	time_window = ValueWindow(100)
173 | 	loss_window = ValueWindow(100)
174 | 	sh_saver = create_shadow_saver(model, global_step)
175 | 
176 | 	log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps))
177 | 
178 | 	#Memory allocation on the memory
179 | 	config = tf.ConfigProto()
180 | 	config.gpu_options.allow_growth = True
181 | 
182 | 	#Train
183 | 	with tf.Session(config=config) as sess:
184 | 		try:
185 | 			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
186 | 			sess.run(tf.global_variables_initializer())
187 | 			checkpoint_state=None
188 | 			#saved model restoring
189 | 			if args.restore:
190 | 				#Restore saved model if the user requested it, default = True
191 | 				try:
192 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
193 | 				except tf.errors.OutOfRangeError as e:
194 | 					log('Cannot restore checkpoint: {}'.format(e))
195 | 
196 | 			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
197 | 				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
198 | 				load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
199 | 
200 | 			else:
201 | 				if not args.restore:
202 | 					log('Starting new training!')
203 | 				else:
204 | 					log('No model to load at {}'.format(save_dir))
205 | 
206 | 			#initializing feeder
207 | 			feeder.start_threads(sess)
208 | 
209 | 			#Training loop
210 | 			while not coord.should_stop() and step < args.wavenet_train_steps:
211 | 				start_time = time.time()
212 | 				step, y_hat, loss, opt = sess.run([global_step, model.y_hat, model.loss, model.optimize])
213 | 				time_window.append(time.time() - start_time)
214 | 				loss_window.append(loss)
215 | 
216 | 				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
217 | 					step, time_window.average, loss, loss_window.average)
218 | 				log(message, end='\r')
219 | 
220 | 				if loss > 100 or np.isnan(loss):
221 | 					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
222 | 					raise Exception('Loss exploded')
223 | 
224 | 				if step % args.summary_interval == 0:
225 | 					log('\nWriting summary at step {}'.format(step))
226 | 					summary_writer.add_summary(sess.run(stats), step)
227 | 
228 | 				if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
229 | 					save_log(sess, step, model, plot_dir, audio_dir, hparams=hparams)
230 | 					save_checkpoint(sess, sh_saver, checkpoint_path, global_step)
231 | 
232 | 				if step % args.eval_interval == 0:
233 | 					log('\nEvaluating at step {}'.format(step))
234 | 					eval_step(sess, step, eval_model, eval_plot_dir, eval_audio_dir, summary_writer=summary_writer , hparams=model._hparams)
235 | 
236 | 			log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps))
237 | 			return save_dir
238 | 
239 | 		except Exception as e:
240 | 			log('Exiting due to Exception: {}'.format(e))
241 | 
242 | 
243 | def wavenet_train(args, log_dir, hparams, input_path):
244 | 	return train(log_dir, args, hparams, input_path)
245 | 


--------------------------------------------------------------------------------
/tacotron/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.ops import nn_ops
  6 | from tensorflow.python.layers import core as layers_core
  7 | from tensorflow.python.ops import array_ops
  8 | from tensorflow.python.ops import variable_scope
  9 | from tensorflow.python.ops import math_ops
 10 | 
 11 | 
 12 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 13 | def _compute_attention(attention_mechanism, cell_output, attention_state,
 14 | 					   attention_layer):
 15 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 16 | 	alignments, next_attention_state = attention_mechanism(
 17 | 		cell_output, state=attention_state)
 18 | 
 19 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 20 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 21 | 	# Context is the inner product of alignments and values along the
 22 | 	# memory time dimension.
 23 | 	# alignments shape is
 24 | 	#   [batch_size, 1, memory_time]
 25 | 	# attention_mechanism.values shape is
 26 | 	#   [batch_size, memory_time, memory_size]
 27 | 	# the batched matmul is over memory_time, so the output shape is
 28 | 	#   [batch_size, 1, memory_size].
 29 | 	# we then squeeze out the singleton dim.
 30 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 31 | 	context = array_ops.squeeze(context, [1])
 32 | 
 33 | 	if attention_layer is not None:
 34 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 35 | 	else:
 36 | 		attention = context
 37 | 
 38 | 	return attention, alignments, next_attention_state
 39 | 
 40 | 
 41 | def _location_sensitive_score(W_query, W_fil, W_keys):
 42 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 43 | 	This attention is described in:
 44 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 45 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 46 | 	  vances in Neural Information Processing Systems, 2015, pp.
 47 | 	  577–585.
 48 | 
 49 | 	#############################################################################
 50 | 			  hybrid attention (content-based + location-based)
 51 | 							   f = F * α_{i-1}
 52 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 53 | 	#############################################################################
 54 | 
 55 | 	Args:
 56 | 		W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
 57 | 		W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
 58 | 		W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
 59 | 	Returns:
 60 | 		A '[batch_size, max_time]' attention score (energy)
 61 | 	"""
 62 | 	# Get the number of hidden units from the trailing dimension of keys
 63 | 	dtype = W_query.dtype
 64 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 65 | 	v_a = tf.get_variable(
 66 | 		'attention_variable', shape=[num_units], dtype=dtype,
 67 | 		initializer=tf.contrib.layers.xavier_initializer())
 68 | 	b_a = tf.get_variable(
 69 | 		'attention_bias', shape=[num_units], dtype=dtype,
 70 | 		initializer=tf.zeros_initializer())
 71 | 
 72 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 73 | 
 74 | def _smoothing_normalization(e):
 75 | 	"""Applies a smoothing normalization function instead of softmax
 76 | 	Introduced in:
 77 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 78 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 79 | 	  vances in Neural Information Processing Systems, 2015, pp.
 80 | 	  577–585.
 81 | 
 82 | 	############################################################################
 83 | 						Smoothing normalization function
 84 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 85 | 	############################################################################
 86 | 
 87 | 	Args:
 88 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 89 | 			values of an attention mechanism
 90 | 	Returns:
 91 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 92 | 			attendance to multiple memory time steps.
 93 | 	"""
 94 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 95 | 
 96 | 
 97 | class LocationSensitiveAttention(BahdanauAttention):
 98 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 99 | 	Usually referred to as "hybrid" attention (content-based + location-based)
100 | 	Extends the additive attention described in:
101 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
102 |   tion by jointly learning to align and translate,” in Proceedings
103 |   of ICLR, 2015."
104 | 	to use previous alignments as additional location features.
105 | 	
106 | 	This attention is described in:
107 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
108 |   gio, “Attention-based models for speech recognition,” in Ad-
109 |   vances in Neural Information Processing Systems, 2015, pp.
110 |   577–585.
111 | 	"""
112 | 
113 | 	def __init__(self,
114 | 				 num_units,
115 | 				 memory,
116 | 				 hparams,
117 | 				 mask_encoder=True,
118 | 				 memory_sequence_length=None,
119 | 				 smoothing=False,
120 | 				 cumulate_weights=True,
121 | 				 name='LocationSensitiveAttention'):
122 | 		"""Construct the Attention mechanism.
123 | 		Args:
124 | 			num_units: The depth of the query mechanism.
125 | 			memory: The memory to query; usually the output of an RNN encoder.  This
126 | 				tensor should be shaped `[batch_size, max_time, ...]`.
127 | 			mask_encoder (optional): Boolean, whether to mask encoder paddings.
128 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
129 | 				in memory.  If provided, the memory tensor rows are masked with zeros
130 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
131 | 			smoothing (optional): Boolean. Determines which normalization function to use.
132 | 				Default normalization function (probablity_fn) is softmax. If smoothing is 
133 | 				enabled, we replace softmax with:
134 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
135 | 				Introduced in:
136 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
137 | 				  gio, “Attention-based models for speech recognition,” in Ad-
138 | 				  vances in Neural Information Processing Systems, 2015, pp.
139 | 				  577–585.
140 | 				This is mainly used if the model wants to attend to multiple inputs parts 
141 | 				at the same decoding step. We probably won't be using it since multiple sound
142 | 				frames may depend from the same character, probably not the way around.
143 | 				Note:
144 | 					We still keep it implemented in case we want to test it. They used it in the
145 | 					paper in the context of speech recognition, where one phoneme may depend on
146 | 					multiple subsequent sound frames.
147 | 			name: Name to use when creating ops.
148 | 		"""
149 | 		#Create normalization function
150 | 		#Setting it to None defaults in using softmax
151 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
152 | 		memory_length = memory_sequence_length if (mask_encoder==True) else None
153 | 		super(LocationSensitiveAttention, self).__init__(
154 | 				num_units=num_units,
155 | 				memory=memory,
156 | 				memory_sequence_length=memory_length,
157 | 				probability_fn=normalization_function,
158 | 				name=name)
159 | 		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
160 | 			kernel_size=hparams.attention_kernel, padding='same', use_bias=True,
161 | 			bias_initializer=tf.zeros_initializer(), name='location_features_convolution')
162 | 		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 
163 | 			dtype=tf.float32, name='location_features_layer')
164 | 		self._cumulate = cumulate_weights
165 | 
166 | 	def __call__(self, query, state):
167 | 		"""Score the query based on the keys and values.
168 | 		Args:
169 | 			query: Tensor of dtype matching `self.values` and shape
170 | 				`[batch_size, query_depth]`.
171 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
172 | 				`[batch_size, alignments_size]`
173 | 				(`alignments_size` is memory's `max_time`).
174 | 		Returns:
175 | 			alignments: Tensor of dtype matching `self.values` and shape
176 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
177 | 				`max_time`).
178 | 		"""
179 | 		previous_alignments = state
180 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
181 | 			
182 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
183 | 			processed_query = self.query_layer(query) if self.query_layer else query
184 | 			# -> [batch_size, 1, attention_dim]
185 | 			processed_query = tf.expand_dims(processed_query, 1)
186 | 
187 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
188 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
189 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
190 | 			# location features [batch_size, max_time, filters]
191 | 			f = self.location_convolution(expanded_alignments)
192 | 			# Projected location features [batch_size, max_time, attention_dim]
193 | 			processed_location_features = self.location_layer(f)
194 | 
195 | 			# energy shape [batch_size, max_time]
196 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
197 | 
198 | 		# alignments shape = energy shape = [batch_size, max_time]
199 | 		alignments = self._probability_fn(energy, previous_alignments)
200 | 		# Cumulate alignments
201 | 		if self._cumulate:
202 | 			next_state = alignments + previous_alignments
203 | 		else:
204 | 			next_state = alignments
205 | 			
206 | 		return alignments, next_state
207 | 


--------------------------------------------------------------------------------
/tacotron/feeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import os
  3 | import threading
  4 | import time
  5 | import traceback
  6 | from tacotron.utils.text import text_to_sequence
  7 | from infolog import log
  8 | from sklearn.model_selection import train_test_split
  9 | import tensorflow as tf 
 10 | 
 11 | 
 12 | _batches_per_group = 32
 13 | 
 14 | class Feeder:
 15 | 	"""
 16 | 		Feeds batches of data into queue on a background thread.
 17 | 	"""
 18 | 
 19 | 	def __init__(self, coordinator, metadata_filename, hparams):
 20 | 		super(Feeder, self).__init__()
 21 | 		self._coord = coordinator
 22 | 		self._hparams = hparams
 23 | 		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 24 | 		self._train_offset = 0
 25 | 		self._test_offset = 0
 26 | 
 27 | 		# Load metadata
 28 | 		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
 29 | 		self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
 30 | 		with open(metadata_filename, encoding='utf-8') as f:
 31 | 			self._metadata = [line.strip().split('|') for line in f]
 32 | 			frame_shift_ms = hparams.hop_size / hparams.sample_rate
 33 | 			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
 34 | 			log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
 35 | 
 36 | 		#Train test split
 37 | 		if hparams.tacotron_test_size is None:
 38 | 			assert hparams.tacotron_test_batches is not None
 39 | 
 40 | 		test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 
 41 | 			else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
 42 | 		indices = np.arange(len(self._metadata))
 43 | 		train_indices, test_indices = train_test_split(indices,
 44 | 			test_size=test_size, random_state=hparams.tacotron_data_random_state)
 45 | 
 46 | 		#Make sure test_indices is a multiple of batch_size else round up
 47 | 		len_test_indices = self._round_up(len(test_indices), hparams.tacotron_batch_size)
 48 | 		extra_test = test_indices[len_test_indices:]
 49 | 		test_indices = test_indices[:len_test_indices]
 50 | 		train_indices = np.concatenate([train_indices, extra_test])
 51 | 
 52 | 		self._train_meta = list(np.array(self._metadata)[train_indices])
 53 | 		self._test_meta = list(np.array(self._metadata)[test_indices])
 54 | 
 55 | 		self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
 56 | 
 57 | 		if hparams.tacotron_test_size is None:
 58 | 			assert hparams.tacotron_test_batches == self.test_steps
 59 | 
 60 | 		#pad input sequences with the <pad_token> 0 ( _ )
 61 | 		self._pad = 0
 62 | 		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
 63 | 		#to avoid any possible conflicts, without affecting the output range of the model too much
 64 | 		if hparams.symmetric_mels:
 65 | 			self._target_pad = -(hparams.max_abs_value + .1)
 66 | 		else:
 67 | 			self._target_pad = -0.1
 68 | 		#Mark finished sequences with 1s
 69 | 		self._token_pad = 1.
 70 | 
 71 | 		with tf.device('/cpu:0'):
 72 | 			# Create placeholders for inputs and targets. Don't specify batch size because we want
 73 | 			# to be able to feed different batch sizes at eval time.
 74 | 			self._placeholders = [
 75 | 			tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
 76 | 			tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 77 | 			tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
 78 | 			tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
 79 | 			tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
 80 | 			tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
 81 | 			]
 82 | 
 83 | 			# Create queue for buffering data
 84 | 			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='input_queue')
 85 | 			self._enqueue_op = queue.enqueue(self._placeholders)
 86 | 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths = queue.dequeue()
 87 | 
 88 | 			self.inputs.set_shape(self._placeholders[0].shape)
 89 | 			self.input_lengths.set_shape(self._placeholders[1].shape)
 90 | 			self.mel_targets.set_shape(self._placeholders[2].shape)
 91 | 			self.token_targets.set_shape(self._placeholders[3].shape)
 92 | 			self.linear_targets.set_shape(self._placeholders[4].shape)
 93 | 			self.targets_lengths.set_shape(self._placeholders[5].shape)
 94 | 
 95 | 			# Create eval queue for buffering eval data
 96 | 			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32], name='eval_queue')
 97 | 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
 98 | 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
 99 | 				self.eval_linear_targets, self.eval_targets_lengths = eval_queue.dequeue()
100 | 
101 | 			self.eval_inputs.set_shape(self._placeholders[0].shape)
102 | 			self.eval_input_lengths.set_shape(self._placeholders[1].shape)
103 | 			self.eval_mel_targets.set_shape(self._placeholders[2].shape)
104 | 			self.eval_token_targets.set_shape(self._placeholders[3].shape)
105 | 			self.eval_linear_targets.set_shape(self._placeholders[4].shape)
106 | 			self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
107 | 
108 | 	def start_threads(self, session):
109 | 		self._session = session
110 | 		thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
111 | 		thread.daemon = True #Thread will close when parent quits
112 | 		thread.start()
113 | 
114 | 		thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
115 | 		thread.daemon = True #Thread will close when parent quits
116 | 		thread.start()
117 | 
118 | 	def _get_test_groups(self):
119 | 		meta = self._test_meta[self._test_offset]
120 | 		self._test_offset += 1
121 | 
122 | 		text = meta[5]
123 | 
124 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
125 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
126 | 		#Create parallel sequences containing zeros to represent a non finished sequence
127 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
128 | 		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
129 | 		return (input_data, mel_target, token_target, linear_target, len(mel_target))
130 | 
131 | 	def make_test_batches(self):
132 | 		start = time.time()
133 | 
134 | 		# Read a group of examples
135 | 		n = self._hparams.tacotron_batch_size
136 | 		r = self._hparams.outputs_per_step
137 | 
138 | 		#Test on entire test set
139 | 		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
140 | 
141 | 		# Bucket examples based on similar output sequence length for efficiency
142 | 		examples.sort(key=lambda x: x[-1])
143 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
144 | 		np.random.shuffle(batches)
145 | 
146 | 		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
147 | 		return batches, r
148 | 
149 | 	def _enqueue_next_train_group(self):
150 | 		while not self._coord.should_stop():
151 | 			start = time.time()
152 | 
153 | 			# Read a group of examples
154 | 			n = self._hparams.tacotron_batch_size
155 | 			r = self._hparams.outputs_per_step
156 | 			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
157 | 
158 | 			# Bucket examples based on similar output sequence length for efficiency
159 | 			examples.sort(key=lambda x: x[-1])
160 | 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
161 | 			np.random.shuffle(batches)
162 | 
163 | 			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
164 | 			for batch in batches:
165 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
166 | 				self._session.run(self._enqueue_op, feed_dict=feed_dict)
167 | 
168 | 	def _enqueue_next_test_group(self):
169 | 		#Create test batches once and evaluate on them for all test steps
170 | 		test_batches, r = self.make_test_batches()
171 | 		while not self._coord.should_stop():
172 | 			for batch in test_batches:
173 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
174 | 				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
175 | 
176 | 	def _get_next_example(self):
177 | 		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
178 | 		"""
179 | 		if self._train_offset >= len(self._train_meta):
180 | 			self._train_offset = 0
181 | 			np.random.shuffle(self._train_meta)
182 | 
183 | 		meta = self._train_meta[self._train_offset]
184 | 		self._train_offset += 1
185 | 
186 | 		text = meta[5]
187 | 
188 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
189 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
190 | 		#Create parallel sequences containing zeros to represent a non finished sequence
191 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
192 | 		linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
193 | 		return (input_data, mel_target, token_target, linear_target, len(mel_target))
194 | 
195 | 
196 | 	def _prepare_batch(self, batch, outputs_per_step):
197 | 		np.random.shuffle(batch)
198 | 		inputs = self._prepare_inputs([x[0] for x in batch])
199 | 		input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
200 | 		mel_targets = self._prepare_targets([x[1] for x in batch], outputs_per_step)
201 | 		#Pad sequences with 1 to infer that the sequence is done
202 | 		token_targets = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
203 | 		linear_targets = self._prepare_targets([x[3] for x in batch], outputs_per_step)
204 | 		targets_lengths = np.asarray([x[-1] for x in batch], dtype=np.int32) #Used to mask loss
205 | 		return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths)
206 | 
207 | 	def _prepare_inputs(self, inputs):
208 | 		max_len = max([len(x) for x in inputs])
209 | 		return np.stack([self._pad_input(x, max_len) for x in inputs])
210 | 
211 | 	def _prepare_targets(self, targets, alignment):
212 | 		max_len = max([len(t) for t in targets])
213 | 		return np.stack([self._pad_target(t, self._round_up(max_len, alignment)) for t in targets])
214 | 
215 | 	def _prepare_token_targets(self, targets, alignment):
216 | 		max_len = max([len(t) for t in targets]) + 1
217 | 		return np.stack([self._pad_token_target(t, self._round_up(max_len, alignment)) for t in targets])
218 | 
219 | 	def _pad_input(self, x, length):
220 | 		return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
221 | 
222 | 	def _pad_target(self, t, length):
223 | 		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
224 | 
225 | 	def _pad_token_target(self, t, length):
226 | 		return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad)
227 | 
228 | 	def _round_up(self, x, multiple):
229 | 		remainder = x % multiple
230 | 		return x if remainder == 0 else x + multiple - remainder
231 | 


--------------------------------------------------------------------------------
/wavenet_vocoder/models/modules.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf 
  3 | from wavenet_vocoder.util import sequence_mask
  4 | from .mixture import discretized_mix_logistic_loss
  5 | 
  6 | class Embedding:
  7 | 	"""Embedding class for global conditions.
  8 | 	"""
  9 | 	def __init__(self, num_embeddings, embedding_dim, std=0.1, name='gc_embedding'):
 10 | 		#Create embedding table
 11 | 		self.embedding_table = tf.get_variable(name,
 12 | 			[num_embeddings, embedding_dim], dtype=tf.float32,
 13 | 			initializer=tf.truncated_normal_initializer(mean=0., stddev=std))
 14 | 
 15 | 	def __call__(self, inputs):
 16 | 		#Do the actual embedding
 17 | 		return tf.nn.embedding_lookup(self.embedding_table, inputs)
 18 | 
 19 | class ReluActivation:
 20 | 	"""Simple class to wrap relu activation function in classe for later call.
 21 | 	"""
 22 | 	def __init__(self, name=None):
 23 | 		self.name = name
 24 | 
 25 | 	def __call__(self, inputs):
 26 | 		return tf.nn.relu(inputs, name=self.name)
 27 | 
 28 | 
 29 | class Conv1d1x1(tf.layers.Conv1D):
 30 | 	"""Extend tf.layers.Conv1D for dilated layers convolutions.
 31 | 	"""
 32 | 	def __init__(self, in_channels, filters, kernel_size=1, padding=None, dilation=1, use_bias=True, name='Conv1d1x1'):
 33 | 		with tf.variable_scope(name) as scope:
 34 | 			#Create variables
 35 | 			kernel_shape = (kernel_size, in_channels, filters)
 36 | 			self.kernel = tf.get_variable(
 37 | 				name='kernel_{}'.format(name),
 38 | 				shape=kernel_shape,
 39 | 				dtype=tf.float32
 40 | 				)
 41 | 
 42 | 			if use_bias:
 43 | 				self.bias = tf.get_variable(
 44 | 					name='bias_{}'.format(name),
 45 | 					shape=(filters, ),
 46 | 					initializer=tf.zeros_initializer(),
 47 | 					dtype=tf.float32)
 48 | 
 49 | 			self.filters = filters
 50 | 			self.in_channels = in_channels
 51 | 			self.dilation_rate = dilation
 52 | 			self.convolution_queue = None
 53 | 			self._linearized_weight = None
 54 | 			self.paddings = None
 55 | 			self.use_bias = use_bias
 56 | 			self.paddings = padding
 57 | 			self.scope = scope
 58 | 
 59 | 	def set_mode(self, is_training):
 60 | 		self.training = is_training
 61 | 
 62 | 	def _to_dilation(self, inputs):
 63 | 		'''Pad and reshape inputs by dilation rate.
 64 | 
 65 | 		Used to perfrom 1D dilation convolution.
 66 | 		'''
 67 | 		if self.paddings is not None: #dilated conv
 68 | 			assert isinstance(self.paddings, int)
 69 | 			inputs_padded = tf.pad(inputs, [[0, 0], [0, 0], [self.paddings, 0]], "CONSTANT")
 70 | 
 71 | 			#inputs are channels first
 72 | 			inputs_shape = tf.shape(inputs_padded)
 73 | 			channels = inputs_shape[1]
 74 | 			width_pad = inputs_shape[-1]
 75 | 
 76 | 			dilation_shape = (width_pad // self.dilation_rate, -1, channels) #-1 refers to batch_size * dilation_rate
 77 | 			#[width_pad, batch_size, channels]
 78 | 			inputs_transposed = tf.transpose(inputs_padded, [2, 0, 1])
 79 | 			#[width_pad / dilation_rate, batch_size * dilation_rate, channels]
 80 | 			inputs_reshaped = tf.reshape(inputs_transposed, dilation_shape)
 81 | 			#[batch_size * dilation_rate, width_pad / dilation_rate, channels]
 82 | 			outputs = tf.transpose(inputs_reshaped, [1, 0, 2])
 83 | 
 84 | 		else: #Simple channels first convolution
 85 | 			outputs = tf.transpose(inputs, [0, 2, 1])
 86 | 
 87 | 		return outputs
 88 | 
 89 | 	def _from_dilation(self, inputs, crop):
 90 | 		'''Remove paddings and reshape to 1d signal.
 91 | 
 92 | 		Used after 1D dilation convolution.
 93 | 		'''
 94 | 		if self.paddings is not None: #dilated conv
 95 | 			assert isinstance(self.paddings, int)
 96 | 			#inputs: [batch_size * dilation_rate, width_pad / dilation_rate, channels]
 97 | 			inputs_shape = tf.shape(inputs)
 98 | 			batch_size = inputs_shape[0] / self.dilation_rate
 99 | 			width_pad = inputs_shape[1] * self.dilation_rate
100 | 			channels = inputs_shape[-1]
101 | 			new_shape = (width_pad, -1, channels) #-1 refers to batch_size
102 | 
103 | 			#[width_pad / dilation_rate, batch_size * dilation_rate, channels]
104 | 			inputs_transposed = tf.transpose(inputs, [1, 0, 2])
105 | 			#[width_pad, batch_size, channels]
106 | 			inputs_reshaped = tf.reshape(inputs_transposed, new_shape)
107 | 			#[batch_size, channels, width_pad]
108 | 			outputs = tf.transpose(inputs_reshaped, [1, 2, 0])
109 | 			#[batch_size, channels, width]
110 | 			cropped = tf.slice(outputs, [0, 0, crop], [-1, -1, -1])
111 | 
112 | 		else: #Simple channels first convolution
113 | 			cropped = tf.transpose(inputs, [0, 2, 1])
114 | 
115 | 		return cropped
116 | 		
117 | 
118 | 	def __call__(self, inputs):
119 | 		'''During this call, we change to channel last scheme for a better generalization and easier bias computation
120 | 		'''
121 | 		with tf.variable_scope(self.scope):
122 | 			#Reshape to dilated conv mode (if this instance is of a dilated convolution)
123 | 			inputs_ = self._to_dilation(inputs)
124 | 
125 | 			outputs_ = tf.nn.conv1d(inputs_, self.kernel,
126 | 				stride=1, padding='VALID', data_format='NWC')
127 | 
128 | 			if self.use_bias:
129 | 				outputs_ = tf.nn.bias_add(outputs_, self.bias)
130 | 
131 | 			#Reshape back ((if this instance is of a dilated convolution))
132 | 			diff = tf.shape(outputs_)[1] * self.dilation_rate - tf.shape(inputs)[-1]
133 | 			outputs = self._from_dilation(outputs_, crop=diff)
134 | 
135 | 			#Make sure that outputs have same time steps as inputs
136 | 			#[batch_size, channels(filters), width]
137 | 			with tf.control_dependencies([tf.assert_equal(tf.shape(outputs)[-1], tf.shape(inputs)[-1])]):
138 | 				outputs = tf.identity(outputs, name='output_equal_input_time_assert')
139 | 
140 | 			return outputs
141 | 
142 | 	def incremental_step(self, inputs):
143 | 		'''At sequential inference times:
144 | 		we adopt fast wavenet convolution queues by saving precomputed states for faster generation
145 | 
146 | 		inputs: [batch_size, time_length, channels] ('NWC')! Channels last!
147 | 		'''
148 | 		with tf.variable_scope(self.scope):
149 | 			#input: [batch_size, time_length, channels]
150 | 			if self.training: 
151 | 				raise RuntimeError('incremental_step only supports eval mode')
152 | 
153 | 			#reshape weight
154 | 			weight = self._get_linearized_weight(inputs)
155 | 			kw = self.kernel.shape[0]
156 | 			dilation = self.dilation_rate
157 | 
158 | 			batch_size = tf.shape(inputs)[0]
159 | 			#Fast dilation
160 | 			#Similar to using tf FIFOQueue to schedule states of dilated convolutions
161 | 			if kw > 1:
162 | 				if self.convolution_queue is None:
163 | 					self.convolution_queue = tf.zeros((batch_size, (kw - 1) + (kw - 1) * (dilation - 1), tf.shape(inputs)[2]))
164 | 				else:
165 | 					#shift queue
166 | 					self.convolution_queue = self.convolution_queue[:, 1:, :]
167 | 
168 | 				#append next input
169 | 				self.convolution_queue = tf.concat([self.convolution_queue, tf.expand_dims(inputs[:, -1, :], axis=1)], axis=1)
170 | 				#self.convolution_queue[:, -1, :] = inputs[:, -1, :]
171 | 				inputs = self.convolution_queue
172 | 				if dilation > 1:
173 | 					inputs = inputs[:, 0::dilation, :]
174 | 
175 | 			#Compute step prediction
176 | 			output = tf.matmul(tf.reshape(inputs, [batch_size, -1]), weight)
177 | 			if self.use_bias:
178 | 				output = tf.nn.bias_add(output, self.bias)
179 | 
180 | 			#[batch_size, 1(time_step), channels(filters)]
181 | 			return tf.reshape(output, [batch_size, 1, self.filters])
182 | 
183 | 	def _get_linearized_weight(self, inputs):
184 | 		if self._linearized_weight is None:
185 | 			kw = self.kernel.shape[0]
186 | 			#layers.Conv1D
187 | 			if tf.shape(self.kernel) == (self.filters, self.in_channels, kw):
188 | 				#[filters, in, kw]
189 | 				weight = tf.transpose(self.kernel, [2, 1, 0])
190 | 			else:
191 | 				#[kw, in, filters]
192 | 				weight = self.kernel
193 | 
194 | 			#[kw, in, filters]
195 | 			assert weight.shape == (kw, self.in_channels, self.filters)
196 | 			self._linearized_weight = tf.cast(tf.reshape(weight, [-1, self.filters]), dtype=inputs.dtype)
197 | 		return self._linearized_weight
198 | 
199 | 	def clear_queue(self):
200 | 		self.convolution_queue = None
201 | 
202 | def _conv1x1_forward(conv, x, is_incremental):
203 | 	"""conv1x1 step
204 | 	"""
205 | 	if is_incremental:
206 | 		return conv.incremental_step(x)
207 | 	else:
208 | 		return conv(x)
209 | 
210 | class ResidualConv1dGLU():
211 | 	'''Residual dilated conv1d + Gated Linear Unit
212 | 	'''
213 | 
214 | 	def __init__(self, residual_channels, gate_channels, kernel_size,
215 | 			skip_out_channels=None, cin_channels=-1, gin_channels=-1,
216 | 			dropout=1 - .95, padding=None, dilation=1, causal=True,
217 | 			use_bias=True, name='ResidualConv1dGLU'):
218 | 		self.dropout = dropout
219 | 
220 | 		if skip_out_channels is None:
221 | 			skip_out_channels = residual_channels
222 | 
223 | 		if padding is None:
224 | 			#No future time stamps available
225 | 			if causal:
226 | 				padding = (kernel_size - 1) * dilation
227 | 			else:
228 | 				padding = (kernel_size - 1) // 2 * dilation
229 | 
230 | 		self.causal = causal
231 | 
232 | 		self.conv = Conv1d1x1(residual_channels, gate_channels, kernel_size,
233 | 			padding=padding, dilation=dilation, use_bias=use_bias, name='residual_block_conv')
234 | 
235 | 		#Local conditioning
236 | 		if cin_channels > 0:
237 | 			self.conv1x1c = Conv1d1x1(cin_channels, gate_channels,
238 | 				use_bias=use_bias, name='residual_block_cin_conv')
239 | 		else:
240 | 			self.conv1x1c = None
241 | 
242 | 		#Global conditioning
243 | 		if gin_channels > 0:
244 | 			self.conv1x1g = Conv1d1x1(gin_channels, gate_channels, 
245 | 				use_bias=use_bias, name='residual_block_gin_conv')
246 | 		else:
247 | 			self.conv1x1g = None
248 | 
249 | 		gate_out_channels = gate_channels // 2
250 | 		self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, use_bias=use_bias, name='residual_block_out_conv')
251 | 		self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_out_channels, use_bias=use_bias, name='residual_block_skip_conv')
252 | 
253 | 	def set_mode(self, is_training):
254 | 		for conv in [self.conv, self.conv1x1c, self.conv1x1g, self.conv1x1_out, self.conv1x1_skip]:
255 | 			try:
256 | 				conv.set_mode(is_training)
257 | 			except AttributeError:
258 | 				pass
259 | 
260 | 	def __call__(self, x, c=None, g=None):
261 | 		return self.step(x, c, g, False)
262 | 
263 | 	def incremental_step(self, x, c=None, g=None):
264 | 		return self.step(x, c, g, True)
265 | 
266 | 	def step(self, x, c, g, is_incremental):
267 | 		'''
268 | 
269 | 		Args:
270 | 			x: Tensor [batch_size, channels, time_length]
271 | 			c: Tensor [batch_size, c_channels, time_length]. Local conditioning features
272 | 			g: Tensor [batch_size, g_channels, time_length], global conditioning features
273 | 			is_incremental: Boolean, whether incremental mode is on
274 | 		Returns:
275 | 			Tensor output
276 | 		'''
277 | 		residual = x
278 | 		x = tf.layers.dropout(x, rate=self.dropout, training=not is_incremental)
279 | 		if is_incremental:
280 | 			splitdim = -1
281 | 			x = self.conv.incremental_step(x)
282 | 		else:
283 | 			splitdim = 1
284 | 			x = self.conv(x)
285 | 			#Remove future time steps
286 | 			x = x[:, :, :tf.shape(residual)[-1]] if self.causal else x
287 | 
288 | 		a, b = tf.split(x, num_or_size_splits=2, axis=splitdim)
289 | 
290 | 		#local conditioning
291 | 		if c is not None:
292 | 			assert self.conv1x1c is not None
293 | 			c = _conv1x1_forward(self.conv1x1c, c, is_incremental)
294 | 			ca, cb = tf.split(c, num_or_size_splits=2, axis=splitdim)
295 | 			a, b = a + ca, b + cb
296 | 
297 | 		#global conditioning
298 | 		if g is not None:
299 | 			assert self.conv1x1g is not None
300 | 			g = _conv1x1_forward(self.conv1x1g, g, is_incremental)
301 | 			ga, gb = tf.split(g, num_or_size_splits=2, axis=splitdim)
302 | 			a, b = a + ga, b + gb
303 | 
304 | 		x = tf.nn.tanh(a) * tf.nn.sigmoid(b)
305 | 		#For Skip connection
306 | 		s = _conv1x1_forward(self.conv1x1_skip, x, is_incremental)
307 | 
308 | 		#For Residual connection
309 | 		x = _conv1x1_forward(self.conv1x1_out, x, is_incremental)
310 | 
311 | 		x = (x + residual) * tf.sqrt(0.5)
312 | 		return x, s
313 | 
314 | 	def clear_queue(self):
315 | 		for conv in [self.conv, self.conv1x1_out, self.conv1x1_skip,
316 | 				self.conv1x1c, self.conv1x1g]:
317 | 			if conv is not None:
318 | 				self.conv.clear_queue()
319 | 
320 | 
321 | class ConvTranspose2d:
322 | 	def __init__(self, filters, kernel_size, freq_axis_kernel_size, padding, strides):
323 | 		self.convt = tf.layers.Conv2DTranspose(
324 | 			filters=filters,
325 | 			kernel_size=kernel_size,
326 | 			strides=strides,
327 | 			padding=padding,
328 | 			kernel_initializer=tf.constant_initializer(1 / freq_axis_kernel_size, dtype=tf.float32),
329 | 			bias_initializer=tf.zeros_initializer(),
330 | 			data_format='channels_first')
331 | 
332 | 	def __call__(self, inputs):
333 | 		return self.convt(inputs)
334 | 
335 | 
336 | 
337 | def MaskedCrossEntropyLoss(outputs, targets, lengths=None, mask=None, max_len=None):
338 | 	if lengths is None and mask is None:
339 | 		raise RuntimeError('Please provide either lengths or mask')
340 | 
341 | 	#[batch_size, time_length]
342 | 	if mask is None:
343 | 		mask = sequence_mask(lengths, max_len, False)
344 | 
345 | 	#One hot encode targets (outputs.shape[-1] = hparams.quantize_channels)
346 | 	targets_ = tf.one_hot(targets, depth=tf.shape(outputs)[-1])
347 | 	
348 | 	with tf.control_dependencies([tf.assert_equal(tf.shape(outputs), tf.shape(targets_))]):
349 | 		losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=outputs, labels=targets_)
350 | 
351 | 	with tf.control_dependencies([tf.assert_equal(tf.shape(mask), tf.shape(losses))]):
352 | 		masked_loss = losses * mask
353 | 
354 | 	return tf.reduce_sum(masked_loss) / tf.count_nonzero(masked_loss, dtype=tf.float32)
355 | 
356 | def DiscretizedMixtureLogisticLoss(outputs, targets, hparams, lengths=None, mask=None, max_len=None):
357 | 	if lengths is None and mask is None:
358 | 		raise RuntimeError('Please provide either lengths or mask')
359 | 
360 | 	#[batch_size, time_length, 1]
361 | 	if mask is None:
362 | 		mask = sequence_mask(lengths, max_len, True)
363 | 
364 | 	#[batch_size, time_length, dimension]
365 | 	ones = tf.ones([tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(targets)[-1]], tf.float32)
366 | 	mask_ = mask * ones
367 | 
368 | 	losses = discretized_mix_logistic_loss(
369 | 		outputs, targets, num_classes=hparams.quantize_channels,
370 | 		log_scale_min=hparams.log_scale_min, reduce=False)
371 | 
372 | 	with tf.control_dependencies([tf.assert_equal(tf.shape(losses), tf.shape(targets))]):
373 | 		return tf.reduce_sum(losses * mask_) / tf.reduce_sum(mask_)


--------------------------------------------------------------------------------
/tacotron/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | from datetime import datetime
  3 | import os
  4 | import subprocess
  5 | import time
  6 | import tensorflow as tf 
  7 | import traceback
  8 | import argparse
  9 | 
 10 | from tacotron.feeder import Feeder
 11 | from hparams import hparams_debug_string
 12 | from tacotron.models import create_model
 13 | from tacotron.utils.text import sequence_to_text
 14 | from tacotron.utils import plot, ValueWindow
 15 | import infolog
 16 | from datasets import audio
 17 | from tqdm import tqdm
 18 | 
 19 | log = infolog.log
 20 | 
 21 | 
 22 | def add_train_stats(model, hparams):
 23 | 	with tf.variable_scope('stats') as scope:
 24 | 		tf.summary.histogram('mel_outputs', model.mel_outputs)
 25 | 		tf.summary.histogram('mel_targets', model.mel_targets)
 26 | 		tf.summary.scalar('before_loss', model.before_loss)
 27 | 		tf.summary.scalar('after_loss', model.after_loss)
 28 | 		if hparams.predict_linear:
 29 | 			tf.summary.scalar('linear_loss', model.linear_loss)
 30 | 		tf.summary.scalar('regularization_loss', model.regularization_loss)
 31 | 		tf.summary.scalar('stop_token_loss', model.stop_token_loss)
 32 | 		tf.summary.scalar('loss', model.loss)
 33 | 		tf.summary.scalar('learning_rate', model.learning_rate) #Control learning rate decay speed
 34 | 		if hparams.tacotron_teacher_forcing_mode == 'scheduled':
 35 | 			tf.summary.scalar('teacher_forcing_ratio', model.ratio) #Control teacher forcing ratio decay when mode = 'scheduled'
 36 | 		gradient_norms = [tf.norm(grad) for grad in model.gradients]
 37 | 		tf.summary.histogram('gradient_norm', gradient_norms)
 38 | 		tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
 39 | 		return tf.summary.merge_all()
 40 | 
 41 | def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss):
 42 | 	values = [
 43 | 	tf.Summary.Value(tag='eval_model/eval_stats/eval_before_loss', simple_value=before_loss),
 44 | 	tf.Summary.Value(tag='eval_model/eval_stats/eval_after_loss', simple_value=after_loss),
 45 | 	tf.Summary.Value(tag='eval_model/eval_stats/stop_token_loss', simple_value=stop_token_loss),
 46 | 	tf.Summary.Value(tag='eval_model/eval_stats/eval_loss', simple_value=loss),
 47 | 	]
 48 | 	if linear_loss is not None:
 49 | 		values.append(tf.Summary.Value(tag='model/eval_stats/eval_linear_loss', simple_value=linear_loss))
 50 | 	test_summary = tf.Summary(value=values)
 51 | 	summary_writer.add_summary(test_summary, step)
 52 | 
 53 | def time_string():
 54 | 	return datetime.now().strftime('%Y-%m-%d %H:%M')
 55 | 
 56 | def model_train_mode(args, feeder, hparams, global_step):
 57 | 	with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
 58 | 		model_name = None
 59 | 		if args.model in ('Tacotron-2', 'Both'):
 60 | 			model_name = 'Tacotron'
 61 | 		model = create_model(model_name or args.model, hparams)
 62 | 		if hparams.predict_linear:
 63 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, linear_targets=feeder.linear_targets, 
 64 | 				targets_lengths=feeder.targets_lengths, global_step=global_step,
 65 | 				is_training=True)
 66 | 		else:
 67 | 			model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, 
 68 | 				targets_lengths=feeder.targets_lengths, global_step=global_step,
 69 | 				is_training=True)
 70 | 		model.add_loss()
 71 | 		model.add_optimizer(global_step)
 72 | 		stats = add_train_stats(model, hparams)
 73 | 		return model, stats
 74 | 
 75 | def model_test_mode(args, feeder, hparams, global_step):
 76 | 	with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
 77 | 		model_name = None
 78 | 		if args.model in ('Tacotron-2', 'Both'):
 79 | 			model_name = 'Tacotron'
 80 | 		model = create_model(model_name or args.model, hparams)
 81 | 		if hparams.predict_linear:
 82 | 			model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 
 83 | 				linear_targets=feeder.eval_linear_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step,
 84 | 				is_training=False, is_evaluating=True)
 85 | 		else:
 86 | 			model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets, 
 87 | 				targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True)
 88 | 		model.add_loss()
 89 | 		return model
 90 | 
 91 | def train(log_dir, args, hparams):
 92 | 	save_dir = os.path.join(log_dir, 'taco_pretrained/')
 93 | 	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
 94 | 	input_path = os.path.join(args.base_dir, args.tacotron_input)
 95 | 	plot_dir = os.path.join(log_dir, 'plots')
 96 | 	wav_dir = os.path.join(log_dir, 'wavs')
 97 | 	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
 98 | 	eval_dir = os.path.join(log_dir, 'eval-dir')
 99 | 	eval_plot_dir = os.path.join(eval_dir, 'plots')
100 | 	eval_wav_dir = os.path.join(eval_dir, 'wavs')
101 | 	os.makedirs(eval_dir, exist_ok=True)
102 | 	os.makedirs(plot_dir, exist_ok=True)
103 | 	os.makedirs(wav_dir, exist_ok=True)
104 | 	os.makedirs(mel_dir, exist_ok=True)
105 | 	os.makedirs(eval_plot_dir, exist_ok=True)
106 | 	os.makedirs(eval_wav_dir, exist_ok=True)
107 | 
108 | 	if hparams.predict_linear:
109 | 		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
110 | 		os.makedirs(linear_dir, exist_ok=True)
111 | 
112 | 	log('Checkpoint path: {}'.format(checkpoint_path))
113 | 	log('Loading training data from: {}'.format(input_path))
114 | 	log('Using model: {}'.format(args.model))
115 | 	log(hparams_debug_string())
116 | 
117 | 	#Start by setting a seed for repeatability
118 | 	tf.set_random_seed(hparams.tacotron_random_seed)
119 | 
120 | 	#Set up data feeder
121 | 	coord = tf.train.Coordinator()
122 | 	with tf.variable_scope('datafeeder') as scope:
123 | 		feeder = Feeder(coord, input_path, hparams)
124 | 
125 | 	#Set up model:
126 | 	global_step = tf.Variable(0, name='global_step', trainable=False)
127 | 	model, stats = model_train_mode(args, feeder, hparams, global_step)
128 | 	eval_model = model_test_mode(args, feeder, hparams, global_step)
129 | 
130 | 	#Book keeping
131 | 	step = 0
132 | 	time_window = ValueWindow(100)
133 | 	loss_window = ValueWindow(100)
134 | 	saver = tf.train.Saver(max_to_keep=5)
135 | 
136 | 	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))
137 | 
138 | 	#Memory allocation on the GPU as needed
139 | 	config = tf.ConfigProto()
140 | 	config.gpu_options.allow_growth = True
141 | 
142 | 	#Train
143 | 	with tf.Session(config=config) as sess:
144 | 		try:
145 | 			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
146 | 			sess.run(tf.global_variables_initializer())
147 | 			checkpoint_state=None
148 | 			#saved model restoring
149 | 			if args.restore:
150 | 				#Restore saved model if the user requested it, Default = True.
151 | 				try:
152 | 					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
153 | 				except tf.errors.OutOfRangeError as e:
154 | 					log('Cannot restore checkpoint: {}'.format(e))
155 | 
156 | 			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
157 | 				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
158 | 				saver.restore(sess, checkpoint_state.model_checkpoint_path)
159 | 
160 | 			else:
161 | 				if not args.restore:
162 | 					log('Starting new training!')
163 | 				else:
164 | 					log('No model to load at {}'.format(save_dir))
165 | 
166 | 			#initializing feeder
167 | 			feeder.start_threads(sess)
168 | 
169 | 			#Training loop
170 | 			while not coord.should_stop() and step < args.tacotron_train_steps:
171 | 				start_time = time.time()
172 | 				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
173 | 				time_window.append(time.time() - start_time)
174 | 				loss_window.append(loss)
175 | 				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
176 | 					step, time_window.average, loss, loss_window.average)
177 | 				log(message, end='\r')
178 | 
179 | 				if np.isnan(loss):
180 | 					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
181 | 					raise Exception('Loss exploded')
182 | 
183 | 				if step % args.summary_interval == 0:
184 | 					log('\nWriting summary at step {}'.format(step))
185 | 					summary_writer.add_summary(sess.run(stats), step)
186 | 
187 | 				if step % args.eval_interval == 0:
188 | 					#Run eval and save eval stats
189 | 					log('\nRunning evaluation at step {}'.format(step))
190 | 
191 | 					eval_losses = []
192 | 					before_losses = []
193 | 					after_losses = []
194 | 					stop_token_losses = []
195 | 					linear_losses = []
196 | 					linear_loss = None
197 | 
198 | 					if hparams.predict_linear:
199 | 						for i in tqdm(range(feeder.test_steps)):
200 | 							eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run(
201 | 								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
202 | 								eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], 
203 | 								eval_model.mel_targets[0], eval_model.targets_lengths[0], 
204 | 								eval_model.alignments[0], eval_model.linear_outputs[0]])
205 | 							print(i)
206 | 							eval_losses.append(eloss)
207 | 							before_losses.append(before_loss)
208 | 							after_losses.append(after_loss)
209 | 							stop_token_losses.append(stop_token_loss)
210 | 							linear_losses.append(linear_loss)
211 | 						linear_loss = sum(linear_losses) / len(linear_losses)
212 | 
213 | 						wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
214 | 						audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate)
215 | 					else:
216 | 						for i in tqdm(range(feeder.test_steps)):
217 | 							eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
218 | 								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
219 | 								eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0],
220 | 								eval_model.targets_lengths[0], eval_model.alignments[0]])
221 | 							eval_losses.append(eloss)
222 | 							before_losses.append(before_loss)
223 | 							after_losses.append(after_loss)
224 | 							stop_token_losses.append(stop_token_loss)
225 | 
226 | 					eval_loss = sum(eval_losses) / len(eval_losses)
227 | 					before_loss = sum(before_losses) / len(before_losses)
228 | 					after_loss = sum(after_losses) / len(after_losses)
229 | 					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
230 | 
231 | 					log('Saving eval log to {}..'.format(eval_dir))
232 | 					#Save some log to monitor model improvement on same unseen sequence
233 | 					wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
234 | 					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate)
235 | 
236 | 					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
237 | 						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss),
238 | 						max_len=t_len // hparams.outputs_per_step)
239 | 					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
240 | 						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t,
241 | 						max_len=t_len)
242 | 
243 | 					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
244 | 					log('Writing eval summary!')
245 | 					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)
246 | 
247 | 				
248 | 				if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps:
249 | 					#Save model and current global step
250 | 					saver.save(sess, checkpoint_path, global_step=global_step)
251 | 					
252 | 					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
253 | 					if hparams.predict_linear:
254 | 						input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([
255 | 							model.inputs[0],
256 | 							model.mel_outputs[0],
257 | 							model.linear_outputs[0],
258 | 							model.alignments[0],
259 | 							model.mel_targets[0],
260 | 							model.targets_lengths[0],
261 | 							])
262 | 
263 | 						#save predicted linear spectrogram to disk (debug)
264 | 						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
265 | 						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
266 | 
267 | 						#save griffin lim inverted wav for debug (linear -> wav)
268 | 						wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
269 | 						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
270 | 
271 | 					else:
272 | 						input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0],
273 | 							model.mel_outputs[0],
274 | 							model.alignments[0],
275 | 							model.mel_targets[0],
276 | 							model.targets_lengths[0],
277 | 							])
278 | 
279 | 					#save predicted mel spectrogram to disk (debug)
280 | 					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
281 | 					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)
282 | 
283 | 					#save griffin lim inverted wav for debug (mel -> wav)
284 | 					wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
285 | 					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
286 | 
287 | 					#save alignment plot to disk (control purposes)
288 | 					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
289 | 						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
290 | 						max_len=target_length // hparams.outputs_per_step)
291 | 					#save real and predicted mel-spectrogram plot to disk (control purposes)
292 | 					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
293 | 						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
294 | 						max_len=target_length)
295 | 					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
296 | 
297 | 			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps))
298 | 			return save_dir
299 | 
300 | 		except Exception as e:
301 | 			log('Exiting due to exception: {}'.format(e))
302 | 			traceback.print_exc()
303 | 			coord.request_stop(e)
304 | 
305 | def tacotron_train(args, log_dir, hparams):
306 | 	return train(log_dir, args, hparams)
307 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | import numpy as np 
  3 | 
  4 | 
  5 | # Default hyperparameters
  6 | hparams = tf.contrib.training.HParams(
  7 | 	# Comma-separated list of cleaners to run on text prior to training and eval. For non-English
  8 | 	# text, you may want to use "basic_cleaners" or "transliteration_cleaners".
  9 | 	cleaners='english_cleaners',
 10 | 
 11 | 	#Hardware setup (TODO: multi-GPU parallel tacotron training)
 12 | 	use_all_gpus = False, #Whether to use all GPU resources. If True, total number of available gpus will override num_gpus.
 13 | 	num_gpus = 1, #Determines the number of gpus in use
 14 | 	###########################################################################################################################################
 15 | 
 16 | 	#Audio
 17 | 	num_mels = 80, #Number of mel-spectrogram channels and local conditioning dimensionality
 18 | 	num_freq = 513, # (= n_fft / 2 + 1) only used when adding linear spectrograms post processing network
 19 | 	rescale = True, #Whether to rescale audio prior to preprocessing
 20 | 	rescaling_max = 0.999, #Rescaling value
 21 | 	trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
 22 | 	clip_mels_length = True, #For cases of OOM (Not really recommended, working on a workaround)
 23 | 	max_mel_frames = 900,  #Only relevant when clip_mels_length = True
 24 | 
 25 | 	# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
 26 | 	# It's preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
 27 | 	# Does not work if n_ffit is not multiple of hop_size!!
 28 | 	use_lws=True,
 29 | 	silence_threshold=2, #silence threshold used for sound trimming for wavenet preprocessing
 30 | 
 31 | 	#Mel spectrogram
 32 | 	n_fft = 1024, #Extra window size is filled with 0 paddings to match this parameter
 33 | 	hop_size = 256, #For 22050Hz, 275 ~= 12.5 ms
 34 | 	win_size = None, #For 22050Hz, 1100 ~= 50 ms (If None, win_size = n_fft)
 35 | 	sample_rate = 22050, #22050 Hz (corresponding to ljspeech dataset)
 36 | 	frame_shift_ms = None,
 37 | 
 38 | 	#M-AILABS (and other datasets) trim params
 39 | 	trim_fft_size = 512,
 40 | 	trim_hop_size = 128,
 41 | 	trim_top_db = 60,
 42 | 
 43 | 	#Mel and Linear spectrograms normalization/scaling and clipping
 44 | 	signal_normalization = True,
 45 | 	allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
 46 | 	symmetric_mels = True, #Whether to scale the data to be symmetric around 0
 47 | 	max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max] 
 48 | 
 49 | 	  #Global style token
 50 | 	use_gst=True,     # When false, the scripit will do as the paper  "Towards End-to-End Prosody Transfer for Expressive Speech Synthesis with Tacotron"
 51 | 	num_gst=10,
 52 | 	num_heads=4,       # Head number for multi-head attention
 53 | 	style_embed_depth=256,
 54 | 	reference_filters=[32, 32, 64, 64, 128, 128],
 55 | 	reference_depth=128,
 56 | 	style_att_type="mlp_attention", # Attention type for style attention module (dot_attention, mlp_attention)
 57 | 	style_att_dim=128,
 58 | 
 59 | 	#Limits
 60 | 	min_level_db = -100,
 61 | 	ref_level_db = 20,
 62 | 	fmin = 25, #Set this to 75 if your speaker is male! if female, 125 should help taking off noise. (To test depending on dataset)
 63 | 	fmax = 7600, 
 64 | 
 65 | 	#Griffin Lim
 66 | 	power = 1.2, 
 67 | 	griffin_lim_iters = 60,
 68 | 	###########################################################################################################################################
 69 | 
 70 | 	#Tacotron
 71 | 	outputs_per_step = 2, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size)
 72 | 	stop_at_any = True, #Determines whether the decoder should stop when predicting <stop> to any frame or to all of them
 73 | 
 74 | 	embedding_dim = 512, #dimension of embedding space
 75 | 
 76 | 	enc_conv_num_layers = 3, #number of encoder convolutional layers
 77 | 	enc_conv_kernel_size = (5, ), #size of encoder convolution filters for each layer
 78 | 	enc_conv_channels = 512, # number of encoder convolutions filters for each layer
 79 | 	encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward)
 80 | 
 81 | 	smoothing = False, #Whether to smooth the attention normalization function 
 82 | 	attention_dim = 128, #dimension of attention space
 83 | 	attention_filters = 32, #number of attention convolution filters
 84 | 	attention_kernel = (31, ), #kernel size of attention convolution
 85 | 	cumulative_weights = True, #Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
 86 | 
 87 | 	prenet_layers = [256, 256], #number of layers and number of units of prenet
 88 | 	decoder_layers = 2, #number of decoder lstm layers
 89 | 	decoder_lstm_units = 1024, #number of decoder lstm units on each layer
 90 | 	max_iters = 2500, #Max decoder steps during inference (Just for safety from infinite loop cases)
 91 | 
 92 | 	postnet_num_layers = 5, #number of postnet convolutional layers
 93 | 	postnet_kernel_size = (5, ), #size of postnet convolution filters for each layer
 94 | 	postnet_channels = 512, #number of postnet convolution filters for each layer
 95 | 
 96 | 	mask_encoder = True, #whether to mask encoder padding while computing attention
 97 | 	mask_decoder = True, #Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not be weighted, else recommended pos_weight = 20)
 98 | 
 99 | 	cross_entropy_pos_weight = 20, #Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1 = disabled)
100 | 	predict_linear = False, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!)
101 | 	###########################################################################################################################################
102 | 
103 | 
104 | 	#Wavenet
105 | 	# Input type:
106 | 	# 1. raw [-1, 1]
107 | 	# 2. mulaw [-1, 1]
108 | 	# 3. mulaw-quantize [0, mu]
109 | 	# If input_type is raw or mulaw, network assumes scalar input and
110 | 	# discretized mixture of logistic distributions output, otherwise one-hot
111 | 	# input and softmax output are assumed.
112 | 	input_type="mulaw",
113 | 	quantize_channels=256,  # 65536 (16-bit) (raw) or 256 (8-bit) (mulaw or mulaw-quantize) // number of classes = 256 <=> mu = 255
114 | 
115 | 	log_scale_min=float(np.log(1e-14)), #Mixture of logistic distributions minimal log scale
116 | 
117 | 	out_channels = 10 * 3, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale)
118 | 	layers = 24, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper)
119 | 	stacks = 4, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper)
120 | 	residual_channels = 512,
121 | 	gate_channels = 512, #split in 2 in gated convolutions
122 | 	skip_out_channels = 256,
123 | 	kernel_size = 3,
124 | 
125 | 	cin_channels = 80, #Set this to -1 to disable local conditioning, else it must be equal to num_mels!!
126 | 	upsample_conditional_features = True, #Whether to repeat conditional features or upsample them (The latter is recommended)
127 | 	upsample_scales = [16, 16], #prod(scales) should be equal to hop size
128 | 	freq_axis_kernel_size = 3,
129 | 
130 | 	gin_channels = -1, #Set this to -1 to disable global conditioning, Only used for multi speaker dataset
131 | 	use_bias = True, #Whether to use bias in convolutional layers of the Wavenet
132 | 
133 | 	max_time_sec = None,
134 | 	max_time_steps = 13000, #Max time steps in audio used to train wavenet (decrease to save memory)
135 | 	###########################################################################################################################################
136 | 
137 | 	#Tacotron Training
138 | 	tacotron_random_seed = 5339, #Determines initial graph and operations (i.e: model) random state for reproducibility
139 | 	tacotron_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!)
140 | 
141 | 	tacotron_batch_size = 48,#number of training samples on each training steps
142 | 	tacotron_reg_weight = 1e-6, #regularization weight (for L2 regularization)
143 | 	tacotron_scale_regularization = True, #Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is high and biasing the model)
144 | 
145 | 	tacotron_test_size = None, #% of data to keep as test data, if None, tacotron_test_batches must be not None
146 | 	tacotron_test_batches = 48, #number of test batches (For Ljspeech: 10% ~= 41 batches of 32 samples)
147 | 	tacotron_data_random_state=1234, #random state for train test split repeatability
148 | 
149 | 	tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
150 | 	tacotron_start_decay = 50000, #Step at which learning decay starts
151 | 	tacotron_decay_steps = 40000, #Determines the learning rate decay slope (UNDER TEST)
152 | 	tacotron_decay_rate = 0.2, #learning rate decay rate (UNDER TEST)
153 | 	tacotron_initial_learning_rate = 1e-3, #starting learning rate
154 | 	tacotron_final_learning_rate = 1e-5, #minimal learning rate
155 | 
156 | 	tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
157 | 	tacotron_adam_beta2 = 0.999, #AdamOptimizer beta2 parameter
158 | 	tacotron_adam_epsilon = 1e-6, #AdamOptimizer beta3 parameter
159 | 
160 | 	tacotron_zoneout_rate = 0.1, #zoneout rate for all LSTM cells in the network
161 | 	tacotron_dropout_rate = 0.5, #dropout rate for all convolutional layers + prenet
162 | 
163 | 	natural_eval = False, #Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same teacher-forcing ratio as in training (just for overfit)
164 | 
165 | 	#Decoder RNN learning can take be done in one of two ways:
166 | 	#	Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode='constant'
167 | 	#	Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is function of global step. (teacher forcing ratio decay) mode='scheduled'
168 | 	#The second approach is inspired by:
169 | 	#Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
170 | 	#Can be found under: https://arxiv.org/pdf/1506.03099.pdf
171 | 	tacotron_teacher_forcing_mode = 'constant', #Can be ('constant' or 'scheduled'). 'scheduled' mode applies a cosine teacher forcing ratio decay. (Preference: scheduled)
172 | 	tacotron_teacher_forcing_ratio = 1., #Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder inputs, Only relevant if mode='constant'
173 | 	tacotron_teacher_forcing_init_ratio = 1., #initial teacher forcing ratio. Relevant if mode='scheduled'
174 | 	tacotron_teacher_forcing_final_ratio = 0., #final teacher forcing ratio. Relevant if mode='scheduled'
175 | 	tacotron_teacher_forcing_start_decay = 10000, #starting point of teacher forcing ratio decay. Relevant if mode='scheduled'
176 | 	tacotron_teacher_forcing_decay_steps = 280000, #Determines the teacher forcing ratio decay slope. Relevant if mode='scheduled'
177 | 	tacotron_teacher_forcing_decay_alpha = 0., #teacher forcing ratio decay rate. Relevant if mode='scheduled'
178 | 	###########################################################################################################################################
179 | 
180 | 	#Wavenet Training
181 | 	wavenet_random_seed = 5339, # S=5, E=3, D=9 :)
182 | 	wavenet_swap_with_cpu = False, #Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause major slowdowns! Only use when critical!)
183 | 
184 | 	wavenet_batch_size = 4, #batch size used to train wavenet.
185 | 	wavenet_test_size = 0.0441, #% of data to keep as test data, if None, wavenet_test_batches must be not None
186 | 	wavenet_test_batches = None, #number of test batches.
187 | 	wavenet_data_random_state = 1234, #random state for train test split repeatability
188 | 
189 | 	wavenet_learning_rate = 1e-4,
190 | 	wavenet_adam_beta1 = 0.9,
191 | 	wavenet_adam_beta2 = 0.999,
192 | 	wavenet_adam_epsilon = 1e-6,
193 | 
194 | 	wavenet_ema_decay = 0.9999, #decay rate of exponential moving average
195 | 
196 | 	wavenet_dropout = 0.05, #drop rate of wavenet layers
197 | 	train_with_GTA = False, #Whether to use GTA mels to train WaveNet instead of ground truth mels.
198 | 	###########################################################################################################################################
199 | 
200 | 	#Eval sentences (if no eval file was specified, these sentences are used for eval)
201 | 	sentences = [
202 | 	# From July 8, 2017 New York Times:
203 | 	'Scientists at the CERN laboratory say they have discovered a new particle.',
204 | 	'There\'s a way to measure the acute emotional intelligence that has never gone out of style.',
205 | 	'President Trump met with other leaders at the Group of 20 conference.',
206 | 	'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
207 | 	# From Google's Tacotron example page:
208 | 	'Generative adversarial network or variational auto-encoder.',
209 | 	'Basilar membrane and otolaryngology are not auto-correlations.',
210 | 	'He has read the whole thing.',
211 | 	'He reads books.',
212 | 	"Don't desert me here in the desert!",
213 | 	'He thought it was time to present the present.',
214 | 	'Thisss isrealy awhsome.',
215 | 	'Punctuation sensitivity, is working.',
216 | 	'Punctuation sensitivity is working.',
217 | 	"The buses aren't the problem, they actually provide a solution.",
218 | 	"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
219 | 	"The quick brown fox jumps over the lazy dog.",
220 | 	"does the quick brown fox jump over the lazy dog?",
221 | 	"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
222 | 	"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
223 | 	"The blue lagoon is a nineteen eighty American romance adventure film.",
224 | 	"Tajima Airport serves Toyooka.",
225 | 	'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
226 | 	#From Training data:
227 | 	'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.',
228 | 	'in giltspur street compter, where he was first lodged.',
229 | 	'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,',
230 | 	'it appears that oswald had only one caller in response to all of his fpcc activities,',
231 | 	'he relied on the absence of the strychnia.',
232 | 	'scoggins thought it was lighter.',
233 | 	'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, 
234 | 	and would have possessed so much moral dignity''',
235 | 	'''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. 
236 | 	This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that 
237 | 	the adopted architecture is able to perform this task with wild success.''',
238 | 	'Thank you so much for your support!',
239 | 	]
240 | 
241 | 	)
242 | 
243 | def hparams_debug_string():
244 | 	values = hparams.values()
245 | 	hp = ['  %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences']
246 | 	return 'Hyperparameters:\n' + '\n'.join(hp)


--------------------------------------------------------------------------------
/wavenet_vocoder/feeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import tensorflow as tf 
  3 | from sklearn.model_selection import train_test_split
  4 | import time
  5 | import threading
  6 | import os
  7 | from .util import is_scalar_input, is_mulaw_quantize
  8 | from infolog import log
  9 | from datasets import audio
 10 | from keras.utils import np_utils
 11 | 
 12 | _batches_per_group = 32
 13 | _pad = 0
 14 | 
 15 | 
 16 | class Feeder:
 17 | 	"""
 18 | 		Feeds batches of data into queue in a background thread.
 19 | 	"""
 20 | 	def __init__(self, coordinator, metadata_filename, base_dir, hparams):
 21 | 		super(Feeder, self).__init__()
 22 | 
 23 | 		if hparams.gin_channels > 0:
 24 | 			raise NotImplementedError('Global conditioning preprocessing has not been added yet, it will be out soon. Thanks for your patience!')
 25 | 
 26 | 		self._coord = coordinator
 27 | 		self._hparams = hparams
 28 | 		self._train_offset = 0
 29 | 		self._test_offset = 0
 30 | 
 31 | 		#Base directory of the project (to map files from different locations)
 32 | 		self._base_dir = base_dir
 33 | 
 34 | 		#Load metadata
 35 | 		self._data_dir = os.path.dirname(metadata_filename)
 36 | 		with open(metadata_filename, 'r') as f:
 37 | 			self._metadata = [line.strip().split('|') for line in f]
 38 | 
 39 | 		#Train test split
 40 | 		if hparams.wavenet_test_size is None:
 41 | 			assert hparams.wavenet_test_batches is not None
 42 | 
 43 | 		test_size = (hparams.wavenet_test_size if hparams.wavenet_test_size is not None
 44 | 			else hparams.wavenet_test_batches * hparams.wavenet_batch_size)
 45 | 		indices = np.arange(len(self._metadata))
 46 | 		train_indices, test_indices = train_test_split(indices,
 47 | 			test_size=test_size, random_state=hparams.wavenet_data_random_state)
 48 | 
 49 | 		#Make sure test size is a multiple of batch size else round up
 50 | 		len_test_indices = _round_up(len(test_indices), hparams.wavenet_batch_size)
 51 | 		extra_test = test_indices[len_test_indices:]
 52 | 		test_indices = test_indices[:len_test_indices]
 53 | 		train_indices = np.concatenate([train_indices, extra_test])
 54 | 
 55 | 		self._train_meta = list(np.array(self._metadata)[train_indices])
 56 | 		self._test_meta = list(np.array(self._metadata)[test_indices])
 57 | 
 58 | 		self.test_steps = len(self._test_meta) // hparams.wavenet_batch_size
 59 | 
 60 | 		if hparams.wavenet_test_size is None:
 61 | 			assert hparams.wavenet_test_batches == self.test_steps
 62 | 
 63 | 		#Get conditioning status
 64 | 		self.local_condition, self.global_condition = self._check_conditions()
 65 | 
 66 | 		with tf.device('/cpu:0'):
 67 | 			# Create placeholders for inputs and targets. Don't specify batch size because we want
 68 | 			# to be able to feed different batch sizes at eval time.
 69 | 			if is_scalar_input(hparams.input_type):
 70 | 				input_placeholder = tf.placeholder(tf.float32, shape=(None, 1, None), name='audio_inputs')
 71 | 				target_placeholder = tf.placeholder(tf.float32, shape=(None, None, 1), name='audio_targets')
 72 | 				target_type = tf.float32
 73 | 			else:
 74 | 				input_placeholder = tf.placeholder(tf.float32, shape=(None, hparams.quantize_channels, None), name='audio_inputs')
 75 | 				target_placeholder = tf.placeholder(tf.int32, shape=(None, None, 1), name='audio_targets')
 76 | 				target_type = tf.int32
 77 | 				
 78 | 			self._placeholders = [
 79 | 			input_placeholder,
 80 | 			target_placeholder,
 81 | 			tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
 82 | 			]
 83 | 
 84 | 			queue_types = [tf.float32, target_type, tf.int32]
 85 | 
 86 | 			if self.local_condition:
 87 | 				self._placeholders.append(tf.placeholder(tf.float32, shape=(None, hparams.num_mels, None), name='local_condition_features'))
 88 | 				queue_types.append(tf.float32)
 89 | 			if self.global_condition:
 90 | 				self._placeholders.append(tf.placeholder(tf.int32, shape=(), name='global_condition_features'))
 91 | 				queue_types.append(tf.int32)
 92 | 
 93 | 			# Create queue for buffering data
 94 | 			queue = tf.FIFOQueue(8, queue_types, name='intput_queue')
 95 | 			self._enqueue_op = queue.enqueue(self._placeholders)
 96 | 			variables = queue.dequeue()
 97 | 
 98 | 			self.inputs = variables[0]
 99 | 			self.inputs.set_shape(self._placeholders[0].shape)
100 | 			self.targets = variables[1]
101 | 			self.targets.set_shape(self._placeholders[1].shape)
102 | 			self.input_lengths = variables[2]
103 | 			self.input_lengths.set_shape(self._placeholders[2].shape)
104 | 
105 | 			#If local conditioning disabled override c inputs with None
106 | 			if hparams.cin_channels < 0:
107 | 				self.local_condition_features = None
108 | 			else:
109 | 				self.local_condition_features = variables[3]
110 | 				self.local_condition_features.set_shape(self._placeholders[3].shape)
111 | 			
112 | 			#If global conditioning disabled override g inputs with None
113 | 			if hparams.gin_channels < 0:
114 | 				self.global_condition_features = None
115 | 			else:
116 | 				self.global_condition_features = variables[4]
117 | 				self.global_condition_features.set_shape(self._placeholders[4].shape)
118 | 
119 | 
120 | 			# Create queue for buffering eval data
121 | 			eval_queue = tf.FIFOQueue(1, queue_types, name='eval_queue')
122 | 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
123 | 			eval_variables = eval_queue.dequeue()
124 | 
125 | 			self.eval_inputs = eval_variables[0]
126 | 			self.eval_inputs.set_shape(self._placeholders[0].shape)
127 | 			self.eval_targets = eval_variables[1]
128 | 			self.eval_targets.set_shape(self._placeholders[1].shape)
129 | 			self.eval_input_lengths = eval_variables[2]
130 | 			self.eval_input_lengths.set_shape(self._placeholders[2].shape)
131 | 			
132 | 			#If local conditioning disabled override c inputs with None
133 | 			if hparams.cin_channels < 0:
134 | 				self.eval_local_condition_features = None
135 | 			else:
136 | 				self.eval_local_condition_features = eval_variables[3]
137 | 				self.eval_local_condition_features.set_shape(self._placeholders[3].shape)
138 | 			
139 | 			#If global conditioning disabled override g inputs with None
140 | 			if hparams.gin_channels < 0:
141 | 				self.eval_global_condition_features = None
142 | 			else:
143 | 				self.eval_global_condition_features = eval_variables[4]
144 | 				self.eval_global_condition_features.set_shape(self._placeholders[4].shape)
145 | 
146 | 
147 | 
148 | 	def start_threads(self, session):
149 | 		self._session = session
150 | 		thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
151 | 		thread.daemon = True #Thread will close when parent quits
152 | 		thread.start()
153 | 
154 | 		thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
155 | 		thread.daemon = True #Thread will close when parent quits
156 | 		thread.start()
157 | 
158 | 	def _get_test_groups(self):
159 | 		meta = self._test_meta[self._test_offset]
160 | 		self._test_offset += 1
161 | 
162 | 		if self._hparams.train_with_GTA:
163 | 			mel_file = meta[2]
164 | 		else:
165 | 			mel_file = meta[1]
166 | 		audio_file = meta[0]
167 | 
168 | 		input_data = np.load(os.path.join(self._base_dir, audio_file))
169 | 
170 | 		if self.local_condition:
171 | 			local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
172 | 		else:
173 | 			local_condition_features = None
174 | 
175 | 		global_condition_features = None
176 | 
177 | 		return (input_data, local_condition_features, global_condition_features, len(input_data))
178 | 
179 | 	def make_test_batches(self):
180 | 		start = time.time()
181 | 
182 | 		#Read one example for evaluation
183 | 		n = 1
184 | 
185 | 		#Test on entire test set (one sample at an evaluation step)
186 | 		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
187 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
188 | 		np.random.shuffle(batches)
189 | 
190 | 		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
191 | 		return batches
192 | 
193 | 	def _enqueue_next_train_group(self):
194 | 		while not self._coord.should_stop():
195 | 			start = time.time()
196 | 
197 | 			# Read a group of examples
198 | 			n = self._hparams.wavenet_batch_size
199 | 			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
200 | 
201 | 			# Bucket examples base on similiar output length for efficiency
202 | 			examples.sort(key=lambda x: x[-1])
203 | 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
204 | 			np.random.shuffle(batches)
205 | 
206 | 			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
207 | 			for batch in batches:
208 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
209 | 				self._session.run(self._enqueue_op, feed_dict=feed_dict)
210 | 
211 | 	def _enqueue_next_test_group(self):
212 | 		test_batches = self.make_test_batches()
213 | 		while not self._coord.should_stop():
214 | 			for batch in test_batches:
215 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
216 | 				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
217 | 
218 | 	def _get_next_example(self):
219 | 		'''Get a single example (input, output, len_output) from disk
220 | 		'''
221 | 		if self._train_offset >= len(self._train_meta):
222 | 			self._train_offset = 0
223 | 			np.random.shuffle(self._train_meta)
224 | 		meta = self._train_meta[self._train_offset]
225 | 		self._train_offset += 1
226 | 
227 | 		if self._hparams.train_with_GTA:
228 | 			mel_file = meta[2]
229 | 			if 'linear' in mel_file:
230 | 				raise RuntimeError('Linear spectrogram files selected instead of GTA mels, did you specify the wrong metadata?')
231 | 		else:
232 | 			mel_file = meta[1]
233 | 		audio_file = meta[0]
234 | 
235 | 		input_data = np.load(os.path.join(self._base_dir, audio_file))
236 | 
237 | 		if self.local_condition:
238 | 			local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
239 | 		else:
240 | 			local_condition_features = None
241 | 			
242 | 		global_condition_features = None
243 | 
244 | 		return (input_data, local_condition_features, global_condition_features, len(input_data))
245 | 
246 | 
247 | 	def _prepare_batch(self, batch):
248 | 		np.random.shuffle(batch)
249 | 		
250 | 		#Limit time steps to save GPU Memory usage
251 | 		max_time_steps = self._limit_time()
252 | 		#Adjust time resolution for upsampling
253 | 		batch = self._adjust_time_resolution(batch, self.local_condition, max_time_steps)
254 | 		
255 | 		#time lengths
256 | 		input_lengths = [len(x[0]) for x in batch]
257 | 		max_input_length = max(input_lengths)
258 | 
259 | 		inputs = self._prepare_inputs([x[0] for x in batch], max_input_length)
260 | 		targets = self._prepare_targets([x[0] for x in batch], max_input_length)
261 | 		local_condition_features = self._prepare_local_conditions(self.local_condition, [x[1] for x in batch])
262 | 		global_condition_features = self._prepare_global_conditions(self.global_condition, [x[2] for x in batch])
263 | 
264 | 		new_batch = (inputs, targets, input_lengths)
265 | 		if local_condition_features is not None:
266 | 			new_batch += (local_condition_features, )
267 | 		if global_condition_features is not None:
268 | 			new_batch += (global_condition_features, )
269 | 
270 | 		return new_batch
271 | 
272 | 	def _prepare_inputs(self, inputs, maxlen):
273 | 		if is_mulaw_quantize(self._hparams.input_type):
274 | 			#[batch_size, time_steps, quantize_channels]
275 | 			x_batch = np.stack([_pad_inputs(np_utils.to_categorical(
276 | 				x, num_classes=self._hparams.quantize_channels), maxlen) for x in inputs]).astype(np.float32)
277 | 		else:
278 | 			#[batch_size, time_steps, 1]
279 | 			x_batch = np.stack([_pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs]).astype(np.float32)
280 | 		assert len(x_batch.shape) == 3
281 | 		#Convert to channels first [batch_size, quantize_channels (or 1), time_steps]
282 | 		x_batch = np.transpose(x_batch, (0, 2, 1))
283 | 		return x_batch
284 | 
285 | 	def _prepare_targets(self, targets, maxlen):
286 | 		#[batch_size, time_steps]
287 | 		if is_mulaw_quantize(self._hparams.input_type):
288 | 			y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.int32)
289 | 		else:
290 | 			y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.float32)
291 | 		assert len(y_batch.shape) == 2
292 | 		#Add extra axis (make 3 dimension)
293 | 		y_batch = np.expand_dims(y_batch, axis=-1)
294 | 		return y_batch
295 | 
296 | 	def _prepare_local_conditions(self, local_condition, c_features):
297 | 		if local_condition:
298 | 			maxlen = max([len(x) for x in c_features])
299 | 			c_batch = np.stack([_pad_inputs(x, maxlen) for x in c_features]).astype(np.float32)
300 | 			assert len(c_batch.shape) == 3
301 | 			#[batch_size, c_channels, time_steps]
302 | 			c_batch = np.transpose(c_batch, (0, 2, 1))
303 | 		else:
304 | 			c_batch = None
305 | 		return c_batch
306 | 
307 | 	def _prepare_global_conditions(self, global_condition, g_features):
308 | 		if global_condition:
309 | 			g_batch = g_features
310 | 		else:
311 | 			g_batch = None
312 | 		return g_batch
313 | 
314 | 	def _check_conditions(self):
315 | 		local_condition = self._hparams.cin_channels > 0
316 | 		global_condition = self._hparams.gin_channels > 0
317 | 		return local_condition, global_condition
318 | 
319 | 	def _limit_time(self):
320 | 		'''Limit time resolution to save GPU memory.
321 | 		'''
322 | 		if self._hparams.max_time_sec is not None:
323 | 			return int(self._hparams.max_time_sec * self._hparams.sample_rate)
324 | 		elif self._hparams.max_time_steps is not None:
325 | 			return self._hparams.max_time_steps
326 | 		else:
327 | 			return None
328 | 
329 | 	def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
330 | 		'''Adjust time resolution between audio and local condition
331 | 		'''
332 | 		if local_condition:
333 | 			new_batch = []
334 | 			for b in batch:
335 | 				x, c, g, l = b
336 | 				self._assert_ready_for_upsample(x, c)
337 | 				if max_time_steps is not None:
338 | 					max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(self._hparams), True)
339 | 					if len(x) > max_time_steps:
340 | 						max_time_frames = max_steps // audio.get_hop_size(self._hparams)
341 | 						start = np.random.randint(0, len(c) - max_time_frames)
342 | 						time_start = start * audio.get_hop_size(self._hparams)
343 | 						x = x[time_start: time_start + max_time_frames * audio.get_hop_size(self._hparams)]
344 | 						c = c[start: start + max_time_frames, :]
345 | 						self._assert_ready_for_upsample(x, c)
346 | 
347 | 				new_batch.append((x, c, g, l))
348 | 			return new_batch
349 | 		else:
350 | 			new_batch = []
351 | 			for b in batch:
352 | 				x, c, g, l = b
353 | 				x = audio.trim(x)
354 | 				if max_time_steps is not None and len(x) > max_time_steps:
355 | 					start = np.random.randint(0, len(c) - max_time_steps)
356 | 					x = x[start: start + max_time_steps]
357 | 				new_batch.append((x, c, g, l))
358 | 			return new_batch
359 | 
360 | 	def _assert_ready_for_upsample(self, x, c):
361 | 		assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(self._hparams)
362 | 
363 | 
364 | def _pad_inputs(x, maxlen):
365 | 	return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad)
366 | 
367 | def _pad_targets(x, maxlen):
368 | 	return np.pad(x, (0, maxlen - len(x)), mode='constant', constant_values=_pad)
369 | 
370 | def _round_up(x, multiple):
371 | 	remainder = x % multiple
372 | 	return x if remainder == 0 else x + multiple - remainder
373 | 
374 | def _ensure_divisible(length, divisible_by=256, lower=True):
375 | 	if length % divisible_by == 0:
376 | 		return length
377 | 	if lower:
378 | 		return length - length % divisible_by
379 | 	else:
380 | 		return length + (divisible_by - length % divisible_by)
381 | 


--------------------------------------------------------------------------------
/tacotron/models/tacotron.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf 
  2 | from tensorflow.contrib.rnn import GRUCell
  3 | from tacotron.utils.symbols import symbols
  4 | from infolog import log
  5 | from .modules import reference_encoder
  6 | from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
  7 | from tacotron.models.modules import *
  8 | from tensorflow.contrib.seq2seq import dynamic_decode
  9 | from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
 10 | from tacotron.models.custom_decoder import CustomDecoder
 11 | from tacotron.models.attention import LocationSensitiveAttention
 12 | from .multihead_attention import MultiheadAttention
 13 | 
 14 | 
 15 | 
 16 | class Tacotron():
 17 | 	"""Tacotron-2 Feature prediction Model.
 18 | 	"""
 19 | 	def __init__(self, hparams):
 20 | 		self._hparams = hparams
 21 | 
 22 | 		
 23 | 	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
 24 | 			global_step=None, is_training=False, is_evaluating=False, reference_mel=None):
 25 | 		"""
 26 | 		Initializes the model for inference
 27 | 
 28 | 		sets "mel_outputs" and "alignments" fields.
 29 | 
 30 | 		Args:
 31 | 			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
 32 | 			  steps in the input time series, and values are character IDs
 33 | 			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
 34 | 			of each sequence in inputs.
 35 | 			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
 36 | 			of steps in the output time series, M is num_mels, and values are entries in the mel
 37 | 			spectrogram. Only needed for training.
 38 | 		"""
 39 | 		if mel_targets is None and stop_token_targets is not None:
 40 | 			raise ValueError('no mel targets were provided but token_targets were given')
 41 | 		if mel_targets is not None and stop_token_targets is None and not gta:
 42 | 			raise ValueError('Mel targets are provided without corresponding token_targets')
 43 | 		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
 44 | 			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
 45 | 		if gta and linear_targets is not None:
 46 | 			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
 47 | 		if is_training and self._hparams.mask_decoder and targets_lengths is None:
 48 | 			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
 49 | 		if is_training and is_evaluating:
 50 | 			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')
 51 | 
 52 | 		with tf.variable_scope('inference') as scope:
 53 | 			batch_size = tf.shape(inputs)[0]
 54 | 			hp = self._hparams
 55 | 			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
 56 | 			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
 57 | 				assert global_step is not None
 58 | 
 59 | 			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
 60 | 			post_condition = hp.predict_linear and not gta
 61 | 
 62 | 			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
 63 | 			embedding_table = tf.get_variable(
 64 | 				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
 65 | 			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)
 66 | 
 67 | 
 68 | 			if hp.use_gst:
 69 | 				#Global style tokens (GST)
 70 | 				gst_tokens = tf.get_variable('style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], 
 71 | 					dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5))
 72 | 				self.gst_tokens = gst_tokens
 73 | 
 74 | 
 75 | 			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
 76 | 			encoder_cell = TacotronEncoderCell(
 77 | 				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
 78 | 				EncoderRNN(is_training, size=hp.encoder_lstm_units,
 79 | 					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
 80 | 			
 81 | 			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)
 82 | 
 83 | 			#For shape visualization purpose
 84 | 			enc_conv_output_shape = encoder_cell.conv_output_shape
 85 | 
 86 | 			if is_training:
 87 | 				reference_mel = mel_targets
 88 | 
 89 | 			if reference_mel is not None:
 90 | 				# Reference encoder
 91 | 				refnet_outputs = reference_encoder(
 92 | 				  reference_mel, 
 93 | 				  filters=hp.reference_filters, 
 94 | 				  kernel_size=(3,3),
 95 | 				  strides=(2,2),
 96 | 				  encoder_cell=GRUCell(hp.reference_depth),
 97 | 				  is_training=is_training)                                                 # [N, 128]
 98 | 				self.refnet_outputs = refnet_outputs
 99 | 
100 | 				if hp.use_gst:
101 | 				  # Style attention
102 | 				  style_attention = MultiheadAttention(
103 | 					tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
104 | 					tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
105 | 					num_heads=hp.num_heads,
106 | 					num_units=hp.style_att_dim,
107 | 					attention_type=hp.style_att_type)
108 | 
109 | 				  style_embeddings = style_attention.multi_head_attention() 
110 | 				else:
111 | 				  style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
112 | 			else:
113 | 				print("Use random weight for GST.")
114 | 				random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
115 | 				random_weights = tf.nn.softmax(random_weights, name="random_weights")
116 | 				style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
117 | 				style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])
118 | 			
119 | 
120 | 			#Extend style embeddings to be compatible with encoder_outputs. 
121 | 			#Make encoder_output's dimensions by concatenating style embeddings with a vector of all zeroes.
122 | 			#Preserves effect of both style and encoder_outputs.
123 | 			neg = tf.add(style_embeddings, tf.negative(style_embeddings))
124 | 			style_embeddings = tf.concat([style_embeddings, neg], axis=-1)
125 | 
126 | 
127 | 			# Add style embedding to every text encoder state
128 | 			style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
129 | 			encoder_outputs = tf.add(encoder_outputs, style_embeddings)   
130 | 
131 | 			#Decoder Parts
132 | 			#Attention Decoder Prenet
133 | 			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
134 | 			#Attention Mechanism
135 | 			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
136 | 				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
137 | 				cumulate_weights=hp.cumulative_weights)
138 | 			#Decoder LSTM Cells
139 | 
140 | 			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
141 | 				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
142 | 			#Frames Projection layer
143 | 			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
144 | 			#<stop_token> projection layer
145 | 			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')
146 | 
147 | 
148 | 			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
149 | 			decoder_cell = TacotronDecoderCell(
150 | 				prenet,
151 | 				attention_mechanism,
152 | 				decoder_lstm,
153 | 				frame_projection,
154 | 				stop_projection)
155 | 			#Define the helper for our decoder
156 | 			if is_training or is_evaluating or gta:
157 | 				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
158 | 			else:
159 | 				self.helper = TacoTestHelper(batch_size, hp)
160 | 
161 | 
162 | 			#initial decoder state
163 | 			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
164 | 
165 | 			#Only use max iterations at synthesis time
166 | 			max_iters = hp.max_iters if not (is_training or is_evaluating) else None
167 | 
168 | 			#Decode
169 | 			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
170 | 				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
171 | 				impute_finished=False,
172 | 				maximum_iterations=max_iters,
173 | 				swap_memory=hp.tacotron_swap_with_cpu)
174 | 
175 | 
176 | 			# Reshape outputs to be one output per entry 
177 | 			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
178 | 			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
179 | 			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
180 | 
181 | 		
182 | 			#Postnet
183 | 			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')
184 | 
185 | 			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
186 | 			residual = postnet(decoder_output)
187 | 
188 | 			#Project residual to same dimension as mel spectrogram 
189 | 			#==> [batch_size, decoder_steps * r, num_mels]
190 | 			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
191 | 			projected_residual = residual_projection(residual)
192 | 
193 | 
194 | 			#Compute the mel spectrogram
195 | 			mel_outputs = decoder_output + projected_residual
196 | 
197 | 
198 | 			if post_condition:
199 | 				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
200 | 				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
201 | 				post_processing_cell = TacotronEncoderCell(
202 | 				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
203 | 				EncoderRNN(is_training, size=hp.encoder_lstm_units,
204 | 					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))
205 | 
206 | 				expand_outputs = post_processing_cell(mel_outputs)
207 | 				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)
208 | 
209 | 			#Grab alignments from the final decoder state
210 | 			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])
211 | 
212 | 			if is_training:
213 | 				self.ratio = self.helper._ratio
214 | 			self.inputs = inputs
215 | 			self.input_lengths = input_lengths
216 | 			self.decoder_output = decoder_output
217 | 			self.alignments = alignments
218 | 			self.style_embeddings = style_embeddings
219 | 			self.stop_token_prediction = stop_token_prediction
220 | 			self.stop_token_targets = stop_token_targets
221 | 			self.mel_outputs = mel_outputs
222 | 			if post_condition:
223 | 				self.linear_outputs = linear_outputs
224 | 				self.linear_targets = linear_targets
225 | 			self.mel_targets = mel_targets
226 | 			self.targets_lengths = targets_lengths
227 | 			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
228 | 			log('  Train mode:               {}'.format(is_training))
229 | 			log('  Eval mode:                {}'.format(is_evaluating))
230 | 			log('  GTA mode:                 {}'.format(gta))
231 | 			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
232 | 			log('  embedding:                {}'.format(embedded_inputs.shape))
233 | 			log('  enc conv out:             {}'.format(enc_conv_output_shape))
234 | 			log('  encoder out:              {}'.format(encoder_outputs.shape))
235 | 			log('  decoder out:              {}'.format(decoder_output.shape))
236 | 			log('  residual out:             {}'.format(residual.shape))
237 | 			log('  projected residual out:   {}'.format(projected_residual.shape))
238 | 			log('  style embedding:         %d' % style_embeddings.shape[-1])
239 | 			log('  mel out:                  {}'.format(mel_outputs.shape))
240 | 			if post_condition:
241 | 				log('  linear out:               {}'.format(linear_outputs.shape))
242 | 			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
243 | 
244 | 
245 | 	def add_loss(self):
246 | 		'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
247 | 		with tf.variable_scope('loss') as scope:
248 | 			hp = self._hparams
249 | 
250 | 			if hp.mask_decoder:
251 | 				# Compute loss of predictions before postnet
252 | 				before = MaskedMSE(self.mel_targets, self.decoder_output, self.targets_lengths,
253 | 					hparams=self._hparams)
254 | 				# Compute loss after postnet
255 | 				after = MaskedMSE(self.mel_targets, self.mel_outputs, self.targets_lengths,
256 | 					hparams=self._hparams)
257 | 				#Compute <stop_token> loss (for learning dynamic generation stop)
258 | 				stop_token_loss = MaskedSigmoidCrossEntropy(self.stop_token_targets,
259 | 					self.stop_token_prediction, self.targets_lengths, hparams=self._hparams)
260 | 			else:
261 | 				# Compute loss of predictions before postnet
262 | 				before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_output)
263 | 				# Compute loss after postnet
264 | 				after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs)
265 | 				#Compute <stop_token> loss (for learning dynamic generation stop)
266 | 				stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
267 | 					labels=self.stop_token_targets,
268 | 					logits=self.stop_token_prediction))
269 | 
270 | 			if hp.predict_linear:
271 | 				#Compute linear loss
272 | 				#From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
273 | 				#Prioritize loss for frequencies under 2000 Hz.
274 | 				l1 = tf.abs(self.linear_targets - self.linear_outputs)
275 | 				n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_mels)
276 | 				linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq])
277 | 			else:
278 | 				linear_loss = 0.
279 | 
280 | 			# Compute the regularization weight
281 | 			if hp.tacotron_scale_regularization:
282 | 				reg_weight_scaler = 1. / (2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (hp.max_abs_value)
283 | 				reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
284 | 			else:
285 | 				reg_weight = hp.tacotron_reg_weight
286 | 
287 | 			# Get all trainable variables
288 | 			all_vars = tf.trainable_variables()
289 | 			regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars
290 | 				if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight
291 | 
292 | 			# Compute final loss term
293 | 			self.before_loss = before
294 | 			self.after_loss = after
295 | 			self.stop_token_loss = stop_token_loss
296 | 			self.regularization_loss = regularization
297 | 			self.linear_loss = linear_loss
298 | 
299 | 			self.loss = self.before_loss + self.after_loss + self.stop_token_loss + self.regularization_loss + self.linear_loss
300 | 
301 | 	def add_optimizer(self, global_step):
302 | 		'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
303 | 
304 | 		Args:
305 | 			global_step: int32 scalar Tensor representing current global step in training
306 | 		'''
307 | 		with tf.variable_scope('optimizer') as scope:
308 | 			hp = self._hparams
309 | 			if hp.tacotron_decay_learning_rate:
310 | 				self.decay_steps = hp.tacotron_decay_steps
311 | 				self.decay_rate = hp.tacotron_decay_rate
312 | 				self.learning_rate = self._learning_rate_decay(hp.tacotron_initial_learning_rate, global_step)
313 | 			else:
314 | 				self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
315 | 
316 | 			optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
317 | 				hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
318 | 			gradients, variables = zip(*optimizer.compute_gradients(self.loss))
319 | 			self.gradients = gradients
320 | 			#Just for causion
321 | 			#https://github.com/Rayhane-mamah/Tacotron-2/issues/11
322 | 			clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.)
323 | 
324 | 			# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
325 | 			# https://github.com/tensorflow/tensorflow/issues/1122
326 | 			with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
327 | 				self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
328 | 					global_step=global_step)
329 | 
330 | 	def _learning_rate_decay(self, init_lr, global_step):
331 | 		#################################################################
332 | 		# Narrow Exponential Decay:
333 | 
334 | 		# Phase 1: lr = 1e-3
335 | 		# We only start learning rate decay after 50k steps
336 | 
337 | 		# Phase 2: lr in ]1e-5, 1e-3[
338 | 		# decay reach minimal value at step 310k
339 | 
340 | 		# Phase 3: lr = 1e-5
341 | 		# clip by minimal learning rate value (step > 310k)
342 | 		#################################################################
343 | 		hp = self._hparams
344 | 
345 | 		#Compute natural exponential decay
346 | 		lr = tf.train.exponential_decay(init_lr, 
347 | 			global_step - hp.tacotron_start_decay, #lr = 1e-3 at step 50k
348 | 			self.decay_steps, 
349 | 			self.decay_rate, #lr = 1e-5 around step 310k
350 | 			name='lr_exponential_decay')
351 | 
352 | 
353 | 		#clip learning rate by max and min values (initial and final values)
354 | 		return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)
355 | 


--------------------------------------------------------------------------------