├── tests
    ├── __init__.py
    ├── cmudict_test.py
    ├── text_test.py
    └── numbers_test.py
├── datasets
    ├── __init__.py
    ├── blizzard.py
    ├── ljspeech.py
    ├── thchs30.py
    └── datafeeder.py
├── util
    ├── test_fun.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   └── txt2pinyin.cpython-36.pyc
    ├── __init__.py
    ├── plot.py
    ├── infolog.py
    ├── audio.py
    └── txt2pinyin.py
├── example
    └── TTS.mp3
├── models
    ├── __init__.py
    ├── modules.py
    ├── helpers.py
    ├── custom_decoder.py
    ├── tacotron.py
    ├── attention.py
    └── rnn_wrappers.py
├── requirements.txt
├── text
    ├── symbols.py
    ├── cmudict.py
    ├── numbers.py
    ├── __init__.py
    └── cleaners.py
├── LICENSE
├── hparams.py
├── synthesizer.py
├── preprocess.py
├── TRAINING_DATA.md
├── eval.py
├── demo_server.py
├── train.py
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/test_fun.py:
--------------------------------------------------------------------------------
1 | import os
2 | print(os.path.expanduser('.'))


--------------------------------------------------------------------------------
/example/TTS.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/example/TTS.mp3


--------------------------------------------------------------------------------
/util/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/util/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/util/__pycache__/txt2pinyin.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/util/__pycache__/txt2pinyin.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == 'tacotron':
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception('Unknown model: ' + name)
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
 2 | # depends on your platform. It is assumed you have already installed tensorflow.
 3 | falcon==1.2.0
 4 | inflect==0.2.5
 5 | librosa==0.5.1
 6 | matplotlib==2.0.2
 7 | numpy==1.14.3
 8 | scipy==0.19.0
 9 | tqdm==4.11.2
10 | Unidecode==0.4.20
11 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/util/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | def plot_alignment(alignment, path, info=None):
 7 |   fig, ax = plt.subplots()
 8 |   im = ax.imshow(
 9 |     alignment,
10 |     aspect='auto',
11 |     origin='lower',
12 |     interpolation='none')
13 |   fig.colorbar(im, ax=ax)
14 |   xlabel = 'Decoder timestep'
15 |   if info is not None:
16 |     xlabel += '\n\n' + info
17 |   plt.xlabel(xlabel)
18 |   plt.ylabel('Encoder timestep')
19 |   plt.tight_layout()
20 |   plt.savefig(path, format='png')
21 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | import os
 8 | from text import cmudict
 9 | 
10 | _pad        = '_'
11 | _eos        = '~'
12 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!\'(),-.:;? '
13 | 
14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
15 | # _arpabet = ['@' + s for s in cmudict.valid_symbols]
16 | 
17 | # Export all symbols:
18 | # symbols = [_pad, _eos] + list(_characters) + _arpabet
19 | symbols = [_pad, _eos] + list(_characters)# + _arpabet
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/tests/cmudict_test.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from text import cmudict
 3 | 
 4 | 
 5 | test_data = '''
 6 | ;;; # CMUdict  --  Major Version: 0.07
 7 | )PAREN  P ER EH N
 8 | 'TIS  T IH Z
 9 | ADVERSE  AE0 D V ER1 S
10 | ADVERSE(1)  AE1 D V ER2 S
11 | ADVERSE(2)  AE2 D V ER1 S
12 | ADVERSELY  AE0 D V ER1 S L IY0
13 | ADVERSITY  AE0 D V ER1 S IH0 T IY2
14 | BARBERSHOP  B AA1 R B ER0 SH AA2 P
15 | YOU'LL  Y UW1 L
16 | '''
17 | 
18 | 
19 | def test_cmudict():
20 |   c = cmudict.CMUDict(io.StringIO(test_data))
21 |   assert len(c) == 6
22 |   assert len(cmudict.valid_symbols) == 84
23 |   assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2']
24 |   assert c.lookup('BarberShop') == ['B AA1 R B ER0 SH AA2 P']
25 |   assert c.lookup("You'll") == ['Y UW1 L']
26 |   assert c.lookup("'tis") == ['T IH Z']
27 |   assert c.lookup('adverse') == [
28 |     'AE0 D V ER1 S',
29 |     'AE1 D V ER2 S',
30 |     'AE2 D V ER1 S',
31 |   ]
32 |   assert c.lookup('') == None
33 |   assert c.lookup('foo') == None
34 |   assert c.lookup(')paren') == None
35 | 
36 | 
37 | def test_cmudict_no_keep_ambiguous():
38 |   c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False)
39 |   assert len(c) == 5
40 |   assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2']
41 |   assert c.lookup('adverse') == None
42 | 


--------------------------------------------------------------------------------
/util/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 |   global _file, _run_name, _slack_url
16 |   _close_logfile()
17 |   _file = open(filename, 'a')
18 |   _file.write('\n-----------------------------------------------------------------\n')
19 |   _file.write('Starting new training run\n')
20 |   _file.write('-----------------------------------------------------------------\n')
21 |   _run_name = run_name
22 |   _slack_url = slack_url
23 | 
24 | 
25 | def log(msg, slack=False):
26 |   print(msg)
27 |   if _file is not None:
28 |     _file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
29 |   if slack and _slack_url is not None:
30 |     Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 |   global _file
35 |   if _file is not None:
36 |     _file.close()
37 |     _file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 |   req = Request(_slack_url)
42 |   req.add_header('Content-Type', 'application/json')
43 |   urlopen(req, json.dumps({
44 |     'username': 'tacotron',
45 |     'icon_emoji': ':taco:',
46 |     'text': '*%s*: %s' % (_run_name, msg)
47 |   }).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | # Default hyperparameters:
 5 | hparams = tf.contrib.training.HParams(
 6 |   # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
 7 |   # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
 8 |   cleaners='basic_cleaners',
 9 | 
10 |   # Audio:
11 |   num_mels=80,
12 |   num_freq=2049,
13 |   sample_rate=48000,
14 |   frame_length_ms=50,
15 |   frame_shift_ms=12.5,
16 |   preemphasis=0.97,
17 |   min_level_db=-100,
18 |   ref_level_db=20,
19 |   max_frame_num=1000,
20 |   max_abs_value = 4,
21 |   fmin = 125, # for male, set 55
22 |   fmax = 7600, # for male, set 3600
23 | 
24 |   # Model:
25 |   outputs_per_step=5,
26 |   embed_depth=512,
27 |   prenet_depths=[256, 256],
28 |   encoder_depth=256,
29 |   postnet_depth=512,
30 |   attention_depth=128,
31 |   decoder_depth=1024,
32 | 
33 |   # Training:
34 |   batch_size=64,
35 |   adam_beta1=0.9,
36 |   adam_beta2=0.999,
37 |   initial_learning_rate=0.001,
38 |   decay_learning_rate=True,
39 |   use_cmudict=False,  # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
40 | 
41 |   # Eval:
42 |   max_iters=300,
43 |   griffin_lim_iters=60,
44 |   power=1.2,              # Power to raise magnitudes to prior to Griffin-Lim
45 | )
46 | 
47 | 
48 | def hparams_debug_string():
49 |   values = hparams.values()
50 |   hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
51 |   return 'Hyperparameters:\n' + '\n'.join(hp)
52 | 


--------------------------------------------------------------------------------
/synthesizer.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from hparams import hparams
 5 | from librosa import effects
 6 | from models import create_model
 7 | from text import text_to_sequence
 8 | from util import audio
 9 | 
10 | 
11 | class Synthesizer:
12 |   def load(self, checkpoint_path, model_name='tacotron'):
13 |     print('Constructing model: %s' % model_name)
14 |     inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
15 |     input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
16 |     with tf.variable_scope('model') as scope:
17 |       self.model = create_model(model_name, hparams)
18 |       self.model.initialize(inputs, input_lengths)
19 |       self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
20 | 
21 |     print('Loading checkpoint: %s' % checkpoint_path)
22 |     self.session = tf.Session()
23 |     self.session.run(tf.global_variables_initializer())
24 |     saver = tf.train.Saver()
25 |     saver.restore(self.session, checkpoint_path)
26 | 
27 | 
28 |   def synthesize(self, text):
29 |     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
30 |     seq = text_to_sequence(text, cleaner_names)
31 |     feed_dict = {
32 |       self.model.inputs: [np.asarray(seq, dtype=np.int32)],
33 |       self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
34 |     }
35 |     wav = self.session.run(self.wav_output, feed_dict=feed_dict)
36 |     wav = audio.inv_preemphasis(wav)
37 |     out = io.BytesIO()
38 |     audio.save_wav(wav, out)
39 |     return out.getvalue()
40 | 


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |   def __init__(self, file_or_path, keep_ambiguous=True):
20 |     if isinstance(file_or_path, str):
21 |       with open(file_or_path, encoding='latin-1') as f:
22 |         entries = _parse_cmudict(f)
23 |     else:
24 |       entries = _parse_cmudict(file_or_path)
25 |     if not keep_ambiguous:
26 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
27 |     self._entries = entries
28 | 
29 | 
30 |   def __len__(self):
31 |     return len(self._entries)
32 | 
33 | 
34 |   def lookup(self, word):
35 |     '''Returns list of ARPAbet pronunciations of the given word.'''
36 |     return self._entries.get(word.upper())
37 | 
38 | 
39 | 
40 | _alt_re = re.compile(r'\([0-9]+\)')
41 | 
42 | 
43 | def _parse_cmudict(file):
44 |   cmudict = {}
45 |   for line in file:
46 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
47 |       parts = line.split('  ')
48 |       word = re.sub(_alt_re, '', parts[0])
49 |       pronunciation = _get_pronunciation(parts[1])
50 |       if pronunciation:
51 |         if word in cmudict:
52 |           cmudict[word].append(pronunciation)
53 |         else:
54 |           cmudict[word] = [pronunciation]
55 |   return cmudict
56 | 
57 | 
58 | def _get_pronunciation(s):
59 |   parts = s.strip().split(' ')
60 |   for part in parts:
61 |     if part not in _valid_symbol_set:
62 |       return None
63 |   return ' '.join(parts)
64 | 


--------------------------------------------------------------------------------
/tests/text_test.py:
--------------------------------------------------------------------------------
 1 | from text import cleaners, symbols, text_to_sequence, sequence_to_text
 2 | from unidecode import unidecode
 3 | 
 4 | 
 5 | def test_symbols():
 6 |   assert len(symbols) >= 3
 7 |   assert symbols[0] == '_'
 8 |   assert symbols[1] == '~'
 9 | 
10 | 
11 | def test_text_to_sequence():
12 |   assert text_to_sequence('', []) == [1]
13 |   assert text_to_sequence('Hi!', []) == [9, 36, 54, 1]
14 |   assert text_to_sequence('"A"_B', []) == [2, 3, 1]
15 |   assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
16 |   assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
17 |   assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]
18 | 
19 | 
20 | def test_sequence_to_text():
21 |   assert sequence_to_text([]) == ''
22 |   assert sequence_to_text([1]) == '~'
23 |   assert sequence_to_text([9, 36, 54, 1]) == 'Hi!~'
24 |   assert sequence_to_text([2, 64, 83, 132, 64, 3]) == 'A {AW1 S} B'
25 | 
26 | 
27 | def test_collapse_whitespace():
28 |   assert cleaners.collapse_whitespace('') == ''
29 |   assert cleaners.collapse_whitespace('  ') == ' '
30 |   assert cleaners.collapse_whitespace('x') == 'x'
31 |   assert cleaners.collapse_whitespace(' x.  y,  \tz') == ' x. y, z'
32 | 
33 | 
34 | def test_convert_to_ascii():
35 |   assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre"
36 |   assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott'
37 |   assert cleaners.convert_to_ascii('안녕') == 'annyeong'
38 |   assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite'
39 | 
40 | 
41 | def test_lowercase():
42 |   assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!'
43 |   assert cleaners.lowercase('CAFÉ') == 'café'
44 | 
45 | 
46 | def test_expand_abbreviations():
47 |   assert cleaners.expand_abbreviations('mr. and mrs. smith') == 'mister and misess smith'
48 | 
49 | 
50 | def test_expand_numbers():
51 |   assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears'
52 |   assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
53 | 
54 | 
55 | def test_cleaner_pipelines():
56 |   text = 'Mr. Müller ate  2 Apples'
57 |   assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
58 |   assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
59 |   assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'
60 | 
61 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from multiprocessing import cpu_count
 4 | from tqdm import tqdm
 5 | from datasets import blizzard, ljspeech, thchs30
 6 | from hparams import hparams
 7 | 
 8 | 
 9 | def preprocess_blizzard(args):
10 |   in_dir = os.path.join(args.base_dir, 'Blizzard2012')
11 |   out_dir = os.path.join(args.base_dir, args.output)
12 |   os.makedirs(out_dir, exist_ok=True)
13 |   metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
14 |   write_metadata(metadata, out_dir)
15 | 
16 | 
17 | def preprocess_ljspeech(args):
18 |   in_dir = os.path.join(args.base_dir, 'LJSpeech-1.1')
19 |   out_dir = os.path.join(args.base_dir, args.output)
20 |   os.makedirs(out_dir, exist_ok=True)
21 |   metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
22 |   write_metadata(metadata, out_dir)
23 | 
24 | 
25 | def preprocess_thchs30(args):
26 |   in_dir = os.path.join(args.base_dir, 'data_thchs30')
27 |   out_dir = os.path.join(args.base_dir, args.output)
28 |   os.makedirs(out_dir, exist_ok=True)
29 |   metadata = thchs30.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
30 |   write_metadata(metadata, out_dir)
31 | 
32 | 
33 | def write_metadata(metadata, out_dir):
34 |   with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
35 |     for m in metadata:
36 |       f.write('|'.join([str(x) for x in m]) + '\n')
37 |   frames = sum([m[2] for m in metadata])
38 |   hours = frames * hparams.frame_shift_ms / (3600 * 1000)
39 |   print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
40 |   print('Max input length:  %d' % max(len(m[3]) for m in metadata))
41 |   print('Max output length: %d' % max(m[2] for m in metadata))
42 | 
43 | 
44 | def main():
45 |   parser = argparse.ArgumentParser()
46 |   parser.add_argument('--base_dir', default=os.path.expanduser('.'))
47 |   parser.add_argument('--output', default='training')
48 |   parser.add_argument('--dataset', default='thchs30', choices=['blizzard', 'ljspeech', 'thchs30'])
49 |   parser.add_argument('--num_workers', type=int, default=cpu_count())
50 |   args = parser.parse_args()
51 |   if args.dataset == 'blizzard':
52 |     preprocess_blizzard(args)
53 |   elif args.dataset == 'ljspeech':
54 |     preprocess_ljspeech(args)
55 |   elif args.dataset == 'thchs30':
56 |     preprocess_thchs30(args)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |   main()
61 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from text import cleaners
 3 | from text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 | 
13 | 
14 | def text_to_sequence(text, cleaner_names):
15 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 | 
17 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 | 
20 |     Args:
21 |       text: string to convert to a sequence
22 |       cleaner_names: names of the cleaner functions to run the text through
23 | 
24 |     Returns:
25 |       List of integers corresponding to the symbols in the text
26 |   '''
27 |   sequence = []
28 | 
29 |   # Check for curly braces and treat their contents as ARPAbet:
30 |   while len(text):
31 |     m = _curly_re.match(text)
32 |     if not m:
33 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 |       break
35 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 |     sequence += _arpabet_to_sequence(m.group(2))
37 |     text = m.group(3)
38 | 
39 |   # Append EOS token
40 |   sequence.append(_symbol_to_id['~'])
41 |   return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |   '''Converts a sequence of IDs back to a string'''
46 |   result = ''
47 |   for symbol_id in sequence:
48 |     if symbol_id in _id_to_symbol:
49 |       s = _id_to_symbol[symbol_id]
50 |       # Enclose ARPAbet back in curly braces:
51 |       if len(s) > 1 and s[0] == '@':
52 |         s = '{%s}' % s[1:]
53 |       result += s
54 |   return result.replace('}{', ' ')
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |   for name in cleaner_names:
59 |     cleaner = getattr(cleaners, name)
60 |     if not cleaner:
61 |       raise Exception('Unknown cleaner: %s' % name)
62 |     text = cleaner(text)
63 |   return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |   return _symbols_to_sequence(['@' + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |   return s in _symbol_to_id and s is not '_' and s is not '~'
76 | 


--------------------------------------------------------------------------------
/tests/numbers_test.py:
--------------------------------------------------------------------------------
 1 | from text.numbers import normalize_numbers
 2 | 
 3 | 
 4 | def test_normalize_numbers():
 5 |   assert normalize_numbers('1') == 'one'
 6 |   assert normalize_numbers('15') == 'fifteen'
 7 |   assert normalize_numbers('24') == 'twenty-four'
 8 |   assert normalize_numbers('100') == 'one hundred'
 9 |   assert normalize_numbers('101') == 'one hundred one'
10 |   assert normalize_numbers('456') == 'four hundred fifty-six'
11 |   assert normalize_numbers('1000') == 'one thousand'
12 |   assert normalize_numbers('1800') == 'eighteen hundred'
13 |   assert normalize_numbers('2,000') == 'two thousand'
14 |   assert normalize_numbers('3000') == 'three thousand'
15 |   assert normalize_numbers('18000') == 'eighteen thousand'
16 |   assert normalize_numbers('24,000') == 'twenty-four thousand'
17 |   assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one'
18 |   assert normalize_numbers('6.4 sec') == 'six point four sec'
19 | 
20 | 
21 | def test_normalize_ordinals():
22 |   assert normalize_numbers('1st') == 'first'
23 |   assert normalize_numbers('2nd') == 'second'
24 |   assert normalize_numbers('9th') == 'ninth'
25 |   assert normalize_numbers('243rd place') == 'two hundred and forty-third place'
26 | 
27 | 
28 | def test_normalize_dates():
29 |   assert normalize_numbers('1400') == 'fourteen hundred'
30 |   assert normalize_numbers('1901') == 'nineteen oh one'
31 |   assert normalize_numbers('1999') == 'nineteen ninety-nine'
32 |   assert normalize_numbers('2000') == 'two thousand'
33 |   assert normalize_numbers('2004') == 'two thousand four'
34 |   assert normalize_numbers('2010') == 'twenty ten'
35 |   assert normalize_numbers('2012') == 'twenty twelve'
36 |   assert normalize_numbers('2025') == 'twenty twenty-five'
37 |   assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one'
38 |   assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.'
39 | 
40 | 
41 | def test_normalize_money():
42 |   assert normalize_numbers('$0.00') == 'zero dollars'
43 |   assert normalize_numbers('$1') == 'one dollar'
44 |   assert normalize_numbers('$10') == 'ten dollars'
45 |   assert normalize_numbers('$.01') == 'one cent'
46 |   assert normalize_numbers('$0.25') == 'twenty-five cents'
47 |   assert normalize_numbers('$5.00') == 'five dollars'
48 |   assert normalize_numbers('$5.01') == 'five dollars, one cent'
49 |   assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.'
50 |   assert normalize_numbers('$40,000') == 'forty thousand dollars'
51 |   assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!'
52 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | '''
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | 
18 | # Regular expression matching whitespace:
19 | _whitespace_re = re.compile(r'\s+')
20 | 
21 | # List of (regular expression, replacement) pairs for abbreviations:
22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
23 |   ('mrs', 'misess'),
24 |   ('mr', 'mister'),
25 |   ('dr', 'doctor'),
26 |   ('st', 'saint'),
27 |   ('co', 'company'),
28 |   ('jr', 'junior'),
29 |   ('maj', 'major'),
30 |   ('gen', 'general'),
31 |   ('drs', 'doctors'),
32 |   ('rev', 'reverend'),
33 |   ('lt', 'lieutenant'),
34 |   ('hon', 'honorable'),
35 |   ('sgt', 'sergeant'),
36 |   ('capt', 'captain'),
37 |   ('esq', 'esquire'),
38 |   ('ltd', 'limited'),
39 |   ('col', 'colonel'),
40 |   ('ft', 'fort'),
41 | ]]
42 | 
43 | 
44 | def expand_abbreviations(text):
45 |   for regex, replacement in _abbreviations:
46 |     text = re.sub(regex, replacement, text)
47 |   return text
48 | 
49 | 
50 | def expand_numbers(text):
51 |   return normalize_numbers(text)
52 | 
53 | 
54 | def lowercase(text):
55 |   return text.lower()
56 | 
57 | 
58 | def collapse_whitespace(text):
59 |   return re.sub(_whitespace_re, ' ', text)
60 | 
61 | 
62 | def convert_to_ascii(text):
63 |   return unidecode(text)
64 | 
65 | 
66 | def basic_cleaners(text):
67 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
68 |   text = lowercase(text)
69 |   text = collapse_whitespace(text)
70 |   return text
71 | 
72 | 
73 | def transliteration_cleaners(text):
74 |   '''Pipeline for non-English text that transliterates to ASCII.'''
75 |   text = convert_to_ascii(text)
76 |   text = lowercase(text)
77 |   text = collapse_whitespace(text)
78 |   return text
79 | 
80 | 
81 | def english_cleaners(text):
82 |   '''Pipeline for English text, including number and abbreviation expansion.'''
83 |   text = convert_to_ascii(text)
84 |   text = lowercase(text)
85 |   text = expand_numbers(text)
86 |   text = expand_abbreviations(text)
87 |   text = collapse_whitespace(text)
88 |   return text
89 | 


--------------------------------------------------------------------------------
/datasets/blizzard.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from hparams import hparams
 6 | from util import audio
 7 | 
 8 | 
 9 | _max_out_length = 700
10 | _end_buffer = 0.05
11 | _min_confidence = 90
12 | 
13 | # Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others.
14 | books = [
15 |   'ATrampAbroad',
16 |   'TheManThatCorruptedHadleyburg',
17 |   # 'LifeOnTheMississippi',
18 |   # 'TheAdventuresOfTomSawyer',
19 | ]
20 | 
21 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
22 |   executor = ProcessPoolExecutor(max_workers=num_workers)
23 |   futures = []
24 |   index = 1
25 |   for book in books:
26 |     with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f:
27 |       for line in f:
28 |         parts = line.strip().split('\t')
29 |         if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence:
30 |           wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0])
31 |           labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0])
32 |           text = parts[5]
33 |           task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text)
34 |           futures.append(executor.submit(task))
35 |           index += 1
36 |   results = [future.result() for future in tqdm(futures)]
37 |   return [r for r in results if r is not None]
38 | 
39 | 
40 | def _process_utterance(out_dir, index, wav_path, labels_path, text):
41 |   # Load the wav file and trim silence from the ends:
42 |   wav = audio.load_wav(wav_path)
43 |   start_offset, end_offset = _parse_labels(labels_path)
44 |   start = int(start_offset * hparams.sample_rate)
45 |   end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
46 |   wav = wav[start:end]
47 |   max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
48 |   if len(wav) > max_samples:
49 |     return None
50 |   spectrogram = audio.spectrogram(wav).astype(np.float32)
51 |   n_frames = spectrogram.shape[1]
52 |   mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
53 |   spectrogram_filename = 'blizzard-spec-%05d.npy' % index
54 |   mel_filename = 'blizzard-mel-%05d.npy' % index
55 |   np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
56 |   np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
57 |   return (spectrogram_filename, mel_filename, n_frames, text)
58 | 
59 | 
60 | def _parse_labels(path):
61 |   labels = []
62 |   with open(os.path.join(path)) as f:
63 |     for line in f:
64 |       parts = line.strip().split(' ')
65 |       if len(parts) >= 3:
66 |         labels.append((float(parts[0]), ' '.join(parts[2:])))
67 |   start = 0
68 |   end = None
69 |   if labels[0][1] == 'sil':
70 |     start = labels[0][0]
71 |   if labels[-1][1] == 'sil':
72 |     end = labels[-2][0] + _end_buffer
73 |   return (start, end)
74 | 


--------------------------------------------------------------------------------
/datasets/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from util import audio
 6 | 
 7 | 
 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 9 |   '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
10 | 
11 |     Args:
12 |       in_dir: The directory where you have downloaded the LJ Speech dataset
13 |       out_dir: The directory to write the output into
14 |       num_workers: Optional number of worker processes to parallelize across
15 |       tqdm: You can optionally pass tqdm to get a nice progress bar
16 | 
17 |     Returns:
18 |       A list of tuples describing the training examples. This should be written to train.txt
19 |   '''
20 | 
21 |   # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
22 |   # can omit it and just call _process_utterance on each input if you want.
23 |   executor = ProcessPoolExecutor(max_workers=num_workers)
24 |   futures = []
25 |   index = 1
26 |   with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
27 |     for line in f:
28 |       parts = line.strip().split('|')
29 |       wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
30 |       text = parts[2]
31 |       futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
32 |       index += 1
33 |   return [future.result() for future in tqdm(futures)]
34 | 
35 | 
36 | def _process_utterance(out_dir, index, wav_path, text):
37 |   '''Preprocesses a single utterance audio/text pair.
38 | 
39 |   This writes the mel and linear scale spectrograms to disk and returns a tuple to write
40 |   to the train.txt file.
41 | 
42 |   Args:
43 |     out_dir: The directory to write the spectrograms into
44 |     index: The numeric index to use in the spectrogram filenames.
45 |     wav_path: Path to the audio file containing the speech input
46 |     text: The text spoken in the input audio file
47 | 
48 |   Returns:
49 |     A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
50 |   '''
51 | 
52 |   # Load the audio to a numpy array:
53 |   wav = audio.load_wav(wav_path)
54 | 
55 |   # Compute the linear-scale spectrogram from the wav:
56 |   spectrogram = audio.spectrogram(wav).astype(np.float32)
57 |   n_frames = spectrogram.shape[1]
58 | 
59 |   # Compute a mel-scale spectrogram from the wav:
60 |   mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
61 | 
62 |   # Write the spectrograms to disk:
63 |   spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
64 |   mel_filename = 'ljspeech-mel-%05d.npy' % index
65 |   np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
66 |   np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
67 | 
68 |   # Return a tuple describing this training example:
69 |   return (spectrogram_filename, mel_filename, n_frames, text)
70 | 


--------------------------------------------------------------------------------
/TRAINING_DATA.md:
--------------------------------------------------------------------------------
 1 | # Training Data
 2 | 
 3 | 
 4 | This repo supports the following speech datasets:
 5 |   * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
 6 |   * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
 7 | 
 8 | You can use any other dataset if you write a preprocessor for it.
 9 | 
10 | 
11 | ### Writing a Preprocessor
12 | 
13 | Each training example consists of:
14 |   1. The text that was spoken
15 |   2. A mel-scale spectrogram of the audio
16 |   3. A linear-scale spectrogram of the audio
17 | 
18 | The preprocessor is responsible for generating these. See [ljspeech.py](datasets/ljspeech.py) for a
19 | commented example.
20 | 
21 | For each training example, a preprocessor should:
22 | 
23 |   1. Load the audio file:
24 |      ```python
25 |      wav = audio.load_wav(wav_path)
26 |      ```
27 | 
28 |   2. Compute linear-scale and mel-scale spectrograms (float32 numpy arrays):
29 |      ```python
30 |      spectrogram = audio.spectrogram(wav).astype(np.float32)
31 |      mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
32 |      ```
33 | 
34 |   3. Save the spectrograms to disk:
35 |      ```python
36 |      np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
37 |      np.save(os.path.join(out_dir, mel_spectrogram_filename), mel_spectrogram.T,  allow_pickle=False)
38 |      ```
39 |      Note that the transpose of the matrix returned by `audio.spectrogram` is saved so that it's
40 |      in time-major format.
41 | 
42 |   4. Generate a tuple `(spectrogram_filename, mel_spectrogram_filename, n_frames, text)` to
43 |      write to train.txt. n_frames is just the length of the time axis of the spectrogram.
44 | 
45 | 
46 | After you've written your preprocessor, you can add it to [preprocess.py](preprocess.py) by
47 | following the example of the other preprocessors in that file.
48 | 
49 | 
50 | ### Non-English Data
51 | 
52 | If your training data is in a language other than English, you will probably want to change the
53 | text cleaners by setting the `cleaners` hyperparameter.
54 | 
55 |   * If your text is in a Latin script or can be transliterated to ASCII using the
56 |     [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration
57 |     cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`.
58 | 
59 |   * If you don't want to transliterate, you can define a custom character set.
60 |     This allows you to train directly on the character set used in your data.
61 | 
62 |     To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a
63 |     string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`.
64 | 
65 |   * If you're not sure which option to use, you can evaluate the transliteration cleaners like this:
66 | 
67 |     ```python
68 |     from text import cleaners
69 |     cleaners.transliteration_cleaners('Здравствуйте')   # Replace with the text you want to try
70 |     ```
71 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | import tensorflow as tf
 5 | from hparams import hparams, hparams_debug_string
 6 | from synthesizer import Synthesizer
 7 | from util.txt2pinyin import text_to_pinyin
 8 | 
 9 | # sentences = [
10 | #   # From July 8, 2017 New York Times:
11 | #   'Scientists at the CERN laboratory say they have discovered a new particle.',
12 | #   'There’s a way to measure the acute emotional intelligence that has never gone out of style.',
13 | #   'President Trump met with other leaders at the Group of 20 conference.',
14 | #   'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.',
15 | #   # From Google's Tacotron example page:
16 | #   'Generative adversarial network or variational auto-encoder.',
17 | #   'The buses aren\'t the problem, they actually provide a solution.',
18 | #   'Does the quick brown fox jump over the lazy dog?',
19 | #   'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
20 | # ]
21 | sentence_1 = '小明硕士毕业于中国科学院计算所，后在日本京都大学深造'
22 | sentence_2 = text_to_pinyin(sentence_1)
23 | print(sentence_2)
24 | 
25 | # sentences = [
26 | #   'ta1 jing3 ti4 de5 xia4 le5 chuang2 gei3 liang3 ge5 sun1 zi5 ye4 hao3 bei4 zi5 you4 na2 guo4 yi1 ba3 da4 yi3 zi5 ba3 jie3 mei4 lia3 dang3 zhu4 gang1 zou3 dao4 ke4 ting1 jiu4 bei4 ren2 lan2 yao1 bao4 zhu4 le5',
27 | #   'wei1 xin4 zhi1 fu4 zhang1 xiao3 long2 han3 jian4 lou4 mian4 cheng1 wei1 xin4 bu4 hui4 cha2 kan4 yong4 hu4 liao2 tian1 ji4 lu4 yi4 si an4 feng4 zhi1 fu4 bao3 , ben3 wen2 lai2 zi4 teng2 xun4 ke1 ji4 .',
28 | #   'da4 hui4 zhi3 re4 nao5 tou2 liang3 tian1 yue4 hou4 yue4 song1 kua3 zui4 zhong1 chu1 ben3 lun4 wen2 ji2 jiu4 suan4 yuan2 man3 wan2 cheng2 ren4 wu5',
29 | #   'lian2 dui4 zhi3 liu2 xia4 yi4 ming2 zhi2 ban1 yuan2 chui1 shi4 yuan2 si4 yang3 yuan2 wei4 sheng1 yuan2 deng3 ye3 lie4 dui4 pao3 bu4 gan2 wang3 zai1 qu1',
30 | #   'yi1 jiu3 wu3 ling2 nian2 ba1 yue4 zhong1 yang1 ren2 min2 zheng4 fu3 zheng4 wu4 yuan4 ban1 bu4 le5 bao3 zhang4 fa1 ming2 quan2 yu3 zhuan1 li4 quan2 zan4 xing2 tiao2 li4',
31 | # ]
32 | sentences = [sentence_2]
33 | 
34 | 
35 | def get_output_base_path(checkpoint_path):
36 |   base_dir = os.path.dirname(checkpoint_path)
37 |   m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(checkpoint_path)
38 |   name = 'eval-%d' % int(m.group(1)) if m else 'eval'
39 |   return os.path.join(base_dir, name)
40 | 
41 | 
42 | def run_eval(args):
43 |   print(hparams_debug_string())
44 |   synth = Synthesizer()
45 |   synth.load(args.checkpoint)
46 |   base_path = get_output_base_path(args.checkpoint)
47 |   for i, text in enumerate(sentences):
48 |     path = '%s-%03d.wav' % (base_path, i)
49 |     print('Synthesizing: %s' % path)
50 |     with open(path, 'wb') as f:
51 |       f.write(synth.synthesize(text))
52 | 
53 | 
54 | def main():
55 |   parser = argparse.ArgumentParser()
56 |   parser.add_argument('--checkpoint', required=True, help='Path to model checkpoint')
57 |   parser.add_argument('--hparams', default='',
58 |     help='Hyperparameter overrides as a comma-separated list of name=value pairs')
59 |   args = parser.parse_args()
60 |   os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
61 |   os.environ['CUDA_VISIBLE_DEVICES'] = '1'
62 |   hparams.parse(args.hparams)
63 |   run_eval(args)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |   main()
68 | 


--------------------------------------------------------------------------------
/datasets/thchs30.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | import glob
 6 | from util import audio
 7 | from hparams import hparams as hp
 8 | 
 9 | 
10 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
11 |   '''Preprocesses the THCHS30 dataset from a given input path into a given output directory.
12 | 
13 |     Args:
14 |       in_dir: The directory where you have downloaded the THCHS30 dataset
15 |       out_dir: The directory to write the output into
16 |       num_workers: Optional number of worker processes to parallelize across
17 |       tqdm: You can optionally pass tqdm to get a nice progress bar
18 | 
19 |     Returns:
20 |       A list of tuples describing the training examples. This should be written to train.txt
21 |   '''
22 | 
23 |   # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you
24 |   # can omit it and just call _process_utterance on each input if you want.
25 |   executor = ProcessPoolExecutor(max_workers=num_workers)
26 |   futures = []
27 |   index = 1
28 | 
29 |   # trn_files = glob.glob(os.path.join(in_dir, 'biaobei_48000', '*.trn')) # 标贝数据集
30 |   trn_files = glob.glob(os.path.join(in_dir, 'data', '*.trn')) # 若是单独训练train文件则data改成train
31 |   print("trn_files:",trn_files)
32 |   for trn in trn_files:
33 |     # print("trn:",trn)
34 |     with open(trn) as f:
35 |       pinyin = f.readline().strip('\n')
36 |       # wav_file = trn[:-4] + '.wav' # 标贝数据集
37 |       wav_file = trn[:-4]
38 |       print("wav_file:",wav_file)
39 |       task = partial(_process_utterance, out_dir, index, wav_file, pinyin)
40 |       futures.append(executor.submit(task))
41 |       index += 1
42 |   return [future.result() for future in tqdm(futures) if future.result() is not None]
43 | 
44 | 
45 | def _process_utterance(out_dir, index, wav_path, pinyin):
46 |   '''Preprocesses a single utterance audio/text pair.
47 | 
48 |   This writes the mel and linear scale spectrograms to disk and returns a tuple to write
49 |   to the train.txt file.
50 | 
51 |   Args:
52 |     out_dir: The directory to write the spectrograms into
53 |     index: The numeric index to use in the spectrogram filenames.
54 |     wav_path: Path to the audio file containing the speech input
55 |     pinyin: The pinyin of Chinese spoken in the input audio file
56 | 
57 |   Returns:
58 |     A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
59 |   '''
60 | 
61 |   # Load the audio to a numpy array:
62 |   wav = audio.load_wav(wav_path)
63 | 
64 |   # rescale wav for unified measure for all clips
65 |   wav = wav / np.abs(wav).max() * 0.999
66 | 
67 |   # trim silence
68 |   wav = audio.trim_silence(wav)
69 | 
70 |   # Compute the linear-scale spectrogram from the wav:
71 |   spectrogram = audio.spectrogram(wav).astype(np.float32)
72 |   n_frames = spectrogram.shape[1]
73 |   if n_frames > hp.max_frame_num:
74 |     return None
75 | 
76 |   # Compute a mel-scale spectrogram from the wav:
77 |   mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
78 | 
79 |   # Write the spectrograms to disk:
80 |   spectrogram_filename = 'thchs30-spec-%05d.npy' % index
81 |   mel_filename = 'thchs30-mel-%05d.npy' % index
82 |   np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
83 |   np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
84 | 
85 |   # Return a tuple describing this training example:
86 |   return (spectrogram_filename, mel_filename, n_frames, pinyin)
87 | 


--------------------------------------------------------------------------------
/models/modules.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib.rnn import GRUCell
  3 | 
  4 | 
  5 | def prenet(inputs, is_training, layer_sizes, scope=None):
  6 |   x = inputs
  7 |   drop_rate = 0.5 if is_training else 0.0
  8 |   with tf.variable_scope(scope or 'prenet'):
  9 |     for i, size in enumerate(layer_sizes):
 10 |       dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1))
 11 |       x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, name='dropout_%d' % (i+1))
 12 |   return x
 13 | 
 14 | 
 15 | def encoder_cbhg(inputs, input_lengths, is_training, depth):
 16 |   input_channels = inputs.get_shape()[2]
 17 |   return cbhg(
 18 |     inputs,
 19 |     input_lengths,
 20 |     is_training,
 21 |     scope='encoder_cbhg',
 22 |     K=16,
 23 |     projections=[128, input_channels],
 24 |     depth=depth)
 25 | 
 26 | 
 27 | def post_cbhg(inputs, input_dim, is_training, depth):
 28 |   return cbhg(
 29 |     inputs,
 30 |     None,
 31 |     is_training,
 32 |     scope='post_cbhg',
 33 |     K=8,
 34 |     projections=[256, input_dim],
 35 |     depth=depth)
 36 | 
 37 | 
 38 | def cbhg(inputs, input_lengths, is_training, scope, K, projections, depth):
 39 |   with tf.variable_scope(scope):
 40 |     with tf.variable_scope('conv_bank'):
 41 |       # Convolution bank: concatenate on the last axis to stack channels from all convolutions
 42 |       conv_outputs = tf.concat(
 43 |         [conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, K+1)],
 44 |         axis=-1
 45 |       )
 46 | 
 47 |     # Maxpooling:
 48 |     maxpool_output = tf.layers.max_pooling1d(
 49 |       conv_outputs,
 50 |       pool_size=2,
 51 |       strides=1,
 52 |       padding='same')
 53 | 
 54 |     # Two projection layers:
 55 |     proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'proj_1')
 56 |     proj2_output = conv1d(proj1_output, 3, projections[1], lambda _:_, is_training, 'proj_2')
 57 | 
 58 |     # Residual connection:
 59 |     highway_input = proj2_output + inputs
 60 | 
 61 |     half_depth = depth // 2
 62 |     assert half_depth*2 == depth, 'encoder and postnet depths must be even.'
 63 | 
 64 |     # Handle dimensionality mismatch:
 65 |     if highway_input.shape[2] != half_depth:
 66 |       highway_input = tf.layers.dense(highway_input, half_depth)
 67 | 
 68 |     # 4-layer HighwayNet:
 69 |     for i in range(4):
 70 |       highway_input = highwaynet(highway_input, 'highway_%d' % (i+1), half_depth)
 71 |     rnn_input = highway_input
 72 | 
 73 |     # Bidirectional RNN
 74 |     outputs, states = tf.nn.bidirectional_dynamic_rnn(
 75 |       GRUCell(half_depth),
 76 |       GRUCell(half_depth),
 77 |       rnn_input,
 78 |       sequence_length=input_lengths,
 79 |       dtype=tf.float32)
 80 |     return tf.concat(outputs, axis=2)  # Concat forward and backward
 81 | 
 82 | 
 83 | def highwaynet(inputs, scope, depth):
 84 |   with tf.variable_scope(scope):
 85 |     H = tf.layers.dense(
 86 |       inputs,
 87 |       units=depth,
 88 |       activation=tf.nn.relu,
 89 |       name='H')
 90 |     T = tf.layers.dense(
 91 |       inputs,
 92 |       units=depth,
 93 |       activation=tf.nn.sigmoid,
 94 |       name='T',
 95 |       bias_initializer=tf.constant_initializer(-1.0))
 96 |     return H * T + inputs * (1.0 - T)
 97 | 
 98 | 
 99 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
100 |   with tf.variable_scope(scope):
101 |     conv1d_output = tf.layers.conv1d(
102 |       inputs,
103 |       filters=channels,
104 |       kernel_size=kernel_size,
105 |       activation=None,
106 |       padding='same')
107 |     batched = tf.layers.batch_normalization(conv1d_output, training=is_training)
108 |     return activation(batched)
109 | 


--------------------------------------------------------------------------------
/demo_server.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import falcon
  3 | from hparams import hparams, hparams_debug_string
  4 | import os
  5 | from synthesizer import Synthesizer
  6 | from util.txt2pinyin import text_to_pinyin
  7 | 
  8 | 
  9 | html_body = '''<html><title>mandarin_tacotron Demo</title>
 10 | <style>
 11 | body {padding: 16px; font-family: sans-serif; font-size: 14px; color: #444}
 12 | input {font-size: 14px; padding: 8px 12px; outline: none; border: 1px solid #ddd}
 13 | input:focus {box-shadow: 0 1px 2px rgba(0,0,0,.15)}
 14 | p {padding: 12px}
 15 | button {background: #28d; padding: 9px 14px; margin-left: 8px; border: none; outline: none;
 16 |         color: #fff; font-size: 14px; border-radius: 4px; cursor: pointer;}
 17 | button:hover {box-shadow: 0 1px 2px rgba(0,0,0,.15); opacity: 0.9;}
 18 | button:active {background: #29f;}
 19 | button[disabled] {opacity: 0.4; cursor: default}
 20 | </style>
 21 | <body>
 22 | <form>
 23 |   <input id="text" type="text" size="40" placeholder="Enter Text">
 24 |   <button id="button" name="synthesize">Speak</button>
 25 | </form>
 26 | <p id="message"></p>
 27 | <audio id="audio" controls autoplay hidden></audio>
 28 | <script>
 29 | function q(selector) {return document.querySelector(selector)}
 30 | q('#text').focus()
 31 | q('#button').addEventListener('click', function(e) {
 32 |   text = q('#text').value.trim()
 33 |   if (text) {
 34 |     q('#message').textContent = 'Synthesizing...'
 35 |     q('#button').disabled = true
 36 |     q('#audio').hidden = true
 37 |     synthesize(text)
 38 |   }
 39 |   e.preventDefault()
 40 |   return false
 41 | })
 42 | function synthesize(text) {
 43 |   fetch('/synthesize?text=' + encodeURIComponent(text), {cache: 'no-cache'})
 44 |     .then(function(res) {
 45 |       if (!res.ok) throw Error(res.statusText)
 46 |       return res.blob()
 47 |     }).then(function(blob) {
 48 |       q('#message').textContent = ''
 49 |       q('#button').disabled = false
 50 |       q('#audio').src = URL.createObjectURL(blob)
 51 |       q('#audio').hidden = false
 52 |     }).catch(function(err) {
 53 |       q('#message').textContent = 'Error: ' + err.message
 54 |       q('#button').disabled = false
 55 |     })
 56 | }
 57 | </script></body></html>
 58 | '''
 59 | 
 60 | 
 61 | class UIResource:
 62 |   def on_get(self, req, res):
 63 |     res.content_type = 'text/html'
 64 |     res.body = html_body
 65 | 
 66 | 
 67 | class SynthesisResource:
 68 |   def on_get(self, req, res):
 69 |     if not req.params.get('text'):
 70 |       raise falcon.HTTPBadRequest()
 71 |     get_text = req.params.get('text')
 72 |     print("get_text:", get_text)
 73 |     print("get_text类型:", type(get_text))
 74 |     sentence = text_to_pinyin(get_text)
 75 |     print("sentence:", sentence)
 76 |     print("sentence类型:", type(sentence))
 77 |     # res.data = synthesizer.synthesize(req.params.get('text'))
 78 |     res.data = synthesizer.synthesize(sentence)
 79 |     res.content_type = 'audio/wav'
 80 | 
 81 | 
 82 | synthesizer = Synthesizer()
 83 | api = falcon.API()
 84 | api.add_route('/synthesize', SynthesisResource())
 85 | api.add_route('/', UIResource())
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |   from wsgiref import simple_server
 90 |   parser = argparse.ArgumentParser()
 91 |   parser.add_argument('--checkpoint', required=True, help='Full path to model checkpoint')
 92 |   parser.add_argument('--port', type=int, default=9000)
 93 |   parser.add_argument('--hparams', default='',
 94 |     help='Hyperparameter overrides as a comma-separated list of name=value pairs')
 95 |   args = parser.parse_args()
 96 |   os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 97 |   hparams.parse(args.hparams)
 98 |   print(hparams_debug_string())
 99 |   synthesizer.load(args.checkpoint)
100 |   print('Serving on port %d' % args.port)
101 |   simple_server.make_server('0.0.0.0', args.port, api).serve_forever()
102 | else:
103 |   synthesizer.load(os.environ['CHECKPOINT'])
104 | 


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | 
  5 | 
  6 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper
  7 | class TacoTestHelper(Helper):
  8 |   def __init__(self, batch_size, output_dim, r):
  9 |     with tf.name_scope('TacoTestHelper'):
 10 |       self._batch_size = batch_size
 11 |       self._output_dim = output_dim
 12 |       self._reduction_factor = r
 13 | 
 14 |   @property
 15 |   def batch_size(self):
 16 |     return self._batch_size
 17 | 
 18 |   @property
 19 |   def token_output_size(self):
 20 |     return self._reduction_factor
 21 | 
 22 |   @property
 23 |   def sample_ids_shape(self):
 24 |     return tf.TensorShape([])
 25 | 
 26 |   @property
 27 |   def sample_ids_dtype(self):
 28 |     return np.int32
 29 | 
 30 |   def initialize(self, name=None):
 31 |     return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 |   def sample(self, time, outputs, state, name=None):
 34 |     return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 |   def next_inputs(self, time, outputs, state, sample_ids, stop_token_preds, name=None):
 37 |     '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
 38 |     with tf.name_scope('TacoTestHelper'):
 39 |       # A sequence is finished when the stop token probability is > 0.5
 40 |       # With enough training steps, the model should be able to predict when to stop correctly
 41 |       # and the use of stop_at_any = True would be recommended. If however the model didn't
 42 |       # learn to stop correctly yet, (stops too soon) one could choose to use the safer option
 43 |       # to get a correct synthesis
 44 |       finished = tf.reduce_any(tf.cast(tf.round(stop_token_preds), tf.bool))
 45 | 
 46 |       # Feed last output frame as next input. outputs is [N, output_dim * r]
 47 |       next_inputs = outputs[:, -self._output_dim:]
 48 |       return (finished, next_inputs, state)
 49 | 
 50 | 
 51 | class TacoTrainingHelper(Helper):
 52 |   def __init__(self, inputs, targets, output_dim, r, global_step):
 53 |     # inputs is [N, T_in], targets is [N, T_out, D]
 54 |     with tf.name_scope('TacoTrainingHelper'):
 55 |       self._batch_size = tf.shape(inputs)[0]
 56 |       self._output_dim = output_dim
 57 |       self._reduction_factor = r
 58 |       self._ratio = tf.convert_to_tensor(1.)
 59 |       self.global_step = global_step
 60 | 
 61 |       # Feed every r-th target frame as input
 62 |       self._targets = targets[:, r-1::r, :]
 63 | 
 64 |       # Use full length for every target because we don't want to mask the padding frames
 65 |       num_steps = tf.shape(self._targets)[1]
 66 |       self._lengths = tf.tile([num_steps], [self._batch_size])
 67 | 
 68 |   @property
 69 |   def batch_size(self):
 70 |     return self._batch_size
 71 | 
 72 |   @property
 73 |   def token_output_size(self):
 74 |     return self._reduction_factor
 75 | 
 76 |   @property
 77 |   def sample_ids_shape(self):
 78 |     return tf.TensorShape([])
 79 | 
 80 |   @property
 81 |   def sample_ids_dtype(self):
 82 |     return np.int32
 83 | 
 84 |   def initialize(self, name=None):
 85 |     self._ratio = _teacher_forcing_ratio_decay(1., self.global_step)
 86 |     return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 87 | 
 88 |   def sample(self, time, outputs, state, name=None):
 89 |     return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 90 | 
 91 |   def next_inputs(self, time, outputs, state, sample_ids, stop_token_preds, name=None):
 92 |     with tf.name_scope(name or 'TacoTrainingHelper'):
 93 |       finished = (time + 1 >= self._lengths)
 94 | 
 95 |       #Pick previous outputs randomly with respect to teacher forcing ratio
 96 |       next_inputs = tf.cond(tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
 97 |               lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
 98 |               lambda: outputs[:,-self._output_dim:])
 99 | 
100 |       # next_inputs = self._targets[:, time, :] # Teacher forcing: feed the true frame
101 |       return (finished, next_inputs, state)
102 | 
103 | 
104 | def _go_frames(batch_size, output_dim):
105 |   '''Returns all-zero <GO> frames for a given batch size and output dimension'''
106 |   return tf.tile([[0.0]], [batch_size, output_dim])
107 | 
108 | def _teacher_forcing_ratio_decay(init_tfr, global_step):
109 |   #################################################################
110 |   # Narrow Cosine Decay:
111 | 
112 |   # Phase 1: tfr = 1
113 |   # We only start learning rate decay after 10k steps
114 | 
115 |   # Phase 2: tfr in ]0, 1[
116 |   # decay reach minimal value at step ~280k
117 | 
118 |   # Phase 3: tfr = 0
119 |   # clip by minimal teacher forcing ratio value (step >~ 280k)
120 |   #################################################################
121 |   #Compute natural cosine decay
122 |   tfr = tf.train.cosine_decay(init_tfr,
123 |           global_step=global_step - 20000, #tfr = 1 at step 10k
124 |           decay_steps=200000, #tfr = 0 at step ~280k
125 |           alpha=0., #tfr = 0% of init_tfr as final value
126 |           name='tfr_cosine_decay')
127 | 
128 |   #force teacher forcing ratio to take initial value when global step < start decay step.
129 |   narrow_tfr = tf.cond(
130 |           tf.less(global_step, tf.convert_to_tensor(20000)),
131 |           lambda: tf.convert_to_tensor(init_tfr),
132 |           lambda: tfr)
133 | 
134 |   return narrow_tfr
135 | 


--------------------------------------------------------------------------------
/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import collections
  6 | import tensorflow as tf
  7 | 
  8 | from tensorflow.contrib.seq2seq.python.ops import decoder
  9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 10 | from tensorflow.python.framework import ops
 11 | from tensorflow.python.framework import tensor_shape
 12 | from tensorflow.python.layers import base as layers_base
 13 | from tensorflow.python.ops import rnn_cell_impl
 14 | from tensorflow.python.util import nest
 15 | from .helpers import TacoTrainingHelper, TacoTestHelper
 16 | 
 17 | 
 18 | class CustomDecoderOutput(
 19 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 20 | 	pass
 21 | 
 22 | 
 23 | class CustomDecoder(decoder.Decoder):
 24 | 	"""Custom sampling decoder.
 25 | 
 26 | 	Allows for stop token prediction at inference time
 27 | 	and returns equivalent loss in training time.
 28 | 
 29 | 	Note:
 30 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 31 | 	"""
 32 | 
 33 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 34 | 		"""Initialize CustomDecoder.
 35 | 		Args:
 36 | 			cell: An `RNNCell` instance.
 37 | 			helper: A `Helper` instance.
 38 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 39 | 				The initial state of the RNNCell.
 40 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 41 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 42 | 				to storing the result or sampling.
 43 | 		Raises:
 44 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 45 | 		"""
 46 | 		rnn_cell_impl.assert_like_rnncell(type(cell), cell)
 47 | 		if not isinstance(helper, helper_py.Helper):
 48 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 49 | 		if (output_layer is not None
 50 | 				and not isinstance(output_layer, layers_base.Layer)):
 51 | 			raise TypeError(
 52 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 53 | 		self._cell = cell
 54 | 		self._helper = helper
 55 | 		self._initial_state = initial_state
 56 | 		self._output_layer = output_layer
 57 | 
 58 | 	@property
 59 | 	def batch_size(self):
 60 | 		return self._helper.batch_size
 61 | 
 62 | 	def _rnn_output_size(self):
 63 | 		size = self._cell.output_size
 64 | 		if self._output_layer is None:
 65 | 			return size
 66 | 		else:
 67 | 			# To use layer's compute_output_shape, we need to convert the
 68 | 			# RNNCell's output_size entries into shapes with an unknown
 69 | 			# batch size.  We then pass this through the layer's
 70 | 			# compute_output_shape and read off all but the first (batch)
 71 | 			# dimensions to get the output size of the rnn with the layer
 72 | 			# applied to the top.
 73 | 			output_shape_with_unknown_batch = nest.map_structure(
 74 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 75 | 					size)
 76 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 77 | 					output_shape_with_unknown_batch)
 78 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 79 | 
 80 | 	@property
 81 | 	def output_size(self):
 82 | 		# Return the cell output and the id
 83 | 		return CustomDecoderOutput(
 84 | 				rnn_output=self._rnn_output_size(),
 85 | 				token_output=self._helper.token_output_size,
 86 | 				sample_id=self._helper.sample_ids_shape)
 87 | 
 88 | 	@property
 89 | 	def output_dtype(self):
 90 | 		# Assume the dtype of the cell is the output_size structure
 91 | 		# containing the input_state's first component's dtype.
 92 | 		# Return that structure and the sample_ids_dtype from the helper.
 93 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 94 | 		return CustomDecoderOutput(
 95 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 96 | 				tf.float32,
 97 | 				self._helper.sample_ids_dtype)
 98 | 
 99 | 	def initialize(self, name=None):
100 | 		"""Initialize the decoder.
101 | 		Args:
102 | 			name: Name scope for any created operations.
103 | 		Returns:
104 | 			`(finished, first_inputs, initial_state)`.
105 | 		"""
106 | 		return self._helper.initialize() + (self._initial_state,)
107 | 
108 | 	def step(self, time, inputs, state, name=None):
109 | 		"""Perform a custom decoding step.
110 | 		Enables for dyanmic <stop_token> prediction
111 | 		Args:
112 | 			time: scalar `int32` tensor.
113 | 			inputs: A (structure of) input tensors.
114 | 			state: A (structure of) state tensors and TensorArrays.
115 | 			name: Name scope for any created operations.
116 | 		Returns:
117 | 			`(outputs, next_state, next_inputs, finished)`.
118 | 		"""
119 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
120 | 			#Call outputprojection wrapper cell
121 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
122 | 
123 | 			#apply output_layer (if existant)
124 | 			if self._output_layer is not None:
125 | 				cell_outputs = self._output_layer(cell_outputs)
126 | 			sample_ids = self._helper.sample(
127 | 					time=time, outputs=cell_outputs, state=cell_state)
128 | 
129 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
130 | 					time=time,
131 | 					outputs=cell_outputs,
132 | 					state=cell_state,
133 | 					sample_ids=sample_ids,
134 | 					stop_token_preds=stop_token)
135 | 
136 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
137 | 		return (outputs, next_state, next_inputs, finished)
138 | 


--------------------------------------------------------------------------------
/util/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import math
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from scipy import signal
  7 | from scipy.io import wavfile
  8 | from hparams import hparams
  9 | 
 10 | 
 11 | def load_wav(path):
 12 |   return librosa.core.load(path, sr=hparams.sample_rate)[0]
 13 | 
 14 | 
 15 | def save_wav(wav, path):
 16 |   # rescaling for unified measure for all clips
 17 |   wav = wav / np.abs(wav).max() * 0.999
 18 |   # factor 0.5 in case of overflow for int16
 19 |   f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav)))
 20 |   # sublinear scaling as Y ~ X ^ k (k < 1)
 21 |   f2 = np.sign(wav) * np.power(np.abs(wav), 0.8)
 22 |   wav = f1 * f2
 23 |   # bandpass for less noises
 24 |   firwin = signal.firwin(hparams.num_freq, [hparams.fmin, hparams.fmax], pass_zero=False, fs=hparams.sample_rate)
 25 |   wav = signal.convolve(wav, firwin)
 26 | 
 27 |   wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 28 | 
 29 | 
 30 | def trim_silence(wav):
 31 |   return librosa.effects.trim(wav, top_db= 60, frame_length=512, hop_length=128)[0]
 32 | 
 33 | 
 34 | def preemphasis(x):
 35 |   return signal.lfilter([1, -hparams.preemphasis], [1], x)
 36 | 
 37 | 
 38 | def inv_preemphasis(x):
 39 |   return signal.lfilter([1], [1, -hparams.preemphasis], x)
 40 | 
 41 | 
 42 | def spectrogram(y):
 43 |   D = _stft(preemphasis(y))
 44 |   S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 45 |   return _normalize(S)
 46 | 
 47 | 
 48 | def inv_spectrogram(spectrogram):
 49 |   '''Converts spectrogram to waveform using librosa'''
 50 |   S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
 51 |   return inv_preemphasis(_griffin_lim(S ** hparams.power))          # Reconstruct phase
 52 | 
 53 | 
 54 | def inv_spectrogram_tensorflow(spectrogram):
 55 |   '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
 56 | 
 57 |   Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
 58 |   inv_preemphasis on the output after running the graph.
 59 |   '''
 60 |   S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
 61 |   return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
 62 | 
 63 | 
 64 | def melspectrogram(y):
 65 |   D = _stft(preemphasis(y))
 66 |   S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 67 |   return _normalize(S)
 68 | 
 69 | 
 70 | def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
 71 |   window_length = int(hparams.sample_rate * min_silence_sec)
 72 |   hop_length = int(window_length / 4)
 73 |   threshold = _db_to_amp(threshold_db)
 74 |   for x in range(hop_length, len(wav) - window_length, hop_length):
 75 |     if np.max(wav[x:x+window_length]) < threshold:
 76 |       return x + hop_length
 77 |   return len(wav)
 78 | 
 79 | 
 80 | def _griffin_lim(S):
 81 |   '''librosa implementation of Griffin-Lim
 82 |   Based on https://github.com/librosa/librosa/issues/434
 83 |   '''
 84 |   angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 85 |   S_complex = np.abs(S).astype(np.complex)
 86 |   y = _istft(S_complex * angles)
 87 |   for i in range(hparams.griffin_lim_iters):
 88 |     angles = np.exp(1j * np.angle(_stft(y)))
 89 |     y = _istft(S_complex * angles)
 90 |   return y
 91 | 
 92 | 
 93 | def _griffin_lim_tensorflow(S):
 94 |   '''TensorFlow implementation of Griffin-Lim
 95 |   Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
 96 |   '''
 97 |   with tf.variable_scope('griffinlim'):
 98 |     # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
 99 |     S = tf.expand_dims(S, 0)
100 |     S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
101 |     y = _istft_tensorflow(S_complex)
102 |     for i in range(hparams.griffin_lim_iters):
103 |       est = _stft_tensorflow(y)
104 |       angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
105 |       y = _istft_tensorflow(S_complex * angles)
106 |     return tf.squeeze(y, 0)
107 | 
108 | 
109 | def _stft(y):
110 |   n_fft, hop_length, win_length = _stft_parameters()
111 |   return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
112 | 
113 | 
114 | def _istft(y):
115 |   _, hop_length, win_length = _stft_parameters()
116 |   return librosa.istft(y, hop_length=hop_length, win_length=win_length)
117 | 
118 | 
119 | def _stft_tensorflow(signals):
120 |   n_fft, hop_length, win_length = _stft_parameters()
121 |   return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
122 | 
123 | 
124 | def _istft_tensorflow(stfts):
125 |   n_fft, hop_length, win_length = _stft_parameters()
126 |   return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
127 | 
128 | 
129 | def _stft_parameters():
130 |   n_fft = (hparams.num_freq - 1) * 2
131 |   hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
132 |   win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
133 |   return n_fft, hop_length, win_length
134 | 
135 | 
136 | # Conversions:
137 | 
138 | _mel_basis = None
139 | 
140 | def _linear_to_mel(spectrogram):
141 |   global _mel_basis
142 |   if _mel_basis is None:
143 |     _mel_basis = _build_mel_basis()
144 |   return np.dot(_mel_basis, spectrogram)
145 | 
146 | def _build_mel_basis():
147 |   n_fft = (hparams.num_freq - 1) * 2
148 |   assert hparams.fmax < hparams.sample_rate // 2
149 |   return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax)
150 | 
151 | def _amp_to_db(x):
152 |   return 20 * np.log10(np.maximum(1e-5, x))
153 | 
154 | def _db_to_amp(x):
155 |   return np.power(10.0, x * 0.05)
156 | 
157 | def _db_to_amp_tensorflow(x):
158 |   return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
159 | 
160 | def _normalize(S):
161 |   # symmetric mels
162 |   return 2 * hparams.max_abs_value * ((S - hparams.min_level_db) / -hparams.min_level_db) - hparams.max_abs_value
163 | 
164 | def _denormalize(S):
165 |   # symmetric mels
166 |   return ((S + hparams.max_abs_value) * -hparams.min_level_db) / (2 * hparams.max_abs_value) + hparams.min_level_db
167 | 
168 | def _denormalize_tensorflow(S):
169 |   # symmetric mels
170 |   return ((S + hparams.max_abs_value) * -hparams.min_level_db) / (2 * hparams.max_abs_value) + hparams.min_level_db
171 | 


--------------------------------------------------------------------------------
/util/txt2pinyin.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: UTF-8 -*-
  2 | from __future__ import unicode_literals
  3 | import sys
  4 | import re
  5 | from pypinyin import pinyin, Style, load_phrases_dict
  6 | import jieba
  7 | 
  8 | consonant_list = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k',
  9 |                   'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z',
 10 |                   'c', 's', 'y', 'w']
 11 | 
 12 | TRANSFORM_DICT = {'ju':'jv', 'qu':'qv', 'xu':'xv', 'zi':'zic',
 13 |                   'ci':'cic', 'si':'sic', 'zhi':'zhih', 
 14 |                   'chi':'chih', 'shi':'shih', 'ri':'rih',
 15 |                   'yuan':'yvan', 'yue':'yve', 'yun':'yvn',
 16 |                   'quan':'qvan','xuan':'xvan','juan':'jvan',
 17 |                   'qun':'qvn','xun':'xvn', 'jun':'jvn',
 18 |                   'iu':'iou', 'ui':'uei', 'un':'uen',
 19 |                   'ya':'yia', 'ye':'yie', 'yao':'yiao',
 20 |                   'you':'yiou', 'yan':'yian', 'yin':'yin',
 21 |                   'yang':'yiang', 'ying':'ying', 'yong':'yiong',
 22 |                   'wa':'wua', 'wo':'wuo', 'wai':'wuai',
 23 |                   'wei':'wuei', 'wan':'wuan', 'wen':'wuen',
 24 |                   'weng':'wueng', 'wang':'wuang'}
 25 | 
 26 | translate_dict = {'ju':'jv', 'qu':'qv', 'xu':'xv', 'zi':'zic',
 27 |                   'ci':'cic', 'si':'sic', 'zhi':'zhih', 
 28 |                   'chi':'chih', 'shi':'shih', 'ri':'rih',
 29 |                   'yuan':'yvan', 'yue':'yve', 'yun':'yvn',
 30 |                   'quan':'qvan','xuan':'xvan','juan':'jvan',
 31 |                   'qun':'qvn','xun':'xvn', 'jun':'jvn',
 32 |                   'iu':'iou', 'ui':'uei', 'un':'uen'}
 33 | # phone-set with y w, this is the default phone set
 34 | translate_dict_more = {'ya':'yia', 'ye':'yie', 'yao':'yiao',
 35 |                        'you':'yiou', 'yan':'yian', 'yin':'yin',
 36 |                        'yang':'yiang', 'ying':'ying', 'yong':'yiong',
 37 |                        'wa':'wua', 'wo':'wuo', 'wai':'wuai',
 38 |                        'wei':'wuei', 'wan':'wuan', 'wen':'wuen',
 39 |                        'weng':'wueng', 'wang':'wuang'}
 40 | # phone-set without y w 
 41 | translate_dict_less = {'ya':'ia', 'ye':'ie', 'yao':'iao',
 42 |                        'you':'iou', 'yan':'ian', 'yin':'in',
 43 |                        'yang':'iang', 'ying':'ing', 'yong':'iong',
 44 |                        'yvan':'van', 'yve':'ve', 'yvn':'vn',
 45 |                        'wa':'ua', 'wo':'uo', 'wai':'uai',
 46 |                        'wei':'uei', 'wan':'uan', 'wen':'uen',
 47 |                        'weng':'ueng', 'wang':'uang'}
 48 | 
 49 | def _pre_pinyin_setting():
 50 |     ''' fix pinyin error'''
 51 |     load_phrases_dict({'嗯':[['ēn']]})
 52 | 
 53 | _pre_pinyin_setting()
 54 | 
 55 | def pinyinformat(syllable):
 56 |     '''format pinyin to mtts's format''' 
 57 |     if not syllable[-1].isdigit():
 58 |         syllable = syllable + '5'
 59 |     assert syllable[-1].isdigit()
 60 |     syl_no_tone = syllable[:-1]
 61 |     if syl_no_tone in TRANSFORM_DICT:
 62 |         syllable = syllable.replace(syl_no_tone, TRANSFORM_DICT[syl_no_tone])
 63 |     return syllable
 64 |  
 65 |     """
 66 |     for key, value in translate_dict.items():
 67 |         syllable = syllable.replace(key, value)
 68 |     for key, value in translate_dict_more.items():
 69 |         syllable = syllable.replace(key, value)
 70 |     if not syllable[-1].isdigit():
 71 |         syllable = syllable + '5'
 72 |     return syllable
 73 |     """
 74 | def seprate_syllable(syllable):
 75 |     '''seprate syllable to consonant + ' ' + vowel '''
 76 |     assert syllable[-1].isdigit()
 77 |     if syllable[0:2] in consonant_list:
 78 |         #return syllable[0:2].encode('utf-8'),syllable[2:].encode('utf-8')
 79 |         return syllable[0:2], syllable[2:]
 80 |     elif syllable[0] in consonant_list:
 81 |         #return syllable[0].encode('utf-8'),syllable[1:].encode('utf-8')
 82 |         return syllable[0], syllable[1:]
 83 |     else:
 84 |         #return (syllable.encode('utf-8'),)
 85 |         return (syllable,)
 86 | 
 87 | 
 88 | def txt2pinyin(txt):
 89 |     phone_list = []
 90 |     '''
 91 |     if isinstance(txt, str):
 92 |         pinyin_list = pinyin(unicode(txt,'utf-8'), style = Style.TONE3)
 93 |     elif isinstance(txt, unicode):
 94 |         pinyin_list = pinyin(txt, style = Style.TONE3)
 95 |     else:
 96 |         print('error: unsupport coding form')
 97 |     '''
 98 | 
 99 |     pinyin_list = pinyin(txt, style = Style.TONE3)
100 |     for item in pinyin_list:
101 |         phone_list.append(seprate_syllable(pinyinformat(item[0])))
102 |     return phone_list
103 | 
104 | """
105 | objective: 去除句子中的标点符号
106 | input:
107 |       text:输入有标点符号的句子。例如："想做/ 兼_职/学生_/ 的 、加,我Q：  1 5.  8 0. ！！？？  8 6 。0.  2。 3     有,惊,喜,哦"
108 | output: 转换为去除标点顾浩的字符串。例如："想做兼职学生的加我Q：158086023有惊喜哦"
109 | status: done
110 | author: changshu
111 | """
112 | def removal_punctuation(text):
113 |     # text = "想做/ 兼_职/学生_/ 的 、加,我Q：  1 5.  8 0. ！！？？  8 6 。0.  2。 3     有,惊,喜,哦"
114 |     # temp = temp.encode()
115 |     string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+","", text)
116 |     # print(string)
117 |     return string
118 | 
119 | """
120 | objective: 将文字转化为拼音加韵律的的形式
121 | input:
122 |       text:输入的中文文本
123 | output: 转换为拼音加音律的字符串。例如：xiao3 ming2   shuo4 shi4
124 | status: done
125 | author: changshu
126 | """
127 | def text_to_pinyin(text):
128 |     text=removal_punctuation(text)
129 |     # print("text:",text)
130 |     # seg_list = jieba.cut(txt, cut_all=True) # 会切出重复的部分
131 |     # print("Full Mode: " + " ".join(seg_list))  # 全模式
132 |     # print("Full Mode: " + " ".join(seg_list))  # 全模式
133 |     seg_list = jieba.cut(text, cut_all=False)  # 无重复的部分
134 |     # print("Default Mode: " + " ".join(seg_list))  # 精确模式
135 |     seg_list = " ".join(seg_list)
136 |     result = pinyin(seg_list, style=Style.TONE3)
137 |     result = [i for lst in result for i in lst]
138 |     # print("result的结果",result)
139 |     pinyin_str = [x.strip() for x in result]
140 |     # print("x的结果", pinyin_str)
141 |     pinyin_str = ' '.join(pinyin_str)
142 |     r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~”“。！，、…—～﹏￥]+'
143 |     pinyin_str = re.sub(r, '', pinyin_str)
144 |     return pinyin_str
145 | 
146 | if __name__ == '__main__':
147 |     # txt='你好看啊'
148 |     # txt='中华人民共和国论居然'
149 |     txt='小明硕士毕业于中国科学院计算所，后在日本京都大学深造'
150 |     # print(txt2pinyin(txt))
151 |     print(text_to_pinyin(txt))
152 | 
153 | 
154 | 
155 | '''
156 | 用法举例
157 | print(txt2pinyin('中华人民共和国论居然'))
158 | ['zh ong1', 'h ua2', 'r en2', 'm in2', 'g ong4', 'h e2', 'g uo2', 'l uen4', 'j
159 | v1', 'r an2']
160 | '''
161 | '''
162 | seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
163 | print("Full Mode: " + "/ ".join(seg_list))  # 全模式
164 | 
165 | seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
166 | print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
167 | 
168 | seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
169 | print(", ".join(seg_list))
170 | 
171 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造")  # 搜索引擎模式
172 | print(", ".join(seg_list))
173 | '''
174 | 
175 | 


--------------------------------------------------------------------------------
/datasets/datafeeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import random
  4 | import tensorflow as tf
  5 | import threading
  6 | import time
  7 | import traceback
  8 | from hparams import hparams
  9 | from text import cmudict, text_to_sequence
 10 | from util.infolog import log
 11 | 
 12 | 
 13 | _batches_per_group = 32
 14 | _p_cmudict = 0.5
 15 | _pad = 0
 16 | _stop_token_pad = 1
 17 | 
 18 | 
 19 | class DataFeeder(threading.Thread):
 20 |   '''Feeds batches of data into a queue on a background thread.'''
 21 | 
 22 |   def __init__(self, coordinator, metadata_filename, hparams):
 23 |     super(DataFeeder, self).__init__()
 24 |     self._coord = coordinator
 25 |     self._hparams = hparams
 26 |     self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 27 |     self._offset = 0
 28 | 
 29 |     # Load metadata:
 30 |     self._datadir = os.path.dirname(metadata_filename)
 31 |     with open(metadata_filename, encoding='utf-8') as f:
 32 |       self._metadata = [line.strip().split('|') for line in f]
 33 |       hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
 34 |       log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
 35 | 
 36 |     # Create placeholders for inputs and targets. Don't specify batch size because we want to
 37 |     # be able to feed different sized batches at eval time.
 38 |     self._placeholders = [
 39 |       tf.placeholder(tf.int32, [None, None], 'inputs'),
 40 |       tf.placeholder(tf.int32, [None], 'input_lengths'),
 41 |       tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
 42 |       tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
 43 |       tf.placeholder(tf.float32, [None, None], 'stop_token_targets')
 44 |     ]
 45 | 
 46 |     # Create queue for buffering data:
 47 |     queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32], name='input_queue')
 48 |     self._enqueue_op = queue.enqueue(self._placeholders)
 49 |     self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.stop_token_targets = queue.dequeue()
 50 |     self.inputs.set_shape(self._placeholders[0].shape)
 51 |     self.input_lengths.set_shape(self._placeholders[1].shape)
 52 |     self.mel_targets.set_shape(self._placeholders[2].shape)
 53 |     self.linear_targets.set_shape(self._placeholders[3].shape)
 54 |     self.stop_token_targets.set_shape(self._placeholders[4].shape)
 55 | 
 56 |     # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
 57 |     # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
 58 |     # synthesis (useful for proper nouns, etc.)
 59 |     if hparams.use_cmudict:
 60 |       cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
 61 |       if not os.path.isfile(cmudict_path):
 62 |         raise Exception('If use_cmudict=True, you must download ' +
 63 |           'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'  % cmudict_path)
 64 |       self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
 65 |       log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
 66 |     else:
 67 |       self._cmudict = None
 68 | 
 69 | 
 70 |   def start_in_session(self, session):
 71 |     self._session = session
 72 |     self.start()
 73 | 
 74 | 
 75 |   def run(self):
 76 |     try:
 77 |       while not self._coord.should_stop():
 78 |         self._enqueue_next_group()
 79 |     except Exception as e:
 80 |       traceback.print_exc()
 81 |       self._coord.request_stop(e)
 82 | 
 83 | 
 84 |   def _enqueue_next_group(self):
 85 |     start = time.time()
 86 | 
 87 |     # Read a group of examples:
 88 |     n = self._hparams.batch_size
 89 |     r = self._hparams.outputs_per_step
 90 |     examples = [self._get_next_example() for i in range(n * _batches_per_group)]
 91 | 
 92 |     # Bucket examples based on similar output sequence length for efficiency:
 93 |     examples.sort(key=lambda x: x[-1])
 94 |     batches = [examples[i:i+n] for i in range(0, len(examples), n)]
 95 |     random.shuffle(batches)
 96 | 
 97 |     log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
 98 |     for batch in batches:
 99 |       feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
100 |       self._session.run(self._enqueue_op, feed_dict=feed_dict)
101 | 
102 | 
103 |   def _get_next_example(self):
104 |     '''Loads a single example (input, mel_target, linear_target, stop_token_target) from disk'''
105 |     if self._offset >= len(self._metadata):
106 |       self._offset = 0
107 |       random.shuffle(self._metadata)
108 |     meta = self._metadata[self._offset]
109 |     self._offset += 1
110 | 
111 |     text = meta[3]
112 |     if self._cmudict and random.random() < _p_cmudict:
113 |       text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])
114 | 
115 |     input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
116 |     linear_target = np.load(os.path.join(self._datadir, meta[0]))
117 |     mel_target = np.load(os.path.join(self._datadir, meta[1]))
118 |     stop_token_target = np.asarray([0.] * len(mel_target))
119 |     return (input_data, mel_target, linear_target, stop_token_target, len(linear_target))
120 | 
121 | 
122 |   def _maybe_get_arpabet(self, word):
123 |     arpabet = self._cmudict.lookup(word)
124 |     return '{%s}' % arpabet[0] if arpabet is not None and random.random() < 0.5 else word
125 | 
126 | 
127 | def _prepare_batch(batch, outputs_per_step):
128 |   random.shuffle(batch)
129 |   inputs = _prepare_inputs([x[0] for x in batch])
130 |   input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
131 |   mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
132 |   linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
133 |   stop_token_targets = _prepare_stop_token_targets([x[3] for x in batch], outputs_per_step)
134 |   return (inputs, input_lengths, mel_targets, linear_targets, stop_token_targets)
135 | 
136 | 
137 | def _prepare_inputs(inputs):
138 |   max_len = max((len(x) for x in inputs))
139 |   return np.stack([_pad_input(x, max_len) for x in inputs])
140 | 
141 | 
142 | def _prepare_targets(targets, alignment):
143 |   max_len = max((len(t) for t in targets)) + 1
144 |   return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
145 | 
146 | 
147 | def _prepare_stop_token_targets(targets, alignment):
148 |   max_len = max((len(t) for t in targets)) + 1
149 |   return np.stack([_pad_stop_token_target(t, _round_up(max_len, alignment)) for t in targets])
150 | 
151 | 
152 | def _pad_input(x, length):
153 |   return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
154 | 
155 | 
156 | def _pad_target(t, length):
157 |   return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
158 | 
159 | 
160 | def _pad_stop_token_target(t, length):
161 |   return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_stop_token_pad)
162 | 
163 | 
164 | def _round_up(x, multiple):
165 |   remainder = x % multiple
166 |   return x if remainder == 0 else x + multiple - remainder
167 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from datetime import datetime
  3 | import math
  4 | import numpy as np
  5 | import os
  6 | import subprocess
  7 | import time
  8 | import tensorflow as tf
  9 | import traceback
 10 | 
 11 | from datasets.datafeeder import DataFeeder
 12 | from hparams import hparams, hparams_debug_string
 13 | from models import create_model
 14 | from text import sequence_to_text
 15 | from util import audio, infolog, plot, ValueWindow
 16 | log = infolog.log
 17 | 
 18 | 
 19 | def get_git_commit():
 20 |   subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD'])   # Verify client is clean
 21 |   commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10]
 22 |   log('Git commit: %s' % commit)
 23 |   return commit
 24 | 
 25 | 
 26 | def add_stats(model):
 27 |   with tf.variable_scope('stats') as scope:
 28 |     tf.summary.histogram('linear_outputs', model.linear_outputs)
 29 |     tf.summary.histogram('linear_targets', model.linear_targets)
 30 |     tf.summary.histogram('mel_outputs', model.mel_outputs)
 31 |     tf.summary.histogram('mel_targets', model.mel_targets)
 32 |     tf.summary.scalar('loss_mel', model.mel_loss)
 33 |     tf.summary.scalar('loss_linear', model.linear_loss)
 34 |     tf.summary.scalar('regularization_loss', model.regularization_loss)
 35 |     tf.summary.scalar('stop_token_loss', model.stop_token_loss)
 36 |     tf.summary.scalar('learning_rate', model.learning_rate)
 37 |     tf.summary.scalar('loss', model.loss)
 38 |     gradient_norms = [tf.norm(grad) for grad in model.gradients]
 39 |     tf.summary.histogram('gradient_norm', gradient_norms)
 40 |     tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms))
 41 |     return tf.summary.merge_all()
 42 | 
 43 | 
 44 | def time_string():
 45 |   return datetime.now().strftime('%Y-%m-%d %H:%M')
 46 | 
 47 | 
 48 | def train(log_dir, args):
 49 |   commit = get_git_commit() if args.git else 'None'
 50 |   checkpoint_path = os.path.join(log_dir, 'model.ckpt')
 51 |   input_path = os.path.join(args.base_dir, args.input)
 52 |   log('Checkpoint path: %s' % checkpoint_path)
 53 |   log('Loading training data from: %s' % input_path)
 54 |   log('Using model: %s' % args.model)
 55 |   log(hparams_debug_string())
 56 | 
 57 |   # Set up DataFeeder:
 58 |   coord = tf.train.Coordinator()
 59 |   with tf.variable_scope('datafeeder') as scope:
 60 |     feeder = DataFeeder(coord, input_path, hparams)
 61 | 
 62 |   # Set up model:
 63 |   global_step = tf.Variable(0, name='global_step', trainable=False)
 64 |   with tf.variable_scope('model') as scope:
 65 |     model = create_model(args.model, hparams)
 66 |     model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.stop_token_targets, global_step)
 67 |     model.add_loss()
 68 |     model.add_optimizer(global_step)
 69 |     stats = add_stats(model)
 70 | 
 71 |   # Bookkeeping:
 72 |   step = 0
 73 |   time_window = ValueWindow(100)
 74 |   loss_window = ValueWindow(100)
 75 |   saver = tf.train.Saver(max_to_keep=1)
 76 | 
 77 |   # Train!
 78 |   with tf.Session() as sess:
 79 |     try:
 80 |       summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
 81 |       sess.run(tf.global_variables_initializer())
 82 | 
 83 |       if args.restore_step:
 84 |         # Restore from a checkpoint if the user requested it.
 85 |         checkpoint_state = tf.train.get_checkpoint_state(log_dir)
 86 |         restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
 87 |         if checkpoint_state is not None:
 88 |           saver.restore(sess, checkpoint_state.model_checkpoint_path)
 89 |           log('Resuming from checkpoint: %s at commit: %s' % (checkpoint_state.model_checkpoint_path, commit), slack=True)
 90 |       else:
 91 |         log('Starting new training run at commit: %s' % commit, slack=True)
 92 | 
 93 |       feeder.start_in_session(sess)
 94 | 
 95 |       while not coord.should_stop():
 96 |         start_time = time.time()
 97 |         step, loss, opt = sess.run([global_step, model.loss, model.optimize])
 98 |         time_window.append(time.time() - start_time)
 99 |         loss_window.append(loss)
100 |         message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
101 |           step, time_window.average, loss, loss_window.average)
102 |         log(message, slack=(step % args.checkpoint_interval == 0))
103 | 
104 |         if loss > 100 or math.isnan(loss):
105 |           log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
106 |           raise Exception('Loss Exploded')
107 | 
108 |         if step % args.summary_interval == 0:
109 |           log('Writing summary at step: %d' % step)
110 |           summary_writer.add_summary(sess.run(stats), step)
111 | 
112 |         if step % args.checkpoint_interval == 0:
113 |           log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
114 |           saver.save(sess, checkpoint_path, global_step=step)
115 |           log('Saving audio and alignment...')
116 |           input_seq, spectrogram, alignment = sess.run([
117 |             model.inputs[0], model.linear_outputs[0], model.alignments[0]])
118 |           waveform = audio.inv_spectrogram(spectrogram.T)
119 |           audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
120 |           plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step),
121 |             info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss))
122 |           log('Input: %s' % sequence_to_text(input_seq))
123 | 
124 |     except Exception as e:
125 |       log('Exiting due to exception: %s' % e, slack=True)
126 |       traceback.print_exc()
127 |       coord.request_stop(e)
128 | 
129 | 
130 | def main():
131 |   parser = argparse.ArgumentParser()
132 |   parser.add_argument('--base_dir', default=os.path.expanduser('.'))
133 |   parser.add_argument('--input', default='training/train.txt')
134 |   parser.add_argument('--model', default='tacotron')
135 |   parser.add_argument('--name', help='Name of the run. Used for logging. Defaults to model name.')
136 |   parser.add_argument('--hparams', default='',
137 |     help='Hyperparameter overrides as a comma-separated list of name=value pairs')
138 |   parser.add_argument('--restore_step', type=bool, default=True, help='Global step to restore from checkpoint.')
139 |   parser.add_argument('--summary_interval', type=int, default=100,
140 |     help='Steps between running summary ops.')
141 |   parser.add_argument('--checkpoint_interval', type=int, default=1000,
142 |     help='Steps between writing checkpoints.')
143 |   parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.')
144 |   parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
145 |   parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.')
146 |   args = parser.parse_args()
147 |   os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
148 |   os.environ['CUDA_VISIBLE_DEVICES'] = '0'
149 |   run_name = args.name or args.model
150 |   log_dir = os.path.join(args.base_dir, 'logs-%s' % run_name)
151 |   os.makedirs(log_dir, exist_ok=True)
152 |   infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url)
153 |   hparams.parse(args.hparams)
154 |   train(log_dir, args)
155 | 
156 | 
157 | if __name__ == '__main__':
158 |   main()
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tacotron
  2 | 
  3 | An implementation of Tacotron speech synthesis in TensorFlow.
  4 | 
  5 | 
  6 | ### Audio Samples
  7 | 
  8 |   * **[Audio Samples](https://keithito.github.io/audio-samples/)** from models trained using this repo.
  9 |     * The first set was trained for 877K steps on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/)
 10 |       * Speech started to become intelligble around 20K steps.
 11 |       * Although loss continued to decrease, there wasn't much noticable improvement after ~250K steps.
 12 |     * The second set was trained by [@MXGray](https://github.com/MXGray) for 140K steps on the [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/).
 13 | 
 14 | 
 15 | 
 16 | ## Background
 17 | 
 18 | In April 2017, Google published a paper, [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/pdf/1703.10135.pdf),
 19 | where they present a neural text-to-speech model that learns to synthesize speech directly from
 20 | (text, audio) pairs. However, they didn't release their source code or training data. This is an
 21 | independent attempt to provide an open-source implementation of the model described in their paper.
 22 | 
 23 | The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-).
 24 | Pull requests are welcome!
 25 | 
 26 | 
 27 | 
 28 | ## Quick Start
 29 | 
 30 | ### Installing dependencies
 31 | 
 32 | 1. Install Python 3.
 33 | 
 34 | 2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better
 35 |    performance, install with GPU support if it's available. This code works with TensorFlow 1.3 and later.
 36 | 
 37 | 3. Install requirements:
 38 |    ```
 39 |    pip install -r requirements.txt
 40 |    ```
 41 | 
 42 | 
 43 | ### Using a pre-trained model
 44 | 
 45 | 1. **Download and unpack a model**:
 46 |    ```
 47 |    curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
 48 |    ```
 49 | 
 50 | 2. **Run the demo server**:
 51 |    ```
 52 |    python3 demo_server.py --checkpoint /tmp/tacotron-20170720/model.ckpt
 53 |    ```
 54 | 
 55 | 3. **Point your browser at localhost:9000**
 56 |    * Type what you want to synthesize
 57 | 
 58 | 
 59 | 
 60 | ### Training
 61 | 
 62 | *Note: you need at least 40GB of free disk space to train a model.*
 63 | 
 64 | 1. **Download a speech dataset.**
 65 | 
 66 |    The following are supported out of the box:
 67 |     * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
 68 |     * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
 69 | 
 70 |    You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.
 71 | 
 72 | 
 73 | 2. **Unpack the dataset into `~/tacotron`**
 74 | 
 75 |    After unpacking, your tree should look like this for LJ Speech:
 76 |    ```
 77 |    tacotron
 78 |      |- LJSpeech-1.1
 79 |          |- metadata.csv
 80 |          |- wavs
 81 |    ```
 82 | 
 83 |    or like this for Blizzard 2012:
 84 |    ```
 85 |    tacotron
 86 |      |- Blizzard2012
 87 |          |- ATrampAbroad
 88 |          |   |- sentence_index.txt
 89 |          |   |- lab
 90 |          |   |- wav
 91 |          |- TheManThatCorruptedHadleyburg
 92 |              |- sentence_index.txt
 93 |              |- lab
 94 |              |- wav
 95 |    ```
 96 | 
 97 | 3. **Preprocess the data**
 98 |    ```
 99 |    python3 preprocess.py --dataset ljspeech
100 |    ```
101 |      * Use `--dataset blizzard` for Blizzard data
102 | 
103 | 4. **Train a model**
104 |    ```
105 |    python3 train.py
106 |    ```
107 | 
108 |    Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command
109 |    line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`.
110 |    Hyperparameters should generally be set to the same values at both training and eval time.
111 |    The default hyperparameters are recommended for LJ Speech and other English-language data.
112 |    See [TRAINING_DATA.md](TRAINING_DATA.md) for other languages.
113 | 
114 | 
115 | 5. **Monitor with Tensorboard** (optional)
116 |    ```
117 |    tensorboard --logdir ~/tacotron/logs-tacotron
118 |    ```
119 | 
120 |    The trainer dumps audio and alignments every 1000 steps. You can find these in
121 |    `~/tacotron/logs-tacotron`.
122 | 
123 | 6. **Synthesize from a checkpoint**
124 |    ```
125 |    python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
126 |    ```
127 |    Replace "185000" with the checkpoint number that you want to use, then open a browser
128 |    to `localhost:9000` and type what you want to speak. Alternately, you can
129 |    run [eval.py](eval.py) at the command line:
130 |    ```
131 |    python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
132 |    ```
133 |    If you set the `--hparams` flag when training, set the same value here.
134 | 
135 | 
136 | ## Notes and Common Issues
137 | 
138 |   * [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to improve
139 |     training speed and avoids occasional slowdowns seen with the default allocator. You
140 |     can enable it by installing it and setting `LD_PRELOAD=/usr/lib/libtcmalloc.so`. With TCMalloc,
141 |     you can get around 1.1 sec/step on a GTX 1080Ti.
142 | 
143 |   * You can train with [CMUDict](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) by downloading the
144 |     dictionary to ~/tacotron/training and then passing the flag `--hparams="use_cmudict=True"` to
145 |     train.py. This will allow you to pass ARPAbet phonemes enclosed in curly braces at eval
146 |     time to force a particular pronunciation, e.g. `Turn left on {HH AW1 S S T AH0 N} Street.`
147 | 
148 |   * If you pass a Slack incoming webhook URL as the `--slack_url` flag to train.py, it will send
149 |     you progress updates every 1000 steps.
150 | 
151 |   * Occasionally, you may see a spike in loss and the model will forget how to attend (the
152 |     alignments will no longer make sense). Although it will recover eventually, it may
153 |     save time to restart at a checkpoint prior to the spike by passing the
154 |     `--restore_step=150000` flag to train.py (replacing 150000 with a step number prior to the
155 |     spike). **Update**: a recent [fix](https://github.com/keithito/tacotron/pull/7) to gradient
156 |     clipping by @candlewill may have fixed this.
157 |     
158 |   * During eval and training, audio length is limited to `max_iters * outputs_per_step * frame_shift_ms`
159 |     milliseconds. With the defaults (max_iters=200, outputs_per_step=5, frame_shift_ms=12.5), this is
160 |     12.5 seconds.
161 |     
162 |     If your training examples are longer, you will see an error like this:
163 |     `Incompatible shapes: [32,1340,80] vs. [32,1000,80]`
164 |     
165 |     To fix this, you can set a larger value of `max_iters` by passing `--hparams="max_iters=300"` to
166 |     train.py (replace "300" with a value based on how long your audio is and the formula above).
167 |     
168 |   * Here is the expected loss curve when training on LJ Speech with the default hyperparameters:
169 |     ![Loss curve](https://user-images.githubusercontent.com/1945356/36077599-c0513e4a-0f21-11e8-8525-07347847720c.png)
170 | 
171 | 
172 | ## Other Implementations
173 |   * By Alex Barron: https://github.com/barronalex/Tacotron
174 |   * By Kyubyong Park: https://github.com/Kyubyong/tacotron
175 | 
176 | 
177 | 
178 | <div align=life> 
179 | <iframe frameborder="no" marginwidth="0" marginheight="0" width=400 height=140 src="https://music.163.com/outchain/player?type=2&id=34341360&auto=0&height=66"></iframe>
180 | </div>


--------------------------------------------------------------------------------
/models/tacotron.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper
  3 | from tensorflow.contrib.seq2seq import BasicDecoder
  4 | from text.symbols import symbols
  5 | from util.infolog import log
  6 | from .helpers import TacoTestHelper, TacoTrainingHelper
  7 | from .modules import encoder_cbhg, post_cbhg, prenet
  8 | from .rnn_wrappers import FrameProjection, StopProjection, TacotronDecoderWrapper
  9 | from .attention import LocationSensitiveAttention
 10 | from .custom_decoder import CustomDecoder
 11 | 
 12 | 
 13 | class Tacotron():
 14 |   def __init__(self, hparams):
 15 |     self._hparams = hparams
 16 | 
 17 | 
 18 |   def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None):
 19 |     '''Initializes the model for inference.
 20 | 
 21 |     Sets "mel_outputs", "linear_outputs", and "alignments" fields.
 22 | 
 23 |     Args:
 24 |       inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
 25 |         steps in the input time series, and values are character IDs
 26 |       input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
 27 |         of each sequence in inputs.
 28 |       mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
 29 |         of steps in the output time series, M is num_mels, and values are entries in the mel
 30 |         spectrogram. Only needed for training.
 31 |       linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
 32 |         of steps in the output time series, F is num_freq, and values are entries in the linear
 33 |         spectrogram. Only needed for training.
 34 |     '''
 35 |     with tf.variable_scope('inference') as scope:
 36 |       is_training = linear_targets is not None
 37 |       batch_size = tf.shape(inputs)[0]
 38 |       hp = self._hparams
 39 | 
 40 |       # Embeddings
 41 |       embedding_table = tf.get_variable(
 42 |         'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
 43 |         initializer=tf.truncated_normal_initializer(stddev=0.5))
 44 |       embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)                            # [N, T_in, embed_depth=256]
 45 | 
 46 |       # Encoder
 47 |       prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths)                      # [N, T_in, prenet_depths[-1]=128]
 48 |       encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N, T_in, encoder_depth=256]
 49 | 
 50 |       # Location sensitive attention
 51 |       attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs)        # [N, T_in, attention_depth=256]
 52 | 
 53 |       # Decoder (layers specified bottom to top):
 54 |       multi_rnn_cell = MultiRNNCell([
 55 |           ResidualWrapper(GRUCell(hp.decoder_depth)),
 56 |           ResidualWrapper(GRUCell(hp.decoder_depth))
 57 |         ], state_is_tuple=True)                                                                    # [N, T_in, decoder_depth=256]
 58 | 
 59 |       # Frames Projection layer
 60 |       frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step)                        # [N, T_out/r, M*r]
 61 | 
 62 |       # <stop_token> projection layer
 63 |       stop_projection = StopProjection(is_training, shape=hp.outputs_per_step)                     # [N, T_out/r, r]
 64 | 
 65 |       # Project onto r mel spectrograms (predict r outputs at each RNN step):
 66 |       decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell,
 67 |                                             frame_projection, stop_projection)
 68 | 
 69 |       if is_training:
 70 |         helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step)
 71 |       else:
 72 |         helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
 73 | 
 74 |       decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
 75 | 
 76 |       (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
 77 |          CustomDecoder(decoder_cell, helper, decoder_init_state),
 78 |          maximum_iterations=hp.max_iters)                                                          # [N, T_out/r, M*r]
 79 | 
 80 |       # Reshape outputs to be one output per entry
 81 |       mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])                     # [N, T_out, M]
 82 |       stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1])                        # [N, T_out, M]
 83 | 
 84 |       # Add post-processing CBHG:
 85 |       post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth)            # [N, T_out, postnet_depth=256]
 86 |       linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)                                  # [N, T_out, F]
 87 | 
 88 |       # Grab alignments from the final decoder state:
 89 |       alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])
 90 | 
 91 |       self.inputs = inputs
 92 |       self.input_lengths = input_lengths
 93 |       self.mel_outputs = mel_outputs
 94 |       self.linear_outputs = linear_outputs
 95 |       self.stop_token_outputs = stop_token_outputs
 96 |       self.alignments = alignments
 97 |       self.mel_targets = mel_targets
 98 |       self.linear_targets = linear_targets
 99 |       self.stop_token_targets = stop_token_targets
100 |       log('Initialized Tacotron model. Dimensions: ')
101 |       log('  embedding:               {}'.format(embedded_inputs.shape))
102 |       log('  prenet out:              {}'.format(prenet_outputs.shape))
103 |       log('  encoder out:             {}'.format(encoder_outputs.shape))
104 |       log('  decoder out (r frames):  {}'.format(decoder_outputs.shape))
105 |       log('  decoder out (1 frame):   {}'.format(mel_outputs.shape))
106 |       log('  postnet out:             {}'.format(post_outputs.shape))
107 |       log('  linear out:              {}'.format(linear_outputs.shape))
108 |       log('  stop token:              {}'.format(stop_token_outputs.shape))
109 | 
110 | 
111 |   def add_loss(self):
112 |     '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
113 |     with tf.variable_scope('loss') as scope:
114 |       hp = self._hparams
115 |       self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs))
116 |       self.linear_loss = tf.reduce_mean(tf.abs(self.linear_targets - self.linear_outputs))
117 |       self.stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
118 |                                             labels=self.stop_token_targets,
119 |                                             logits=self.stop_token_outputs))
120 | 
121 |       # Compute the regularization weights
122 |       reg_weight = 1e-6
123 |       all_vars = tf.trainable_variables()
124 |       self.regularization_loss = tf.add_n([tf.nn.l2_loss(v) for v in all_vars
125 |         if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight
126 | 
127 |       self.loss = self.mel_loss + self.linear_loss + self.stop_token_loss + self.regularization_loss
128 | 
129 | 
130 |   def add_optimizer(self, global_step):
131 |     '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
132 | 
133 |     Args:
134 |       global_step: int32 scalar Tensor representing current global step in training
135 |     '''
136 |     with tf.variable_scope('optimizer') as scope:
137 |       hp = self._hparams
138 |       if hp.decay_learning_rate:
139 |         self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
140 |       else:
141 |         self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
142 |       optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
143 |       gradients, variables = zip(*optimizer.compute_gradients(self.loss))
144 |       self.gradients = gradients
145 |       clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
146 | 
147 |       # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
148 |       # https://github.com/tensorflow/tensorflow/issues/1122
149 |       with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
150 |         self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
151 |           global_step=global_step)
152 | 
153 | 
154 | def _learning_rate_decay(init_lr, global_step):
155 |   # Noam scheme from tensor2tensor:
156 |   warmup_steps = 4000.0
157 |   step = tf.cast(global_step + 1, dtype=tf.float32)
158 |   return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
159 | 


--------------------------------------------------------------------------------
/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.layers import core as layers_core
  6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
  7 | 
  8 | 
  9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, attention_layer):
 11 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 12 | 	alignments, next_attention_state = attention_mechanism(
 13 | 		cell_output, state=attention_state)
 14 | 
 15 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 16 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 17 | 	# Context is the inner product of alignments and values along the
 18 | 	# memory time dimension.
 19 | 	# alignments shape is
 20 | 	#   [batch_size, 1, memory_time]
 21 | 	# attention_mechanism.values shape is
 22 | 	#   [batch_size, memory_time, memory_size]
 23 | 	# the batched matmul is over memory_time, so the output shape is
 24 | 	#   [batch_size, 1, memory_size].
 25 | 	# we then squeeze out the singleton dim.
 26 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 27 | 	context = array_ops.squeeze(context, [1])
 28 | 
 29 | 	if attention_layer is not None:
 30 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 31 | 	else:
 32 | 		attention = context
 33 | 
 34 | 	return attention, alignments, next_attention_state
 35 | 
 36 | 
 37 | def _location_sensitive_score(W_query, W_fil, W_keys):
 38 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 39 | 	This attention is described in:
 40 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 41 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 42 | 	  vances in Neural Information Processing Systems, 2015, pp.
 43 | 	  577–585.
 44 | 
 45 | 	#############################################################################
 46 | 			  hybrid attention (content-based + location-based)
 47 | 							   f = F * α_{i-1}
 48 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 49 | 	#############################################################################
 50 | 
 51 | 	Args:
 52 | 		W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
 53 | 		W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
 54 | 		W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
 55 | 	Returns:
 56 | 		A '[batch_size, max_time]' attention score (energy)
 57 | 	"""
 58 | 	# Get the number of hidden units from the trailing dimension of keys
 59 | 	dtype = W_query.dtype
 60 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 61 | 
 62 | 	v_a = tf.get_variable(
 63 | 		'attention_variable', shape=[num_units], dtype=dtype,
 64 | 		initializer=tf.contrib.layers.xavier_initializer())
 65 | 	b_a = tf.get_variable(
 66 | 		'attention_bias', shape=[num_units], dtype=dtype,
 67 | 		initializer=tf.zeros_initializer())
 68 | 
 69 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 70 | 
 71 | def _smoothing_normalization(e):
 72 | 	"""Applies a smoothing normalization function instead of softmax
 73 | 	Introduced in:
 74 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 75 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 76 | 	  vances in Neural Information Processing Systems, 2015, pp.
 77 | 	  577–585.
 78 | 
 79 | 	############################################################################
 80 | 						Smoothing normalization function
 81 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 82 | 	############################################################################
 83 | 
 84 | 	Args:
 85 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 86 | 			values of an attention mechanism
 87 | 	Returns:
 88 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 89 | 			attendance to multiple memory time steps.
 90 | 	"""
 91 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 92 | 
 93 | 
 94 | class LocationSensitiveAttention(BahdanauAttention):
 95 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 96 | 	Usually referred to as "hybrid" attention (content-based + location-based)
 97 | 	Extends the additive attention described in:
 98 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
 99 |   tion by jointly learning to align and translate,” in Proceedings
100 |   of ICLR, 2015."
101 | 	to use previous alignments as additional location features.
102 | 
103 | 	This attention is described in:
104 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
105 |   gio, “Attention-based models for speech recognition,” in Ad-
106 |   vances in Neural Information Processing Systems, 2015, pp.
107 |   577–585.
108 | 	"""
109 | 
110 | 	def __init__(self,
111 | 			num_units,
112 | 			memory,
113 | 			smoothing=False,
114 | 			cumulate_weights=True,
115 | 			name='LocationSensitiveAttention'):
116 | 		"""Construct the Attention mechanism.
117 | 		Args:
118 | 			num_units: The depth of the query mechanism.
119 | 			memory: The memory to query; usually the output of an RNN encoder.  This
120 | 				tensor should be shaped `[batch_size, max_time, ...]`.
121 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
122 | 				in memory.  If provided, the memory tensor rows are masked with zeros
123 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
124 | 			smoothing (optional): Boolean. Determines which normalization function to use.
125 | 				Default normalization function (probablity_fn) is softmax. If smoothing is
126 | 				enabled, we replace softmax with:
127 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
128 | 				Introduced in:
129 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
130 | 				  gio, “Attention-based models for speech recognition,” in Ad-
131 | 				  vances in Neural Information Processing Systems, 2015, pp.
132 | 				  577–585.
133 | 				This is mainly used if the model wants to attend to multiple inputs parts
134 | 				at the same decoding step. We probably won't be using it since multiple sound
135 | 				frames may depend from the same character, probably not the way around.
136 | 				Note:
137 | 					We still keep it implemented in case we want to test it. They used it in the
138 | 					paper in the context of speech recognition, where one phoneme may depend on
139 | 					multiple subsequent sound frames.
140 | 			name: Name to use when creating ops.
141 | 		"""
142 | 		#Create normalization function
143 | 		#Setting it to None defaults in using softmax
144 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
145 | 		super(LocationSensitiveAttention, self).__init__(
146 | 				num_units=num_units,
147 | 				memory=memory,
148 | 				memory_sequence_length=None,
149 | 				probability_fn=normalization_function,
150 | 				name=name)
151 | 
152 | 		self.location_convolution = tf.layers.Conv1D(filters=32,
153 | 			kernel_size=(31, ), padding='same', use_bias=True,
154 | 			bias_initializer=tf.zeros_initializer(), name='location_features_convolution')
155 | 		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
156 | 			dtype=tf.float32, name='location_features_layer')
157 | 		self._cumulate = cumulate_weights
158 | 
159 | 	def __call__(self, query, state):
160 | 		"""Score the query based on the keys and values.
161 | 		Args:
162 | 			query: Tensor of dtype matching `self.values` and shape
163 | 				`[batch_size, query_depth]`.
164 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
165 | 				`[batch_size, alignments_size]`
166 | 				(`alignments_size` is memory's `max_time`).
167 | 		Returns:
168 | 			alignments: Tensor of dtype matching `self.values` and shape
169 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
170 | 				`max_time`).
171 | 		"""
172 | 		previous_alignments = state
173 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
174 | 
175 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
176 | 			processed_query = self.query_layer(query) if self.query_layer else query
177 | 			# -> [batch_size, 1, attention_dim]
178 | 			processed_query = tf.expand_dims(processed_query, 1)
179 | 
180 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
181 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
182 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
183 | 			# location features [batch_size, max_time, filters]
184 | 			f = self.location_convolution(expanded_alignments)
185 | 			# Projected location features [batch_size, max_time, attention_dim]
186 | 			processed_location_features = self.location_layer(f)
187 | 
188 | 			# energy shape [batch_size, max_time]
189 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
190 | 
191 | 
192 | 		# alignments shape = energy shape = [batch_size, max_time]
193 | 		alignments = self._probability_fn(energy, previous_alignments)
194 | 
195 | 		# Cumulate alignments
196 | 		if self._cumulate:
197 | 			next_state = alignments + previous_alignments
198 | 		else:
199 | 			next_state = alignments
200 | 
201 | 		return alignments, next_state
202 | 


--------------------------------------------------------------------------------
/models/rnn_wrappers.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from .modules import prenet
  5 | from .attention import _compute_attention
  6 | from tensorflow.contrib.rnn import RNNCell
  7 | from tensorflow.python.framework import ops, tensor_shape
  8 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
  9 | from tensorflow.python.util import nest
 10 | from hparams import hparams as hp
 11 | 
 12 | 
 13 | class FrameProjection:
 14 |   """Projection layer to r * num_mels dimensions or num_mels dimensions
 15 |   """
 16 |   def __init__(self, shape=hp.num_mels, activation=None, scope=None):
 17 |     """
 18 |     Args:
 19 |       shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for postnet)
 20 |       activation: callable, activation function
 21 |       scope: FrameProjection scope.
 22 |     """
 23 |     super(FrameProjection, self).__init__()
 24 | 
 25 |     self.shape = shape
 26 |     self.activation = activation
 27 |     self.scope = 'linear_projection' if scope is None else scope
 28 |     self.dense = tf.layers.Dense(units=shape, activation=activation, name='projection_{}'.format(self.scope))
 29 | 
 30 |   def __call__(self, inputs):
 31 |     with tf.variable_scope(self.scope):
 32 |       # If activation==None, this returns a simple Linear projection
 33 |       # else the projection will be passed through an activation function
 34 |       # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
 35 |       # name='projection_{}'.format(self.scope))
 36 |       return self.dense(inputs)
 37 | 
 38 | 
 39 | class StopProjection:
 40 |   """Projection to a scalar and through a sigmoid activation
 41 |   """
 42 |   def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):
 43 |     """
 44 |     Args:
 45 |       is_training: Boolean, to control the use of sigmoid function as it is useless to use it
 46 |         during training since it is integrate inside the sigmoid_crossentropy loss
 47 |       shape: integer, dimensionality of output space. Defaults to 1 (scalar)
 48 |       activation: callable, activation function. only used during inference
 49 |       scope: StopProjection scope.
 50 |     """
 51 |     super(StopProjection, self).__init__()
 52 | 
 53 |     self.is_training = is_training
 54 |     self.shape = shape
 55 |     self.activation = activation
 56 |     self.scope = 'stop_token_projection' if scope is None else scope
 57 | 
 58 |   def __call__(self, inputs):
 59 |     with tf.variable_scope(self.scope):
 60 |       output = tf.layers.dense(inputs, units=self.shape, activation=None, name='projection_{}'.format(self.scope))
 61 |       #During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function
 62 |       return output if self.is_training else self.activation(output)
 63 | 
 64 | 
 65 | class TacotronDecoderCellState(
 66 |   collections.namedtuple("TacotronDecoderCellState",
 67 |    ("cell_state", "attention", "time", "alignments",
 68 |     "alignment_history"))):
 69 |   """`namedtuple` storing the state of a `TacotronDecoderCell`.
 70 |   Contains:
 71 |     - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 72 |     step.
 73 |     - `attention`: The attention emitted at the previous time step.
 74 |     - `time`: int32 scalar containing the current time step.
 75 |     - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 76 |      emitted at the previous time step for each attention mechanism.
 77 |     - `alignment_history`: a single or tuple of `TensorArray`(s)
 78 |      containing alignment matrices from all time steps for each attention
 79 |      mechanism. Call `stack()` on each to convert to a `Tensor`.
 80 |   """
 81 |   def replace(self, **kwargs):
 82 |     """Clones the current state while overwriting components provided by kwargs.
 83 |     """
 84 |     return super(TacotronDecoderCellState, self)._replace(**kwargs)
 85 | 
 86 | 
 87 | class TacotronDecoderWrapper(RNNCell):
 88 |   """Tactron 2 Decoder Cell
 89 |   Decodes encoder output and previous mel frames into next r frames
 90 | 
 91 |   Decoder Step i:
 92 |     1) Prenet to compress last output information
 93 |     2) Concat compressed inputs with previous context vector (input feeding) *
 94 |     3) Decoder RNN (actual decoding) to predict current state s_{i} *
 95 |     4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 96 |     5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 97 |     6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 98 | 
 99 |   * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
100 |   and wrap that with the prenet before doing an input feeding, and with the prediction layer
101 |   that uses RNN states to project on output space. Actions marked with (*) can be replaced with
102 |   tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
103 |   """
104 | 
105 |   def __init__(self, is_training, attention_mechanism, rnn_cell, frame_projection, stop_projection):
106 |     """Initialize decoder parameters
107 | 
108 |     Args:
109 |         prenet: A tensorflow fully connected layer acting as the decoder pre-net
110 |         attention_mechanism: A _BaseAttentionMechanism instance, usefull to
111 |           learn encoder-decoder alignments
112 |         rnn_cell: Instance of RNNCell, main body of the decoder
113 |         frame_projection: tensorflow fully connected layer with r * num_mels output units
114 |         stop_projection: tensorflow fully connected layer, expected to project to a scalar
115 |           and through a sigmoid activation
116 |       mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
117 |     """
118 |     super(TacotronDecoderWrapper, self).__init__()
119 |     #Initialize decoder layers
120 |     self._training = is_training
121 |     self._attention_mechanism = attention_mechanism
122 |     self._cell = rnn_cell
123 |     self._frame_projection = frame_projection
124 |     self._stop_projection = stop_projection
125 |     self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
126 | 
127 |   def _batch_size_checks(self, batch_size, error_message):
128 |     return [check_ops.assert_equal(batch_size,
129 |       self._attention_mechanism.batch_size,
130 |       message=error_message)]
131 | 
132 |   @property
133 |   def output_size(self):
134 |     return self._frame_projection.shape
135 | 
136 |   # @property
137 |   def state_size(self):
138 |     """The `state_size` property of `TacotronDecoderWrapper`.
139 | 
140 |     Returns:
141 |       An `TacotronDecoderWrapper` tuple containing shapes used by this object.
142 |     """
143 |     return TacotronDecoderCellState(
144 |       cell_state=self._cell._cell.state_size,
145 |       time=tensor_shape.TensorShape([]),
146 |       attention=self._attention_layer_size,
147 |       alignments=self._attention_mechanism.alignments_size,
148 |       alignment_history=())
149 | 
150 |   def zero_state(self, batch_size, dtype):
151 |     """Return an initial (zero) state tuple for this `AttentionWrapper`.
152 | 
153 |     Args:
154 |       batch_size: `0D` integer tensor: the batch size.
155 |       dtype: The internal state data type.
156 |     Returns:
157 |       An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
158 |       possibly, empty `TensorArray` objects.
159 |     Raises:
160 |       ValueError: (or, possibly at runtime, InvalidArgument), if
161 |       `batch_size` does not match the output size of the encoder passed
162 |       to the wrapper object at initialization time.
163 |     """
164 |     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
165 |       cell_state = self._cell.zero_state(batch_size, dtype)
166 |       error_message = (
167 |         "When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
168 |         "Non-matching batch sizes between the memory "
169 |         "(encoder output) and the requested batch size.")
170 |       with ops.control_dependencies(
171 |         self._batch_size_checks(batch_size, error_message)):
172 |         cell_state = nest.map_structure(
173 |           lambda s: array_ops.identity(s, name="checked_cell_state"),
174 |           cell_state)
175 |       return TacotronDecoderCellState(
176 |         cell_state=cell_state,
177 |         time=array_ops.zeros([], dtype=tf.int32),
178 |         attention=rnn_cell_impl._zero_state_tensors(self._attention_layer_size, batch_size, dtype),
179 |         alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
180 |         alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
181 |         dynamic_size=True))
182 | 
183 | 
184 |   def __call__(self, inputs, state):
185 |     #Information bottleneck (essential for learning attention)
186 |     prenet_output = prenet(inputs, self._training, hp.prenet_depths, scope='decoder_prenet')
187 | 
188 |     #Concat context vector and prenet output to form RNN cells input (input feeding)
189 |     rnn_input = tf.concat([prenet_output, state.attention], axis=-1)
190 | 
191 |     #Unidirectional RNN layers
192 |     rnn_output, next_cell_state = self._cell(tf.layers.dense(rnn_input, hp.decoder_depth), state.cell_state)
193 | 
194 |     #Compute the attention (context) vector and alignments using
195 |     #the new decoder cell hidden state as query vector
196 |     #and cumulative alignments to extract location features
197 |     #The choice of the new cell hidden state (s_{i}) of the last
198 |     #decoder RNN Cell is based on Luong et Al. (2015):
199 |     #https://arxiv.org/pdf/1508.04025.pdf
200 |     previous_alignments = state.alignments
201 |     previous_alignment_history = state.alignment_history
202 |     context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
203 |       rnn_output,
204 |       previous_alignments,
205 |       attention_layer=None)
206 | 
207 |     #Concat RNN outputs and context vector to form projections inputs
208 |     projections_input = tf.concat([rnn_output, context_vector], axis=-1)
209 | 
210 |     #Compute predicted frames and predicted <stop_token>
211 |     cell_outputs = self._frame_projection(projections_input)
212 |     stop_tokens = self._stop_projection(projections_input)
213 | 
214 |     #Save alignment history
215 |     alignment_history = previous_alignment_history.write(state.time, alignments)
216 | 
217 |     #Prepare next decoder state
218 |     next_state = TacotronDecoderCellState(
219 |       time=state.time + 1,
220 |       cell_state=next_cell_state,
221 |       attention=context_vector,
222 |       alignments=cumulated_alignments,
223 |       alignment_history=alignment_history)
224 | 
225 |     return (cell_outputs, stop_tokens), next_state
226 | 


--------------------------------------------------------------------------------