├── tests ├── __init__.py ├── cmudict_test.py ├── text_test.py └── numbers_test.py ├── datasets ├── __init__.py ├── blizzard.py ├── ljspeech.py ├── thchs30.py └── datafeeder.py ├── util ├── test_fun.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── txt2pinyin.cpython-36.pyc ├── __init__.py ├── plot.py ├── infolog.py ├── audio.py └── txt2pinyin.py ├── example └── TTS.mp3 ├── models ├── __init__.py ├── modules.py ├── helpers.py ├── custom_decoder.py ├── tacotron.py ├── attention.py └── rnn_wrappers.py ├── requirements.txt ├── text ├── symbols.py ├── cmudict.py ├── numbers.py ├── __init__.py └── cleaners.py ├── LICENSE ├── hparams.py ├── synthesizer.py ├── preprocess.py ├── TRAINING_DATA.md ├── eval.py ├── demo_server.py ├── train.py └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/test_fun.py: -------------------------------------------------------------------------------- 1 | import os 2 | print(os.path.expanduser('.')) -------------------------------------------------------------------------------- /example/TTS.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/example/TTS.mp3 -------------------------------------------------------------------------------- /util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /util/__pycache__/txt2pinyin.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-CCS/mandarin_tacotron_GL/HEAD/util/__pycache__/txt2pinyin.cpython-36.pyc -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'tacotron': 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install 2 | # depends on your platform. It is assumed you have already installed tensorflow. 3 | falcon==1.2.0 4 | inflect==0.2.5 5 | librosa==0.5.1 6 | matplotlib==2.0.2 7 | numpy==1.14.3 8 | scipy==0.19.0 9 | tqdm==4.11.2 10 | Unidecode==0.4.20 11 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /util/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def plot_alignment(alignment, path, info=None): 7 | fig, ax = plt.subplots() 8 | im = ax.imshow( 9 | alignment, 10 | aspect='auto', 11 | origin='lower', 12 | interpolation='none') 13 | fig.colorbar(im, ax=ax) 14 | xlabel = 'Decoder timestep' 15 | if info is not None: 16 | xlabel += '\n\n' + info 17 | plt.xlabel(xlabel) 18 | plt.ylabel('Encoder timestep') 19 | plt.tight_layout() 20 | plt.savefig(path, format='png') 21 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | import os 8 | from text import cmudict 9 | 10 | _pad = '_' 11 | _eos = '~' 12 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!\'(),-.:;? ' 13 | 14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 15 | # _arpabet = ['@' + s for s in cmudict.valid_symbols] 16 | 17 | # Export all symbols: 18 | # symbols = [_pad, _eos] + list(_characters) + _arpabet 19 | symbols = [_pad, _eos] + list(_characters)# + _arpabet 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /tests/cmudict_test.py: -------------------------------------------------------------------------------- 1 | import io 2 | from text import cmudict 3 | 4 | 5 | test_data = ''' 6 | ;;; # CMUdict -- Major Version: 0.07 7 | )PAREN P ER EH N 8 | 'TIS T IH Z 9 | ADVERSE AE0 D V ER1 S 10 | ADVERSE(1) AE1 D V ER2 S 11 | ADVERSE(2) AE2 D V ER1 S 12 | ADVERSELY AE0 D V ER1 S L IY0 13 | ADVERSITY AE0 D V ER1 S IH0 T IY2 14 | BARBERSHOP B AA1 R B ER0 SH AA2 P 15 | YOU'LL Y UW1 L 16 | ''' 17 | 18 | 19 | def test_cmudict(): 20 | c = cmudict.CMUDict(io.StringIO(test_data)) 21 | assert len(c) == 6 22 | assert len(cmudict.valid_symbols) == 84 23 | assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2'] 24 | assert c.lookup('BarberShop') == ['B AA1 R B ER0 SH AA2 P'] 25 | assert c.lookup("You'll") == ['Y UW1 L'] 26 | assert c.lookup("'tis") == ['T IH Z'] 27 | assert c.lookup('adverse') == [ 28 | 'AE0 D V ER1 S', 29 | 'AE1 D V ER2 S', 30 | 'AE2 D V ER1 S', 31 | ] 32 | assert c.lookup('') == None 33 | assert c.lookup('foo') == None 34 | assert c.lookup(')paren') == None 35 | 36 | 37 | def test_cmudict_no_keep_ambiguous(): 38 | c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False) 39 | assert len(c) == 5 40 | assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2'] 41 | assert c.lookup('adverse') == None 42 | -------------------------------------------------------------------------------- /util/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new training run\n') 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, slack=False): 26 | print(msg) 27 | if _file is not None: 28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header('Content-Type', 'application/json') 43 | urlopen(req, json.dumps({ 44 | 'username': 'tacotron', 45 | 'icon_emoji': ':taco:', 46 | 'text': '*%s*: %s' % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | # Default hyperparameters: 5 | hparams = tf.contrib.training.HParams( 6 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 7 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. 8 | cleaners='basic_cleaners', 9 | 10 | # Audio: 11 | num_mels=80, 12 | num_freq=2049, 13 | sample_rate=48000, 14 | frame_length_ms=50, 15 | frame_shift_ms=12.5, 16 | preemphasis=0.97, 17 | min_level_db=-100, 18 | ref_level_db=20, 19 | max_frame_num=1000, 20 | max_abs_value = 4, 21 | fmin = 125, # for male, set 55 22 | fmax = 7600, # for male, set 3600 23 | 24 | # Model: 25 | outputs_per_step=5, 26 | embed_depth=512, 27 | prenet_depths=[256, 256], 28 | encoder_depth=256, 29 | postnet_depth=512, 30 | attention_depth=128, 31 | decoder_depth=1024, 32 | 33 | # Training: 34 | batch_size=64, 35 | adam_beta1=0.9, 36 | adam_beta2=0.999, 37 | initial_learning_rate=0.001, 38 | decay_learning_rate=True, 39 | use_cmudict=False, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes 40 | 41 | # Eval: 42 | max_iters=300, 43 | griffin_lim_iters=60, 44 | power=1.2, # Power to raise magnitudes to prior to Griffin-Lim 45 | ) 46 | 47 | 48 | def hparams_debug_string(): 49 | values = hparams.values() 50 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 51 | return 'Hyperparameters:\n' + '\n'.join(hp) 52 | -------------------------------------------------------------------------------- /synthesizer.py: -------------------------------------------------------------------------------- 1 | import io 2 | import numpy as np 3 | import tensorflow as tf 4 | from hparams import hparams 5 | from librosa import effects 6 | from models import create_model 7 | from text import text_to_sequence 8 | from util import audio 9 | 10 | 11 | class Synthesizer: 12 | def load(self, checkpoint_path, model_name='tacotron'): 13 | print('Constructing model: %s' % model_name) 14 | inputs = tf.placeholder(tf.int32, [1, None], 'inputs') 15 | input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') 16 | with tf.variable_scope('model') as scope: 17 | self.model = create_model(model_name, hparams) 18 | self.model.initialize(inputs, input_lengths) 19 | self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0]) 20 | 21 | print('Loading checkpoint: %s' % checkpoint_path) 22 | self.session = tf.Session() 23 | self.session.run(tf.global_variables_initializer()) 24 | saver = tf.train.Saver() 25 | saver.restore(self.session, checkpoint_path) 26 | 27 | 28 | def synthesize(self, text): 29 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 30 | seq = text_to_sequence(text, cleaner_names) 31 | feed_dict = { 32 | self.model.inputs: [np.asarray(seq, dtype=np.int32)], 33 | self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) 34 | } 35 | wav = self.session.run(self.wav_output, feed_dict=feed_dict) 36 | wav = audio.inv_preemphasis(wav) 37 | out = io.BytesIO() 38 | audio.save_wav(wav, out) 39 | return out.getvalue() 40 | -------------------------------------------------------------------------------- /text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | def __init__(self, file_or_path, keep_ambiguous=True): 20 | if isinstance(file_or_path, str): 21 | with open(file_or_path, encoding='latin-1') as f: 22 | entries = _parse_cmudict(f) 23 | else: 24 | entries = _parse_cmudict(file_or_path) 25 | if not keep_ambiguous: 26 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 27 | self._entries = entries 28 | 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | 34 | def lookup(self, word): 35 | '''Returns list of ARPAbet pronunciations of the given word.''' 36 | return self._entries.get(word.upper()) 37 | 38 | 39 | 40 | _alt_re = re.compile(r'\([0-9]+\)') 41 | 42 | 43 | def _parse_cmudict(file): 44 | cmudict = {} 45 | for line in file: 46 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 47 | parts = line.split(' ') 48 | word = re.sub(_alt_re, '', parts[0]) 49 | pronunciation = _get_pronunciation(parts[1]) 50 | if pronunciation: 51 | if word in cmudict: 52 | cmudict[word].append(pronunciation) 53 | else: 54 | cmudict[word] = [pronunciation] 55 | return cmudict 56 | 57 | 58 | def _get_pronunciation(s): 59 | parts = s.strip().split(' ') 60 | for part in parts: 61 | if part not in _valid_symbol_set: 62 | return None 63 | return ' '.join(parts) 64 | -------------------------------------------------------------------------------- /tests/text_test.py: -------------------------------------------------------------------------------- 1 | from text import cleaners, symbols, text_to_sequence, sequence_to_text 2 | from unidecode import unidecode 3 | 4 | 5 | def test_symbols(): 6 | assert len(symbols) >= 3 7 | assert symbols[0] == '_' 8 | assert symbols[1] == '~' 9 | 10 | 11 | def test_text_to_sequence(): 12 | assert text_to_sequence('', []) == [1] 13 | assert text_to_sequence('Hi!', []) == [9, 36, 54, 1] 14 | assert text_to_sequence('"A"_B', []) == [2, 3, 1] 15 | assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1] 16 | assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1] 17 | assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] 18 | 19 | 20 | def test_sequence_to_text(): 21 | assert sequence_to_text([]) == '' 22 | assert sequence_to_text([1]) == '~' 23 | assert sequence_to_text([9, 36, 54, 1]) == 'Hi!~' 24 | assert sequence_to_text([2, 64, 83, 132, 64, 3]) == 'A {AW1 S} B' 25 | 26 | 27 | def test_collapse_whitespace(): 28 | assert cleaners.collapse_whitespace('') == '' 29 | assert cleaners.collapse_whitespace(' ') == ' ' 30 | assert cleaners.collapse_whitespace('x') == 'x' 31 | assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z' 32 | 33 | 34 | def test_convert_to_ascii(): 35 | assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre" 36 | assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott' 37 | assert cleaners.convert_to_ascii('안녕') == 'annyeong' 38 | assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite' 39 | 40 | 41 | def test_lowercase(): 42 | assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!' 43 | assert cleaners.lowercase('CAFÉ') == 'café' 44 | 45 | 46 | def test_expand_abbreviations(): 47 | assert cleaners.expand_abbreviations('mr. and mrs. smith') == 'mister and misess smith' 48 | 49 | 50 | def test_expand_numbers(): 51 | assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears' 52 | assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' 53 | 54 | 55 | def test_cleaner_pipelines(): 56 | text = 'Mr. Müller ate 2 Apples' 57 | assert cleaners.english_cleaners(text) == 'mister muller ate two apples' 58 | assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' 59 | assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples' 60 | 61 | -------------------------------------------------------------------------------- /text/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | from tqdm import tqdm 5 | from datasets import blizzard, ljspeech, thchs30 6 | from hparams import hparams 7 | 8 | 9 | def preprocess_blizzard(args): 10 | in_dir = os.path.join(args.base_dir, 'Blizzard2012') 11 | out_dir = os.path.join(args.base_dir, args.output) 12 | os.makedirs(out_dir, exist_ok=True) 13 | metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 14 | write_metadata(metadata, out_dir) 15 | 16 | 17 | def preprocess_ljspeech(args): 18 | in_dir = os.path.join(args.base_dir, 'LJSpeech-1.1') 19 | out_dir = os.path.join(args.base_dir, args.output) 20 | os.makedirs(out_dir, exist_ok=True) 21 | metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 22 | write_metadata(metadata, out_dir) 23 | 24 | 25 | def preprocess_thchs30(args): 26 | in_dir = os.path.join(args.base_dir, 'data_thchs30') 27 | out_dir = os.path.join(args.base_dir, args.output) 28 | os.makedirs(out_dir, exist_ok=True) 29 | metadata = thchs30.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 30 | write_metadata(metadata, out_dir) 31 | 32 | 33 | def write_metadata(metadata, out_dir): 34 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 35 | for m in metadata: 36 | f.write('|'.join([str(x) for x in m]) + '\n') 37 | frames = sum([m[2] for m in metadata]) 38 | hours = frames * hparams.frame_shift_ms / (3600 * 1000) 39 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) 40 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 41 | print('Max output length: %d' % max(m[2] for m in metadata)) 42 | 43 | 44 | def main(): 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--base_dir', default=os.path.expanduser('.')) 47 | parser.add_argument('--output', default='training') 48 | parser.add_argument('--dataset', default='thchs30', choices=['blizzard', 'ljspeech', 'thchs30']) 49 | parser.add_argument('--num_workers', type=int, default=cpu_count()) 50 | args = parser.parse_args() 51 | if args.dataset == 'blizzard': 52 | preprocess_blizzard(args) 53 | elif args.dataset == 'ljspeech': 54 | preprocess_ljspeech(args) 55 | elif args.dataset == 'thchs30': 56 | preprocess_thchs30(args) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from text import cleaners 3 | from text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | # Regular expression matching text enclosed in curly braces: 11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 12 | 13 | 14 | def text_to_sequence(text, cleaner_names): 15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 16 | 17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 19 | 20 | Args: 21 | text: string to convert to a sequence 22 | cleaner_names: names of the cleaner functions to run the text through 23 | 24 | Returns: 25 | List of integers corresponding to the symbols in the text 26 | ''' 27 | sequence = [] 28 | 29 | # Check for curly braces and treat their contents as ARPAbet: 30 | while len(text): 31 | m = _curly_re.match(text) 32 | if not m: 33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 34 | break 35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 36 | sequence += _arpabet_to_sequence(m.group(2)) 37 | text = m.group(3) 38 | 39 | # Append EOS token 40 | sequence.append(_symbol_to_id['~']) 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | '''Converts a sequence of IDs back to a string''' 46 | result = '' 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == '@': 52 | s = '{%s}' % s[1:] 53 | result += s 54 | return result.replace('}{', ' ') 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception('Unknown cleaner: %s' % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(['@' + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s is not '_' and s is not '~' 76 | -------------------------------------------------------------------------------- /tests/numbers_test.py: -------------------------------------------------------------------------------- 1 | from text.numbers import normalize_numbers 2 | 3 | 4 | def test_normalize_numbers(): 5 | assert normalize_numbers('1') == 'one' 6 | assert normalize_numbers('15') == 'fifteen' 7 | assert normalize_numbers('24') == 'twenty-four' 8 | assert normalize_numbers('100') == 'one hundred' 9 | assert normalize_numbers('101') == 'one hundred one' 10 | assert normalize_numbers('456') == 'four hundred fifty-six' 11 | assert normalize_numbers('1000') == 'one thousand' 12 | assert normalize_numbers('1800') == 'eighteen hundred' 13 | assert normalize_numbers('2,000') == 'two thousand' 14 | assert normalize_numbers('3000') == 'three thousand' 15 | assert normalize_numbers('18000') == 'eighteen thousand' 16 | assert normalize_numbers('24,000') == 'twenty-four thousand' 17 | assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one' 18 | assert normalize_numbers('6.4 sec') == 'six point four sec' 19 | 20 | 21 | def test_normalize_ordinals(): 22 | assert normalize_numbers('1st') == 'first' 23 | assert normalize_numbers('2nd') == 'second' 24 | assert normalize_numbers('9th') == 'ninth' 25 | assert normalize_numbers('243rd place') == 'two hundred and forty-third place' 26 | 27 | 28 | def test_normalize_dates(): 29 | assert normalize_numbers('1400') == 'fourteen hundred' 30 | assert normalize_numbers('1901') == 'nineteen oh one' 31 | assert normalize_numbers('1999') == 'nineteen ninety-nine' 32 | assert normalize_numbers('2000') == 'two thousand' 33 | assert normalize_numbers('2004') == 'two thousand four' 34 | assert normalize_numbers('2010') == 'twenty ten' 35 | assert normalize_numbers('2012') == 'twenty twelve' 36 | assert normalize_numbers('2025') == 'twenty twenty-five' 37 | assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one' 38 | assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.' 39 | 40 | 41 | def test_normalize_money(): 42 | assert normalize_numbers('$0.00') == 'zero dollars' 43 | assert normalize_numbers('$1') == 'one dollar' 44 | assert normalize_numbers('$10') == 'ten dollars' 45 | assert normalize_numbers('$.01') == 'one cent' 46 | assert normalize_numbers('$0.25') == 'twenty-five cents' 47 | assert normalize_numbers('$5.00') == 'five dollars' 48 | assert normalize_numbers('$5.01') == 'five dollars, one cent' 49 | assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.' 50 | assert normalize_numbers('$40,000') == 'forty thousand dollars' 51 | assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!' 52 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | 18 | # Regular expression matching whitespace: 19 | _whitespace_re = re.compile(r'\s+') 20 | 21 | # List of (regular expression, replacement) pairs for abbreviations: 22 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, ' ', text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def basic_cleaners(text): 67 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 68 | text = lowercase(text) 69 | text = collapse_whitespace(text) 70 | return text 71 | 72 | 73 | def transliteration_cleaners(text): 74 | '''Pipeline for non-English text that transliterates to ASCII.''' 75 | text = convert_to_ascii(text) 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def english_cleaners(text): 82 | '''Pipeline for English text, including number and abbreviation expansion.''' 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = expand_numbers(text) 86 | text = expand_abbreviations(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | -------------------------------------------------------------------------------- /datasets/blizzard.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from hparams import hparams 6 | from util import audio 7 | 8 | 9 | _max_out_length = 700 10 | _end_buffer = 0.05 11 | _min_confidence = 90 12 | 13 | # Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others. 14 | books = [ 15 | 'ATrampAbroad', 16 | 'TheManThatCorruptedHadleyburg', 17 | # 'LifeOnTheMississippi', 18 | # 'TheAdventuresOfTomSawyer', 19 | ] 20 | 21 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 22 | executor = ProcessPoolExecutor(max_workers=num_workers) 23 | futures = [] 24 | index = 1 25 | for book in books: 26 | with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f: 27 | for line in f: 28 | parts = line.strip().split('\t') 29 | if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence: 30 | wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0]) 31 | labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0]) 32 | text = parts[5] 33 | task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text) 34 | futures.append(executor.submit(task)) 35 | index += 1 36 | results = [future.result() for future in tqdm(futures)] 37 | return [r for r in results if r is not None] 38 | 39 | 40 | def _process_utterance(out_dir, index, wav_path, labels_path, text): 41 | # Load the wav file and trim silence from the ends: 42 | wav = audio.load_wav(wav_path) 43 | start_offset, end_offset = _parse_labels(labels_path) 44 | start = int(start_offset * hparams.sample_rate) 45 | end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 46 | wav = wav[start:end] 47 | max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate 48 | if len(wav) > max_samples: 49 | return None 50 | spectrogram = audio.spectrogram(wav).astype(np.float32) 51 | n_frames = spectrogram.shape[1] 52 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 53 | spectrogram_filename = 'blizzard-spec-%05d.npy' % index 54 | mel_filename = 'blizzard-mel-%05d.npy' % index 55 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 56 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 57 | return (spectrogram_filename, mel_filename, n_frames, text) 58 | 59 | 60 | def _parse_labels(path): 61 | labels = [] 62 | with open(os.path.join(path)) as f: 63 | for line in f: 64 | parts = line.strip().split(' ') 65 | if len(parts) >= 3: 66 | labels.append((float(parts[0]), ' '.join(parts[2:]))) 67 | start = 0 68 | end = None 69 | if labels[0][1] == 'sil': 70 | start = labels[0][0] 71 | if labels[-1][1] == 'sil': 72 | end = labels[-2][0] + _end_buffer 73 | return (start, end) 74 | -------------------------------------------------------------------------------- /datasets/ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from util import audio 6 | 7 | 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 10 | 11 | Args: 12 | in_dir: The directory where you have downloaded the LJ Speech dataset 13 | out_dir: The directory to write the output into 14 | num_workers: Optional number of worker processes to parallelize across 15 | tqdm: You can optionally pass tqdm to get a nice progress bar 16 | 17 | Returns: 18 | A list of tuples describing the training examples. This should be written to train.txt 19 | ''' 20 | 21 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 22 | # can omit it and just call _process_utterance on each input if you want. 23 | executor = ProcessPoolExecutor(max_workers=num_workers) 24 | futures = [] 25 | index = 1 26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 27 | for line in f: 28 | parts = line.strip().split('|') 29 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 30 | text = parts[2] 31 | futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text))) 32 | index += 1 33 | return [future.result() for future in tqdm(futures)] 34 | 35 | 36 | def _process_utterance(out_dir, index, wav_path, text): 37 | '''Preprocesses a single utterance audio/text pair. 38 | 39 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 40 | to the train.txt file. 41 | 42 | Args: 43 | out_dir: The directory to write the spectrograms into 44 | index: The numeric index to use in the spectrogram filenames. 45 | wav_path: Path to the audio file containing the speech input 46 | text: The text spoken in the input audio file 47 | 48 | Returns: 49 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 50 | ''' 51 | 52 | # Load the audio to a numpy array: 53 | wav = audio.load_wav(wav_path) 54 | 55 | # Compute the linear-scale spectrogram from the wav: 56 | spectrogram = audio.spectrogram(wav).astype(np.float32) 57 | n_frames = spectrogram.shape[1] 58 | 59 | # Compute a mel-scale spectrogram from the wav: 60 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 61 | 62 | # Write the spectrograms to disk: 63 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index 64 | mel_filename = 'ljspeech-mel-%05d.npy' % index 65 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 66 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 67 | 68 | # Return a tuple describing this training example: 69 | return (spectrogram_filename, mel_filename, n_frames, text) 70 | -------------------------------------------------------------------------------- /TRAINING_DATA.md: -------------------------------------------------------------------------------- 1 | # Training Data 2 | 3 | 4 | This repo supports the following speech datasets: 5 | * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain) 6 | * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike) 7 | 8 | You can use any other dataset if you write a preprocessor for it. 9 | 10 | 11 | ### Writing a Preprocessor 12 | 13 | Each training example consists of: 14 | 1. The text that was spoken 15 | 2. A mel-scale spectrogram of the audio 16 | 3. A linear-scale spectrogram of the audio 17 | 18 | The preprocessor is responsible for generating these. See [ljspeech.py](datasets/ljspeech.py) for a 19 | commented example. 20 | 21 | For each training example, a preprocessor should: 22 | 23 | 1. Load the audio file: 24 | ```python 25 | wav = audio.load_wav(wav_path) 26 | ``` 27 | 28 | 2. Compute linear-scale and mel-scale spectrograms (float32 numpy arrays): 29 | ```python 30 | spectrogram = audio.spectrogram(wav).astype(np.float32) 31 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 32 | ``` 33 | 34 | 3. Save the spectrograms to disk: 35 | ```python 36 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 37 | np.save(os.path.join(out_dir, mel_spectrogram_filename), mel_spectrogram.T, allow_pickle=False) 38 | ``` 39 | Note that the transpose of the matrix returned by `audio.spectrogram` is saved so that it's 40 | in time-major format. 41 | 42 | 4. Generate a tuple `(spectrogram_filename, mel_spectrogram_filename, n_frames, text)` to 43 | write to train.txt. n_frames is just the length of the time axis of the spectrogram. 44 | 45 | 46 | After you've written your preprocessor, you can add it to [preprocess.py](preprocess.py) by 47 | following the example of the other preprocessors in that file. 48 | 49 | 50 | ### Non-English Data 51 | 52 | If your training data is in a language other than English, you will probably want to change the 53 | text cleaners by setting the `cleaners` hyperparameter. 54 | 55 | * If your text is in a Latin script or can be transliterated to ASCII using the 56 | [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration 57 | cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`. 58 | 59 | * If you don't want to transliterate, you can define a custom character set. 60 | This allows you to train directly on the character set used in your data. 61 | 62 | To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a 63 | string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`. 64 | 65 | * If you're not sure which option to use, you can evaluate the transliteration cleaners like this: 66 | 67 | ```python 68 | from text import cleaners 69 | cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try 70 | ``` 71 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import tensorflow as tf 5 | from hparams import hparams, hparams_debug_string 6 | from synthesizer import Synthesizer 7 | from util.txt2pinyin import text_to_pinyin 8 | 9 | # sentences = [ 10 | # # From July 8, 2017 New York Times: 11 | # 'Scientists at the CERN laboratory say they have discovered a new particle.', 12 | # 'There’s a way to measure the acute emotional intelligence that has never gone out of style.', 13 | # 'President Trump met with other leaders at the Group of 20 conference.', 14 | # 'The Senate\'s bill to repeal and replace the Affordable Care Act is now imperiled.', 15 | # # From Google's Tacotron example page: 16 | # 'Generative adversarial network or variational auto-encoder.', 17 | # 'The buses aren\'t the problem, they actually provide a solution.', 18 | # 'Does the quick brown fox jump over the lazy dog?', 19 | # 'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.', 20 | # ] 21 | sentence_1 = '小明硕士毕业于中国科学院计算所,后在日本京都大学深造' 22 | sentence_2 = text_to_pinyin(sentence_1) 23 | print(sentence_2) 24 | 25 | # sentences = [ 26 | # 'ta1 jing3 ti4 de5 xia4 le5 chuang2 gei3 liang3 ge5 sun1 zi5 ye4 hao3 bei4 zi5 you4 na2 guo4 yi1 ba3 da4 yi3 zi5 ba3 jie3 mei4 lia3 dang3 zhu4 gang1 zou3 dao4 ke4 ting1 jiu4 bei4 ren2 lan2 yao1 bao4 zhu4 le5', 27 | # 'wei1 xin4 zhi1 fu4 zhang1 xiao3 long2 han3 jian4 lou4 mian4 cheng1 wei1 xin4 bu4 hui4 cha2 kan4 yong4 hu4 liao2 tian1 ji4 lu4 yi4 si an4 feng4 zhi1 fu4 bao3 , ben3 wen2 lai2 zi4 teng2 xun4 ke1 ji4 .', 28 | # 'da4 hui4 zhi3 re4 nao5 tou2 liang3 tian1 yue4 hou4 yue4 song1 kua3 zui4 zhong1 chu1 ben3 lun4 wen2 ji2 jiu4 suan4 yuan2 man3 wan2 cheng2 ren4 wu5', 29 | # 'lian2 dui4 zhi3 liu2 xia4 yi4 ming2 zhi2 ban1 yuan2 chui1 shi4 yuan2 si4 yang3 yuan2 wei4 sheng1 yuan2 deng3 ye3 lie4 dui4 pao3 bu4 gan2 wang3 zai1 qu1', 30 | # 'yi1 jiu3 wu3 ling2 nian2 ba1 yue4 zhong1 yang1 ren2 min2 zheng4 fu3 zheng4 wu4 yuan4 ban1 bu4 le5 bao3 zhang4 fa1 ming2 quan2 yu3 zhuan1 li4 quan2 zan4 xing2 tiao2 li4', 31 | # ] 32 | sentences = [sentence_2] 33 | 34 | 35 | def get_output_base_path(checkpoint_path): 36 | base_dir = os.path.dirname(checkpoint_path) 37 | m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(checkpoint_path) 38 | name = 'eval-%d' % int(m.group(1)) if m else 'eval' 39 | return os.path.join(base_dir, name) 40 | 41 | 42 | def run_eval(args): 43 | print(hparams_debug_string()) 44 | synth = Synthesizer() 45 | synth.load(args.checkpoint) 46 | base_path = get_output_base_path(args.checkpoint) 47 | for i, text in enumerate(sentences): 48 | path = '%s-%03d.wav' % (base_path, i) 49 | print('Synthesizing: %s' % path) 50 | with open(path, 'wb') as f: 51 | f.write(synth.synthesize(text)) 52 | 53 | 54 | def main(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--checkpoint', required=True, help='Path to model checkpoint') 57 | parser.add_argument('--hparams', default='', 58 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 59 | args = parser.parse_args() 60 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 61 | os.environ['CUDA_VISIBLE_DEVICES'] = '1' 62 | hparams.parse(args.hparams) 63 | run_eval(args) 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /datasets/thchs30.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | import glob 6 | from util import audio 7 | from hparams import hparams as hp 8 | 9 | 10 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 11 | '''Preprocesses the THCHS30 dataset from a given input path into a given output directory. 12 | 13 | Args: 14 | in_dir: The directory where you have downloaded the THCHS30 dataset 15 | out_dir: The directory to write the output into 16 | num_workers: Optional number of worker processes to parallelize across 17 | tqdm: You can optionally pass tqdm to get a nice progress bar 18 | 19 | Returns: 20 | A list of tuples describing the training examples. This should be written to train.txt 21 | ''' 22 | 23 | # We use ProcessPoolExecutor to parallize across processes. This is just an optimization and you 24 | # can omit it and just call _process_utterance on each input if you want. 25 | executor = ProcessPoolExecutor(max_workers=num_workers) 26 | futures = [] 27 | index = 1 28 | 29 | # trn_files = glob.glob(os.path.join(in_dir, 'biaobei_48000', '*.trn')) # 标贝数据集 30 | trn_files = glob.glob(os.path.join(in_dir, 'data', '*.trn')) # 若是单独训练train文件则data改成train 31 | print("trn_files:",trn_files) 32 | for trn in trn_files: 33 | # print("trn:",trn) 34 | with open(trn) as f: 35 | pinyin = f.readline().strip('\n') 36 | # wav_file = trn[:-4] + '.wav' # 标贝数据集 37 | wav_file = trn[:-4] 38 | print("wav_file:",wav_file) 39 | task = partial(_process_utterance, out_dir, index, wav_file, pinyin) 40 | futures.append(executor.submit(task)) 41 | index += 1 42 | return [future.result() for future in tqdm(futures) if future.result() is not None] 43 | 44 | 45 | def _process_utterance(out_dir, index, wav_path, pinyin): 46 | '''Preprocesses a single utterance audio/text pair. 47 | 48 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 49 | to the train.txt file. 50 | 51 | Args: 52 | out_dir: The directory to write the spectrograms into 53 | index: The numeric index to use in the spectrogram filenames. 54 | wav_path: Path to the audio file containing the speech input 55 | pinyin: The pinyin of Chinese spoken in the input audio file 56 | 57 | Returns: 58 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 59 | ''' 60 | 61 | # Load the audio to a numpy array: 62 | wav = audio.load_wav(wav_path) 63 | 64 | # rescale wav for unified measure for all clips 65 | wav = wav / np.abs(wav).max() * 0.999 66 | 67 | # trim silence 68 | wav = audio.trim_silence(wav) 69 | 70 | # Compute the linear-scale spectrogram from the wav: 71 | spectrogram = audio.spectrogram(wav).astype(np.float32) 72 | n_frames = spectrogram.shape[1] 73 | if n_frames > hp.max_frame_num: 74 | return None 75 | 76 | # Compute a mel-scale spectrogram from the wav: 77 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 78 | 79 | # Write the spectrograms to disk: 80 | spectrogram_filename = 'thchs30-spec-%05d.npy' % index 81 | mel_filename = 'thchs30-mel-%05d.npy' % index 82 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 83 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 84 | 85 | # Return a tuple describing this training example: 86 | return (spectrogram_filename, mel_filename, n_frames, pinyin) 87 | -------------------------------------------------------------------------------- /models/modules.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.rnn import GRUCell 3 | 4 | 5 | def prenet(inputs, is_training, layer_sizes, scope=None): 6 | x = inputs 7 | drop_rate = 0.5 if is_training else 0.0 8 | with tf.variable_scope(scope or 'prenet'): 9 | for i, size in enumerate(layer_sizes): 10 | dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1)) 11 | x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, name='dropout_%d' % (i+1)) 12 | return x 13 | 14 | 15 | def encoder_cbhg(inputs, input_lengths, is_training, depth): 16 | input_channels = inputs.get_shape()[2] 17 | return cbhg( 18 | inputs, 19 | input_lengths, 20 | is_training, 21 | scope='encoder_cbhg', 22 | K=16, 23 | projections=[128, input_channels], 24 | depth=depth) 25 | 26 | 27 | def post_cbhg(inputs, input_dim, is_training, depth): 28 | return cbhg( 29 | inputs, 30 | None, 31 | is_training, 32 | scope='post_cbhg', 33 | K=8, 34 | projections=[256, input_dim], 35 | depth=depth) 36 | 37 | 38 | def cbhg(inputs, input_lengths, is_training, scope, K, projections, depth): 39 | with tf.variable_scope(scope): 40 | with tf.variable_scope('conv_bank'): 41 | # Convolution bank: concatenate on the last axis to stack channels from all convolutions 42 | conv_outputs = tf.concat( 43 | [conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, K+1)], 44 | axis=-1 45 | ) 46 | 47 | # Maxpooling: 48 | maxpool_output = tf.layers.max_pooling1d( 49 | conv_outputs, 50 | pool_size=2, 51 | strides=1, 52 | padding='same') 53 | 54 | # Two projection layers: 55 | proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'proj_1') 56 | proj2_output = conv1d(proj1_output, 3, projections[1], lambda _:_, is_training, 'proj_2') 57 | 58 | # Residual connection: 59 | highway_input = proj2_output + inputs 60 | 61 | half_depth = depth // 2 62 | assert half_depth*2 == depth, 'encoder and postnet depths must be even.' 63 | 64 | # Handle dimensionality mismatch: 65 | if highway_input.shape[2] != half_depth: 66 | highway_input = tf.layers.dense(highway_input, half_depth) 67 | 68 | # 4-layer HighwayNet: 69 | for i in range(4): 70 | highway_input = highwaynet(highway_input, 'highway_%d' % (i+1), half_depth) 71 | rnn_input = highway_input 72 | 73 | # Bidirectional RNN 74 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 75 | GRUCell(half_depth), 76 | GRUCell(half_depth), 77 | rnn_input, 78 | sequence_length=input_lengths, 79 | dtype=tf.float32) 80 | return tf.concat(outputs, axis=2) # Concat forward and backward 81 | 82 | 83 | def highwaynet(inputs, scope, depth): 84 | with tf.variable_scope(scope): 85 | H = tf.layers.dense( 86 | inputs, 87 | units=depth, 88 | activation=tf.nn.relu, 89 | name='H') 90 | T = tf.layers.dense( 91 | inputs, 92 | units=depth, 93 | activation=tf.nn.sigmoid, 94 | name='T', 95 | bias_initializer=tf.constant_initializer(-1.0)) 96 | return H * T + inputs * (1.0 - T) 97 | 98 | 99 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope): 100 | with tf.variable_scope(scope): 101 | conv1d_output = tf.layers.conv1d( 102 | inputs, 103 | filters=channels, 104 | kernel_size=kernel_size, 105 | activation=None, 106 | padding='same') 107 | batched = tf.layers.batch_normalization(conv1d_output, training=is_training) 108 | return activation(batched) 109 | -------------------------------------------------------------------------------- /demo_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import falcon 3 | from hparams import hparams, hparams_debug_string 4 | import os 5 | from synthesizer import Synthesizer 6 | from util.txt2pinyin import text_to_pinyin 7 | 8 | 9 | html_body = '''mandarin_tacotron Demo 10 | 21 | 22 |
23 | 24 | 25 |
26 |

27 | 28 | 58 | ''' 59 | 60 | 61 | class UIResource: 62 | def on_get(self, req, res): 63 | res.content_type = 'text/html' 64 | res.body = html_body 65 | 66 | 67 | class SynthesisResource: 68 | def on_get(self, req, res): 69 | if not req.params.get('text'): 70 | raise falcon.HTTPBadRequest() 71 | get_text = req.params.get('text') 72 | print("get_text:", get_text) 73 | print("get_text类型:", type(get_text)) 74 | sentence = text_to_pinyin(get_text) 75 | print("sentence:", sentence) 76 | print("sentence类型:", type(sentence)) 77 | # res.data = synthesizer.synthesize(req.params.get('text')) 78 | res.data = synthesizer.synthesize(sentence) 79 | res.content_type = 'audio/wav' 80 | 81 | 82 | synthesizer = Synthesizer() 83 | api = falcon.API() 84 | api.add_route('/synthesize', SynthesisResource()) 85 | api.add_route('/', UIResource()) 86 | 87 | 88 | if __name__ == '__main__': 89 | from wsgiref import simple_server 90 | parser = argparse.ArgumentParser() 91 | parser.add_argument('--checkpoint', required=True, help='Full path to model checkpoint') 92 | parser.add_argument('--port', type=int, default=9000) 93 | parser.add_argument('--hparams', default='', 94 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 95 | args = parser.parse_args() 96 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 97 | hparams.parse(args.hparams) 98 | print(hparams_debug_string()) 99 | synthesizer.load(args.checkpoint) 100 | print('Serving on port %d' % args.port) 101 | simple_server.make_server('0.0.0.0', args.port, api).serve_forever() 102 | else: 103 | synthesizer.load(os.environ['CHECKPOINT']) 104 | -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper 7 | class TacoTestHelper(Helper): 8 | def __init__(self, batch_size, output_dim, r): 9 | with tf.name_scope('TacoTestHelper'): 10 | self._batch_size = batch_size 11 | self._output_dim = output_dim 12 | self._reduction_factor = r 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_preds, name=None): 37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 38 | with tf.name_scope('TacoTestHelper'): 39 | # A sequence is finished when the stop token probability is > 0.5 40 | # With enough training steps, the model should be able to predict when to stop correctly 41 | # and the use of stop_at_any = True would be recommended. If however the model didn't 42 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 43 | # to get a correct synthesis 44 | finished = tf.reduce_any(tf.cast(tf.round(stop_token_preds), tf.bool)) 45 | 46 | # Feed last output frame as next input. outputs is [N, output_dim * r] 47 | next_inputs = outputs[:, -self._output_dim:] 48 | return (finished, next_inputs, state) 49 | 50 | 51 | class TacoTrainingHelper(Helper): 52 | def __init__(self, inputs, targets, output_dim, r, global_step): 53 | # inputs is [N, T_in], targets is [N, T_out, D] 54 | with tf.name_scope('TacoTrainingHelper'): 55 | self._batch_size = tf.shape(inputs)[0] 56 | self._output_dim = output_dim 57 | self._reduction_factor = r 58 | self._ratio = tf.convert_to_tensor(1.) 59 | self.global_step = global_step 60 | 61 | # Feed every r-th target frame as input 62 | self._targets = targets[:, r-1::r, :] 63 | 64 | # Use full length for every target because we don't want to mask the padding frames 65 | num_steps = tf.shape(self._targets)[1] 66 | self._lengths = tf.tile([num_steps], [self._batch_size]) 67 | 68 | @property 69 | def batch_size(self): 70 | return self._batch_size 71 | 72 | @property 73 | def token_output_size(self): 74 | return self._reduction_factor 75 | 76 | @property 77 | def sample_ids_shape(self): 78 | return tf.TensorShape([]) 79 | 80 | @property 81 | def sample_ids_dtype(self): 82 | return np.int32 83 | 84 | def initialize(self, name=None): 85 | self._ratio = _teacher_forcing_ratio_decay(1., self.global_step) 86 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 87 | 88 | def sample(self, time, outputs, state, name=None): 89 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 90 | 91 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_preds, name=None): 92 | with tf.name_scope(name or 'TacoTrainingHelper'): 93 | finished = (time + 1 >= self._lengths) 94 | 95 | #Pick previous outputs randomly with respect to teacher forcing ratio 96 | next_inputs = tf.cond(tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 97 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 98 | lambda: outputs[:,-self._output_dim:]) 99 | 100 | # next_inputs = self._targets[:, time, :] # Teacher forcing: feed the true frame 101 | return (finished, next_inputs, state) 102 | 103 | 104 | def _go_frames(batch_size, output_dim): 105 | '''Returns all-zero frames for a given batch size and output dimension''' 106 | return tf.tile([[0.0]], [batch_size, output_dim]) 107 | 108 | def _teacher_forcing_ratio_decay(init_tfr, global_step): 109 | ################################################################# 110 | # Narrow Cosine Decay: 111 | 112 | # Phase 1: tfr = 1 113 | # We only start learning rate decay after 10k steps 114 | 115 | # Phase 2: tfr in ]0, 1[ 116 | # decay reach minimal value at step ~280k 117 | 118 | # Phase 3: tfr = 0 119 | # clip by minimal teacher forcing ratio value (step >~ 280k) 120 | ################################################################# 121 | #Compute natural cosine decay 122 | tfr = tf.train.cosine_decay(init_tfr, 123 | global_step=global_step - 20000, #tfr = 1 at step 10k 124 | decay_steps=200000, #tfr = 0 at step ~280k 125 | alpha=0., #tfr = 0% of init_tfr as final value 126 | name='tfr_cosine_decay') 127 | 128 | #force teacher forcing ratio to take initial value when global step < start decay step. 129 | narrow_tfr = tf.cond( 130 | tf.less(global_step, tf.convert_to_tensor(20000)), 131 | lambda: tf.convert_to_tensor(init_tfr), 132 | lambda: tfr) 133 | 134 | return narrow_tfr 135 | -------------------------------------------------------------------------------- /models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib.seq2seq.python.ops import decoder 9 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 10 | from tensorflow.python.framework import ops 11 | from tensorflow.python.framework import tensor_shape 12 | from tensorflow.python.layers import base as layers_base 13 | from tensorflow.python.ops import rnn_cell_impl 14 | from tensorflow.python.util import nest 15 | from .helpers import TacoTrainingHelper, TacoTestHelper 16 | 17 | 18 | class CustomDecoderOutput( 19 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 20 | pass 21 | 22 | 23 | class CustomDecoder(decoder.Decoder): 24 | """Custom sampling decoder. 25 | 26 | Allows for stop token prediction at inference time 27 | and returns equivalent loss in training time. 28 | 29 | Note: 30 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 31 | """ 32 | 33 | def __init__(self, cell, helper, initial_state, output_layer=None): 34 | """Initialize CustomDecoder. 35 | Args: 36 | cell: An `RNNCell` instance. 37 | helper: A `Helper` instance. 38 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 39 | The initial state of the RNNCell. 40 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 41 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 42 | to storing the result or sampling. 43 | Raises: 44 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 45 | """ 46 | rnn_cell_impl.assert_like_rnncell(type(cell), cell) 47 | if not isinstance(helper, helper_py.Helper): 48 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 49 | if (output_layer is not None 50 | and not isinstance(output_layer, layers_base.Layer)): 51 | raise TypeError( 52 | "output_layer must be a Layer, received: %s" % type(output_layer)) 53 | self._cell = cell 54 | self._helper = helper 55 | self._initial_state = initial_state 56 | self._output_layer = output_layer 57 | 58 | @property 59 | def batch_size(self): 60 | return self._helper.batch_size 61 | 62 | def _rnn_output_size(self): 63 | size = self._cell.output_size 64 | if self._output_layer is None: 65 | return size 66 | else: 67 | # To use layer's compute_output_shape, we need to convert the 68 | # RNNCell's output_size entries into shapes with an unknown 69 | # batch size. We then pass this through the layer's 70 | # compute_output_shape and read off all but the first (batch) 71 | # dimensions to get the output size of the rnn with the layer 72 | # applied to the top. 73 | output_shape_with_unknown_batch = nest.map_structure( 74 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 75 | size) 76 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 77 | output_shape_with_unknown_batch) 78 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 79 | 80 | @property 81 | def output_size(self): 82 | # Return the cell output and the id 83 | return CustomDecoderOutput( 84 | rnn_output=self._rnn_output_size(), 85 | token_output=self._helper.token_output_size, 86 | sample_id=self._helper.sample_ids_shape) 87 | 88 | @property 89 | def output_dtype(self): 90 | # Assume the dtype of the cell is the output_size structure 91 | # containing the input_state's first component's dtype. 92 | # Return that structure and the sample_ids_dtype from the helper. 93 | dtype = nest.flatten(self._initial_state)[0].dtype 94 | return CustomDecoderOutput( 95 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 96 | tf.float32, 97 | self._helper.sample_ids_dtype) 98 | 99 | def initialize(self, name=None): 100 | """Initialize the decoder. 101 | Args: 102 | name: Name scope for any created operations. 103 | Returns: 104 | `(finished, first_inputs, initial_state)`. 105 | """ 106 | return self._helper.initialize() + (self._initial_state,) 107 | 108 | def step(self, time, inputs, state, name=None): 109 | """Perform a custom decoding step. 110 | Enables for dyanmic prediction 111 | Args: 112 | time: scalar `int32` tensor. 113 | inputs: A (structure of) input tensors. 114 | state: A (structure of) state tensors and TensorArrays. 115 | name: Name scope for any created operations. 116 | Returns: 117 | `(outputs, next_state, next_inputs, finished)`. 118 | """ 119 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 120 | #Call outputprojection wrapper cell 121 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 122 | 123 | #apply output_layer (if existant) 124 | if self._output_layer is not None: 125 | cell_outputs = self._output_layer(cell_outputs) 126 | sample_ids = self._helper.sample( 127 | time=time, outputs=cell_outputs, state=cell_state) 128 | 129 | (finished, next_inputs, next_state) = self._helper.next_inputs( 130 | time=time, 131 | outputs=cell_outputs, 132 | state=cell_state, 133 | sample_ids=sample_ids, 134 | stop_token_preds=stop_token) 135 | 136 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 137 | return (outputs, next_state, next_inputs, finished) 138 | -------------------------------------------------------------------------------- /util/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | import tensorflow as tf 6 | from scipy import signal 7 | from scipy.io import wavfile 8 | from hparams import hparams 9 | 10 | 11 | def load_wav(path): 12 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 13 | 14 | 15 | def save_wav(wav, path): 16 | # rescaling for unified measure for all clips 17 | wav = wav / np.abs(wav).max() * 0.999 18 | # factor 0.5 in case of overflow for int16 19 | f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav))) 20 | # sublinear scaling as Y ~ X ^ k (k < 1) 21 | f2 = np.sign(wav) * np.power(np.abs(wav), 0.8) 22 | wav = f1 * f2 23 | # bandpass for less noises 24 | firwin = signal.firwin(hparams.num_freq, [hparams.fmin, hparams.fmax], pass_zero=False, fs=hparams.sample_rate) 25 | wav = signal.convolve(wav, firwin) 26 | 27 | wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 28 | 29 | 30 | def trim_silence(wav): 31 | return librosa.effects.trim(wav, top_db= 60, frame_length=512, hop_length=128)[0] 32 | 33 | 34 | def preemphasis(x): 35 | return signal.lfilter([1, -hparams.preemphasis], [1], x) 36 | 37 | 38 | def inv_preemphasis(x): 39 | return signal.lfilter([1], [1, -hparams.preemphasis], x) 40 | 41 | 42 | def spectrogram(y): 43 | D = _stft(preemphasis(y)) 44 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 45 | return _normalize(S) 46 | 47 | 48 | def inv_spectrogram(spectrogram): 49 | '''Converts spectrogram to waveform using librosa''' 50 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 51 | return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase 52 | 53 | 54 | def inv_spectrogram_tensorflow(spectrogram): 55 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow. 56 | 57 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call 58 | inv_preemphasis on the output after running the graph. 59 | ''' 60 | S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) 61 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) 62 | 63 | 64 | def melspectrogram(y): 65 | D = _stft(preemphasis(y)) 66 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 67 | return _normalize(S) 68 | 69 | 70 | def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): 71 | window_length = int(hparams.sample_rate * min_silence_sec) 72 | hop_length = int(window_length / 4) 73 | threshold = _db_to_amp(threshold_db) 74 | for x in range(hop_length, len(wav) - window_length, hop_length): 75 | if np.max(wav[x:x+window_length]) < threshold: 76 | return x + hop_length 77 | return len(wav) 78 | 79 | 80 | def _griffin_lim(S): 81 | '''librosa implementation of Griffin-Lim 82 | Based on https://github.com/librosa/librosa/issues/434 83 | ''' 84 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 85 | S_complex = np.abs(S).astype(np.complex) 86 | y = _istft(S_complex * angles) 87 | for i in range(hparams.griffin_lim_iters): 88 | angles = np.exp(1j * np.angle(_stft(y))) 89 | y = _istft(S_complex * angles) 90 | return y 91 | 92 | 93 | def _griffin_lim_tensorflow(S): 94 | '''TensorFlow implementation of Griffin-Lim 95 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb 96 | ''' 97 | with tf.variable_scope('griffinlim'): 98 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 99 | S = tf.expand_dims(S, 0) 100 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 101 | y = _istft_tensorflow(S_complex) 102 | for i in range(hparams.griffin_lim_iters): 103 | est = _stft_tensorflow(y) 104 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 105 | y = _istft_tensorflow(S_complex * angles) 106 | return tf.squeeze(y, 0) 107 | 108 | 109 | def _stft(y): 110 | n_fft, hop_length, win_length = _stft_parameters() 111 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 112 | 113 | 114 | def _istft(y): 115 | _, hop_length, win_length = _stft_parameters() 116 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 117 | 118 | 119 | def _stft_tensorflow(signals): 120 | n_fft, hop_length, win_length = _stft_parameters() 121 | return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False) 122 | 123 | 124 | def _istft_tensorflow(stfts): 125 | n_fft, hop_length, win_length = _stft_parameters() 126 | return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) 127 | 128 | 129 | def _stft_parameters(): 130 | n_fft = (hparams.num_freq - 1) * 2 131 | hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 132 | win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) 133 | return n_fft, hop_length, win_length 134 | 135 | 136 | # Conversions: 137 | 138 | _mel_basis = None 139 | 140 | def _linear_to_mel(spectrogram): 141 | global _mel_basis 142 | if _mel_basis is None: 143 | _mel_basis = _build_mel_basis() 144 | return np.dot(_mel_basis, spectrogram) 145 | 146 | def _build_mel_basis(): 147 | n_fft = (hparams.num_freq - 1) * 2 148 | assert hparams.fmax < hparams.sample_rate // 2 149 | return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, fmin=hparams.fmin, fmax=hparams.fmax) 150 | 151 | def _amp_to_db(x): 152 | return 20 * np.log10(np.maximum(1e-5, x)) 153 | 154 | def _db_to_amp(x): 155 | return np.power(10.0, x * 0.05) 156 | 157 | def _db_to_amp_tensorflow(x): 158 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 159 | 160 | def _normalize(S): 161 | # symmetric mels 162 | return 2 * hparams.max_abs_value * ((S - hparams.min_level_db) / -hparams.min_level_db) - hparams.max_abs_value 163 | 164 | def _denormalize(S): 165 | # symmetric mels 166 | return ((S + hparams.max_abs_value) * -hparams.min_level_db) / (2 * hparams.max_abs_value) + hparams.min_level_db 167 | 168 | def _denormalize_tensorflow(S): 169 | # symmetric mels 170 | return ((S + hparams.max_abs_value) * -hparams.min_level_db) / (2 * hparams.max_abs_value) + hparams.min_level_db 171 | -------------------------------------------------------------------------------- /util/txt2pinyin.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: UTF-8 -*- 2 | from __future__ import unicode_literals 3 | import sys 4 | import re 5 | from pypinyin import pinyin, Style, load_phrases_dict 6 | import jieba 7 | 8 | consonant_list = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 9 | 'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 10 | 'c', 's', 'y', 'w'] 11 | 12 | TRANSFORM_DICT = {'ju':'jv', 'qu':'qv', 'xu':'xv', 'zi':'zic', 13 | 'ci':'cic', 'si':'sic', 'zhi':'zhih', 14 | 'chi':'chih', 'shi':'shih', 'ri':'rih', 15 | 'yuan':'yvan', 'yue':'yve', 'yun':'yvn', 16 | 'quan':'qvan','xuan':'xvan','juan':'jvan', 17 | 'qun':'qvn','xun':'xvn', 'jun':'jvn', 18 | 'iu':'iou', 'ui':'uei', 'un':'uen', 19 | 'ya':'yia', 'ye':'yie', 'yao':'yiao', 20 | 'you':'yiou', 'yan':'yian', 'yin':'yin', 21 | 'yang':'yiang', 'ying':'ying', 'yong':'yiong', 22 | 'wa':'wua', 'wo':'wuo', 'wai':'wuai', 23 | 'wei':'wuei', 'wan':'wuan', 'wen':'wuen', 24 | 'weng':'wueng', 'wang':'wuang'} 25 | 26 | translate_dict = {'ju':'jv', 'qu':'qv', 'xu':'xv', 'zi':'zic', 27 | 'ci':'cic', 'si':'sic', 'zhi':'zhih', 28 | 'chi':'chih', 'shi':'shih', 'ri':'rih', 29 | 'yuan':'yvan', 'yue':'yve', 'yun':'yvn', 30 | 'quan':'qvan','xuan':'xvan','juan':'jvan', 31 | 'qun':'qvn','xun':'xvn', 'jun':'jvn', 32 | 'iu':'iou', 'ui':'uei', 'un':'uen'} 33 | # phone-set with y w, this is the default phone set 34 | translate_dict_more = {'ya':'yia', 'ye':'yie', 'yao':'yiao', 35 | 'you':'yiou', 'yan':'yian', 'yin':'yin', 36 | 'yang':'yiang', 'ying':'ying', 'yong':'yiong', 37 | 'wa':'wua', 'wo':'wuo', 'wai':'wuai', 38 | 'wei':'wuei', 'wan':'wuan', 'wen':'wuen', 39 | 'weng':'wueng', 'wang':'wuang'} 40 | # phone-set without y w 41 | translate_dict_less = {'ya':'ia', 'ye':'ie', 'yao':'iao', 42 | 'you':'iou', 'yan':'ian', 'yin':'in', 43 | 'yang':'iang', 'ying':'ing', 'yong':'iong', 44 | 'yvan':'van', 'yve':'ve', 'yvn':'vn', 45 | 'wa':'ua', 'wo':'uo', 'wai':'uai', 46 | 'wei':'uei', 'wan':'uan', 'wen':'uen', 47 | 'weng':'ueng', 'wang':'uang'} 48 | 49 | def _pre_pinyin_setting(): 50 | ''' fix pinyin error''' 51 | load_phrases_dict({'嗯':[['ēn']]}) 52 | 53 | _pre_pinyin_setting() 54 | 55 | def pinyinformat(syllable): 56 | '''format pinyin to mtts's format''' 57 | if not syllable[-1].isdigit(): 58 | syllable = syllable + '5' 59 | assert syllable[-1].isdigit() 60 | syl_no_tone = syllable[:-1] 61 | if syl_no_tone in TRANSFORM_DICT: 62 | syllable = syllable.replace(syl_no_tone, TRANSFORM_DICT[syl_no_tone]) 63 | return syllable 64 | 65 | """ 66 | for key, value in translate_dict.items(): 67 | syllable = syllable.replace(key, value) 68 | for key, value in translate_dict_more.items(): 69 | syllable = syllable.replace(key, value) 70 | if not syllable[-1].isdigit(): 71 | syllable = syllable + '5' 72 | return syllable 73 | """ 74 | def seprate_syllable(syllable): 75 | '''seprate syllable to consonant + ' ' + vowel ''' 76 | assert syllable[-1].isdigit() 77 | if syllable[0:2] in consonant_list: 78 | #return syllable[0:2].encode('utf-8'),syllable[2:].encode('utf-8') 79 | return syllable[0:2], syllable[2:] 80 | elif syllable[0] in consonant_list: 81 | #return syllable[0].encode('utf-8'),syllable[1:].encode('utf-8') 82 | return syllable[0], syllable[1:] 83 | else: 84 | #return (syllable.encode('utf-8'),) 85 | return (syllable,) 86 | 87 | 88 | def txt2pinyin(txt): 89 | phone_list = [] 90 | ''' 91 | if isinstance(txt, str): 92 | pinyin_list = pinyin(unicode(txt,'utf-8'), style = Style.TONE3) 93 | elif isinstance(txt, unicode): 94 | pinyin_list = pinyin(txt, style = Style.TONE3) 95 | else: 96 | print('error: unsupport coding form') 97 | ''' 98 | 99 | pinyin_list = pinyin(txt, style = Style.TONE3) 100 | for item in pinyin_list: 101 | phone_list.append(seprate_syllable(pinyinformat(item[0]))) 102 | return phone_list 103 | 104 | """ 105 | objective: 去除句子中的标点符号 106 | input: 107 | text:输入有标点符号的句子。例如:"想做/ 兼_职/学生_/ 的 、加,我Q: 1 5. 8 0. !!?? 8 6 。0. 2。 3 有,惊,喜,哦" 108 | output: 转换为去除标点顾浩的字符串。例如:"想做兼职学生的加我Q:158086023有惊喜哦" 109 | status: done 110 | author: changshu 111 | """ 112 | def removal_punctuation(text): 113 | # text = "想做/ 兼_职/学生_/ 的 、加,我Q: 1 5. 8 0. !!?? 8 6 。0. 2。 3 有,惊,喜,哦" 114 | # temp = temp.encode() 115 | string = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+","", text) 116 | # print(string) 117 | return string 118 | 119 | """ 120 | objective: 将文字转化为拼音加韵律的的形式 121 | input: 122 | text:输入的中文文本 123 | output: 转换为拼音加音律的字符串。例如:xiao3 ming2 shuo4 shi4 124 | status: done 125 | author: changshu 126 | """ 127 | def text_to_pinyin(text): 128 | text=removal_punctuation(text) 129 | # print("text:",text) 130 | # seg_list = jieba.cut(txt, cut_all=True) # 会切出重复的部分 131 | # print("Full Mode: " + " ".join(seg_list)) # 全模式 132 | # print("Full Mode: " + " ".join(seg_list)) # 全模式 133 | seg_list = jieba.cut(text, cut_all=False) # 无重复的部分 134 | # print("Default Mode: " + " ".join(seg_list)) # 精确模式 135 | seg_list = " ".join(seg_list) 136 | result = pinyin(seg_list, style=Style.TONE3) 137 | result = [i for lst in result for i in lst] 138 | # print("result的结果",result) 139 | pinyin_str = [x.strip() for x in result] 140 | # print("x的结果", pinyin_str) 141 | pinyin_str = ' '.join(pinyin_str) 142 | r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~”“。!,、…—~﹏¥]+' 143 | pinyin_str = re.sub(r, '', pinyin_str) 144 | return pinyin_str 145 | 146 | if __name__ == '__main__': 147 | # txt='你好看啊' 148 | # txt='中华人民共和国论居然' 149 | txt='小明硕士毕业于中国科学院计算所,后在日本京都大学深造' 150 | # print(txt2pinyin(txt)) 151 | print(text_to_pinyin(txt)) 152 | 153 | 154 | 155 | ''' 156 | 用法举例 157 | print(txt2pinyin('中华人民共和国论居然')) 158 | ['zh ong1', 'h ua2', 'r en2', 'm in2', 'g ong4', 'h e2', 'g uo2', 'l uen4', 'j 159 | v1', 'r an2'] 160 | ''' 161 | ''' 162 | seg_list = jieba.cut("我来到北京清华大学", cut_all=True) 163 | print("Full Mode: " + "/ ".join(seg_list)) # 全模式 164 | 165 | seg_list = jieba.cut("我来到北京清华大学", cut_all=False) 166 | print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 167 | 168 | seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 169 | print(", ".join(seg_list)) 170 | 171 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 172 | print(", ".join(seg_list)) 173 | ''' 174 | 175 | -------------------------------------------------------------------------------- /datasets/datafeeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import random 4 | import tensorflow as tf 5 | import threading 6 | import time 7 | import traceback 8 | from hparams import hparams 9 | from text import cmudict, text_to_sequence 10 | from util.infolog import log 11 | 12 | 13 | _batches_per_group = 32 14 | _p_cmudict = 0.5 15 | _pad = 0 16 | _stop_token_pad = 1 17 | 18 | 19 | class DataFeeder(threading.Thread): 20 | '''Feeds batches of data into a queue on a background thread.''' 21 | 22 | def __init__(self, coordinator, metadata_filename, hparams): 23 | super(DataFeeder, self).__init__() 24 | self._coord = coordinator 25 | self._hparams = hparams 26 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 27 | self._offset = 0 28 | 29 | # Load metadata: 30 | self._datadir = os.path.dirname(metadata_filename) 31 | with open(metadata_filename, encoding='utf-8') as f: 32 | self._metadata = [line.strip().split('|') for line in f] 33 | hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) 34 | log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) 35 | 36 | # Create placeholders for inputs and targets. Don't specify batch size because we want to 37 | # be able to feed different sized batches at eval time. 38 | self._placeholders = [ 39 | tf.placeholder(tf.int32, [None, None], 'inputs'), 40 | tf.placeholder(tf.int32, [None], 'input_lengths'), 41 | tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), 42 | tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), 43 | tf.placeholder(tf.float32, [None, None], 'stop_token_targets') 44 | ] 45 | 46 | # Create queue for buffering data: 47 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32], name='input_queue') 48 | self._enqueue_op = queue.enqueue(self._placeholders) 49 | self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.stop_token_targets = queue.dequeue() 50 | self.inputs.set_shape(self._placeholders[0].shape) 51 | self.input_lengths.set_shape(self._placeholders[1].shape) 52 | self.mel_targets.set_shape(self._placeholders[2].shape) 53 | self.linear_targets.set_shape(self._placeholders[3].shape) 54 | self.stop_token_targets.set_shape(self._placeholders[4].shape) 55 | 56 | # Load CMUDict: If enabled, this will randomly substitute some words in the training data with 57 | # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for 58 | # synthesis (useful for proper nouns, etc.) 59 | if hparams.use_cmudict: 60 | cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') 61 | if not os.path.isfile(cmudict_path): 62 | raise Exception('If use_cmudict=True, you must download ' + 63 | 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) 64 | self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) 65 | log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) 66 | else: 67 | self._cmudict = None 68 | 69 | 70 | def start_in_session(self, session): 71 | self._session = session 72 | self.start() 73 | 74 | 75 | def run(self): 76 | try: 77 | while not self._coord.should_stop(): 78 | self._enqueue_next_group() 79 | except Exception as e: 80 | traceback.print_exc() 81 | self._coord.request_stop(e) 82 | 83 | 84 | def _enqueue_next_group(self): 85 | start = time.time() 86 | 87 | # Read a group of examples: 88 | n = self._hparams.batch_size 89 | r = self._hparams.outputs_per_step 90 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 91 | 92 | # Bucket examples based on similar output sequence length for efficiency: 93 | examples.sort(key=lambda x: x[-1]) 94 | batches = [examples[i:i+n] for i in range(0, len(examples), n)] 95 | random.shuffle(batches) 96 | 97 | log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) 98 | for batch in batches: 99 | feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) 100 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 101 | 102 | 103 | def _get_next_example(self): 104 | '''Loads a single example (input, mel_target, linear_target, stop_token_target) from disk''' 105 | if self._offset >= len(self._metadata): 106 | self._offset = 0 107 | random.shuffle(self._metadata) 108 | meta = self._metadata[self._offset] 109 | self._offset += 1 110 | 111 | text = meta[3] 112 | if self._cmudict and random.random() < _p_cmudict: 113 | text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')]) 114 | 115 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 116 | linear_target = np.load(os.path.join(self._datadir, meta[0])) 117 | mel_target = np.load(os.path.join(self._datadir, meta[1])) 118 | stop_token_target = np.asarray([0.] * len(mel_target)) 119 | return (input_data, mel_target, linear_target, stop_token_target, len(linear_target)) 120 | 121 | 122 | def _maybe_get_arpabet(self, word): 123 | arpabet = self._cmudict.lookup(word) 124 | return '{%s}' % arpabet[0] if arpabet is not None and random.random() < 0.5 else word 125 | 126 | 127 | def _prepare_batch(batch, outputs_per_step): 128 | random.shuffle(batch) 129 | inputs = _prepare_inputs([x[0] for x in batch]) 130 | input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) 131 | mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step) 132 | linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step) 133 | stop_token_targets = _prepare_stop_token_targets([x[3] for x in batch], outputs_per_step) 134 | return (inputs, input_lengths, mel_targets, linear_targets, stop_token_targets) 135 | 136 | 137 | def _prepare_inputs(inputs): 138 | max_len = max((len(x) for x in inputs)) 139 | return np.stack([_pad_input(x, max_len) for x in inputs]) 140 | 141 | 142 | def _prepare_targets(targets, alignment): 143 | max_len = max((len(t) for t in targets)) + 1 144 | return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) 145 | 146 | 147 | def _prepare_stop_token_targets(targets, alignment): 148 | max_len = max((len(t) for t in targets)) + 1 149 | return np.stack([_pad_stop_token_target(t, _round_up(max_len, alignment)) for t in targets]) 150 | 151 | 152 | def _pad_input(x, length): 153 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 154 | 155 | 156 | def _pad_target(t, length): 157 | return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad) 158 | 159 | 160 | def _pad_stop_token_target(t, length): 161 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_stop_token_pad) 162 | 163 | 164 | def _round_up(x, multiple): 165 | remainder = x % multiple 166 | return x if remainder == 0 else x + multiple - remainder 167 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datetime import datetime 3 | import math 4 | import numpy as np 5 | import os 6 | import subprocess 7 | import time 8 | import tensorflow as tf 9 | import traceback 10 | 11 | from datasets.datafeeder import DataFeeder 12 | from hparams import hparams, hparams_debug_string 13 | from models import create_model 14 | from text import sequence_to_text 15 | from util import audio, infolog, plot, ValueWindow 16 | log = infolog.log 17 | 18 | 19 | def get_git_commit(): 20 | subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD']) # Verify client is clean 21 | commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10] 22 | log('Git commit: %s' % commit) 23 | return commit 24 | 25 | 26 | def add_stats(model): 27 | with tf.variable_scope('stats') as scope: 28 | tf.summary.histogram('linear_outputs', model.linear_outputs) 29 | tf.summary.histogram('linear_targets', model.linear_targets) 30 | tf.summary.histogram('mel_outputs', model.mel_outputs) 31 | tf.summary.histogram('mel_targets', model.mel_targets) 32 | tf.summary.scalar('loss_mel', model.mel_loss) 33 | tf.summary.scalar('loss_linear', model.linear_loss) 34 | tf.summary.scalar('regularization_loss', model.regularization_loss) 35 | tf.summary.scalar('stop_token_loss', model.stop_token_loss) 36 | tf.summary.scalar('learning_rate', model.learning_rate) 37 | tf.summary.scalar('loss', model.loss) 38 | gradient_norms = [tf.norm(grad) for grad in model.gradients] 39 | tf.summary.histogram('gradient_norm', gradient_norms) 40 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) 41 | return tf.summary.merge_all() 42 | 43 | 44 | def time_string(): 45 | return datetime.now().strftime('%Y-%m-%d %H:%M') 46 | 47 | 48 | def train(log_dir, args): 49 | commit = get_git_commit() if args.git else 'None' 50 | checkpoint_path = os.path.join(log_dir, 'model.ckpt') 51 | input_path = os.path.join(args.base_dir, args.input) 52 | log('Checkpoint path: %s' % checkpoint_path) 53 | log('Loading training data from: %s' % input_path) 54 | log('Using model: %s' % args.model) 55 | log(hparams_debug_string()) 56 | 57 | # Set up DataFeeder: 58 | coord = tf.train.Coordinator() 59 | with tf.variable_scope('datafeeder') as scope: 60 | feeder = DataFeeder(coord, input_path, hparams) 61 | 62 | # Set up model: 63 | global_step = tf.Variable(0, name='global_step', trainable=False) 64 | with tf.variable_scope('model') as scope: 65 | model = create_model(args.model, hparams) 66 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.stop_token_targets, global_step) 67 | model.add_loss() 68 | model.add_optimizer(global_step) 69 | stats = add_stats(model) 70 | 71 | # Bookkeeping: 72 | step = 0 73 | time_window = ValueWindow(100) 74 | loss_window = ValueWindow(100) 75 | saver = tf.train.Saver(max_to_keep=1) 76 | 77 | # Train! 78 | with tf.Session() as sess: 79 | try: 80 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 81 | sess.run(tf.global_variables_initializer()) 82 | 83 | if args.restore_step: 84 | # Restore from a checkpoint if the user requested it. 85 | checkpoint_state = tf.train.get_checkpoint_state(log_dir) 86 | restore_path = '%s-%d' % (checkpoint_path, args.restore_step) 87 | if checkpoint_state is not None: 88 | saver.restore(sess, checkpoint_state.model_checkpoint_path) 89 | log('Resuming from checkpoint: %s at commit: %s' % (checkpoint_state.model_checkpoint_path, commit), slack=True) 90 | else: 91 | log('Starting new training run at commit: %s' % commit, slack=True) 92 | 93 | feeder.start_in_session(sess) 94 | 95 | while not coord.should_stop(): 96 | start_time = time.time() 97 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 98 | time_window.append(time.time() - start_time) 99 | loss_window.append(loss) 100 | message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( 101 | step, time_window.average, loss, loss_window.average) 102 | log(message, slack=(step % args.checkpoint_interval == 0)) 103 | 104 | if loss > 100 or math.isnan(loss): 105 | log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) 106 | raise Exception('Loss Exploded') 107 | 108 | if step % args.summary_interval == 0: 109 | log('Writing summary at step: %d' % step) 110 | summary_writer.add_summary(sess.run(stats), step) 111 | 112 | if step % args.checkpoint_interval == 0: 113 | log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) 114 | saver.save(sess, checkpoint_path, global_step=step) 115 | log('Saving audio and alignment...') 116 | input_seq, spectrogram, alignment = sess.run([ 117 | model.inputs[0], model.linear_outputs[0], model.alignments[0]]) 118 | waveform = audio.inv_spectrogram(spectrogram.T) 119 | audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) 120 | plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step), 121 | info='%s, %s, %s, step=%d, loss=%.5f' % (args.model, commit, time_string(), step, loss)) 122 | log('Input: %s' % sequence_to_text(input_seq)) 123 | 124 | except Exception as e: 125 | log('Exiting due to exception: %s' % e, slack=True) 126 | traceback.print_exc() 127 | coord.request_stop(e) 128 | 129 | 130 | def main(): 131 | parser = argparse.ArgumentParser() 132 | parser.add_argument('--base_dir', default=os.path.expanduser('.')) 133 | parser.add_argument('--input', default='training/train.txt') 134 | parser.add_argument('--model', default='tacotron') 135 | parser.add_argument('--name', help='Name of the run. Used for logging. Defaults to model name.') 136 | parser.add_argument('--hparams', default='', 137 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 138 | parser.add_argument('--restore_step', type=bool, default=True, help='Global step to restore from checkpoint.') 139 | parser.add_argument('--summary_interval', type=int, default=100, 140 | help='Steps between running summary ops.') 141 | parser.add_argument('--checkpoint_interval', type=int, default=1000, 142 | help='Steps between writing checkpoints.') 143 | parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.') 144 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') 145 | parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.') 146 | args = parser.parse_args() 147 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 148 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 149 | run_name = args.name or args.model 150 | log_dir = os.path.join(args.base_dir, 'logs-%s' % run_name) 151 | os.makedirs(log_dir, exist_ok=True) 152 | infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url) 153 | hparams.parse(args.hparams) 154 | train(log_dir, args) 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | 160 | 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron 2 | 3 | An implementation of Tacotron speech synthesis in TensorFlow. 4 | 5 | 6 | ### Audio Samples 7 | 8 | * **[Audio Samples](https://keithito.github.io/audio-samples/)** from models trained using this repo. 9 | * The first set was trained for 877K steps on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) 10 | * Speech started to become intelligble around 20K steps. 11 | * Although loss continued to decrease, there wasn't much noticable improvement after ~250K steps. 12 | * The second set was trained by [@MXGray](https://github.com/MXGray) for 140K steps on the [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/). 13 | 14 | 15 | 16 | ## Background 17 | 18 | In April 2017, Google published a paper, [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/pdf/1703.10135.pdf), 19 | where they present a neural text-to-speech model that learns to synthesize speech directly from 20 | (text, audio) pairs. However, they didn't release their source code or training data. This is an 21 | independent attempt to provide an open-source implementation of the model described in their paper. 22 | 23 | The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-). 24 | Pull requests are welcome! 25 | 26 | 27 | 28 | ## Quick Start 29 | 30 | ### Installing dependencies 31 | 32 | 1. Install Python 3. 33 | 34 | 2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better 35 | performance, install with GPU support if it's available. This code works with TensorFlow 1.3 and later. 36 | 37 | 3. Install requirements: 38 | ``` 39 | pip install -r requirements.txt 40 | ``` 41 | 42 | 43 | ### Using a pre-trained model 44 | 45 | 1. **Download and unpack a model**: 46 | ``` 47 | curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp 48 | ``` 49 | 50 | 2. **Run the demo server**: 51 | ``` 52 | python3 demo_server.py --checkpoint /tmp/tacotron-20170720/model.ckpt 53 | ``` 54 | 55 | 3. **Point your browser at localhost:9000** 56 | * Type what you want to synthesize 57 | 58 | 59 | 60 | ### Training 61 | 62 | *Note: you need at least 40GB of free disk space to train a model.* 63 | 64 | 1. **Download a speech dataset.** 65 | 66 | The following are supported out of the box: 67 | * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain) 68 | * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike) 69 | 70 | You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info. 71 | 72 | 73 | 2. **Unpack the dataset into `~/tacotron`** 74 | 75 | After unpacking, your tree should look like this for LJ Speech: 76 | ``` 77 | tacotron 78 | |- LJSpeech-1.1 79 | |- metadata.csv 80 | |- wavs 81 | ``` 82 | 83 | or like this for Blizzard 2012: 84 | ``` 85 | tacotron 86 | |- Blizzard2012 87 | |- ATrampAbroad 88 | | |- sentence_index.txt 89 | | |- lab 90 | | |- wav 91 | |- TheManThatCorruptedHadleyburg 92 | |- sentence_index.txt 93 | |- lab 94 | |- wav 95 | ``` 96 | 97 | 3. **Preprocess the data** 98 | ``` 99 | python3 preprocess.py --dataset ljspeech 100 | ``` 101 | * Use `--dataset blizzard` for Blizzard data 102 | 103 | 4. **Train a model** 104 | ``` 105 | python3 train.py 106 | ``` 107 | 108 | Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command 109 | line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`. 110 | Hyperparameters should generally be set to the same values at both training and eval time. 111 | The default hyperparameters are recommended for LJ Speech and other English-language data. 112 | See [TRAINING_DATA.md](TRAINING_DATA.md) for other languages. 113 | 114 | 115 | 5. **Monitor with Tensorboard** (optional) 116 | ``` 117 | tensorboard --logdir ~/tacotron/logs-tacotron 118 | ``` 119 | 120 | The trainer dumps audio and alignments every 1000 steps. You can find these in 121 | `~/tacotron/logs-tacotron`. 122 | 123 | 6. **Synthesize from a checkpoint** 124 | ``` 125 | python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 126 | ``` 127 | Replace "185000" with the checkpoint number that you want to use, then open a browser 128 | to `localhost:9000` and type what you want to speak. Alternately, you can 129 | run [eval.py](eval.py) at the command line: 130 | ``` 131 | python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 132 | ``` 133 | If you set the `--hparams` flag when training, set the same value here. 134 | 135 | 136 | ## Notes and Common Issues 137 | 138 | * [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to improve 139 | training speed and avoids occasional slowdowns seen with the default allocator. You 140 | can enable it by installing it and setting `LD_PRELOAD=/usr/lib/libtcmalloc.so`. With TCMalloc, 141 | you can get around 1.1 sec/step on a GTX 1080Ti. 142 | 143 | * You can train with [CMUDict](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) by downloading the 144 | dictionary to ~/tacotron/training and then passing the flag `--hparams="use_cmudict=True"` to 145 | train.py. This will allow you to pass ARPAbet phonemes enclosed in curly braces at eval 146 | time to force a particular pronunciation, e.g. `Turn left on {HH AW1 S S T AH0 N} Street.` 147 | 148 | * If you pass a Slack incoming webhook URL as the `--slack_url` flag to train.py, it will send 149 | you progress updates every 1000 steps. 150 | 151 | * Occasionally, you may see a spike in loss and the model will forget how to attend (the 152 | alignments will no longer make sense). Although it will recover eventually, it may 153 | save time to restart at a checkpoint prior to the spike by passing the 154 | `--restore_step=150000` flag to train.py (replacing 150000 with a step number prior to the 155 | spike). **Update**: a recent [fix](https://github.com/keithito/tacotron/pull/7) to gradient 156 | clipping by @candlewill may have fixed this. 157 | 158 | * During eval and training, audio length is limited to `max_iters * outputs_per_step * frame_shift_ms` 159 | milliseconds. With the defaults (max_iters=200, outputs_per_step=5, frame_shift_ms=12.5), this is 160 | 12.5 seconds. 161 | 162 | If your training examples are longer, you will see an error like this: 163 | `Incompatible shapes: [32,1340,80] vs. [32,1000,80]` 164 | 165 | To fix this, you can set a larger value of `max_iters` by passing `--hparams="max_iters=300"` to 166 | train.py (replace "300" with a value based on how long your audio is and the formula above). 167 | 168 | * Here is the expected loss curve when training on LJ Speech with the default hyperparameters: 169 | ![Loss curve](https://user-images.githubusercontent.com/1945356/36077599-c0513e4a-0f21-11e8-8525-07347847720c.png) 170 | 171 | 172 | ## Other Implementations 173 | * By Alex Barron: https://github.com/barronalex/Tacotron 174 | * By Kyubyong Park: https://github.com/Kyubyong/tacotron 175 | 176 | 177 | 178 |
179 | 180 |
-------------------------------------------------------------------------------- /models/tacotron.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper 3 | from tensorflow.contrib.seq2seq import BasicDecoder 4 | from text.symbols import symbols 5 | from util.infolog import log 6 | from .helpers import TacoTestHelper, TacoTrainingHelper 7 | from .modules import encoder_cbhg, post_cbhg, prenet 8 | from .rnn_wrappers import FrameProjection, StopProjection, TacotronDecoderWrapper 9 | from .attention import LocationSensitiveAttention 10 | from .custom_decoder import CustomDecoder 11 | 12 | 13 | class Tacotron(): 14 | def __init__(self, hparams): 15 | self._hparams = hparams 16 | 17 | 18 | def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None, global_step=None): 19 | '''Initializes the model for inference. 20 | 21 | Sets "mel_outputs", "linear_outputs", and "alignments" fields. 22 | 23 | Args: 24 | inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of 25 | steps in the input time series, and values are character IDs 26 | input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths 27 | of each sequence in inputs. 28 | mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number 29 | of steps in the output time series, M is num_mels, and values are entries in the mel 30 | spectrogram. Only needed for training. 31 | linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number 32 | of steps in the output time series, F is num_freq, and values are entries in the linear 33 | spectrogram. Only needed for training. 34 | ''' 35 | with tf.variable_scope('inference') as scope: 36 | is_training = linear_targets is not None 37 | batch_size = tf.shape(inputs)[0] 38 | hp = self._hparams 39 | 40 | # Embeddings 41 | embedding_table = tf.get_variable( 42 | 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, 43 | initializer=tf.truncated_normal_initializer(stddev=0.5)) 44 | embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=256] 45 | 46 | # Encoder 47 | prenet_outputs = prenet(embedded_inputs, is_training, hp.prenet_depths) # [N, T_in, prenet_depths[-1]=128] 48 | encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training, hp.encoder_depth) # [N, T_in, encoder_depth=256] 49 | 50 | # Location sensitive attention 51 | attention_mechanism = LocationSensitiveAttention(hp.attention_depth, encoder_outputs) # [N, T_in, attention_depth=256] 52 | 53 | # Decoder (layers specified bottom to top): 54 | multi_rnn_cell = MultiRNNCell([ 55 | ResidualWrapper(GRUCell(hp.decoder_depth)), 56 | ResidualWrapper(GRUCell(hp.decoder_depth)) 57 | ], state_is_tuple=True) # [N, T_in, decoder_depth=256] 58 | 59 | # Frames Projection layer 60 | frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step) # [N, T_out/r, M*r] 61 | 62 | # projection layer 63 | stop_projection = StopProjection(is_training, shape=hp.outputs_per_step) # [N, T_out/r, r] 64 | 65 | # Project onto r mel spectrograms (predict r outputs at each RNN step): 66 | decoder_cell = TacotronDecoderWrapper(is_training, attention_mechanism, multi_rnn_cell, 67 | frame_projection, stop_projection) 68 | 69 | if is_training: 70 | helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step, global_step) 71 | else: 72 | helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) 73 | 74 | decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) 75 | 76 | (decoder_outputs, stop_token_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( 77 | CustomDecoder(decoder_cell, helper, decoder_init_state), 78 | maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] 79 | 80 | # Reshape outputs to be one output per entry 81 | mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M] 82 | stop_token_outputs = tf.reshape(stop_token_outputs, [batch_size, -1]) # [N, T_out, M] 83 | 84 | # Add post-processing CBHG: 85 | post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) # [N, T_out, postnet_depth=256] 86 | linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] 87 | 88 | # Grab alignments from the final decoder state: 89 | alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) 90 | 91 | self.inputs = inputs 92 | self.input_lengths = input_lengths 93 | self.mel_outputs = mel_outputs 94 | self.linear_outputs = linear_outputs 95 | self.stop_token_outputs = stop_token_outputs 96 | self.alignments = alignments 97 | self.mel_targets = mel_targets 98 | self.linear_targets = linear_targets 99 | self.stop_token_targets = stop_token_targets 100 | log('Initialized Tacotron model. Dimensions: ') 101 | log(' embedding: {}'.format(embedded_inputs.shape)) 102 | log(' prenet out: {}'.format(prenet_outputs.shape)) 103 | log(' encoder out: {}'.format(encoder_outputs.shape)) 104 | log(' decoder out (r frames): {}'.format(decoder_outputs.shape)) 105 | log(' decoder out (1 frame): {}'.format(mel_outputs.shape)) 106 | log(' postnet out: {}'.format(post_outputs.shape)) 107 | log(' linear out: {}'.format(linear_outputs.shape)) 108 | log(' stop token: {}'.format(stop_token_outputs.shape)) 109 | 110 | 111 | def add_loss(self): 112 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 113 | with tf.variable_scope('loss') as scope: 114 | hp = self._hparams 115 | self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs)) 116 | self.linear_loss = tf.reduce_mean(tf.abs(self.linear_targets - self.linear_outputs)) 117 | self.stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 118 | labels=self.stop_token_targets, 119 | logits=self.stop_token_outputs)) 120 | 121 | # Compute the regularization weights 122 | reg_weight = 1e-6 123 | all_vars = tf.trainable_variables() 124 | self.regularization_loss = tf.add_n([tf.nn.l2_loss(v) for v in all_vars 125 | if not('bias' in v.name or 'Bias' in v.name)]) * reg_weight 126 | 127 | self.loss = self.mel_loss + self.linear_loss + self.stop_token_loss + self.regularization_loss 128 | 129 | 130 | def add_optimizer(self, global_step): 131 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 132 | 133 | Args: 134 | global_step: int32 scalar Tensor representing current global step in training 135 | ''' 136 | with tf.variable_scope('optimizer') as scope: 137 | hp = self._hparams 138 | if hp.decay_learning_rate: 139 | self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step) 140 | else: 141 | self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) 142 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) 143 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 144 | self.gradients = gradients 145 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) 146 | 147 | # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 148 | # https://github.com/tensorflow/tensorflow/issues/1122 149 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 150 | self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 151 | global_step=global_step) 152 | 153 | 154 | def _learning_rate_decay(init_lr, global_step): 155 | # Noam scheme from tensor2tensor: 156 | warmup_steps = 4000.0 157 | step = tf.cast(global_step + 1, dtype=tf.float32) 158 | return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5) 159 | -------------------------------------------------------------------------------- /models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.layers import core as layers_core 6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope 7 | 8 | 9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, attention_layer): 11 | """Computes the attention and alignments for a given attention_mechanism.""" 12 | alignments, next_attention_state = attention_mechanism( 13 | cell_output, state=attention_state) 14 | 15 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 16 | expanded_alignments = array_ops.expand_dims(alignments, 1) 17 | # Context is the inner product of alignments and values along the 18 | # memory time dimension. 19 | # alignments shape is 20 | # [batch_size, 1, memory_time] 21 | # attention_mechanism.values shape is 22 | # [batch_size, memory_time, memory_size] 23 | # the batched matmul is over memory_time, so the output shape is 24 | # [batch_size, 1, memory_size]. 25 | # we then squeeze out the singleton dim. 26 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 27 | context = array_ops.squeeze(context, [1]) 28 | 29 | if attention_layer is not None: 30 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 31 | else: 32 | attention = context 33 | 34 | return attention, alignments, next_attention_state 35 | 36 | 37 | def _location_sensitive_score(W_query, W_fil, W_keys): 38 | """Impelements Bahdanau-style (cumulative) scoring function. 39 | This attention is described in: 40 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 41 | gio, “Attention-based models for speech recognition,” in Ad- 42 | vances in Neural Information Processing Systems, 2015, pp. 43 | 577–585. 44 | 45 | ############################################################################# 46 | hybrid attention (content-based + location-based) 47 | f = F * α_{i-1} 48 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 49 | ############################################################################# 50 | 51 | Args: 52 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 53 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 54 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 55 | Returns: 56 | A '[batch_size, max_time]' attention score (energy) 57 | """ 58 | # Get the number of hidden units from the trailing dimension of keys 59 | dtype = W_query.dtype 60 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 61 | 62 | v_a = tf.get_variable( 63 | 'attention_variable', shape=[num_units], dtype=dtype, 64 | initializer=tf.contrib.layers.xavier_initializer()) 65 | b_a = tf.get_variable( 66 | 'attention_bias', shape=[num_units], dtype=dtype, 67 | initializer=tf.zeros_initializer()) 68 | 69 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 70 | 71 | def _smoothing_normalization(e): 72 | """Applies a smoothing normalization function instead of softmax 73 | Introduced in: 74 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 75 | gio, “Attention-based models for speech recognition,” in Ad- 76 | vances in Neural Information Processing Systems, 2015, pp. 77 | 577–585. 78 | 79 | ############################################################################ 80 | Smoothing normalization function 81 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 82 | ############################################################################ 83 | 84 | Args: 85 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 86 | values of an attention mechanism 87 | Returns: 88 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 89 | attendance to multiple memory time steps. 90 | """ 91 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 92 | 93 | 94 | class LocationSensitiveAttention(BahdanauAttention): 95 | """Impelements Bahdanau-style (cumulative) scoring function. 96 | Usually referred to as "hybrid" attention (content-based + location-based) 97 | Extends the additive attention described in: 98 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 99 | tion by jointly learning to align and translate,” in Proceedings 100 | of ICLR, 2015." 101 | to use previous alignments as additional location features. 102 | 103 | This attention is described in: 104 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 105 | gio, “Attention-based models for speech recognition,” in Ad- 106 | vances in Neural Information Processing Systems, 2015, pp. 107 | 577–585. 108 | """ 109 | 110 | def __init__(self, 111 | num_units, 112 | memory, 113 | smoothing=False, 114 | cumulate_weights=True, 115 | name='LocationSensitiveAttention'): 116 | """Construct the Attention mechanism. 117 | Args: 118 | num_units: The depth of the query mechanism. 119 | memory: The memory to query; usually the output of an RNN encoder. This 120 | tensor should be shaped `[batch_size, max_time, ...]`. 121 | memory_sequence_length (optional): Sequence lengths for the batch entries 122 | in memory. If provided, the memory tensor rows are masked with zeros 123 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 124 | smoothing (optional): Boolean. Determines which normalization function to use. 125 | Default normalization function (probablity_fn) is softmax. If smoothing is 126 | enabled, we replace softmax with: 127 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 128 | Introduced in: 129 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 130 | gio, “Attention-based models for speech recognition,” in Ad- 131 | vances in Neural Information Processing Systems, 2015, pp. 132 | 577–585. 133 | This is mainly used if the model wants to attend to multiple inputs parts 134 | at the same decoding step. We probably won't be using it since multiple sound 135 | frames may depend from the same character, probably not the way around. 136 | Note: 137 | We still keep it implemented in case we want to test it. They used it in the 138 | paper in the context of speech recognition, where one phoneme may depend on 139 | multiple subsequent sound frames. 140 | name: Name to use when creating ops. 141 | """ 142 | #Create normalization function 143 | #Setting it to None defaults in using softmax 144 | normalization_function = _smoothing_normalization if (smoothing == True) else None 145 | super(LocationSensitiveAttention, self).__init__( 146 | num_units=num_units, 147 | memory=memory, 148 | memory_sequence_length=None, 149 | probability_fn=normalization_function, 150 | name=name) 151 | 152 | self.location_convolution = tf.layers.Conv1D(filters=32, 153 | kernel_size=(31, ), padding='same', use_bias=True, 154 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution') 155 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 156 | dtype=tf.float32, name='location_features_layer') 157 | self._cumulate = cumulate_weights 158 | 159 | def __call__(self, query, state): 160 | """Score the query based on the keys and values. 161 | Args: 162 | query: Tensor of dtype matching `self.values` and shape 163 | `[batch_size, query_depth]`. 164 | state (previous alignments): Tensor of dtype matching `self.values` and shape 165 | `[batch_size, alignments_size]` 166 | (`alignments_size` is memory's `max_time`). 167 | Returns: 168 | alignments: Tensor of dtype matching `self.values` and shape 169 | `[batch_size, alignments_size]` (`alignments_size` is memory's 170 | `max_time`). 171 | """ 172 | previous_alignments = state 173 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 174 | 175 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 176 | processed_query = self.query_layer(query) if self.query_layer else query 177 | # -> [batch_size, 1, attention_dim] 178 | processed_query = tf.expand_dims(processed_query, 1) 179 | 180 | # processed_location_features shape [batch_size, max_time, attention dimension] 181 | # [batch_size, max_time] -> [batch_size, max_time, 1] 182 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 183 | # location features [batch_size, max_time, filters] 184 | f = self.location_convolution(expanded_alignments) 185 | # Projected location features [batch_size, max_time, attention_dim] 186 | processed_location_features = self.location_layer(f) 187 | 188 | # energy shape [batch_size, max_time] 189 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 190 | 191 | 192 | # alignments shape = energy shape = [batch_size, max_time] 193 | alignments = self._probability_fn(energy, previous_alignments) 194 | 195 | # Cumulate alignments 196 | if self._cumulate: 197 | next_state = alignments + previous_alignments 198 | else: 199 | next_state = alignments 200 | 201 | return alignments, next_state 202 | -------------------------------------------------------------------------------- /models/rnn_wrappers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import tensorflow as tf 4 | from .modules import prenet 5 | from .attention import _compute_attention 6 | from tensorflow.contrib.rnn import RNNCell 7 | from tensorflow.python.framework import ops, tensor_shape 8 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops 9 | from tensorflow.python.util import nest 10 | from hparams import hparams as hp 11 | 12 | 13 | class FrameProjection: 14 | """Projection layer to r * num_mels dimensions or num_mels dimensions 15 | """ 16 | def __init__(self, shape=hp.num_mels, activation=None, scope=None): 17 | """ 18 | Args: 19 | shape: integer, dimensionality of output space (r*n_mels for decoder or n_mels for postnet) 20 | activation: callable, activation function 21 | scope: FrameProjection scope. 22 | """ 23 | super(FrameProjection, self).__init__() 24 | 25 | self.shape = shape 26 | self.activation = activation 27 | self.scope = 'linear_projection' if scope is None else scope 28 | self.dense = tf.layers.Dense(units=shape, activation=activation, name='projection_{}'.format(self.scope)) 29 | 30 | def __call__(self, inputs): 31 | with tf.variable_scope(self.scope): 32 | # If activation==None, this returns a simple Linear projection 33 | # else the projection will be passed through an activation function 34 | # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, 35 | # name='projection_{}'.format(self.scope)) 36 | return self.dense(inputs) 37 | 38 | 39 | class StopProjection: 40 | """Projection to a scalar and through a sigmoid activation 41 | """ 42 | def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None): 43 | """ 44 | Args: 45 | is_training: Boolean, to control the use of sigmoid function as it is useless to use it 46 | during training since it is integrate inside the sigmoid_crossentropy loss 47 | shape: integer, dimensionality of output space. Defaults to 1 (scalar) 48 | activation: callable, activation function. only used during inference 49 | scope: StopProjection scope. 50 | """ 51 | super(StopProjection, self).__init__() 52 | 53 | self.is_training = is_training 54 | self.shape = shape 55 | self.activation = activation 56 | self.scope = 'stop_token_projection' if scope is None else scope 57 | 58 | def __call__(self, inputs): 59 | with tf.variable_scope(self.scope): 60 | output = tf.layers.dense(inputs, units=self.shape, activation=None, name='projection_{}'.format(self.scope)) 61 | #During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function 62 | return output if self.is_training else self.activation(output) 63 | 64 | 65 | class TacotronDecoderCellState( 66 | collections.namedtuple("TacotronDecoderCellState", 67 | ("cell_state", "attention", "time", "alignments", 68 | "alignment_history"))): 69 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 70 | Contains: 71 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 72 | step. 73 | - `attention`: The attention emitted at the previous time step. 74 | - `time`: int32 scalar containing the current time step. 75 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 76 | emitted at the previous time step for each attention mechanism. 77 | - `alignment_history`: a single or tuple of `TensorArray`(s) 78 | containing alignment matrices from all time steps for each attention 79 | mechanism. Call `stack()` on each to convert to a `Tensor`. 80 | """ 81 | def replace(self, **kwargs): 82 | """Clones the current state while overwriting components provided by kwargs. 83 | """ 84 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 85 | 86 | 87 | class TacotronDecoderWrapper(RNNCell): 88 | """Tactron 2 Decoder Cell 89 | Decodes encoder output and previous mel frames into next r frames 90 | 91 | Decoder Step i: 92 | 1) Prenet to compress last output information 93 | 2) Concat compressed inputs with previous context vector (input feeding) * 94 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 95 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 96 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 97 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 98 | 99 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper, 100 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 101 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 102 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only. 103 | """ 104 | 105 | def __init__(self, is_training, attention_mechanism, rnn_cell, frame_projection, stop_projection): 106 | """Initialize decoder parameters 107 | 108 | Args: 109 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 110 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 111 | learn encoder-decoder alignments 112 | rnn_cell: Instance of RNNCell, main body of the decoder 113 | frame_projection: tensorflow fully connected layer with r * num_mels output units 114 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 115 | and through a sigmoid activation 116 | mask_finished: Boolean, Whether to mask decoder frames after the 117 | """ 118 | super(TacotronDecoderWrapper, self).__init__() 119 | #Initialize decoder layers 120 | self._training = is_training 121 | self._attention_mechanism = attention_mechanism 122 | self._cell = rnn_cell 123 | self._frame_projection = frame_projection 124 | self._stop_projection = stop_projection 125 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 126 | 127 | def _batch_size_checks(self, batch_size, error_message): 128 | return [check_ops.assert_equal(batch_size, 129 | self._attention_mechanism.batch_size, 130 | message=error_message)] 131 | 132 | @property 133 | def output_size(self): 134 | return self._frame_projection.shape 135 | 136 | # @property 137 | def state_size(self): 138 | """The `state_size` property of `TacotronDecoderWrapper`. 139 | 140 | Returns: 141 | An `TacotronDecoderWrapper` tuple containing shapes used by this object. 142 | """ 143 | return TacotronDecoderCellState( 144 | cell_state=self._cell._cell.state_size, 145 | time=tensor_shape.TensorShape([]), 146 | attention=self._attention_layer_size, 147 | alignments=self._attention_mechanism.alignments_size, 148 | alignment_history=()) 149 | 150 | def zero_state(self, batch_size, dtype): 151 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 152 | 153 | Args: 154 | batch_size: `0D` integer tensor: the batch size. 155 | dtype: The internal state data type. 156 | Returns: 157 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 158 | possibly, empty `TensorArray` objects. 159 | Raises: 160 | ValueError: (or, possibly at runtime, InvalidArgument), if 161 | `batch_size` does not match the output size of the encoder passed 162 | to the wrapper object at initialization time. 163 | """ 164 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 165 | cell_state = self._cell.zero_state(batch_size, dtype) 166 | error_message = ( 167 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 168 | "Non-matching batch sizes between the memory " 169 | "(encoder output) and the requested batch size.") 170 | with ops.control_dependencies( 171 | self._batch_size_checks(batch_size, error_message)): 172 | cell_state = nest.map_structure( 173 | lambda s: array_ops.identity(s, name="checked_cell_state"), 174 | cell_state) 175 | return TacotronDecoderCellState( 176 | cell_state=cell_state, 177 | time=array_ops.zeros([], dtype=tf.int32), 178 | attention=rnn_cell_impl._zero_state_tensors(self._attention_layer_size, batch_size, dtype), 179 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 180 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 181 | dynamic_size=True)) 182 | 183 | 184 | def __call__(self, inputs, state): 185 | #Information bottleneck (essential for learning attention) 186 | prenet_output = prenet(inputs, self._training, hp.prenet_depths, scope='decoder_prenet') 187 | 188 | #Concat context vector and prenet output to form RNN cells input (input feeding) 189 | rnn_input = tf.concat([prenet_output, state.attention], axis=-1) 190 | 191 | #Unidirectional RNN layers 192 | rnn_output, next_cell_state = self._cell(tf.layers.dense(rnn_input, hp.decoder_depth), state.cell_state) 193 | 194 | #Compute the attention (context) vector and alignments using 195 | #the new decoder cell hidden state as query vector 196 | #and cumulative alignments to extract location features 197 | #The choice of the new cell hidden state (s_{i}) of the last 198 | #decoder RNN Cell is based on Luong et Al. (2015): 199 | #https://arxiv.org/pdf/1508.04025.pdf 200 | previous_alignments = state.alignments 201 | previous_alignment_history = state.alignment_history 202 | context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 203 | rnn_output, 204 | previous_alignments, 205 | attention_layer=None) 206 | 207 | #Concat RNN outputs and context vector to form projections inputs 208 | projections_input = tf.concat([rnn_output, context_vector], axis=-1) 209 | 210 | #Compute predicted frames and predicted 211 | cell_outputs = self._frame_projection(projections_input) 212 | stop_tokens = self._stop_projection(projections_input) 213 | 214 | #Save alignment history 215 | alignment_history = previous_alignment_history.write(state.time, alignments) 216 | 217 | #Prepare next decoder state 218 | next_state = TacotronDecoderCellState( 219 | time=state.time + 1, 220 | cell_state=next_cell_state, 221 | attention=context_vector, 222 | alignments=cumulated_alignments, 223 | alignment_history=alignment_history) 224 | 225 | return (cell_outputs, stop_tokens), next_state 226 | --------------------------------------------------------------------------------