├── tests ├── __init__.py ├── cmudict_test.py ├── text_test.py └── numbers_test.py ├── datasets ├── __init__.py ├── ljspeech.py ├── blizzard.py ├── bible.py └── datafeeder.py ├── .gitignore ├── models ├── __init__.py ├── rnn_wrappers.py ├── helpers.py ├── tacotron.py ├── tacotron2.py └── modules.py ├── requirements.txt ├── util ├── __init__.py ├── plot.py ├── infolog.py └── audio.py ├── LICENSE ├── synthesizer.py ├── hparams.py ├── text ├── cmudict.py ├── symbols.py ├── numbers.py ├── cleaners.py ├── __init__.py ├── kor_dic.py └── korean.py ├── eval.py ├── preprocess.py ├── TRAINING_DATA.md ├── demo_server.py ├── README.md ├── LJSpeech-1.1 └── README └── train.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | __pycache__/ 3 | .cache/ 4 | *.pyc 5 | .DS_Store 6 | run*.sh 7 | *.wav 8 | *.npy 9 | *.json 10 | .ipynb_checkpoints/ 11 | training/ 12 | logs-*/ 13 | 14 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron2 import Tacotron2 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == 'tacotron': 6 | return Tacotron2(hparams) 7 | else: 8 | raise Exception('Unknown model: ' + name) 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install 2 | # depends on your platform. It is assumed you have already installed tensorflow. 3 | falcon==1.2.0 4 | inflect==0.2.5 5 | librosa==0.5.1 6 | matplotlib==2.0.2 7 | numpy==1.14.3 8 | scipy==0.19.0 9 | tqdm==4.11.2 10 | Unidecode==0.4.20 11 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /util/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def plot_alignment(alignment, path, info=None): 7 | fig, ax = plt.subplots() 8 | im = ax.imshow( 9 | alignment, 10 | aspect='auto', 11 | origin='lower', 12 | interpolation='none') 13 | fig.colorbar(im, ax=ax) 14 | xlabel = 'Decoder timestep' 15 | if info is not None: 16 | xlabel += '\n\n' + info 17 | plt.xlabel(xlabel) 18 | plt.ylabel('Encoder timestep') 19 | plt.tight_layout() 20 | plt.savefig(path, format='png') 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /tests/cmudict_test.py: -------------------------------------------------------------------------------- 1 | import io 2 | from text import cmudict 3 | 4 | 5 | test_data = ''' 6 | ;;; # CMUdict -- Major Version: 0.07 7 | )PAREN P ER EH N 8 | 'TIS T IH Z 9 | ADVERSE AE0 D V ER1 S 10 | ADVERSE(1) AE1 D V ER2 S 11 | ADVERSE(2) AE2 D V ER1 S 12 | ADVERSELY AE0 D V ER1 S L IY0 13 | ADVERSITY AE0 D V ER1 S IH0 T IY2 14 | BARBERSHOP B AA1 R B ER0 SH AA2 P 15 | YOU'LL Y UW1 L 16 | ''' 17 | 18 | 19 | def test_cmudict(): 20 | c = cmudict.CMUDict(io.StringIO(test_data)) 21 | assert len(c) == 6 22 | assert len(cmudict.valid_symbols) == 84 23 | assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2'] 24 | assert c.lookup('BarberShop') == ['B AA1 R B ER0 SH AA2 P'] 25 | assert c.lookup("You'll") == ['Y UW1 L'] 26 | assert c.lookup("'tis") == ['T IH Z'] 27 | assert c.lookup('adverse') == [ 28 | 'AE0 D V ER1 S', 29 | 'AE1 D V ER2 S', 30 | 'AE2 D V ER1 S', 31 | ] 32 | assert c.lookup('') == None 33 | assert c.lookup('foo') == None 34 | assert c.lookup(')paren') == None 35 | 36 | 37 | def test_cmudict_no_keep_ambiguous(): 38 | c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False) 39 | assert len(c) == 5 40 | assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2'] 41 | assert c.lookup('adverse') == None 42 | -------------------------------------------------------------------------------- /util/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a', encoding="utf-8") 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new training run\n') 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, slack=False): 26 | print(msg) 27 | if _file is not None: 28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header('Content-Type', 'application/json') 43 | urlopen(req, json.dumps({ 44 | 'username': 'tacotron', 45 | 'icon_emoji': ':taco:', 46 | 'text': '*%s*: %s' % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /synthesizer.py: -------------------------------------------------------------------------------- 1 | import io 2 | import numpy as np 3 | import tensorflow as tf 4 | from hparams import hparams 5 | from librosa import effects 6 | from models import create_model 7 | from text import text_to_sequence 8 | from util import audio 9 | # from g2pk import G2p 10 | 11 | 12 | class Synthesizer: 13 | def load(self, checkpoint_path, model_name='tacotron'): 14 | print('Constructing model: %s' % model_name) 15 | inputs = tf.placeholder(tf.int32, [1, None], 'inputs') 16 | input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths') 17 | with tf.variable_scope('model') as scope: 18 | self.model = create_model(model_name, hparams) 19 | self.model.initialize(inputs, input_lengths) 20 | self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0]) 21 | 22 | print('Loading checkpoint: %s' % checkpoint_path) 23 | self.session = tf.Session() 24 | self.session.run(tf.global_variables_initializer()) 25 | saver = tf.train.Saver() 26 | saver.restore(self.session, checkpoint_path) 27 | 28 | def synthesize(self, text): 29 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 30 | # g2p = G2p() 31 | seq = text_to_sequence(text, cleaner_names) 32 | feed_dict = { 33 | self.model.inputs: [np.asarray(seq, dtype=np.int32)], 34 | self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) 35 | } 36 | wav = self.session.run(self.wav_output, feed_dict=feed_dict) 37 | wav = audio.inv_preemphasis(wav) 38 | wav = wav[:audio.find_endpoint(wav)] 39 | out = io.BytesIO() 40 | audio.save_wav(wav, out) 41 | return out.getvalue() 42 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # Default hyperparameters: 4 | hparams = tf.contrib.training.HParams( 5 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 6 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. 7 | cleaners='korean_cleaners', 8 | # Audio: 9 | num_mels=80, 10 | num_freq=1025, 11 | sample_rate=21000, 12 | frame_length_ms=50, 13 | frame_shift_ms=12.5, 14 | preemphasis=0.97, 15 | min_level_db=-100, 16 | ref_level_db=20, 17 | 18 | # Encoder: 19 | embed_depth=512, 20 | encoder_conv_filter=512, 21 | encoder_conv_kernel=5, 22 | encoder_stack_size=3, 23 | encoder_lstm_hidden_dim=256, 24 | # Model: 25 | outputs_per_step=5, 26 | prenet_depths=[256, 256], 27 | encoder_depth=256, 28 | postnet_depth=256, 29 | attention_depth=256, 30 | attention_filters = 32, 31 | attention_kernel = (31, ), 32 | attention_dim = 128, 33 | decoder_depth=256, 34 | synthesis_constraint = False, 35 | synthesis_constraint_type = 'window', 36 | attention_win_size = 7, 37 | attention_type = 'loc_sen', 38 | cumulative_weights = True, 39 | reg_weight = 1e-6, 40 | 41 | # Training: 42 | batch_size=32, 43 | adam_beta1=0.9, 44 | adam_beta2=0.999, 45 | initial_learning_rate=0.002, 46 | decay_learning_rate=True, 47 | use_cmudict=False, # Use CMUDict during training to learn pronunciation of ARPAbet phonemes 48 | 49 | # Eval: 50 | max_iters=500, 51 | griffin_lim_iters=60, 52 | power=1.5, # Power to raise magnitudes to prior to Griffin-Lim 53 | ) 54 | 55 | 56 | def hparams_debug_string(): 57 | values = hparams.values() 58 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 59 | return 'Hyperparameters:\n' + '\n'.join(hp) 60 | -------------------------------------------------------------------------------- /models/rnn_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.rnn import RNNCell 4 | from .modules import prenet 5 | 6 | 7 | class DecoderPrenetWrapper(RNNCell): 8 | '''Runs RNN inputs through a prenet before sending them to the cell.''' 9 | 10 | def __init__(self, cell, is_training, layer_sizes): 11 | super(DecoderPrenetWrapper, self).__init__() 12 | self._cell = cell 13 | self._is_training = is_training 14 | self._layer_sizes = layer_sizes 15 | 16 | @property 17 | def state_size(self): 18 | return self._cell.state_size 19 | 20 | @property 21 | def output_size(self): 22 | return self._cell.output_size 23 | 24 | def call(self, inputs, state): 25 | prenet_out = prenet(inputs, self._is_training, self._layer_sizes, scope='decoder_prenet') 26 | return self._cell(prenet_out, state) 27 | 28 | def zero_state(self, batch_size, dtype): 29 | return self._cell.zero_state(batch_size, dtype) 30 | 31 | 32 | class ConcatOutputAndAttentionWrapper(RNNCell): 33 | '''Concatenates RNN cell output with the attention context vector. 34 | This is expected to wrap a cell wrapped with an AttentionWrapper constructed with 35 | attention_layer_size=None and output_attention=False. Such a cell's state will include an 36 | "attention" field that is the context vector. 37 | ''' 38 | 39 | def __init__(self, cell): 40 | super(ConcatOutputAndAttentionWrapper, self).__init__() 41 | self._cell = cell 42 | 43 | @property 44 | def state_size(self): 45 | return self._cell.state_size 46 | 47 | @property 48 | def output_size(self): 49 | return self._cell.output_size + self._cell.state_size.attention 50 | 51 | def call(self, inputs, state): 52 | output, res_state = self._cell(inputs, state) 53 | return tf.concat([output, res_state.attention], axis=-1), res_state 54 | 55 | def zero_state(self, batch_size, dtype): 56 | return self._cell.zero_state(batch_size, dtype) -------------------------------------------------------------------------------- /text/cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | valid_symbols = [ 5 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 6 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 7 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 8 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 9 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 10 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 11 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' 12 | ] 13 | 14 | _valid_symbol_set = set(valid_symbols) 15 | 16 | 17 | class CMUDict: 18 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 19 | def __init__(self, file_or_path, keep_ambiguous=True): 20 | if isinstance(file_or_path, str): 21 | with open(file_or_path, encoding='latin-1') as f: 22 | entries = _parse_cmudict(f) 23 | else: 24 | entries = _parse_cmudict(file_or_path) 25 | if not keep_ambiguous: 26 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 27 | self._entries = entries 28 | 29 | 30 | def __len__(self): 31 | return len(self._entries) 32 | 33 | 34 | def lookup(self, word): 35 | '''Returns list of ARPAbet pronunciations of the given word.''' 36 | return self._entries.get(word.upper()) 37 | 38 | 39 | 40 | _alt_re = re.compile(r'\([0-9]+\)') 41 | 42 | 43 | def _parse_cmudict(file): 44 | cmudict = {} 45 | for line in file: 46 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 47 | parts = line.split(' ') 48 | word = re.sub(_alt_re, '', parts[0]) 49 | pronunciation = _get_pronunciation(parts[1]) 50 | if pronunciation: 51 | if word in cmudict: 52 | cmudict[word].append(pronunciation) 53 | else: 54 | cmudict[word] = [pronunciation] 55 | return cmudict 56 | 57 | 58 | def _get_pronunciation(s): 59 | parts = s.strip().split(' ') 60 | for part in parts: 61 | if part not in _valid_symbol_set: 62 | return None 63 | return ' '.join(parts) 64 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | # ''' 2 | # Defines the set of symbols used in text input to the model. 3 | # 4 | # The default is a set of ASCII characters that works well for English or text that has been run 5 | # through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | # ''' 7 | # from text import cmudict 8 | # 9 | # _pad = '_' 10 | # _eos = '~' 11 | # _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 12 | # 13 | # # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | # _arpabet = ['@' + s for s in cmudict.valid_symbols] 15 | # 16 | # # Export all symbols: 17 | # symbols = [_pad, _eos] + list(_characters) + _arpabet 18 | 19 | # coding: utf-8 20 | ''' 21 | Defines the set of symbols used in text input to the model. 22 | 23 | The default is a set of ASCII characters that works well for English or text that has been run 24 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 25 | ''' 26 | from jamo import h2j, j2h 27 | from jamo.jamo import _jamo_char_to_hcj 28 | 29 | from .korean import ALL_SYMBOLS, PAD, EOS 30 | 31 | # For english 32 | en_symbols = PAD + EOS + 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' # <-For deployment(Because korean ALL_SYMBOLS follow this convention) 33 | 34 | symbols = ALL_SYMBOLS # for korean 35 | 36 | """ 37 | 초성과 종성은 같아보이지만, 다른 character이다. 38 | '_~ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ!'(),-.:;? ' 39 | '_': 0, '~': 1, 'ᄀ': 2, 'ᄁ': 3, 'ᄂ': 4, 'ᄃ': 5, 'ᄄ': 6, 'ᄅ': 7, 'ᄆ': 8, 'ᄇ': 9, 'ᄈ': 10, 40 | 'ᄉ': 11, 'ᄊ': 12, 'ᄋ': 13, 'ᄌ': 14, 'ᄍ': 15, 'ᄎ': 16, 'ᄏ': 17, 'ᄐ': 18, 'ᄑ': 19, 'ᄒ': 20, 41 | 'ᅡ': 21, 'ᅢ': 22, 'ᅣ': 23, 'ᅤ': 24, 'ᅥ': 25, 'ᅦ': 26, 'ᅧ': 27, 'ᅨ': 28, 'ᅩ': 29, 'ᅪ': 30, 42 | 'ᅫ': 31, 'ᅬ': 32, 'ᅭ': 33, 'ᅮ': 34, 'ᅯ': 35, 'ᅰ': 36, 'ᅱ': 37, 'ᅲ': 38, 'ᅳ': 39, 'ᅴ': 40, 43 | 'ᅵ': 41, 'ᆨ': 42, 'ᆩ': 43, 'ᆪ': 44, 'ᆫ': 45, 'ᆬ': 46, 'ᆭ': 47, 'ᆮ': 48, 'ᆯ': 49, 'ᆰ': 50, 44 | 'ᆱ': 51, 'ᆲ': 52, 'ᆳ': 53, 'ᆴ': 54, 'ᆵ': 55, 'ᆶ': 56, 'ᆷ': 57, 'ᆸ': 58, 'ᆹ': 59, 'ᆺ': 60, 45 | 'ᆻ': 61, 'ᆼ': 62, 'ᆽ': 63, 'ᆾ': 64, 'ᆿ': 65, 'ᇀ': 66, 'ᇁ': 67, 'ᇂ': 68, '!': 69, "'": 70, 46 | '(': 71, ')': 72, ',': 73, '-': 74, '.': 75, ':': 76, ';': 77, '?': 78, ' ': 79 47 | """ -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | from hparams import hparams, hparams_debug_string 5 | from synthesizer import Synthesizer 6 | 7 | 8 | # sentences = [ 9 | # # '완전히 쾅 닫힌 대화창 뿐이네', 10 | # # '정성스럽게 적었던 거야', 11 | # # '나는 큰 결심을 하고서 보낸 문잔데', 12 | # # '모든걸 마무리 해버렸어', 13 | # # '이모티콘 하나마저 조심스럽게 보냈어', 14 | # # '너가 잘해야지', 15 | # # '새해 복만으로는 안돼', 16 | # # 장기하와 얼굴들 ㅋ 가사: 17 | # '신진 샹숑가수의 신춘 샹숑쇼우', 18 | # '철수 책상 철 책상', 19 | # '창경원 창살은 쌍창살', 20 | # '스위스에서 온 스미스씨', 21 | # # 장기하와 얼굴들 새해복 가사: 22 | # '간장 공장 공장장', 23 | # '한양양장점 옆 한양양장점', 24 | # '후회한 시간을 후회할 거잖아', 25 | # ] 26 | 27 | 28 | def get_output_base_path(checkpoint_path): 29 | base_dir = os.path.dirname(checkpoint_path) 30 | m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(checkpoint_path) 31 | name = 'eval-char-%d' % int(m.group(1)) if m else 'eval' 32 | return os.path.join(base_dir, name) 33 | 34 | 35 | def run_eval(args): 36 | print(hparams_debug_string()) 37 | synth = Synthesizer() 38 | synth.load(args.checkpoint) 39 | base_path = get_output_base_path(args.checkpoint) 40 | sentences=[] 41 | with open('./eval_char.txt', encoding='utf-8') as f: 42 | for line in f: 43 | try: 44 | parts = line.strip().replace('"', '').split('|') 45 | text = parts[3] 46 | sentences.append(text) 47 | except: 48 | pass 49 | for i, text in enumerate(sentences): 50 | path = '%s-%d.wav' % (base_path, i) 51 | print('Synthesizing: %s' % path) 52 | with open(path, 'wb') as f: 53 | f.write(synth.synthesize(text)) 54 | 55 | 56 | def main(): 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument('--checkpoint', required=True, help='Path to model checkpoint') 59 | parser.add_argument('--hparams', default='', 60 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 61 | parser.add_argument('--gpu', default='1') 62 | args = parser.parse_args() 63 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 64 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 65 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 66 | hparams.parse(args.hparams) 67 | run_eval(args) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /tests/text_test.py: -------------------------------------------------------------------------------- 1 | from text import cleaners, symbols, text_to_sequence, sequence_to_text 2 | from unidecode import unidecode 3 | 4 | 5 | def test_symbols(): 6 | assert len(symbols) >= 3 7 | assert symbols[0] == '_' 8 | assert symbols[1] == '~' 9 | 10 | 11 | def test_text_to_sequence(): 12 | assert text_to_sequence('', []) == [1] 13 | assert text_to_sequence('Hi!', []) == [9, 36, 54, 1] 14 | assert text_to_sequence('"A"_B', []) == [2, 3, 1] 15 | assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1] 16 | assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1] 17 | assert text_to_sequence('A {AW1 S} B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] 18 | 19 | 20 | def test_sequence_to_text(): 21 | assert sequence_to_text([]) == '' 22 | assert sequence_to_text([1]) == '~' 23 | assert sequence_to_text([9, 36, 54, 1]) == 'Hi!~' 24 | assert sequence_to_text([2, 64, 83, 132, 64, 3]) == 'A {AW1 S} B' 25 | 26 | 27 | def test_collapse_whitespace(): 28 | assert cleaners.collapse_whitespace('') == '' 29 | assert cleaners.collapse_whitespace(' ') == ' ' 30 | assert cleaners.collapse_whitespace('x') == 'x' 31 | assert cleaners.collapse_whitespace(' x. y, \tz') == ' x. y, z' 32 | 33 | 34 | def test_convert_to_ascii(): 35 | assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre" 36 | assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott' 37 | assert cleaners.convert_to_ascii('안녕') == 'annyeong' 38 | assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite' 39 | 40 | 41 | def test_lowercase(): 42 | assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!' 43 | assert cleaners.lowercase('CAFÉ') == 'café' 44 | 45 | 46 | def test_expand_abbreviations(): 47 | assert cleaners.expand_abbreviations('mr. and mrs. smith') == 'mister and misess smith' 48 | 49 | 50 | def test_expand_numbers(): 51 | assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears' 52 | assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.' 53 | 54 | 55 | def test_cleaner_pipelines(): 56 | text = 'Mr. Müller ate 2 Apples' 57 | assert cleaners.english_cleaners(text) == 'mister muller ate two apples' 58 | assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples' 59 | assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples' 60 | 61 | -------------------------------------------------------------------------------- /text/numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # Code based on https://github.com/keithito/tacotron/blob/master/text/cleaners.py 4 | 5 | import re 6 | from .korean import tokenize as ko_tokenize 7 | 8 | # # Added to support LJ_speech 9 | # from unidecode import unidecode 10 | # from .en_numbers import normalize_numbers as en_normalize_numbers 11 | 12 | # Regular expression matching whitespace: 13 | _whitespace_re = re.compile(r'\s+') 14 | 15 | 16 | def korean_cleaners(text): 17 | '''Pipeline for Korean text, including number and abbreviation expansion.''' 18 | text = ko_tokenize(text) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~'] 19 | return text 20 | 21 | 22 | # # List of (regular expression, replacement) pairs for abbreviations: 23 | # _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | # ('mrs', 'misess'), 25 | # ('mr', 'mister'), 26 | # ('dr', 'doctor'), 27 | # ('st', 'saint'), 28 | # ('co', 'company'), 29 | # ('jr', 'junior'), 30 | # ('maj', 'major'), 31 | # ('gen', 'general'), 32 | # ('drs', 'doctors'), 33 | # ('rev', 'reverend'), 34 | # ('lt', 'lieutenant'), 35 | # ('hon', 'honorable'), 36 | # ('sgt', 'sergeant'), 37 | # ('capt', 'captain'), 38 | # ('esq', 'esquire'), 39 | # ('ltd', 'limited'), 40 | # ('col', 'colonel'), 41 | # ('ft', 'fort'), 42 | # ]] 43 | 44 | 45 | # def expand_abbreviations(text): 46 | # for regex, replacement in _abbreviations: 47 | # text = re.sub(regex, replacement, text) 48 | # return text 49 | # 50 | # 51 | # def expand_numbers(text): 52 | # return en_normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | return text.lower() 57 | 58 | 59 | def collapse_whitespace(text): 60 | return re.sub(_whitespace_re, ' ', text) 61 | 62 | 63 | # def convert_to_ascii(text): 64 | # return unidecode(text) 65 | 66 | 67 | def basic_cleaners(text): 68 | text = lowercase(text) 69 | text = collapse_whitespace(text) 70 | return text 71 | 72 | 73 | # def transliteration_cleaners(text): 74 | # # text = convert_to_ascii(text) 75 | # text = lowercase(text) 76 | # text = collapse_whitespace(text) 77 | # return text 78 | # 79 | # 80 | # def english_cleaners(text): 81 | # text = convert_to_ascii(text) 82 | # text = lowercase(text) 83 | # text = expand_numbers(text) 84 | # text = expand_abbreviations(text) 85 | # text = collapse_whitespace(text) 86 | # return text 87 | -------------------------------------------------------------------------------- /tests/numbers_test.py: -------------------------------------------------------------------------------- 1 | from text.numbers import normalize_numbers 2 | 3 | 4 | def test_normalize_numbers(): 5 | assert normalize_numbers('1') == 'one' 6 | assert normalize_numbers('15') == 'fifteen' 7 | assert normalize_numbers('24') == 'twenty-four' 8 | assert normalize_numbers('100') == 'one hundred' 9 | assert normalize_numbers('101') == 'one hundred one' 10 | assert normalize_numbers('456') == 'four hundred fifty-six' 11 | assert normalize_numbers('1000') == 'one thousand' 12 | assert normalize_numbers('1800') == 'eighteen hundred' 13 | assert normalize_numbers('2,000') == 'two thousand' 14 | assert normalize_numbers('3000') == 'three thousand' 15 | assert normalize_numbers('18000') == 'eighteen thousand' 16 | assert normalize_numbers('24,000') == 'twenty-four thousand' 17 | assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one' 18 | assert normalize_numbers('6.4 sec') == 'six point four sec' 19 | 20 | 21 | def test_normalize_ordinals(): 22 | assert normalize_numbers('1st') == 'first' 23 | assert normalize_numbers('2nd') == 'second' 24 | assert normalize_numbers('9th') == 'ninth' 25 | assert normalize_numbers('243rd place') == 'two hundred and forty-third place' 26 | 27 | 28 | def test_normalize_dates(): 29 | assert normalize_numbers('1400') == 'fourteen hundred' 30 | assert normalize_numbers('1901') == 'nineteen oh one' 31 | assert normalize_numbers('1999') == 'nineteen ninety-nine' 32 | assert normalize_numbers('2000') == 'two thousand' 33 | assert normalize_numbers('2004') == 'two thousand four' 34 | assert normalize_numbers('2010') == 'twenty ten' 35 | assert normalize_numbers('2012') == 'twenty twelve' 36 | assert normalize_numbers('2025') == 'twenty twenty-five' 37 | assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one' 38 | assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.' 39 | 40 | 41 | def test_normalize_money(): 42 | assert normalize_numbers('$0.00') == 'zero dollars' 43 | assert normalize_numbers('$1') == 'one dollar' 44 | assert normalize_numbers('$10') == 'ten dollars' 45 | assert normalize_numbers('$.01') == 'one cent' 46 | assert normalize_numbers('$0.25') == 'twenty-five cents' 47 | assert normalize_numbers('$5.00') == 'five dollars' 48 | assert normalize_numbers('$5.01') == 'five dollars, one cent' 49 | assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.' 50 | assert normalize_numbers('$40,000') == 'forty thousand dollars' 51 | assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!' 52 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from multiprocessing import cpu_count 4 | from tqdm import tqdm 5 | from datasets import blizzard, ljspeech, bible 6 | from hparams import hparams 7 | 8 | 9 | def preprocess_blizzard(args): 10 | in_dir = os.path.join(args.base_dir, 'Blizzard2012') 11 | out_dir = os.path.join(args.base_dir, args.output) 12 | os.makedirs(out_dir, exist_ok=True) 13 | metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 14 | write_metadata(metadata, out_dir) 15 | 16 | 17 | def preprocess_ljspeech(args): 18 | in_dir = os.path.join(args.base_dir, 'LJSpeech-1.1') 19 | out_dir = os.path.join(args.base_dir, args.output) 20 | os.makedirs(out_dir, exist_ok=True) 21 | metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 22 | write_metadata(metadata, out_dir) 23 | 24 | 25 | def preprocess_bible(args): 26 | in_dir = os.path.join(args.base_dir, 'bible') 27 | out_dir = os.path.join(args.base_dir, args.output) 28 | os.makedirs(out_dir, exist_ok=True) 29 | metadata = bible.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 30 | write_metadata(metadata, out_dir) 31 | 32 | 33 | def preprocess_kss(args): 34 | in_dir = os.path.join(args.base_dir, 'kss') 35 | out_dir = os.path.join(args.base_dir, args.output) 36 | os.makedirs(out_dir, exist_ok=True) 37 | metadata = bible.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm) 38 | write_metadata(metadata, out_dir) 39 | 40 | 41 | def write_metadata(metadata, out_dir): 42 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f: 43 | for m in metadata: 44 | f.write('|'.join([str(x) for x in m]) + '\n') 45 | frames = sum([m[2] for m in metadata]) 46 | hours = frames * hparams.frame_shift_ms / (3600 * 1000) 47 | print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) 48 | print('Max input length: %d' % max(len(m[3]) for m in metadata)) 49 | print('Max output length: %d' % max(m[2] for m in metadata)) 50 | 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron/Tacotron2/')) 55 | parser.add_argument('--output', default='training') 56 | parser.add_argument('--dataset', required=True, choices=['blizzard', 'ljspeech', 'bible', 'kss']) 57 | parser.add_argument('--num_workers', type=int, default=cpu_count()) 58 | args = parser.parse_args() 59 | if args.dataset == 'blizzard': 60 | preprocess_blizzard(args) 61 | elif args.dataset == 'ljspeech': 62 | preprocess_ljspeech(args) 63 | elif args.dataset == 'bible': 64 | preprocess_bible(args) 65 | elif args.dataset == 'kss': 66 | preprocess_kss(args) 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /TRAINING_DATA.md: -------------------------------------------------------------------------------- 1 | # Training Data 2 | 3 | 4 | This repo supports the following speech datasets: 5 | * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain) 6 | * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike) 7 | 8 | You can use any other dataset if you write a preprocessor for it. 9 | 10 | 11 | ### Writing a Preprocessor 12 | 13 | Each training example consists of: 14 | 1. The text that was spoken 15 | 2. A mel-scale spectrogram of the audio 16 | 3. A linear-scale spectrogram of the audio 17 | 18 | The preprocessor is responsible for generating these. See [ljspeech.py](datasets/ljspeech.py) for a 19 | commented example. 20 | 21 | For each training example, a preprocessor should: 22 | 23 | 1. Load the audio file: 24 | ```python 25 | wav = audio.load_wav(wav_path) 26 | ``` 27 | 28 | 2. Compute linear-scale and mel-scale spectrograms (float32 numpy arrays): 29 | ```python 30 | spectrogram = audio.spectrogram(wav).astype(np.float32) 31 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 32 | ``` 33 | 34 | 3. Save the spectrograms to disk: 35 | ```python 36 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 37 | np.save(os.path.join(out_dir, mel_spectrogram_filename), mel_spectrogram.T, allow_pickle=False) 38 | ``` 39 | Note that the transpose of the matrix returned by `audio.spectrogram` is saved so that it's 40 | in time-major format. 41 | 42 | 4. Generate a tuple `(spectrogram_filename, mel_spectrogram_filename, n_frames, text)` to 43 | write to train.txt. n_frames is just the length of the time axis of the spectrogram. 44 | 45 | 46 | After you've written your preprocessor, you can add it to [preprocess.py](preprocess.py) by 47 | following the example of the other preprocessors in that file. 48 | 49 | 50 | ### Non-English Data 51 | 52 | If your training data is in a language other than English, you will probably want to change the 53 | text cleaners by setting the `cleaners` hyperparameter. 54 | 55 | * If your text is in a Latin script or can be transliterated to ASCII using the 56 | [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration 57 | cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`. 58 | 59 | * If you don't want to transliterate, you can define a custom character set. 60 | This allows you to train directly on the character set used in your data. 61 | 62 | To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a 63 | string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`. 64 | 65 | * If you're not sure which option to use, you can evaluate the transliteration cleaners like this: 66 | 67 | ```python 68 | from text import cleaners 69 | cleaners.transliteration_cleaners('Здравствуйте') # Replace with the text you want to try 70 | ``` 71 | -------------------------------------------------------------------------------- /datasets/ljspeech.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from util import audio 6 | 7 | 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 10 | 11 | Args: 12 | in_dir: The directory where you have downloaded the LJ Speech dataset 13 | out_dir: The directory to write the output into 14 | num_workers: Optional number of worker processes to parallelize across 15 | tqdm: You can optionally pass tqdm to get a nice progress bar 16 | 17 | Returns: 18 | A list of tuples describing the training examples. This should be written to train.txt 19 | ''' 20 | 21 | # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you 22 | # can omit it and just call _process_utterance on each input if you want. 23 | executor = ProcessPoolExecutor(max_workers=num_workers) 24 | futures = [] 25 | index = 1 26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 27 | for line in f: 28 | parts = line.strip().split('|') 29 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) 30 | text = parts[2] 31 | futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text))) 32 | index += 1 33 | return [future.result() for future in tqdm(futures)] 34 | 35 | 36 | def _process_utterance(out_dir, index, wav_path, text): 37 | '''Preprocesses a single utterance audio/text pair. 38 | 39 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 40 | to the train.txt file. 41 | 42 | Args: 43 | out_dir: The directory to write the spectrograms into 44 | index: The numeric index to use in the spectrogram filenames. 45 | wav_path: Path to the audio file containing the speech input 46 | text: The text spoken in the input audio file 47 | 48 | Returns: 49 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 50 | ''' 51 | 52 | # Load the audio to a numpy array: 53 | wav = audio.load_wav(wav_path) 54 | 55 | # Compute the linear-scale spectrogram from the wav: 56 | spectrogram = audio.spectrogram(wav).astype(np.float32) 57 | n_frames = spectrogram.shape[1] 58 | 59 | # Compute a mel-scale spectrogram from the wav: 60 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 61 | 62 | # Write the spectrograms to disk: 63 | spectrogram_filename = 'ljspeech-spec-%05d.npy' % index 64 | mel_filename = 'ljspeech-mel-%05d.npy' % index 65 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 66 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 67 | 68 | # Return a tuple describing this training example: 69 | return (spectrogram_filename, mel_filename, n_frames, text) 70 | -------------------------------------------------------------------------------- /datasets/blizzard.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from hparams import hparams 6 | from util import audio 7 | 8 | _max_out_length = 700 9 | _end_buffer = 0.05 10 | _min_confidence = 90 11 | 12 | # Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others. 13 | books = [ 14 | 'ATrampAbroad', 15 | 'TheManThatCorruptedHadleyburg', 16 | # 'LifeOnTheMississippi', 17 | # 'TheAdventuresOfTomSawyer', 18 | ] 19 | 20 | 21 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 22 | executor = ProcessPoolExecutor(max_workers=num_workers) 23 | futures = [] 24 | index = 1 25 | for book in books: 26 | with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f: 27 | for line in f: 28 | parts = line.strip().split('\t') 29 | if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence: 30 | wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0]) 31 | labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0]) 32 | text = parts[5] 33 | task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text) 34 | futures.append(executor.submit(task)) 35 | index += 1 36 | results = [future.result() for future in tqdm(futures)] 37 | return [r for r in results if r is not None] 38 | 39 | 40 | def _process_utterance(out_dir, index, wav_path, labels_path, text): 41 | # Load the wav file and trim silence from the ends: 42 | wav = audio.load_wav(wav_path) 43 | start_offset, end_offset = _parse_labels(labels_path) 44 | start = int(start_offset * hparams.sample_rate) 45 | end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 46 | wav = wav[start:end] 47 | max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate 48 | if len(wav) > max_samples: 49 | return None 50 | spectrogram = audio.spectrogram(wav).astype(np.float32) 51 | n_frames = spectrogram.shape[1] 52 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 53 | spectrogram_filename = 'blizzard-spec-%05d.npy' % index 54 | mel_filename = 'blizzard-mel-%05d.npy' % index 55 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 56 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 57 | return (spectrogram_filename, mel_filename, n_frames, text) 58 | 59 | 60 | def _parse_labels(path): 61 | labels = [] 62 | with open(os.path.join(path)) as f: 63 | for line in f: 64 | parts = line.strip().split(' ') 65 | if len(parts) >= 3: 66 | labels.append((float(parts[0]), ' '.join(parts[2:]))) 67 | start = 0 68 | end = None 69 | if labels[0][1] == 'sil': 70 | start = labels[0][0] 71 | if labels[-1][1] == 'sil': 72 | end = labels[-2][0] + _end_buffer 73 | return (start, end) 74 | -------------------------------------------------------------------------------- /datasets/bible.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ProcessPoolExecutor 2 | from functools import partial 3 | import numpy as np 4 | import os 5 | from util import audio 6 | 7 | 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): 9 | '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. 10 | 11 | Args: 12 | in_dir: The directory where you have downloaded the LJ Speech dataset 13 | out_dir: The directory to write the output into 14 | num_workers: Optional number of worker processes to parallelize across 15 | tqdm: You can optionally pass tqdm to get a nice progress bar 16 | 17 | Returns: 18 | A list of tuples describing the training examples. This should be written to train.txt 19 | ''' 20 | 21 | # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you 22 | # can omit it and just call _process_utterance on each input if you want. 23 | executor = ProcessPoolExecutor(max_workers=num_workers) 24 | futures = [] 25 | index = 1 26 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: 27 | for line in f: 28 | try: 29 | 30 | parts = line.strip().split('|') 31 | wav_path = os.path.join(in_dir, 'wavs', '%s' % parts[0]) 32 | text = parts[1] 33 | futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text))) 34 | index += 1 35 | 36 | except: 37 | 38 | pass 39 | return [future.result() for future in tqdm(futures)] 40 | 41 | 42 | def _process_utterance(out_dir, index, wav_path, text): 43 | '''Preprocesses a single utterance audio/text pair. 44 | 45 | This writes the mel and linear scale spectrograms to disk and returns a tuple to write 46 | to the train.txt file. 47 | 48 | Args: 49 | out_dir: The directory to write the spectrograms into 50 | index: The numeric index to use in the spectrogram filenames. 51 | wav_path: Path to the audio file containing the speech input 52 | text: The text spoken in the input audio file 53 | 54 | Returns: 55 | A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt 56 | ''' 57 | 58 | # Load the audio to a numpy array: 59 | wav = audio.load_wav(wav_path) 60 | 61 | # Compute the linear-scale spectrogram from the wav: 62 | spectrogram = audio.spectrogram(wav).astype(np.float32) 63 | n_frames = spectrogram.shape[1] 64 | 65 | # Compute a mel-scale spectrogram from the wav: 66 | mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) 67 | 68 | # Write the spectrograms to disk: 69 | spectrogram_filename = 'bible-spec-%05d.npy' % index 70 | mel_filename = 'bible-mel-%05d.npy' % index 71 | np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) 72 | np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) 73 | 74 | # Return a tuple describing this training example: 75 | return (spectrogram_filename, mel_filename, n_frames, text) 76 | -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper 7 | class TacoTestHelper(Helper): 8 | def __init__(self, batch_size, output_dim, r): 9 | with tf.name_scope('TacoTestHelper'): 10 | self._batch_size = batch_size 11 | self._output_dim = output_dim 12 | self._end_token = tf.tile([0.0], [output_dim * r]) 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def sample_ids_shape(self): 20 | return tf.TensorShape([]) 21 | 22 | @property 23 | def sample_ids_dtype(self): 24 | return np.int32 25 | 26 | def initialize(self, name=None): 27 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 28 | 29 | def sample(self, time, outputs, state, name=None): 30 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 31 | 32 | def next_inputs(self, time, outputs, state, sample_ids, name=None): 33 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 34 | with tf.name_scope('TacoTestHelper'): 35 | finished = tf.reduce_all(tf.equal(outputs, self._end_token), axis=1) 36 | # Feed last output frame as next input. outputs is [N, output_dim * r] 37 | next_inputs = outputs[:, -self._output_dim:] 38 | return (finished, next_inputs, state) 39 | 40 | 41 | class TacoTrainingHelper(Helper): 42 | def __init__(self, inputs, targets, output_dim, r): 43 | # inputs is [N, T_in], targets is [N, T_out, D] 44 | with tf.name_scope('TacoTrainingHelper'): 45 | self._batch_size = tf.shape(inputs)[0] 46 | self._output_dim = output_dim 47 | 48 | # Feed every r-th target frame as input 49 | self._targets = targets[:, r - 1::r, :] 50 | 51 | # Use full length for every target because we don't want to mask the padding frames 52 | num_steps = tf.shape(self._targets)[1] 53 | self._lengths = tf.tile([num_steps], [self._batch_size]) 54 | 55 | @property 56 | def batch_size(self): 57 | return self._batch_size 58 | 59 | @property 60 | def sample_ids_shape(self): 61 | return tf.TensorShape([]) 62 | 63 | @property 64 | def sample_ids_dtype(self): 65 | return np.int32 66 | 67 | def initialize(self, name=None): 68 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 69 | 70 | def sample(self, time, outputs, state, name=None): 71 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 72 | 73 | def next_inputs(self, time, outputs, state, sample_ids, name=None): 74 | with tf.name_scope(name or 'TacoTrainingHelper'): 75 | finished = (time + 1 >= self._lengths) 76 | next_inputs = self._targets[:, time, :] 77 | return (finished, next_inputs, state) 78 | 79 | 80 | def _go_frames(batch_size, output_dim): 81 | '''Returns all-zero frames for a given batch size and output dimension''' 82 | return tf.tile([[0.0]], [batch_size, output_dim]) 83 | -------------------------------------------------------------------------------- /demo_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import falcon 3 | from hparams import hparams, hparams_debug_string 4 | import os 5 | from synthesizer import Synthesizer 6 | 7 | html_body = '''Demo 8 | 19 | 20 |
21 | 22 | 23 |
24 |

25 | 26 | 56 | ''' 57 | 58 | 59 | class UIResource: 60 | def on_get(self, req, res): 61 | res.content_type = 'text/html' 62 | res.body = html_body 63 | 64 | 65 | class SynthesisResource: 66 | def on_get(self, req, res): 67 | if not req.params.get('text'): 68 | raise falcon.HTTPBadRequest() 69 | res.data = synthesizer.synthesize(req.params.get('text')) 70 | res.content_type = 'audio/wav' 71 | 72 | 73 | synthesizer = Synthesizer() 74 | api = falcon.API() 75 | api.add_route('/synthesize', SynthesisResource()) 76 | api.add_route('/', UIResource()) 77 | 78 | if __name__ == '__main__': 79 | from wsgiref import simple_server 80 | 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument('--checkpoint', required=True, help='Full path to model checkpoint') 83 | parser.add_argument('--port', type=int, default=3000) 84 | parser.add_argument('--hparams', default='', 85 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 86 | parser.add_argument('--gpu', default='1') 87 | args = parser.parse_args() 88 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 89 | hparams.parse(args.hparams) 90 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 91 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 92 | print(hparams_debug_string()) 93 | synthesizer.load(args.checkpoint) 94 | print('Serving on port %d' % args.port) 95 | simple_server.make_server('0.0.0.0', args.port, api).serve_forever() 96 | else: 97 | synthesizer.load(os.environ['CHECKPOINT']) 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tacotron2 2 | 3 | Korean Speech Synthesis with Tacotron 4 | 5 | Note that this repo is based on https://github.com/hccho2/Tacotron2-Wavenet-Korean-TTS, https://github.com/keithito/tacotron 6 | 7 | 8 | ## Background 9 | 10 | In February 2018, Google published a paper, [NATURAL TTS SYNTHESIS BY CONDITIONINGWAVENET ON MEL SPECTROGRAM PREDICTIONS], 11 | where they present a neural text-to-speech model that learns to synthesize speech directly from 12 | (text, audio) pairs. However, they didn't release their source code or training data. This is an 13 | independent attempt to provide an open-source implementation of the model described in their paper. 14 | 15 | The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-). 16 | Pull requests are welcome! 17 | 18 | 19 | 20 | ## Quick Start 21 | 22 | ### Installing dependencies 23 | 24 | 1. Install Python 3. 25 | 26 | 2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better 27 | performance, install with GPU support if it's available. This code works with TensorFlow 1.3 and later. 28 | 29 | 3. Install requirements: 30 | ``` 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | 35 | ### Training 36 | 37 | *Note: you need at least 40GB of free disk space to train a model.* 38 | 39 | 1. **Download a speech dataset.** 40 | 41 | The following are supported out of the box: 42 | * [KSS Dataset](https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset) (Public Domain) 43 | 44 | You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info. 45 | 46 | 47 | 2. **Unpack the dataset into `~/tacotron`** 48 | 49 | After unpacking, your tree should look like this for LJ Speech: 50 | ``` 51 | tacotron 52 | |- kss 53 | |- metadata.csv 54 | |- wavs 55 | ``` 56 | 57 | 58 | 3. **Preprocess the data** 59 | ``` 60 | python3 preprocess.py --dataset kss 61 | ``` 62 | 63 | 4. **Train a model** 64 | ``` 65 | python3 train.py 66 | ``` 67 | 68 | Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command 69 | line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`. 70 | Hyperparameters should generally be set to the same values at both training and eval time. 71 | The default hyperparameters are recommended for LJ Speech and other English-language data. 72 | See [TRAINING_DATA.md](TRAINING_DATA.md) for other languages. 73 | 74 | 75 | 5. **Monitor with Tensorboard** (optional) 76 | ``` 77 | tensorboard --logdir ~/tacotron/logs-tacotron 78 | ``` 79 | 80 | The trainer dumps audio and alignments every 1000 steps. You can find these in 81 | `~/tacotron/logs-tacotron`. 82 | 83 | 6. **Synthesize from a checkpoint** 84 | ``` 85 | python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 86 | ``` 87 | Replace "185000" with the checkpoint number that you want to use, then open a browser 88 | to `localhost:9000` and type what you want to speak. Alternately, you can 89 | run [eval.py](eval.py) at the command line: 90 | ``` 91 | python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000 92 | ``` 93 | If you set the `--hparams` flag when training, set the same value here. 94 | 95 | 96 | ## Modifications 97 | 98 | * We add Stepwise Monotonic Attention, Monotonic Attention, GMM Attention, Loung Attention (20.01.20) 99 | 100 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | import string 4 | import numpy as np 5 | 6 | from text import cleaners 7 | from hparams import hparams 8 | from text.symbols import symbols, en_symbols, PAD, EOS 9 | from text.korean import jamo_to_korean 10 | 11 | # Mappings from symbol to numeric ID and vice versa: 12 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 13 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 14 | isEn = False 15 | 16 | # Regular expression matching text enclosed in curly braces: 17 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 18 | 19 | puncuation_table = str.maketrans({key: None for key in string.punctuation}) 20 | 21 | 22 | def convert_to_en_symbols(): 23 | '''Converts built-in korean symbols to english, to be used for english training 24 | 25 | ''' 26 | global _symbol_to_id, _id_to_symbol, isEn 27 | if not isEn: 28 | print(" [!] Converting to english mode") 29 | _symbol_to_id = {s: i for i, s in enumerate(en_symbols)} 30 | _id_to_symbol = {i: s for i, s in enumerate(en_symbols)} 31 | isEn = True 32 | 33 | 34 | def remove_puncuations(text): 35 | return text.translate(puncuation_table) 36 | 37 | 38 | # def text_to_sequence(text, as_token=False): 39 | # cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 40 | # if ('english_cleaners' in cleaner_names) and isEn == False: 41 | # convert_to_en_symbols() 42 | # else: 43 | # 44 | # return _text_to_sequence(text, cleaner_names, as_token) 45 | 46 | 47 | def text_to_sequence(text, cleaner_names): 48 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 49 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 50 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 51 | Args: 52 | text: string to convert to a sequence 53 | cleaner_names: names of the cleaner functions to run the text through 54 | Returns: 55 | List of integers corresponding to the symbols in the text 56 | ''' 57 | sequence = [] 58 | 59 | # Check for curly braces and treat their contents as ARPAbet: 60 | while len(text): 61 | m = _curly_re.match(text) 62 | if not m: 63 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 64 | break 65 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 66 | sequence += _arpabet_to_sequence(m.group(2)) 67 | text = m.group(3) 68 | 69 | # Append EOS token 70 | sequence.append(_symbol_to_id[EOS]) # [14, 29, 45, 2, 27, 62, 20, 21, 4, 39, 45, 1] 71 | 72 | # if as_token: 73 | # return sequence_to_text(sequence, combine_jamo=True) 74 | # else: 75 | return np.array(sequence, dtype=np.int32) 76 | 77 | 78 | def sequence_to_text(sequence, skip_eos_and_pad=False, combine_jamo=False): 79 | '''Converts a sequence of IDs back to a string''' 80 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 81 | if 'english_cleaners' in cleaner_names and isEn == False: 82 | convert_to_en_symbols() 83 | 84 | result = '' 85 | for symbol_id in sequence: 86 | if symbol_id in _id_to_symbol: 87 | s = _id_to_symbol[symbol_id] 88 | # Enclose ARPAbet back in curly braces: 89 | if len(s) > 1 and s[0] == '@': 90 | s = '{%s}' % s[1:] 91 | 92 | if not skip_eos_and_pad or s not in [EOS, PAD]: 93 | result += s 94 | 95 | result = result.replace('}{', ' ') 96 | 97 | if combine_jamo: 98 | return jamo_to_korean(result) 99 | else: 100 | return result 101 | 102 | 103 | def _clean_text(text, cleaner_names): 104 | for name in cleaner_names: 105 | cleaner = getattr(cleaners, name) 106 | if not cleaner: 107 | raise Exception('Unknown cleaner: %s' % name) 108 | text = cleaner(text) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~'] 109 | return text 110 | 111 | 112 | def _symbols_to_sequence(symbols): 113 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 114 | 115 | 116 | def _arpabet_to_sequence(text): 117 | return _symbols_to_sequence(['@' + s for s in text.split()]) 118 | 119 | 120 | def _should_keep_symbol(s): 121 | return s in _symbol_to_id and s is not '_' and s is not '~' 122 | -------------------------------------------------------------------------------- /text/kor_dic.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | etc_dictionary = { 4 | '2 30대': '이삼십대', 5 | '20~30대': '이삼십대', 6 | '20, 30대': '이십대 삼십대', 7 | '1+1': '원플러스원', 8 | '3에서 6개월인': '3개월에서 육개월인', 9 | } 10 | 11 | english_dictionary = { 12 | 'Devsisters': '데브시스터즈', 13 | 'track': '트랙', 14 | 15 | # krbook 16 | 'LA': '엘에이', 17 | 'LG': '엘지', 18 | 'KOREA': '코리아', 19 | 'JSA': '제이에스에이', 20 | 'PGA': '피지에이', 21 | 'GA': '지에이', 22 | 'idol': '아이돌', 23 | 'KTX': '케이티엑스', 24 | 'AC': '에이씨', 25 | 'DVD': '디비디', 26 | 'US': '유에스', 27 | 'CNN': '씨엔엔', 28 | 'LPGA': '엘피지에이', 29 | 'P': '피', 30 | 'L': '엘', 31 | 'T': '티', 32 | 'B': '비', 33 | 'C': '씨', 34 | 'BIFF': '비아이에프에프', 35 | 'GV': '지비', 36 | 37 | # JTBC 38 | 'IT': '아이티', 39 | 'IQ': '아이큐', 40 | 'JTBC': '제이티비씨', 41 | 'trickle down effect': '트리클 다운 이펙트', 42 | 'trickle up effect': '트리클 업 이펙트', 43 | 'down': '다운', 44 | 'up': '업', 45 | 'FCK': '에프씨케이', 46 | 'AP': '에이피', 47 | 'WHERETHEWILDTHINGSARE': '', 48 | 'Rashomon Effect': '', 49 | 'O': '오', 50 | 'OO': '오오', 51 | 'B': '비', 52 | 'GDP': '지디피', 53 | 'CIPA': '씨아이피에이', 54 | 'YS': '와이에스', 55 | 'Y': '와이', 56 | 'S': '에스', 57 | 'JTBC': '제이티비씨', 58 | 'PC': '피씨', 59 | 'bill': '빌', 60 | 'Halmuny': '하모니', ##### 61 | 'X': '엑스', 62 | 'SNS': '에스엔에스', 63 | 'ability': '어빌리티', 64 | 'shy': '', 65 | 'CCTV': '씨씨티비', 66 | 'IT': '아이티', 67 | 'the tenth man': '더 텐쓰 맨', #### 68 | 'L': '엘', 69 | 'PC': '피씨', 70 | 'YSDJJPMB': '', ######## 71 | 'Content Attitude Timing': '컨텐트 애티튜드 타이밍', 72 | 'CAT': '캣', 73 | 'IS': '아이에스', 74 | 'SNS': '에스엔에스', 75 | 'K': '케이', 76 | 'Y': '와이', 77 | 'KDI': '케이디아이', 78 | 'DOC': '디오씨', 79 | 'CIA': '씨아이에이', 80 | 'PBS': '피비에스', 81 | 'D': '디', 82 | 'PPropertyPositionPowerPrisonP' 83 | 'S': '에스', 84 | 'francisco': '프란시스코', 85 | 'I': '아이', 86 | 'III': '아이아이', ###### 87 | 'No joke': '노 조크', 88 | 'BBK': '비비케이', 89 | 'LA': '엘에이', 90 | 'Don': '', 91 | 't worry be happy': ' 워리 비 해피', 92 | 'NO': '엔오', ##### 93 | 'it was our sky': '잇 워즈 아워 스카이', 94 | 'it is our sky': '잇 이즈 아워 스카이', #### 95 | 'NEIS': '엔이아이에스', ##### 96 | 'IMF': '아이엠에프', 97 | 'apology': '어폴로지', 98 | 'humble': '험블', 99 | 'M': '엠', 100 | 'Nowhere Man': '노웨어 맨', 101 | 'The Tenth Man': '더 텐쓰 맨', 102 | 'PBS': '피비에스', 103 | 'BBC': '비비씨', 104 | 'MRJ': '엠알제이', 105 | 'CCTV': '씨씨티비', 106 | 'Pick me up': '픽 미 업', 107 | 'DNA': '디엔에이', 108 | 'UN': '유엔', 109 | 'STOP': '스탑', ##### 110 | 'PRESS': '프레스', ##### 111 | 'not to be': '낫 투비', 112 | 'Denial': '디나이얼', 113 | 'G': '지', 114 | 'IMF': '아이엠에프', 115 | 'GDP': '지디피', 116 | 'JTBC': '제이티비씨', 117 | 'Time flies like an arrow': '타임 플라이즈 라이크 언 애로우', 118 | 'DDT': '디디티', 119 | 'AI': '에이아이', 120 | 'Z': '제트', 121 | 'OECD': '오이씨디', 122 | 'N': '앤', 123 | 'A': '에이', 124 | 'MB': '엠비', 125 | 'EH': '이에이치', 126 | 'IS': '아이에스', 127 | 'TV': '티비', 128 | 'MIT': '엠아이티', 129 | 'KBO': '케이비오', 130 | 'I love America': '아이 러브 아메리카', 131 | 'SF': '에스에프', 132 | 'Q': '큐', 133 | 'KFX': '케이에프엑스', 134 | 'PM': '피엠', 135 | 'Prime Minister': '프라임 미니스터', 136 | 'Swordline': '스워드라인', 137 | 'TBS': '티비에스', 138 | 'DDT': '디디티', 139 | 'CS': '씨에스', 140 | 'Reflecting Absence': '리플렉팅 앱센스', 141 | 'PBS': '피비에스', 142 | 'Drum being beaten by everyone': '드럼 빙 비튼 바이 에브리원', 143 | 'negative pressure': '네거티브 프레셔', 144 | 'F': '에프', 145 | 'KIA': '기아', 146 | 'FTA': '에프티에이', 147 | 'Que sais-je': '', 148 | 'UFC': '유에프씨', 149 | 'P': '피', 150 | 'DJ': '디제이', 151 | 'Chaebol': '채벌', 152 | 'BBC': '비비씨', 153 | 'OECD': '오이씨디', 154 | 'BC': '삐씨', 155 | 'C': '씨', 156 | 'B': '씨', 157 | 'KY': '케이와이', 158 | 'K': '케이', 159 | 'CEO': '씨이오', 160 | 'YH': '와이에치', 161 | 'IS': '아이에스', 162 | 'who are you': '후 얼 유', 163 | 'Y': '와이', 164 | 'The Devils Advocate': '더 데빌즈 어드보카트', 165 | 'YS': '와이에스', 166 | 'so sorry': '쏘 쏘리', 167 | 'Santa': '산타', 168 | 'Big Endian': '빅 엔디안', 169 | 'Small Endian': '스몰 엔디안', 170 | 'Oh Captain My Captain': '오 캡틴 마이 캡틴', 171 | 'AIB': '에이아이비', 172 | 'K': '케이', 173 | 'PBS': '피비에스', 174 | } -------------------------------------------------------------------------------- /LJSpeech-1.1/README: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------------- 2 | The LJ Speech Dataset 3 | 4 | Version 1.0 5 | July 5, 2017 6 | https://keithito.com/LJ-Speech-Dataset 7 | ----------------------------------------------------------------------------- 8 | 9 | 10 | OVERVIEW 11 | 12 | This is a public domain speech dataset consisting of 13,100 short audio clips 13 | of a single speaker reading passages from 7 non-fiction books. A transcription 14 | is provided for each clip. Clips vary in length from 1 to 10 seconds and have 15 | a total length of approximately 24 hours. 16 | 17 | The texts were published between 1884 and 1964, and are in the public domain. 18 | The audio was recorded in 2016-17 by the LibriVox project and is also in the 19 | public domain. 20 | 21 | 22 | 23 | FILE FORMAT 24 | 25 | Metadata is provided in metadata.csv. This file consists of one record per 26 | line, delimited by the pipe character (0x7c). The fields are: 27 | 28 | 1. ID: this is the name of the corresponding .wav file 29 | 2. Transcription: words spoken by the reader (UTF-8) 30 | 3. Normalized Transcription: transcription with numbers, ordinals, and 31 | monetary units expanded into full words (UTF-8). 32 | 33 | Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 34 | 22050 Hz. 35 | 36 | 37 | 38 | STATISTICS 39 | 40 | Total Clips 13,100 41 | Total Words 225,715 42 | Total Characters 1,308,674 43 | Total Duration 23:55:17 44 | Mean Clip Duration 6.57 sec 45 | Min Clip Duration 1.11 sec 46 | Max Clip Duration 10.10 sec 47 | Mean Words per Clip 17.23 48 | Distinct Words 13,821 49 | 50 | 51 | 52 | MISCELLANEOUS 53 | 54 | The audio clips range in length from approximately 1 second to 10 seconds. 55 | They were segmented automatically based on silences in the recording. Clip 56 | boundaries generally align with sentence or clause boundaries, but not always. 57 | 58 | The text was matched to the audio manually, and a QA pass was done to ensure 59 | that the text accurately matched the words spoken in the audio. 60 | 61 | The original LibriVox recordings were distributed as 128 kbps MP3 files. As a 62 | result, they may contain artifacts introduced by the MP3 encoding. 63 | 64 | The following abbreviations appear in the text. They may be expanded as 65 | follows: 66 | 67 | Abbreviation Expansion 68 | -------------------------- 69 | Mr. Mister 70 | Mrs. Misess (*) 71 | Dr. Doctor 72 | No. Number 73 | St. Saint 74 | Co. Company 75 | Jr. Junior 76 | Maj. Major 77 | Gen. General 78 | Drs. Doctors 79 | Rev. Reverend 80 | Lt. Lieutenant 81 | Hon. Honorable 82 | Sgt. Sergeant 83 | Capt. Captain 84 | Esq. Esquire 85 | Ltd. Limited 86 | Col. Colonel 87 | Ft. Fort 88 | 89 | * there's no standard expansion of "Mrs." 90 | 91 | 92 | 19 of the transcriptions contain non-ASCII characters (for example, LJ016-0257 93 | contains "raison d'être"). 94 | 95 | For more information or to report errors, please email kito@kito.us. 96 | 97 | 98 | 99 | LICENSE 100 | 101 | This dataset is in the public domain in the USA (and likely other countries as 102 | well). There are no restrictions on its use. For more information, please see: 103 | https://librivox.org/pages/public-domain. 104 | 105 | 106 | CHANGELOG 107 | 108 | * 1.0 (July 8, 2017): 109 | Initial release 110 | 111 | * 1.1 (Feb 19, 2018): 112 | Version 1.0 included 30 .wav files with no corresponding annotations in 113 | metadata.csv. These have been removed in version 1.1. Thanks to Rafael Valle 114 | for spotting this. 115 | 116 | 117 | CREDITS 118 | 119 | This dataset consists of excerpts from the following works: 120 | 121 | * Morris, William, et al. Arts and Crafts Essays. 1893. 122 | * Griffiths, Arthur. The Chronicles of Newgate, Vol. 2. 1884. 123 | * Roosevelt, Franklin D. The Fireside Chats of Franklin Delano Roosevelt. 124 | 1933-42. 125 | * Harland, Marion. Marion Harland's Cookery for Beginners. 1893. 126 | * Rolt-Wheeler, Francis. The Science - History of the Universe, Vol. 5: 127 | Biology. 1910. 128 | * Banks, Edgar J. The Seven Wonders of the Ancient World. 1916. 129 | * President's Commission on the Assassination of President Kennedy. Report 130 | of the President's Commission on the Assassination of President Kennedy. 131 | 1964. 132 | 133 | Recordings by Linda Johnson. Alignment and annotation by Keith Ito. All text, 134 | audio, and annotations are in the public domain. 135 | 136 | There's no requirement to cite this work, but if you'd like to do so, you can 137 | link to: https://keithito.com/LJ-Speech-Dataset 138 | 139 | or use the following: 140 | @misc{ljspeech17, 141 | author = {Keith Ito}, 142 | title = {The LJ Speech Dataset}, 143 | howpublished = {\url{https://keithito.com/LJ-Speech-Dataset/}}, 144 | year = 2017 145 | } 146 | -------------------------------------------------------------------------------- /util/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import math 4 | import numpy as np 5 | import tensorflow as tf 6 | import scipy 7 | from hparams import hparams 8 | 9 | 10 | def load_wav(path): 11 | return librosa.core.load(path, sr=hparams.sample_rate)[0] 12 | 13 | 14 | def save_wav(wav, path): 15 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 16 | scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16)) 17 | 18 | 19 | def preemphasis(x): 20 | return scipy.signal.lfilter([1, -hparams.preemphasis], [1], x) 21 | 22 | 23 | def inv_preemphasis(x): 24 | return scipy.signal.lfilter([1], [1, -hparams.preemphasis], x) 25 | 26 | 27 | def spectrogram(y): 28 | D = _stft(preemphasis(y)) 29 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 30 | return _normalize(S) 31 | 32 | 33 | def inv_spectrogram(spectrogram): 34 | '''Converts spectrogram to waveform using librosa''' 35 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 36 | return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase 37 | 38 | 39 | def inv_spectrogram_tensorflow(spectrogram): 40 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow. 41 | 42 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call 43 | inv_preemphasis on the output after running the graph. 44 | ''' 45 | S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) 46 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) 47 | 48 | 49 | def melspectrogram(y): 50 | D = _stft(preemphasis(y)) 51 | S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db 52 | return _normalize(S) 53 | 54 | 55 | def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): 56 | window_length = int(hparams.sample_rate * min_silence_sec) 57 | hop_length = int(window_length / 4) 58 | threshold = _db_to_amp(threshold_db) 59 | for x in range(hop_length, len(wav) - window_length, hop_length): 60 | if np.max(wav[x:x+window_length]) < threshold: 61 | return x + hop_length 62 | return len(wav) 63 | 64 | 65 | def _griffin_lim(S): 66 | '''librosa implementation of Griffin-Lim 67 | Based on https://github.com/librosa/librosa/issues/434 68 | ''' 69 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 70 | S_complex = np.abs(S).astype(np.complex) 71 | y = _istft(S_complex * angles) 72 | for i in range(hparams.griffin_lim_iters): 73 | angles = np.exp(1j * np.angle(_stft(y))) 74 | y = _istft(S_complex * angles) 75 | return y 76 | 77 | 78 | def _griffin_lim_tensorflow(S): 79 | '''TensorFlow implementation of Griffin-Lim 80 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb 81 | ''' 82 | with tf.variable_scope('griffinlim'): 83 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 84 | S = tf.expand_dims(S, 0) 85 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 86 | y = _istft_tensorflow(S_complex) 87 | for i in range(hparams.griffin_lim_iters): 88 | est = _stft_tensorflow(y) 89 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 90 | y = _istft_tensorflow(S_complex * angles) 91 | return tf.squeeze(y, 0) 92 | 93 | 94 | def _stft(y): 95 | n_fft, hop_length, win_length = _stft_parameters() 96 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 97 | 98 | 99 | def _istft(y): 100 | _, hop_length, win_length = _stft_parameters() 101 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 102 | 103 | 104 | def _stft_tensorflow(signals): 105 | n_fft, hop_length, win_length = _stft_parameters() 106 | return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False) 107 | 108 | 109 | def _istft_tensorflow(stfts): 110 | n_fft, hop_length, win_length = _stft_parameters() 111 | return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) 112 | 113 | 114 | def _stft_parameters(): 115 | n_fft = (hparams.num_freq - 1) * 2 116 | hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 117 | win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) 118 | return n_fft, hop_length, win_length 119 | 120 | 121 | # Conversions: 122 | 123 | _mel_basis = None 124 | 125 | def _linear_to_mel(spectrogram): 126 | global _mel_basis 127 | if _mel_basis is None: 128 | _mel_basis = _build_mel_basis() 129 | return np.dot(_mel_basis, spectrogram) 130 | 131 | def _build_mel_basis(): 132 | n_fft = (hparams.num_freq - 1) * 2 133 | return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 134 | 135 | def _amp_to_db(x): 136 | return 20 * np.log10(np.maximum(1e-5, x)) 137 | 138 | def _db_to_amp(x): 139 | return np.power(10.0, x * 0.05) 140 | 141 | def _db_to_amp_tensorflow(x): 142 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 143 | 144 | def _normalize(S): 145 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 146 | 147 | def _denormalize(S): 148 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 149 | 150 | def _denormalize_tensorflow(S): 151 | return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 152 | -------------------------------------------------------------------------------- /models/tacotron.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper, LSTMCell 3 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, AttentionWrapper 4 | from text.symbols import symbols 5 | from util.infolog import log 6 | from .helpers import TacoTestHelper, TacoTrainingHelper 7 | from .modules import encoder_cbhg, post_cbhg, prenet 8 | from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper 9 | 10 | 11 | class Tacotron(): 12 | def __init__(self, hparams): 13 | self._hparams = hparams 14 | 15 | def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None): 16 | 17 | with tf.variable_scope('embedding') as scope: 18 | is_training = linear_targets is not None 19 | batch_size = tf.shape(inputs)[0] 20 | hp = self._hparams 21 | 22 | # Embeddings 23 | embedding_table = tf.get_variable( 24 | 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, 25 | initializer=tf.truncated_normal_initializer(stddev=0.5)) 26 | embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=512] 27 | 28 | with tf.variable_scope('encoder') as scope: 29 | x = embedded_inputs 30 | for i in range(hp.encoder_stack_size): 31 | x = tf.layers.conv1d(x, 32 | filters=hp.encoder_conv_filter, 33 | kernel_size=hp.encoder_conv_kernel, 34 | padding='same', 35 | activation=tf.nn.relu) 36 | x = tf.layers.batch_normalization(x, training=is_training) 37 | 38 | lstm_fw = LSTMCell(hp.encoder_lstm_hidden_dim) 39 | lstm_bw = LSTMCell(hp.encoder_lstm_hidden_dim) 40 | 41 | encoder_conv_output = x 42 | outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw, 43 | lstm_bw, 44 | encoder_conv_output, 45 | sequence_length=input_lengths, 46 | dtype=tf.float32) # [N, T_in, 512] 47 | encoder_output = tf.concat(outputs, axis=2) 48 | 49 | # with tf.variable_scope('decoder') as scope: 50 | 51 | 52 | self.inputs = inputs 53 | self.input_lengths = input_lengths 54 | # self.mel_outputs = mel_outputs 55 | # self.linear_outputs = linear_outputs 56 | # self.alignments = alignments 57 | self.mel_targets = mel_targets 58 | self.linear_targets = linear_targets 59 | log('Initialized Tacotron model. Dimensions: ') 60 | log(' embedding: %d' % embedded_inputs.shape[-1]) 61 | log(' encoder out: %d' % encoder_output.shape[-1]) 62 | # log(' attention out: %d' % attention_cell.output_size) 63 | # log(' concat attn & out: %d' % concat_cell.output_size) 64 | # log(' decoder cell out: %d' % decoder_cell.output_size) 65 | # log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) 66 | # log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) 67 | # log(' postnet out: %d' % post_outputs.shape[-1]) 68 | # log(' linear out: %d' % linear_outputs.shape[-1]) 69 | 70 | # def add_loss(self): 71 | # '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 72 | # with tf.variable_scope('loss') as scope: 73 | # hp = self._hparams 74 | # self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs)) 75 | # l1 = tf.abs(self.linear_targets - self.linear_outputs) 76 | # # Prioritize loss for frequencies under 3000 Hz. 77 | # n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq) 78 | # self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:, :, 0:n_priority_freq]) 79 | # self.loss = self.mel_loss + self.linear_loss 80 | # 81 | # def add_optimizer(self, global_step): 82 | # '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 83 | # 84 | # Args: 85 | # global_step: int32 scalar Tensor representing current global step in training 86 | # ''' 87 | # with tf.variable_scope('optimizer') as scope: 88 | # hp = self._hparams 89 | # if hp.decay_learning_rate: 90 | # self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step) 91 | # else: 92 | # self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) 93 | # optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) 94 | # gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 95 | # self.gradients = gradients 96 | # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) 97 | # 98 | # # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 99 | # # https://github.com/tensorflow/tensorflow/issues/1122 100 | # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 101 | # self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 102 | # global_step=global_step) 103 | 104 | 105 | def _learning_rate_decay(init_lr, global_step): 106 | # Noam scheme from tensor2tensor: 107 | warmup_steps = 4000.0 108 | step = tf.cast(global_step + 1, dtype=tf.float32) 109 | return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) 110 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datetime import datetime 3 | import math 4 | import os 5 | import subprocess 6 | import time 7 | import tensorflow as tf 8 | import traceback 9 | import sys 10 | from datasets.datafeeder import DataFeeder 11 | from hparams import hparams, hparams_debug_string 12 | from models import create_model 13 | from text import sequence_to_text 14 | from util import audio, infolog, plot, ValueWindow 15 | 16 | log = infolog.log 17 | 18 | 19 | def get_git_commit(): 20 | subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD']) # Verify client is clean 21 | commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10] 22 | log('Git commit: %s' % commit) 23 | return commit 24 | 25 | 26 | def add_stats(model): 27 | with tf.variable_scope('stats') as scope: 28 | tf.summary.histogram('linear_outputs', model.linear_outputs) 29 | tf.summary.histogram('linear_targets', model.linear_targets) 30 | tf.summary.histogram('mel_outputs', model.mel_outputs) 31 | tf.summary.histogram('mel_targets', model.mel_targets) 32 | tf.summary.scalar('loss_mel', model.mel_loss) 33 | tf.summary.scalar('loss_linear', model.linear_loss) 34 | tf.summary.scalar('learning_rate', model.learning_rate) 35 | tf.summary.scalar('loss', model.loss) 36 | gradient_norms = [tf.norm(grad) for grad in model.gradients] 37 | tf.summary.histogram('gradient_norm', gradient_norms) 38 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) 39 | return tf.summary.merge_all() 40 | 41 | 42 | def time_string(): 43 | return datetime.now().strftime('%Y-%m-%d %H:%M') 44 | 45 | 46 | def train(log_dir, args): 47 | commit = get_git_commit() if args.git else 'None' 48 | checkpoint_path = os.path.join(log_dir, 'model.ckpt') 49 | input_path = os.path.join(args.base_dir, args.input) 50 | log('Checkpoint path: %s' % checkpoint_path) 51 | log('Loading training data from: %s' % input_path) 52 | log('Using model: %s' % args.model) 53 | log(hparams_debug_string()) 54 | 55 | # Set up DataFeeder: 56 | coord = tf.train.Coordinator() 57 | with tf.variable_scope('datafeeder') as scope: 58 | feeder = DataFeeder(coord, input_path, hparams) 59 | 60 | # Set up model: 61 | global_step = tf.Variable(0, name='global_step', trainable=False) 62 | with tf.variable_scope('model') as scope: 63 | model = create_model(args.model, hparams) 64 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.stop_token_targets) 65 | model.add_loss() 66 | model.add_optimizer(global_step) 67 | stats = add_stats(model) 68 | 69 | # Bookkeeping: 70 | step = 0 71 | time_window = ValueWindow(100) 72 | loss_window = ValueWindow(100) 73 | saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2) 74 | 75 | # Train! 76 | with tf.Session() as sess: 77 | try: 78 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 79 | sess.run(tf.global_variables_initializer()) 80 | 81 | if args.restore_step: 82 | # Restore from a checkpoint if the user requested it. 83 | restore_path = '%s-%d' % (checkpoint_path, args.restore_step) 84 | saver.restore(sess, restore_path) 85 | log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) 86 | else: 87 | log('Starting new training run at commit: %s' % commit, slack=True) 88 | 89 | feeder.start_in_session(sess) 90 | 91 | while not coord.should_stop(): 92 | start_time = time.time() 93 | step, loss, opt = sess.run([global_step, model.loss, model.optimize]) 94 | time_window.append(time.time() - start_time) 95 | loss_window.append(loss) 96 | message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( 97 | step, time_window.average, loss, loss_window.average) 98 | log(message, slack=(step % args.checkpoint_interval == 0)) 99 | 100 | if loss > 100 or math.isnan(loss): 101 | log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) 102 | raise Exception('Loss Exploded') 103 | 104 | if step % args.summary_interval == 0: 105 | log('Writing summary at step: %d' % step) 106 | summary_writer.add_summary(sess.run(stats), step) 107 | 108 | if step % args.checkpoint_interval == 0: 109 | log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) 110 | saver.save(sess, checkpoint_path, global_step=step) 111 | log('Saving audio and alignment...') 112 | input_seq, spectrogram, alignment = sess.run([ 113 | model.inputs[0], model.linear_outputs[0], model.alignments[0]]) 114 | waveform = audio.inv_spectrogram(spectrogram.T) 115 | audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step)) 116 | plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step), 117 | info='%s, %s, %s, step=%d, loss=%.5f' % ( 118 | args.model, commit, time_string(), step, loss)) 119 | log('Input: %s' % sequence_to_text(input_seq)) 120 | 121 | except Exception as e: 122 | log('Exiting due to exception: %s' % e, slack=True) 123 | traceback.print_exc() 124 | coord.request_stop(e) 125 | 126 | 127 | def main(): 128 | parser = argparse.ArgumentParser() 129 | parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron/Tacotron2/')) 130 | parser.add_argument('--input', default='training/train.txt') 131 | parser.add_argument('--model', default='tacotron') 132 | parser.add_argument('--name', help='Name of the run. Used for logging. Defaults to model name.') 133 | parser.add_argument('--hparams', default='', 134 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 135 | parser.add_argument('--restore_step', type=int, help='Global step to restore from checkpoint.') 136 | parser.add_argument('--summary_interval', type=int, default=100, 137 | help='Steps between running summary ops.') 138 | parser.add_argument('--checkpoint_interval', type=int, default=1000, 139 | help='Steps between writing checkpoints.') 140 | parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.') 141 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.') 142 | parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.') 143 | parser.add_argument('--gpu', default='1') 144 | args = parser.parse_args() 145 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level) 146 | 147 | 148 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 149 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 150 | run_name = args.name or args.model 151 | hparams.parse(args.hparams) 152 | attention_name = hparams.attention_type 153 | print(attention_name) 154 | log_dir = os.path.join(args.base_dir, 'logs-%s-%s' % (run_name, attention_name)) 155 | os.makedirs(log_dir, exist_ok=True) 156 | infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url) 157 | train(log_dir, args) 158 | 159 | 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /datasets/datafeeder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import random 4 | import tensorflow as tf 5 | import threading 6 | import time 7 | import traceback 8 | from text import text_to_sequence 9 | from util.infolog import log 10 | 11 | _batches_per_group = 32 12 | # _p_cmudict = 0.5 13 | _pad = 0 14 | _stop_token_pad = 1 15 | 16 | 17 | class DataFeeder(threading.Thread): 18 | '''Feeds batches of data into a queue on a background thread.''' 19 | 20 | def __init__(self, coordinator, metadata_filename, hparams): 21 | super(DataFeeder, self).__init__() 22 | self._coord = coordinator 23 | self._hparams = hparams 24 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 25 | self._offset = 0 26 | 27 | # Load metadata: 28 | self._datadir = os.path.dirname(metadata_filename) 29 | with open(metadata_filename, encoding='utf-8') as f: 30 | self._metadata = [line.strip().split('|') for line in f] 31 | hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) 32 | log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) 33 | 34 | # Create placeholders for inputs and targets. Don't specify batch size because we want to 35 | # be able to feed different sized batches at eval time. 36 | self._placeholders = [ 37 | tf.placeholder(tf.int32, [None, None], 'inputs'), 38 | tf.placeholder(tf.int32, [None], 'input_lengths'), 39 | tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), 40 | tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), 41 | tf.placeholder(tf.float32, [None, None], 'stop_token_targets') 42 | ] 43 | 44 | # Create queue for buffering data: 45 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32], name='input_queue') 46 | self._enqueue_op = queue.enqueue(self._placeholders) 47 | self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.stop_token_targets = queue.dequeue() 48 | self.inputs.set_shape(self._placeholders[0].shape) 49 | self.input_lengths.set_shape(self._placeholders[1].shape) 50 | self.mel_targets.set_shape(self._placeholders[2].shape) 51 | self.linear_targets.set_shape(self._placeholders[3].shape) 52 | self.stop_token_targets.set_shape(self._placeholders[4].shape) 53 | self._cmudict = None 54 | 55 | # # Load CMUDict: If enabled, this will randomly substitute some words in the training data with 56 | # # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for 57 | # # synthesis (useful for proper nouns, etc.) 58 | # if hparams.use_cmudict: 59 | # cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') 60 | # if not os.path.isfile(cmudict_path): 61 | # raise Exception('If use_cmudict=True, you must download ' + 62 | # 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) 63 | # self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) 64 | # log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) 65 | # else: 66 | # self._cmudict = None 67 | 68 | def start_in_session(self, session): 69 | self._session = session 70 | self.start() 71 | 72 | def run(self): 73 | try: 74 | while not self._coord.should_stop(): 75 | self._enqueue_next_group() 76 | except Exception as e: 77 | traceback.print_exc() 78 | self._coord.request_stop(e) 79 | 80 | def _enqueue_next_group(self): 81 | start = time.time() 82 | 83 | # Read a group of examples: 84 | n = self._hparams.batch_size 85 | r = self._hparams.outputs_per_step 86 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 87 | 88 | # Bucket examples based on similar output sequence length for efficiency: 89 | examples.sort(key=lambda x: x[-1]) 90 | batches = [examples[i:i + n] for i in range(0, len(examples), n)] 91 | random.shuffle(batches) 92 | 93 | log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) 94 | for batch in batches: 95 | feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) 96 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 97 | 98 | def _get_next_example(self): 99 | '''Loads a single example (input, mel_target, linear_target, cost) from disk''' 100 | if self._offset >= len(self._metadata): 101 | self._offset = 0 102 | random.shuffle(self._metadata) 103 | meta = self._metadata[self._offset] 104 | self._offset += 1 105 | 106 | text = meta[3] 107 | # if self._cmudict and random.random() < _p_cmudict: 108 | # text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')]) 109 | 110 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 111 | linear_target = np.load(os.path.join(self._datadir, meta[0])) 112 | mel_target = np.load(os.path.join(self._datadir, meta[1])) 113 | stop_token_target = np.asarray([0.] * len(mel_target)) 114 | return (input_data, mel_target, linear_target, stop_token_target, len(linear_target)) 115 | 116 | def _maybe_get_arpabet(self, word): 117 | arpabet = self._cmudict.lookup(word) 118 | return '{%s}' % arpabet[0] if arpabet is not None and random.random() < 0.5 else word 119 | 120 | 121 | def _prepare_batch(batch, outputs_per_step): 122 | random.shuffle(batch) 123 | inputs = _prepare_inputs([x[0] for x in batch]) 124 | input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) 125 | mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step) 126 | linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step) 127 | stop_token_targets = _prepare_stop_token_targets([x[3] for x in batch], outputs_per_step) 128 | return (inputs, input_lengths, mel_targets, linear_targets, stop_token_targets) 129 | 130 | 131 | def _prepare_inputs(inputs): 132 | max_len = max((len(x) for x in inputs)) 133 | return np.stack([_pad_input(x, max_len) for x in inputs]) 134 | 135 | 136 | def _prepare_targets(targets, alignment): 137 | max_len = max((len(t) for t in targets)) + 1 138 | return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) 139 | 140 | def _prepare_stop_token_targets(targets, alignment): 141 | max_len = max((len(t) for t in targets)) + 1 142 | return np.stack([_pad_stop_token_target(t, _round_up(max_len, alignment)) for t in targets]) 143 | 144 | def _pad_input(x, length): 145 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 146 | 147 | 148 | def _pad_target(t, length): 149 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_pad) 150 | 151 | def _pad_stop_token_target(t, length): 152 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_stop_token_pad) 153 | 154 | def _round_up(x, multiple): 155 | remainder = x % multiple 156 | return x if remainder == 0 else x + multiple - remainder 157 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Code based on carpedm20 3 | 4 | import re 5 | import os 6 | import ast 7 | import json 8 | from jamo import hangul_to_jamo, h2j, j2h 9 | 10 | from .kor_dic import english_dictionary, etc_dictionary 11 | 12 | PAD = '_' 13 | EOS = '~' 14 | PUNC = '!\'(),-.:;?' 15 | SPACE = ' ' 16 | 17 | JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) 18 | JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) 19 | JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) 20 | 21 | VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE 22 | ALL_SYMBOLS = PAD + EOS + VALID_CHARS 23 | 24 | char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)} 25 | id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)} 26 | 27 | quote_checker = """([`"'"“‘])(.+?)([`"'"”’])""" 28 | 29 | 30 | def is_lead(char): 31 | return char in JAMO_LEADS 32 | 33 | 34 | def is_vowel(char): 35 | return char in JAMO_VOWELS 36 | 37 | 38 | def is_tail(char): 39 | return char in JAMO_TAILS 40 | 41 | 42 | def get_mode(char): 43 | if is_lead(char): 44 | return 0 45 | elif is_vowel(char): 46 | return 1 47 | elif is_tail(char): 48 | return 2 49 | else: 50 | return -1 51 | 52 | 53 | def _get_text_from_candidates(candidates): 54 | if len(candidates) == 0: 55 | return "" 56 | elif len(candidates) == 1: 57 | return _jamo_char_to_hcj(candidates[0]) 58 | else: 59 | return j2h(**dict(zip(["lead", "vowel", "tail"], candidates))) 60 | 61 | 62 | def jamo_to_korean(text): 63 | text = h2j(text) 64 | 65 | idx = 0 66 | new_text = "" 67 | candidates = [] 68 | 69 | while True: 70 | if idx >= len(text): 71 | new_text += _get_text_from_candidates(candidates) 72 | break 73 | 74 | char = text[idx] 75 | mode = get_mode(char) 76 | 77 | if mode == 0: 78 | new_text += _get_text_from_candidates(candidates) 79 | candidates = [char] 80 | elif mode == -1: 81 | new_text += _get_text_from_candidates(candidates) 82 | new_text += char 83 | candidates = [] 84 | else: 85 | candidates.append(char) 86 | 87 | idx += 1 88 | return new_text 89 | 90 | 91 | num_to_kor = { 92 | '0': '영', 93 | '1': '일', 94 | '2': '이', 95 | '3': '삼', 96 | '4': '사', 97 | '5': '오', 98 | '6': '육', 99 | '7': '칠', 100 | '8': '팔', 101 | '9': '구', 102 | } 103 | 104 | unit_to_kor1 = { 105 | '%': '퍼센트', 106 | 'cm': '센치미터', 107 | 'mm': '밀리미터', 108 | 'km': '킬로미터', 109 | 'kg': '킬로그람', 110 | } 111 | unit_to_kor2 = { 112 | 'm': '미터', 113 | } 114 | 115 | upper_to_kor = { 116 | 'A': '에이', 117 | 'B': '비', 118 | 'C': '씨', 119 | 'D': '디', 120 | 'E': '이', 121 | 'F': '에프', 122 | 'G': '지', 123 | 'H': '에이치', 124 | 'I': '아이', 125 | 'J': '제이', 126 | 'K': '케이', 127 | 'L': '엘', 128 | 'M': '엠', 129 | 'N': '엔', 130 | 'O': '오', 131 | 'P': '피', 132 | 'Q': '큐', 133 | 'R': '알', 134 | 'S': '에스', 135 | 'T': '티', 136 | 'U': '유', 137 | 'V': '브이', 138 | 'W': '더블유', 139 | 'X': '엑스', 140 | 'Y': '와이', 141 | 'Z': '지', 142 | } 143 | 144 | 145 | def compare_sentence_with_jamo(text1, text2): 146 | return h2j(text1) != h2j(text2) 147 | 148 | 149 | def tokenize(text, as_id=False): 150 | # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다. 151 | text = normalize(text) 152 | tokens = list(hangul_to_jamo(text)) # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~'] 153 | 154 | if as_id: 155 | return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] 156 | else: 157 | return [token for token in tokens] + [EOS] 158 | 159 | 160 | def tokenizer_fn(iterator): 161 | return (token for x in iterator for token in tokenize(x, as_id=False)) 162 | 163 | 164 | def normalize(text): 165 | text = text.strip() 166 | 167 | text = re.sub('\(\d+일\)', '', text) 168 | text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text) 169 | 170 | text = normalize_with_dictionary(text, etc_dictionary) 171 | text = normalize_english(text) 172 | text = re.sub('[a-zA-Z]+', normalize_upper, text) 173 | 174 | text = normalize_quote(text) 175 | text = normalize_number(text) 176 | 177 | return text 178 | 179 | 180 | def normalize_with_dictionary(text, dic): 181 | if any(key in text for key in dic.keys()): 182 | pattern = re.compile('|'.join(re.escape(key) for key in dic.keys())) 183 | return pattern.sub(lambda x: dic[x.group()], text) 184 | else: 185 | return text 186 | 187 | 188 | def normalize_english(text): 189 | def fn(m): 190 | word = m.group() 191 | if word in english_dictionary: 192 | return english_dictionary.get(word) 193 | else: 194 | return word 195 | 196 | text = re.sub("([A-Za-z]+)", fn, text) 197 | return text 198 | 199 | 200 | def normalize_upper(text): 201 | text = text.group(0) 202 | 203 | if all([char.isupper() for char in text]): 204 | return "".join(upper_to_kor[char] for char in text) 205 | else: 206 | return text 207 | 208 | 209 | def normalize_quote(text): 210 | def fn(found_text): 211 | from nltk import sent_tokenize # NLTK doesn't along with multiprocessing 212 | 213 | found_text = found_text.group() 214 | unquoted_text = found_text[1:-1] 215 | 216 | sentences = sent_tokenize(unquoted_text) 217 | return " ".join(["'{}'".format(sent) for sent in sentences]) 218 | 219 | return re.sub(quote_checker, fn, text) 220 | 221 | 222 | number_checker = "([+-]?\d[\d,]*)[\.]?\d*" 223 | count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)" 224 | 225 | 226 | def normalize_number(text): 227 | text = normalize_with_dictionary(text, unit_to_kor1) 228 | text = normalize_with_dictionary(text, unit_to_kor2) 229 | text = re.sub(number_checker + count_checker, 230 | lambda x: number_to_korean(x, True), text) 231 | text = re.sub(number_checker, 232 | lambda x: number_to_korean(x, False), text) 233 | return text 234 | 235 | 236 | num_to_kor1 = [""] + list("일이삼사오육칠팔구") 237 | num_to_kor2 = [""] + list("만억조경해") 238 | num_to_kor3 = [""] + list("십백천") 239 | 240 | # count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"] 241 | count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"] 242 | 243 | count_tenth_dict = { 244 | "십": "열", 245 | "두십": "스물", 246 | "세십": "서른", 247 | "네십": "마흔", 248 | "다섯십": "쉰", 249 | "여섯십": "예순", 250 | "일곱십": "일흔", 251 | "여덟십": "여든", 252 | "아홉십": "아흔", 253 | } 254 | 255 | 256 | def number_to_korean(num_str, is_count=False): 257 | if is_count: 258 | num_str, unit_str = num_str.group(1), num_str.group(2) 259 | else: 260 | num_str, unit_str = num_str.group(), "" 261 | 262 | num_str = num_str.replace(',', '') 263 | num = ast.literal_eval(num_str) 264 | 265 | if num == 0: 266 | return "영" 267 | 268 | check_float = num_str.split('.') 269 | if len(check_float) == 2: 270 | digit_str, float_str = check_float 271 | elif len(check_float) >= 3: 272 | raise Exception(" [!] Wrong number format") 273 | else: 274 | digit_str, float_str = check_float[0], None 275 | 276 | if is_count and float_str is not None: 277 | raise Exception(" [!] `is_count` and float number does not fit each other") 278 | 279 | digit = int(digit_str) 280 | 281 | if digit_str.startswith("-"): 282 | digit, digit_str = abs(digit), str(abs(digit)) 283 | 284 | kor = "" 285 | size = len(str(digit)) 286 | tmp = [] 287 | 288 | for i, v in enumerate(digit_str, start=1): 289 | v = int(v) 290 | 291 | if v != 0: 292 | if is_count: 293 | tmp += count_to_kor1[v] 294 | else: 295 | tmp += num_to_kor1[v] 296 | 297 | tmp += num_to_kor3[(size - i) % 4] 298 | 299 | if (size - i) % 4 == 0 and len(tmp) != 0: 300 | kor += "".join(tmp) 301 | tmp = [] 302 | kor += num_to_kor2[int((size - i) / 4)] 303 | 304 | if is_count: 305 | if kor.startswith("한") and len(kor) > 1: 306 | kor = kor[1:] 307 | 308 | if any(word in kor for word in count_tenth_dict): 309 | kor = re.sub( 310 | '|'.join(count_tenth_dict.keys()), 311 | lambda x: count_tenth_dict[x.group()], kor) 312 | 313 | if not is_count and kor.startswith("일") and len(kor) > 1: 314 | kor = kor[1:] 315 | 316 | if float_str is not None: 317 | kor += "쩜 " 318 | kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) 319 | 320 | if num_str.startswith("+"): 321 | kor = "플러스 " + kor 322 | elif num_str.startswith("-"): 323 | kor = "마이너스 " + kor 324 | 325 | return kor + unit_str 326 | 327 | 328 | if __name__ == "__main__": 329 | def test_normalize(text): 330 | print(text) 331 | print(normalize(text)) 332 | print("=" * 30) 333 | 334 | 335 | test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute") 336 | test_normalize("오늘(13일) 3,600마리 강아지가") 337 | test_normalize("60.3%") 338 | test_normalize('"저돌"(猪突) 입니다.') 339 | test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”') 340 | test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다") 341 | test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다") 342 | print(list(hangul_to_jamo(list(hangul_to_jamo('비대위원장이 지난 1월 이런 말을 했습니다? “난 그냥 산돼지처럼 돌파하는 스타일이다”'))))) -------------------------------------------------------------------------------- /models/tacotron2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper 3 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, AttentionWrapper, BahdanauMonotonicAttention, LuongAttention 4 | from text.symbols import symbols 5 | from util.infolog import log 6 | from .helpers import TacoTestHelper, TacoTrainingHelper 7 | from .modules import encoder_cbhg, post_cbhg, prenet, LocationSensitiveAttention, ZoneoutLSTMCell, GmmAttention, BahdanauStepwiseMonotonicAttention 8 | from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper 9 | 10 | 11 | class Tacotron2(): 12 | def __init__(self, hparams): 13 | self._hparams = hparams 14 | 15 | def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None): 16 | '''Initializes the model for inference. 17 | 18 | Sets "mel_outputs", "linear_outputs", and "alignments" fields. 19 | 20 | Args: 21 | inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of 22 | steps in the input time series, and values are character IDs 23 | input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths 24 | of each sequence in inputs. 25 | mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number 26 | of steps in the output time series, M is num_mels, and values are entries in the mel 27 | spectrogram. Only needed for training. 28 | linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number 29 | of steps in the output time series, F is num_freq, and values are entries in the linear 30 | spectrogram. Only needed for training. 31 | ''' 32 | with tf.variable_scope('inference') as scope: 33 | is_training = linear_targets is not None 34 | batch_size = tf.shape(inputs)[0] 35 | hp = self._hparams 36 | 37 | # Embeddings 38 | embedding_table = tf.get_variable( 39 | 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32, 40 | initializer=tf.truncated_normal_initializer(stddev=0.5)) 41 | 42 | embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, embed_depth=256] 43 | 44 | with tf.variable_scope('Encoder') as scope: 45 | 46 | x = embedded_inputs 47 | 48 | #3 Conv Layers 49 | for i in range(3): 50 | x = tf.layers.conv1d(x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='Encoder_{}'.format(i)) 51 | x = tf.layers.batch_normalization(x, training=is_training) 52 | x = tf.layers.dropout(x, rate=0.5, training=is_training, name='dropout_{}'.format(i)) 53 | encoder_conv_output = x 54 | 55 | #bi-directional LSTM 56 | cell_fw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM') 57 | cell_bw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM') 58 | 59 | outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32) 60 | 61 | # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512] 62 | encoder_outputs = tf.concat(outputs, axis=2) # Concat and return forward + backward outputs 63 | 64 | with tf.variable_scope('Decoder') as scope: 65 | 66 | if hp.attention_type == 'loc_sen': # Location Sensitivity Attention 67 | attention_mechanism = LocationSensitiveAttention(128, encoder_outputs,hparams=hp, is_training=is_training, 68 | mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) 69 | elif hp.attention_type == 'gmm': # GMM Attention 70 | attention_mechanism = GmmAttention(128, memory=encoder_outputs, memory_sequence_length = input_lengths) 71 | elif hp.attention_type == 'step_bah': 72 | attention_mechanism = BahdanauStepwiseMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, mode="parallel") 73 | elif hp.attention_type == 'mon_bah': 74 | attention_mechanism = BahdanauMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, normalize=True) 75 | elif hp.attention_type == 'loung': 76 | attention_mechanism = LuongAttention(128, encoder_outputs, memory_sequence_length = input_lengths) 77 | 78 | # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True) 79 | #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence. 80 | #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True) 81 | 82 | decoder_lstm = [ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i+1)) for i in range(2)] 83 | 84 | decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True) 85 | # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음 86 | 87 | attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False) 88 | 89 | # attention_state_size = 256 90 | # Decoder input -> prenet -> decoder_lstm -> concat[output, attention] 91 | dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths) 92 | dec_outputs_cell = OutputProjectionWrapper(dec_outputs,(hp.num_mels) * hp.outputs_per_step) 93 | 94 | if is_training: 95 | helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step) 96 | else: 97 | helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) 98 | 99 | decoder_init_state = dec_outputs_cell.zero_state(batch_size=batch_size, dtype=tf.float32) 100 | (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode( 101 | BasicDecoder(dec_outputs_cell, helper, decoder_init_state), 102 | maximum_iterations=hp.max_iters) # [N, T_out/r, M*r] 103 | 104 | # Reshape outputs to be one output per entry 105 | decoder_mel_outputs = tf.reshape(decoder_outputs[:,:,:hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels]) # [N, T_out, M] 106 | #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters] 107 | 108 | # Postnet 109 | x = decoder_mel_outputs 110 | for i in range(5): 111 | activation = tf.nn.tanh if i != (4) else None 112 | x = tf.layers.conv1d(x,filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i)) 113 | x = tf.layers.batch_normalization(x, training=is_training) 114 | x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i)) 115 | 116 | residual = tf.layers.dense(x, hp.num_mels, name='residual_projection') 117 | mel_outputs = decoder_mel_outputs + residual 118 | 119 | # Add post-processing CBHG: 120 | # mel_outputs: (N,T,num_mels) 121 | post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth) 122 | linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F(1025)] 123 | 124 | # Grab alignments from the final decoder state: 125 | alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) # batch_size, text length(encoder), target length(decoder) 126 | 127 | 128 | self.inputs = inputs 129 | self.input_lengths = input_lengths 130 | self.decoder_mel_outputs = decoder_mel_outputs 131 | self.mel_outputs = mel_outputs 132 | self.linear_outputs = linear_outputs 133 | self.alignments = alignments 134 | self.mel_targets = mel_targets 135 | self.linear_targets = linear_targets 136 | #self.stop_token_targets = stop_token_targets 137 | #self.stop_token_outputs = stop_token_outputs 138 | self.all_vars = tf.trainable_variables() 139 | log('Initialized Tacotron model. Dimensions: ') 140 | log(' embedding: %d' % embedded_inputs.shape[-1]) 141 | # log(' prenet out: %d' % prenet_outputs.shape[-1]) 142 | log(' encoder out: %d' % encoder_outputs.shape[-1]) 143 | log(' attention out: %d' % attention_cell.output_size) 144 | #log(' concat attn & out: %d' % concat_cell.output_size) 145 | log(' decoder cell out: %d' % dec_outputs_cell.output_size) 146 | log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1])) 147 | log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) 148 | log(' postnet out: %d' % post_outputs.shape[-1]) 149 | log(' linear out: %d' % linear_outputs.shape[-1]) 150 | 151 | def add_loss(self): 152 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 153 | with tf.variable_scope('loss') as scope: 154 | hp = self._hparams 155 | before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_mel_outputs) 156 | after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs) 157 | 158 | self.mel_loss = before + after 159 | 160 | 161 | #self.stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.stop_token_targets, logits=self.stop_token_outputs)) 162 | 163 | l1 = tf.abs(self.linear_targets - self.linear_outputs) 164 | # Prioritize loss for frequencies under 3000 Hz. 165 | n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq) 166 | self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:, :, 0:n_priority_freq]) 167 | 168 | self.regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars 169 | if not('bias' in v.name or 'Bias' in v.name or '_projection' in v.name or 'inputs_embedding' in v.name 170 | or 'RNN' in v.name or 'LSTM' in v.name)]) * hp.reg_weight 171 | self.loss = self.mel_loss + self.linear_loss + self.regularization 172 | 173 | def add_optimizer(self, global_step): 174 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 175 | 176 | Args: 177 | global_step: int32 scalar Tensor representing current global step in training 178 | ''' 179 | with tf.variable_scope('optimizer') as scope: 180 | hp = self._hparams 181 | if hp.decay_learning_rate: 182 | self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step) 183 | else: 184 | self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate) 185 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) 186 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 187 | self.gradients = gradients 188 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) 189 | 190 | # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 191 | # https://github.com/tensorflow/tensorflow/issues/1122 192 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 193 | self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 194 | global_step=global_step) 195 | 196 | 197 | def _learning_rate_decay(init_lr, global_step): 198 | # Noam scheme from tensor2tensor: 199 | warmup_steps = 4000.0 200 | step = tf.cast(global_step + 1, dtype=tf.float32) 201 | return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) 202 | -------------------------------------------------------------------------------- /models/modules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.rnn import RNNCell, GRUCell 4 | from tensorflow.python.ops import rnn_cell_impl 5 | from tensorflow.contrib.framework import nest 6 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, BahdanauMonotonicAttention, AttentionWrapperState, AttentionMechanism, _BaseMonotonicAttentionMechanism, _maybe_mask_score,_prepare_memory, _monotonic_probability_fn 7 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope, random_ops 8 | from tensorflow.python.layers.core import Dense 9 | 10 | import functools 11 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 12 | 13 | ''' 14 | Adding zoneoutLSTMcell and LocationSensitiveAttention function to existing code for Tacotron2 15 | ''' 16 | 17 | def prenet(inputs, is_training, layer_sizes, scope=None): 18 | """ 19 | Args: 20 | inputs: input vector 21 | is_training: dropout option 22 | layer_sizes: iteration number 23 | 24 | Output: 25 | x: prenet 26 | """ 27 | x = inputs 28 | drop_rate = 0.5 if is_training else 0.0 # set dropout rate 0.5 (only training) 29 | with tf.variable_scope(scope or 'prenet'): 30 | for i, size in enumerate(layer_sizes): # iterate layer_sizes 31 | dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i + 1)) 32 | x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, name='dropout_%d' % (i + 1)) 33 | return x 34 | 35 | 36 | def encoder_cbhg(inputs, input_lengths, is_training, depth): 37 | """ 38 | Args: 39 | inputs: input tensor 40 | input_lengths: length of input tensor 41 | is_training: Batch Normalization option in Conv1D 42 | depth: dimensionality option of Highway net and Bidirectical GRU's output 43 | 44 | Output: 45 | cbhg function 46 | """ 47 | input_channels = inputs.get_shape()[2] # 3rd element of inputs' shape 48 | return cbhg( 49 | inputs, 50 | input_lengths, 51 | is_training, 52 | scope='encoder_cbhg', 53 | K=16, 54 | projections=[128, input_channels], 55 | depth=depth) 56 | 57 | 58 | def post_cbhg(inputs, input_dim, is_training, depth): 59 | """ 60 | Args: 61 | inputs: input tensor 62 | input_dim: dimension of input tensor 63 | is_training: Batch Normalization option in Conv1D 64 | depth: dimensionality option of Highway net and Bidirectical GRU's output 65 | 66 | Output: 67 | cbhg function 68 | """ 69 | return cbhg( 70 | inputs, 71 | None, 72 | is_training, 73 | scope='post_cbhg', 74 | K=8, 75 | projections=[256, input_dim], 76 | depth=depth) 77 | 78 | 79 | def cbhg(inputs, input_lengths, is_training, scope, K, projections, depth): 80 | """ 81 | Args: 82 | inputs: input tensor 83 | input_lengths: length of input tensor 84 | is_training: Batch Normalization option in Conv1D 85 | scope: network or model name 86 | K: kernel size range 87 | projections: projection layers option 88 | depth: dimensionality option of Highway net and Bidirectical GRU's output 89 | The layers in the code are staked in the order in which they came out. 90 | """ 91 | with tf.variable_scope(scope): 92 | with tf.variable_scope('conv_bank'): 93 | 94 | conv_outputs = tf.concat( 95 | [conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, K + 1)], #1D Convolution layers using multiple types of Convolution Kernel. 96 | axis=-1 #Iterate K with increasing filter size by 1. 97 | )# Convolution bank: concatenate on the last axis to stack channels from all convolutions 98 | 99 | # Maxpooling: 100 | maxpool_output = tf.layers.max_pooling1d( 101 | conv_outputs, 102 | pool_size=2, 103 | strides=1, 104 | padding='same') #1D Maxpooling layer(strides=1, width=2) 105 | 106 | # Two projection layers: 107 | proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'proj_1')#1st Conv1D projections 108 | proj2_output = conv1d(proj1_output, 3, projections[1], None, is_training, 'proj_2')#2nd Conv1D projections 109 | 110 | # Residual connection: 111 | highway_input = proj2_output + inputs #Highway net input with residual connection 112 | 113 | half_depth = depth // 2 114 | assert half_depth * 2 == depth, 'encoder and postnet depths must be even.' #assert depth to be even 115 | 116 | # Handle dimensionality mismatch: 117 | if highway_input.shape[2] != half_depth: #check input's dimensionality and output's dimensionality are the same 118 | highway_input = tf.layers.dense(highway_input, half_depth) #change input's channel size to Highway net output's size 119 | 120 | # 4-layer HighwayNet: 121 | for i in range(4): 122 | highway_input = highwaynet(highway_input, 'highway_%d' % (i + 1), half_depth) #make 4 Highway net layers 123 | rnn_input = highway_input 124 | 125 | # Bidirectional GRU 126 | outputs, states = tf.nn.bidirectional_dynamic_rnn( #make Bidirectional GRU 127 | GRUCell(half_depth), 128 | GRUCell(half_depth), 129 | rnn_input, 130 | sequence_length=input_lengths, 131 | dtype=tf.float32) 132 | return tf.concat(outputs, axis=2) # Concat forward sequence and backward sequence 133 | 134 | def highwaynet(inputs, scope, depth): 135 | with tf.variable_scope(scope): 136 | H = tf.layers.dense( 137 | inputs, 138 | units=depth, 139 | activation=tf.nn.relu, 140 | name='H') 141 | T = tf.layers.dense( 142 | inputs, 143 | units=depth, 144 | activation=tf.nn.sigmoid, 145 | name='T', 146 | bias_initializer=tf.constant_initializer(-1.0)) 147 | return H * T + inputs * (1.0 - T) 148 | 149 | 150 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope): 151 | """ 152 | Args: 153 | inputs: input tensor 154 | kernel_size: length of the 1D convolution window 155 | channels: dimensionality of the output space 156 | activation: Activation function (None means linear activation) 157 | is_training: Batch Normalization option in Conv1D 158 | scope: namespace 159 | 160 | Output: 161 | output tensor 162 | """ 163 | with tf.variable_scope(scope): 164 | conv1d_output = tf.layers.conv1d( # creates a convolution kernel 165 | inputs, 166 | filters=channels, 167 | kernel_size=kernel_size, 168 | activation=activation, 169 | padding='same') # return output tensor 170 | return tf.layers.batch_normalization(conv1d_output, training=is_training) 171 | 172 | 173 | class ZoneoutLSTMCell(RNNCell): 174 | '''Wrapper for tf LSTM to create Zoneout LSTM Cell 175 | inspired by: 176 | https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py 177 | Published by one of 'https://arxiv.org/pdf/1606.01305.pdf' paper writers. 178 | ''' 179 | def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0., state_is_tuple=True, name=None): 180 | '''Initializer with possibility to set different zoneout values for cell/hidden states. 181 | ''' 182 | zm = min(zoneout_factor_output, zoneout_factor_cell) 183 | zs = max(zoneout_factor_output, zoneout_factor_cell) 184 | 185 | if zm < 0. or zs > 1.: 186 | raise ValueError('One/both provided Zoneout factors are not in [0, 1]') 187 | 188 | self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) 189 | self._zoneout_cell = zoneout_factor_cell 190 | self._zoneout_outputs = zoneout_factor_output 191 | self.is_training = is_training 192 | self.state_is_tuple = state_is_tuple 193 | 194 | @property 195 | def state_size(self): 196 | return self._cell.state_size 197 | 198 | @property 199 | def output_size(self): 200 | return self._cell.output_size 201 | 202 | def __call__(self, inputs, state, scope=None): 203 | '''Runs vanilla LSTM Cell and applies zoneout. 204 | ''' 205 | #Apply vanilla LSTM 206 | output, new_state = self._cell(inputs, state, scope) 207 | 208 | if self.state_is_tuple: 209 | (prev_c, prev_h) = state 210 | (new_c, new_h) = new_state 211 | else: 212 | num_proj = self._cell._num_units if self._cell._num_proj is None else self._cell._num_proj 213 | prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units]) 214 | prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj]) 215 | new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units]) 216 | new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj]) 217 | 218 | #Apply zoneout 219 | if self.is_training: 220 | #nn.dropout takes keep_prob (probability to keep activations) not drop_prob (probability to mask activations)! 221 | c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c # tf.nn.dropout outputs the input element scaled up by 1 / keep_prob 222 | h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h 223 | 224 | else: 225 | c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c 226 | h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h 227 | 228 | new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, h]) 229 | 230 | return output, new_state 231 | 232 | 233 | class LocationSensitiveAttention(BahdanauAttention): 234 | """Impelements Bahdanau-style (cumulative) scoring function. 235 | Usually referred to as "hybrid" attention (content-based + location-based) 236 | Extends the additive attention described in: 237 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 238 | tion by jointly learning to align and translate,” in Proceedings 239 | of ICLR, 2015." 240 | to use previous alignments as additional location features. 241 | This attention is described in: 242 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 243 | gio, “Attention-based models for speech recognition,” in Ad- 244 | vances in Neural Information Processing Systems, 2015, pp. 245 | 577–585. 246 | """ 247 | 248 | def __init__(self, 249 | num_units, 250 | memory, 251 | hparams, 252 | is_training, 253 | mask_encoder=True, 254 | memory_sequence_length=None, 255 | smoothing=False, 256 | cumulate_weights=True, 257 | name='LocationSensitiveAttention'): 258 | """Construct the Attention mechanism. 259 | Args: 260 | num_units: The depth of the query mechanism. 261 | memory: The memory to query; usually the output of an RNN encoder. This 262 | tensor should be shaped `[batch_size, max_time, ...]`. 263 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 264 | memory_sequence_length (optional): Sequence lengths for the batch entries 265 | in memory. If provided, the memory tensor rows are masked with zeros 266 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 267 | smoothing (optional): Boolean. Determines which normalization function to use. 268 | Default normalization function (probablity_fn) is softmax. If smoothing is 269 | enabled, we replace softmax with: 270 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 271 | Introduced in: 272 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 273 | gio, “Attention-based models for speech recognition,” in Ad- 274 | vances in Neural Information Processing Systems, 2015, pp. 275 | 577–585. 276 | This is mainly used if the model wants to attend to multiple input parts 277 | at the same decoding step. We probably won't be using it since multiple sound 278 | frames may depend on the same character/phone, probably not the way around. 279 | Note: 280 | We still keep it implemented in case we want to test it. They used it in the 281 | paper in the context of speech recognition, where one phoneme may depend on 282 | multiple subsequent sound frames. 283 | name: Name to use when creating ops. 284 | """ 285 | #Create normalization function 286 | #Setting it to None defaults in using softmax 287 | normalization_function = _smoothing_normalization if (smoothing == True) else None 288 | memory_length = memory_sequence_length if (mask_encoder==True) else None 289 | super(LocationSensitiveAttention, self).__init__( 290 | num_units=num_units, 291 | memory=memory, 292 | memory_sequence_length=memory_length, 293 | probability_fn=normalization_function, 294 | name=name) 295 | 296 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 297 | kernel_size=hparams.attention_kernel, padding='same', use_bias=True, 298 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution') 299 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,dtype=tf.float32, name='location_features_projection') 300 | self._cumulate = cumulate_weights 301 | self.synthesis_constraint = hparams.synthesis_constraint and not is_training 302 | self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32) 303 | self.constraint_type = hparams.synthesis_constraint_type 304 | 305 | def __call__(self, query, state): 306 | """Score the query based on the keys and values. 307 | Args: 308 | query: Tensor of dtype matching `self.values` and shape 309 | `[batch_size, query_depth]`. 310 | state (previous alignments): Tensor of dtype matching `self.values` and shape 311 | `[batch_size, alignments_size]` 312 | (`alignments_size` is memory's `max_time`). 313 | Returns: 314 | alignments: Tensor of dtype matching `self.values` and shape 315 | `[batch_size, alignments_size]` (`alignments_size` is memory's 316 | `max_time`). 317 | """ 318 | previous_alignments = state 319 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 320 | 321 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 322 | processed_query = self.query_layer(query) if self.query_layer else query 323 | # -> [batch_size, 1, attention_dim] 324 | processed_query = tf.expand_dims(processed_query, 1) 325 | 326 | # processed_location_features shape [batch_size, max_time, attention dimension] 327 | # [batch_size, max_time] -> [batch_size, max_time, 1] 328 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 329 | # location features [batch_size, max_time, filters] 330 | f = self.location_convolution(expanded_alignments) 331 | # Projected location features [batch_size, max_time, attention_dim] 332 | processed_location_features = self.location_layer(f) 333 | 334 | # energy shape [batch_size, max_time] 335 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 336 | 337 | if self.synthesis_constraint: 338 | prev_max_attentions = tf.argmax(previous_alignments, -1, output_type=tf.int32) 339 | Tx = tf.shape(energy)[-1] 340 | # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1]) 341 | if self.constraint_type == 'monotonic': 342 | key_masks = tf.sequence_mask(prev_max_attentions, Tx) 343 | reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1] 344 | else: 345 | assert self.constraint_type == 'window' 346 | key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx) 347 | reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1] 348 | 349 | masks = tf.logical_or(key_masks, reverse_masks) 350 | paddings = tf.ones_like(energy) * (-2 ** 32 + 1) # (N, Ty/r, Tx) 351 | energy = tf.where(tf.equal(masks, False), energy, paddings) 352 | 353 | # alignments shape = energy shape = [batch_size, max_time] 354 | alignments = self._probability_fn(energy, previous_alignments) 355 | 356 | # Cumulate alignments 357 | if self._cumulate: 358 | next_state = alignments + previous_alignments 359 | else: 360 | next_state = alignments 361 | 362 | return alignments, next_state 363 | 364 | 365 | def _location_sensitive_score(W_query, W_fil, W_keys): 366 | """Impelements Bahdanau-style (cumulative) scoring function. 367 | This attention is described in: 368 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 369 | gio, “Attention-based models for speech recognition,” in Ad- 370 | vances in Neural Information Processing Systems, 2015, pp. 371 | 577–585. 372 | ############################################################################# 373 | hybrid attention (content-based + location-based) 374 | f = F * α_{i-1} 375 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 376 | ############################################################################# 377 | Args: 378 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features. 379 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]' 380 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs. 381 | Returns: 382 | A '[batch_size, max_time]' attention score (energy) 383 | """ 384 | # Get the number of hidden units from the trailing dimension of keys 385 | dtype = W_query.dtype 386 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 387 | 388 | v_a = tf.get_variable( 389 | 'attention_variable_projection', shape=[num_units], dtype=dtype, 390 | initializer=tf.contrib.layers.xavier_initializer()) 391 | b_a = tf.get_variable( 392 | 'attention_bias', shape=[num_units], dtype=dtype, 393 | initializer=tf.zeros_initializer()) 394 | 395 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 396 | 397 | 398 | def _smoothing_normalization(e): 399 | """Applies a smoothing normalization function instead of softmax 400 | Introduced in: 401 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 402 | gio, “Attention-based models for speech recognition,” in Ad- 403 | vances in Neural Information Processing Systems, 2015, pp. 404 | 577–585. 405 | ############################################################################ 406 | Smoothing normalization function 407 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 408 | ############################################################################ 409 | Args: 410 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 411 | values of an attention mechanism 412 | Returns: 413 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 414 | attendance to multiple memory time steps. 415 | """ 416 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 417 | 418 | class GmmAttention(AttentionMechanism): 419 | def __init__(self, 420 | num_mixtures, 421 | memory, 422 | memory_sequence_length=None, 423 | check_inner_dims_defined=True, 424 | score_mask_value=None, 425 | name='GmmAttention'): 426 | 427 | self.dtype = memory.dtype 428 | self.num_mixtures = num_mixtures 429 | self.query_layer = tf.layers.Dense(3 * num_mixtures, name='gmm_query_projection', use_bias=True, dtype=self.dtype) 430 | 431 | with tf.name_scope(name, 'GmmAttentionMechanismInit'): 432 | if score_mask_value is None: 433 | score_mask_value = 0. 434 | self._maybe_mask_score = functools.partial( 435 | _maybe_mask_score, 436 | memory_sequence_length=memory_sequence_length, 437 | score_mask_value=score_mask_value) 438 | self._value = _prepare_memory( 439 | memory, memory_sequence_length, check_inner_dims_defined) 440 | self._batch_size = ( 441 | self._value.shape[0].value or tf.shape(self._value)[0]) 442 | self._alignments_size = ( 443 | self._value.shape[1].value or tf.shape(self._value)[1]) 444 | 445 | @property 446 | def values(self): 447 | return self._value 448 | 449 | @property 450 | def batch_size(self): 451 | return self._batch_size 452 | 453 | @property 454 | def alignments_size(self): 455 | return self._alignments_size 456 | 457 | @property 458 | def state_size(self): 459 | return self.num_mixtures 460 | 461 | def initial_alignments(self, batch_size, dtype): 462 | max_time = self._alignments_size 463 | return _zero_state_tensors(max_time, batch_size, dtype) 464 | 465 | def initial_state(self, batch_size, dtype): 466 | state_size_ = self.state_size 467 | return _zero_state_tensors(state_size_, batch_size, dtype) 468 | 469 | def __call__(self, query, state): 470 | with tf.variable_scope("GmmAttention"): 471 | previous_kappa = state 472 | 473 | params = self.query_layer(query) # query(dec_rnn_size=256) , params(num_mixtures(256)*3) 474 | alpha_hat, beta_hat, kappa_hat = tf.split(params, num_or_size_splits=3, axis=1) 475 | 476 | # [batch_size, num_mixtures, 1] 477 | alpha = tf.expand_dims(tf.exp(alpha_hat), axis=2) 478 | # softmax makes the alpha value more stable. 479 | # alpha = tf.expand_dims(tf.nn.softmax(alpha_hat, axis=1), axis=2) 480 | beta = tf.expand_dims(tf.exp(beta_hat), axis=2) 481 | kappa = tf.expand_dims(previous_kappa + tf.exp(kappa_hat), axis=2) 482 | 483 | # [1, 1, max_input_steps] 484 | mu = tf.reshape(tf.cast(tf.range(self.alignments_size), dtype=tf.float32), shape=[1, 1, self.alignments_size]) # [[[0,1,2,...]]] 485 | 486 | # [batch_size, max_input_steps] 487 | phi = tf.reduce_sum(alpha * tf.exp(-beta * (kappa - mu) ** 2.), axis=1) 488 | 489 | alignments = self._maybe_mask_score(phi) 490 | state = tf.squeeze(kappa, axis=2) 491 | 492 | return alignments, state 493 | 494 | def monotonic_stepwise_attention(p_choose_i, previous_attention, mode): 495 | # p_choose_i, previous_alignments, previous_score: [batch_size, memory_size] 496 | # p_choose_i: probability to keep attended to the last attended entry i 497 | if mode == "parallel": 498 | pad = tf.zeros([tf.shape(p_choose_i)[0], 1], dtype=p_choose_i.dtype) 499 | attention = previous_attention * p_choose_i + tf.concat( 500 | [pad, previous_attention[:, :-1] * (1.0 - p_choose_i[:, :-1])], axis=1) 501 | elif mode == "hard": 502 | # Given that previous_alignments is one_hot 503 | move_next_mask = tf.concat([tf.zeros_like(previous_attention[:, :1]), previous_attention[:, :-1]], axis=1) 504 | stay_prob = tf.reduce_sum(p_choose_i * previous_attention, axis=1) # [B] 505 | attention = tf.where(stay_prob > 0.5, previous_attention, move_next_mask) 506 | else: 507 | raise ValueError("mode must be 'parallel', or 'hard'.") 508 | return attention 509 | 510 | 511 | def _stepwise_monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, seed=None): 512 | if sigmoid_noise > 0: 513 | noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype, 514 | seed=seed) 515 | score += sigmoid_noise * noise 516 | if mode == "hard": 517 | # When mode is hard, use a hard sigmoid 518 | p_choose_i = math_ops.cast(score > 0, score.dtype) 519 | else: 520 | p_choose_i = math_ops.sigmoid(score) 521 | alignments = monotonic_stepwise_attention(p_choose_i, previous_alignments, mode) 522 | return alignments 523 | 524 | 525 | class BahdanauStepwiseMonotonicAttention(BahdanauMonotonicAttention): 526 | def __init__(self, 527 | num_units, 528 | memory, 529 | memory_sequence_length=None, 530 | normalize=True, 531 | score_mask_value=None, 532 | sigmoid_noise=2.0, 533 | sigmoid_noise_seed=None, 534 | score_bias_init=3.5, 535 | mode="parallel", 536 | dtype=None, 537 | name="BahdanauStepwiseMonotonicAttention"): 538 | if dtype is None: 539 | dtype = tf.float32 540 | wrapped_probability_fn = functools.partial( 541 | _stepwise_monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode, 542 | seed=sigmoid_noise_seed) 543 | super(BahdanauMonotonicAttention, self).__init__( 544 | query_layer=tf.layers.Dense( 545 | num_units, name="query_layer", use_bias=False, dtype=dtype), 546 | memory_layer=tf.layers.Dense( 547 | num_units, name="memory_layer", use_bias=False, dtype=dtype), 548 | memory=memory, 549 | probability_fn=wrapped_probability_fn, 550 | memory_sequence_length=memory_sequence_length, 551 | score_mask_value=score_mask_value, 552 | name=name) 553 | self._num_units = num_units 554 | self._normalize = normalize 555 | self._name = name 556 | self._score_bias_init = score_bias_init 557 | 558 | # def __call__(self, query, state): 559 | # """Score the query based on the keys and values. 560 | # Args: 561 | # query: Tensor of dtype matching `self.values` and shape 562 | # `[batch_size, query_depth]`. 563 | # state: Tensor of dtype matching `self.values` and shape 564 | # `[batch_size, alignments_size]` 565 | # (`alignments_size` is memory's `max_time`). 566 | # Returns: 567 | # alignments: Tensor of dtype matching `self.values` and shape 568 | # `[batch_size, alignments_size]` (`alignments_size` is memory's 569 | # `max_time`). 570 | # """ 571 | # with tf.variable_scope(None, "bahdanau_stepwise_monotonic_attention", [query]): 572 | # processed_query = self.query_layer(query) if self.query_layer else query 573 | # score = _bahdanau_score(processed_query, self._keys, self._normalize) # keys 가 memory임 574 | # score_bias = tf.get_variable("attention_score_bias", dtype=processed_query.dtype, initializer=self._score_bias_init) 575 | 576 | # #alignments_bias = tf.get_variable("alignments_bias", shape = state.get_shape()[-1],dtype=processed_query.dtype, initializer=tf.zeros_initializer()) # hccho 577 | # alignments_bias = tf.get_variable("alignments_bias", shape = (1),dtype=processed_query.dtype, initializer=tf.zeros_initializer()) # hccho 578 | 579 | # score += score_bias 580 | # alignments = self._probability_fn(score, state) #BahdanauAttention에서 _probability_fn = softmax 581 | 582 | # next_state = alignments # 다음 alignment 계산에 사용할 state 값 = AttentionWrapperState.attention_state 583 | # # hccho. alignment가 attention 계산에 직접 사용된다. 584 | # alignments = tf.nn.relu(alignments+alignments_bias) 585 | # alignments = alignments/(tf.reduce_sum(alignments,axis=-1,keepdims=True) + 1.0e-12 ) # hccho 수정 586 | 587 | 588 | # return alignments, next_state 589 | --------------------------------------------------------------------------------