├── tests
    ├── __init__.py
    ├── cmudict_test.py
    ├── text_test.py
    └── numbers_test.py
├── datasets
    ├── __init__.py
    ├── ljspeech.py
    ├── blizzard.py
    ├── bible.py
    └── datafeeder.py
├── .gitignore
├── models
    ├── __init__.py
    ├── rnn_wrappers.py
    ├── helpers.py
    ├── tacotron.py
    ├── tacotron2.py
    └── modules.py
├── requirements.txt
├── util
    ├── __init__.py
    ├── plot.py
    ├── infolog.py
    └── audio.py
├── LICENSE
├── synthesizer.py
├── hparams.py
├── text
    ├── cmudict.py
    ├── symbols.py
    ├── numbers.py
    ├── cleaners.py
    ├── __init__.py
    ├── kor_dic.py
    └── korean.py
├── eval.py
├── preprocess.py
├── TRAINING_DATA.md
├── demo_server.py
├── README.md
├── LJSpeech-1.1
    └── README
└── train.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | __pycache__/
 3 | .cache/
 4 | *.pyc
 5 | .DS_Store
 6 | run*.sh
 7 | *.wav
 8 | *.npy
 9 | *.json
10 | .ipynb_checkpoints/
11 | training/
12 | logs-*/
13 | 
14 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron2 import Tacotron2
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == 'tacotron':
6 |     return Tacotron2(hparams)
7 |   else:
8 |     raise Exception('Unknown model: ' + name)
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
 2 | # depends on your platform. It is assumed you have already installed tensorflow.
 3 | falcon==1.2.0
 4 | inflect==0.2.5
 5 | librosa==0.5.1
 6 | matplotlib==2.0.2
 7 | numpy==1.14.3
 8 | scipy==0.19.0
 9 | tqdm==4.11.2
10 | Unidecode==0.4.20
11 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/util/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | def plot_alignment(alignment, path, info=None):
 7 |   fig, ax = plt.subplots()
 8 |   im = ax.imshow(
 9 |     alignment,
10 |     aspect='auto',
11 |     origin='lower',
12 |     interpolation='none')
13 |   fig.colorbar(im, ax=ax)
14 |   xlabel = 'Decoder timestep'
15 |   if info is not None:
16 |     xlabel += '\n\n' + info
17 |   plt.xlabel(xlabel)
18 |   plt.ylabel('Encoder timestep')
19 |   plt.tight_layout()
20 |   plt.savefig(path, format='png')
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/tests/cmudict_test.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from text import cmudict
 3 | 
 4 | 
 5 | test_data = '''
 6 | ;;; # CMUdict  --  Major Version: 0.07
 7 | )PAREN  P ER EH N
 8 | 'TIS  T IH Z
 9 | ADVERSE  AE0 D V ER1 S
10 | ADVERSE(1)  AE1 D V ER2 S
11 | ADVERSE(2)  AE2 D V ER1 S
12 | ADVERSELY  AE0 D V ER1 S L IY0
13 | ADVERSITY  AE0 D V ER1 S IH0 T IY2
14 | BARBERSHOP  B AA1 R B ER0 SH AA2 P
15 | YOU'LL  Y UW1 L
16 | '''
17 | 
18 | 
19 | def test_cmudict():
20 |   c = cmudict.CMUDict(io.StringIO(test_data))
21 |   assert len(c) == 6
22 |   assert len(cmudict.valid_symbols) == 84
23 |   assert c.lookup('ADVERSITY') == ['AE0 D V ER1 S IH0 T IY2']
24 |   assert c.lookup('BarberShop') == ['B AA1 R B ER0 SH AA2 P']
25 |   assert c.lookup("You'll") == ['Y UW1 L']
26 |   assert c.lookup("'tis") == ['T IH Z']
27 |   assert c.lookup('adverse') == [
28 |     'AE0 D V ER1 S',
29 |     'AE1 D V ER2 S',
30 |     'AE2 D V ER1 S',
31 |   ]
32 |   assert c.lookup('') == None
33 |   assert c.lookup('foo') == None
34 |   assert c.lookup(')paren') == None
35 | 
36 | 
37 | def test_cmudict_no_keep_ambiguous():
38 |   c = cmudict.CMUDict(io.StringIO(test_data), keep_ambiguous=False)
39 |   assert len(c) == 5
40 |   assert c.lookup('adversity') == ['AE0 D V ER1 S IH0 T IY2']
41 |   assert c.lookup('adverse') == None
42 | 


--------------------------------------------------------------------------------
/util/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 |   global _file, _run_name, _slack_url
16 |   _close_logfile()
17 |   _file = open(filename, 'a', encoding="utf-8")
18 |   _file.write('\n-----------------------------------------------------------------\n')
19 |   _file.write('Starting new training run\n')
20 |   _file.write('-----------------------------------------------------------------\n')
21 |   _run_name = run_name
22 |   _slack_url = slack_url
23 | 
24 | 
25 | def log(msg, slack=False):
26 |   print(msg)
27 |   if _file is not None:
28 |     _file.write('[%s]  %s\n' % (datetime.now().strftime(_format)[:-3], msg))
29 |   if slack and _slack_url is not None:
30 |     Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 |   global _file
35 |   if _file is not None:
36 |     _file.close()
37 |     _file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 |   req = Request(_slack_url)
42 |   req.add_header('Content-Type', 'application/json')
43 |   urlopen(req, json.dumps({
44 |     'username': 'tacotron',
45 |     'icon_emoji': ':taco:',
46 |     'text': '*%s*: %s' % (_run_name, msg)
47 |   }).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/synthesizer.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from hparams import hparams
 5 | from librosa import effects
 6 | from models import create_model
 7 | from text import text_to_sequence
 8 | from util import audio
 9 | # from g2pk import G2p
10 | 
11 | 
12 | class Synthesizer:
13 |     def load(self, checkpoint_path, model_name='tacotron'):
14 |         print('Constructing model: %s' % model_name)
15 |         inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
16 |         input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
17 |         with tf.variable_scope('model') as scope:
18 |             self.model = create_model(model_name, hparams)
19 |             self.model.initialize(inputs, input_lengths)
20 |             self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
21 | 
22 |         print('Loading checkpoint: %s' % checkpoint_path)
23 |         self.session = tf.Session()
24 |         self.session.run(tf.global_variables_initializer())
25 |         saver = tf.train.Saver()
26 |         saver.restore(self.session, checkpoint_path)
27 | 
28 |     def synthesize(self, text):
29 |         cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
30 |         # g2p = G2p()
31 |         seq = text_to_sequence(text, cleaner_names)
32 |         feed_dict = {
33 |             self.model.inputs: [np.asarray(seq, dtype=np.int32)],
34 |             self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
35 |         }
36 |         wav = self.session.run(self.wav_output, feed_dict=feed_dict)
37 |         wav = audio.inv_preemphasis(wav)
38 |         wav = wav[:audio.find_endpoint(wav)]
39 |         out = io.BytesIO()
40 |         audio.save_wav(wav, out)
41 |         return out.getvalue()
42 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | # Default hyperparameters:
 4 | hparams = tf.contrib.training.HParams(
 5 |     # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
 6 |     # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
 7 |     cleaners='korean_cleaners',
 8 |     # Audio:
 9 |     num_mels=80,
10 |     num_freq=1025,
11 |     sample_rate=21000,
12 |     frame_length_ms=50,
13 |     frame_shift_ms=12.5,
14 |     preemphasis=0.97,
15 |     min_level_db=-100,
16 |     ref_level_db=20,
17 | 
18 |     # Encoder:
19 |     embed_depth=512,
20 |     encoder_conv_filter=512,
21 |     encoder_conv_kernel=5,
22 |     encoder_stack_size=3,
23 |     encoder_lstm_hidden_dim=256,
24 |     # Model:
25 |     outputs_per_step=5,
26 |     prenet_depths=[256, 256],
27 |     encoder_depth=256,
28 |     postnet_depth=256,
29 |     attention_depth=256,
30 |     attention_filters = 32,
31 |     attention_kernel = (31, ),
32 |     attention_dim = 128,
33 |     decoder_depth=256,
34 |     synthesis_constraint = False,
35 |     synthesis_constraint_type = 'window',
36 |     attention_win_size = 7,
37 |     attention_type = 'loc_sen',
38 |     cumulative_weights = True,
39 |     reg_weight = 1e-6,
40 | 
41 |     # Training:
42 |     batch_size=32,
43 |     adam_beta1=0.9,
44 |     adam_beta2=0.999,
45 |     initial_learning_rate=0.002,
46 |     decay_learning_rate=True,
47 |     use_cmudict=False,  # Use CMUDict during training to learn pronunciation of ARPAbet phonemes
48 | 
49 |     # Eval:
50 |     max_iters=500,
51 |     griffin_lim_iters=60,
52 |     power=1.5,  # Power to raise magnitudes to prior to Griffin-Lim
53 | )
54 | 
55 | 
56 | def hparams_debug_string():
57 |     values = hparams.values()
58 |     hp = ['  %s: %s' % (name, values[name]) for name in sorted(values)]
59 |     return 'Hyperparameters:\n' + '\n'.join(hp)
60 | 


--------------------------------------------------------------------------------
/models/rnn_wrappers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.contrib.rnn import RNNCell
 4 | from .modules import prenet
 5 | 
 6 | 
 7 | class DecoderPrenetWrapper(RNNCell):
 8 |     '''Runs RNN inputs through a prenet before sending them to the cell.'''
 9 | 
10 |     def __init__(self, cell, is_training, layer_sizes):
11 |         super(DecoderPrenetWrapper, self).__init__()
12 |         self._cell = cell
13 |         self._is_training = is_training
14 |         self._layer_sizes = layer_sizes
15 | 
16 |     @property
17 |     def state_size(self):
18 |         return self._cell.state_size
19 | 
20 |     @property
21 |     def output_size(self):
22 |         return self._cell.output_size
23 | 
24 |     def call(self, inputs, state):
25 |         prenet_out = prenet(inputs, self._is_training, self._layer_sizes, scope='decoder_prenet')
26 |         return self._cell(prenet_out, state)
27 | 
28 |     def zero_state(self, batch_size, dtype):
29 |         return self._cell.zero_state(batch_size, dtype)
30 | 
31 | 
32 | class ConcatOutputAndAttentionWrapper(RNNCell):
33 |     '''Concatenates RNN cell output with the attention context vector.
34 |   This is expected to wrap a cell wrapped with an AttentionWrapper constructed with
35 |   attention_layer_size=None and output_attention=False. Such a cell's state will include an
36 |   "attention" field that is the context vector.
37 |   '''
38 | 
39 |     def __init__(self, cell):
40 |         super(ConcatOutputAndAttentionWrapper, self).__init__()
41 |         self._cell = cell
42 | 
43 |     @property
44 |     def state_size(self):
45 |         return self._cell.state_size
46 | 
47 |     @property
48 |     def output_size(self):
49 |         return self._cell.output_size + self._cell.state_size.attention
50 | 
51 |     def call(self, inputs, state):
52 |         output, res_state = self._cell(inputs, state)
53 |         return tf.concat([output, res_state.attention], axis=-1), res_state
54 | 
55 |     def zero_state(self, batch_size, dtype):
56 |         return self._cell.zero_state(batch_size, dtype)


--------------------------------------------------------------------------------
/text/cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | valid_symbols = [
 5 |   'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
 6 |   'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
 7 |   'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
 8 |   'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
 9 |   'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
10 |   'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
11 |   'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
12 | ]
13 | 
14 | _valid_symbol_set = set(valid_symbols)
15 | 
16 | 
17 | class CMUDict:
18 |   '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
19 |   def __init__(self, file_or_path, keep_ambiguous=True):
20 |     if isinstance(file_or_path, str):
21 |       with open(file_or_path, encoding='latin-1') as f:
22 |         entries = _parse_cmudict(f)
23 |     else:
24 |       entries = _parse_cmudict(file_or_path)
25 |     if not keep_ambiguous:
26 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
27 |     self._entries = entries
28 | 
29 | 
30 |   def __len__(self):
31 |     return len(self._entries)
32 | 
33 | 
34 |   def lookup(self, word):
35 |     '''Returns list of ARPAbet pronunciations of the given word.'''
36 |     return self._entries.get(word.upper())
37 | 
38 | 
39 | 
40 | _alt_re = re.compile(r'\([0-9]+\)')
41 | 
42 | 
43 | def _parse_cmudict(file):
44 |   cmudict = {}
45 |   for line in file:
46 |     if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
47 |       parts = line.split('  ')
48 |       word = re.sub(_alt_re, '', parts[0])
49 |       pronunciation = _get_pronunciation(parts[1])
50 |       if pronunciation:
51 |         if word in cmudict:
52 |           cmudict[word].append(pronunciation)
53 |         else:
54 |           cmudict[word] = [pronunciation]
55 |   return cmudict
56 | 
57 | 
58 | def _get_pronunciation(s):
59 |   parts = s.strip().split(' ')
60 |   for part in parts:
61 |     if part not in _valid_symbol_set:
62 |       return None
63 |   return ' '.join(parts)
64 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | # '''
 2 | # Defines the set of symbols used in text input to the model.
 3 | #
 4 | # The default is a set of ASCII characters that works well for English or text that has been run
 5 | # through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | # '''
 7 | # from text import cmudict
 8 | #
 9 | # _pad        = '_'
10 | # _eos        = '~'
11 | # _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12 | #
13 | # # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | # _arpabet = ['@' + s for s in cmudict.valid_symbols]
15 | #
16 | # # Export all symbols:
17 | # symbols = [_pad, _eos] + list(_characters) + _arpabet
18 | 
19 | # coding: utf-8
20 | '''
21 | Defines the set of symbols used in text input to the model.
22 | 
23 | The default is a set of ASCII characters that works well for English or text that has been run
24 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
25 | '''
26 | from jamo import h2j, j2h
27 | from jamo.jamo import _jamo_char_to_hcj
28 | 
29 | from .korean import ALL_SYMBOLS, PAD, EOS
30 | 
31 | # For english
32 | en_symbols = PAD + EOS + 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '  # <-For deployment(Because korean ALL_SYMBOLS follow this convention)
33 | 
34 | symbols = ALL_SYMBOLS  # for korean
35 | 
36 | """
37 | 초성과 종성은 같아보이지만, 다른 character이다.
38 | '_~ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ!'(),-.:;? '
39 | '_': 0, '~': 1, 'ᄀ': 2, 'ᄁ': 3, 'ᄂ': 4, 'ᄃ': 5, 'ᄄ': 6, 'ᄅ': 7, 'ᄆ': 8, 'ᄇ': 9, 'ᄈ': 10, 
40 | 'ᄉ': 11, 'ᄊ': 12, 'ᄋ': 13, 'ᄌ': 14, 'ᄍ': 15, 'ᄎ': 16, 'ᄏ': 17, 'ᄐ': 18, 'ᄑ': 19, 'ᄒ': 20, 
41 | 'ᅡ': 21, 'ᅢ': 22, 'ᅣ': 23, 'ᅤ': 24, 'ᅥ': 25, 'ᅦ': 26, 'ᅧ': 27, 'ᅨ': 28, 'ᅩ': 29, 'ᅪ': 30, 
42 | 'ᅫ': 31, 'ᅬ': 32, 'ᅭ': 33, 'ᅮ': 34, 'ᅯ': 35, 'ᅰ': 36, 'ᅱ': 37, 'ᅲ': 38, 'ᅳ': 39, 'ᅴ': 40, 
43 | 'ᅵ': 41, 'ᆨ': 42, 'ᆩ': 43, 'ᆪ': 44, 'ᆫ': 45, 'ᆬ': 46, 'ᆭ': 47, 'ᆮ': 48, 'ᆯ': 49, 'ᆰ': 50, 
44 | 'ᆱ': 51, 'ᆲ': 52, 'ᆳ': 53, 'ᆴ': 54, 'ᆵ': 55, 'ᆶ': 56, 'ᆷ': 57, 'ᆸ': 58, 'ᆹ': 59, 'ᆺ': 60, 
45 | 'ᆻ': 61, 'ᆼ': 62, 'ᆽ': 63, 'ᆾ': 64, 'ᆿ': 65, 'ᇀ': 66, 'ᇁ': 67, 'ᇂ': 68, '!': 69, "'": 70, 
46 | '(': 71, ')': 72, ',': 73, '-': 74, '.': 75, ':': 76, ';': 77, '?': 78, ' ': 79
47 | """


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | from hparams import hparams, hparams_debug_string
 5 | from synthesizer import Synthesizer
 6 | 
 7 | 
 8 | # sentences = [
 9 | #     # '완전히 쾅 닫힌 대화창 뿐이네',
10 | #     # '정성스럽게 적었던 거야',
11 | #     # '나는 큰 결심을 하고서 보낸 문잔데',
12 | #     # '모든걸 마무리 해버렸어',
13 | #     # '이모티콘 하나마저 조심스럽게 보냈어',
14 | #     # '너가 잘해야지',
15 | #     # '새해 복만으로는 안돼',
16 | #   # 장기하와 얼굴들 ㅋ 가사:
17 | #   '신진 샹숑가수의 신춘 샹숑쇼우',
18 | #   '철수 책상 철 책상',
19 | #   '창경원 창살은 쌍창살',
20 | #   '스위스에서 온 스미스씨',
21 | #   # 장기하와 얼굴들 새해복 가사:
22 | #   '간장 공장 공장장',
23 | #   '한양양장점 옆 한양양장점',
24 | #   '후회한 시간을 후회할 거잖아',
25 | # ]
26 | 
27 | 
28 | def get_output_base_path(checkpoint_path):
29 |     base_dir = os.path.dirname(checkpoint_path)
30 |     m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(checkpoint_path)
31 |     name = 'eval-char-%d' % int(m.group(1)) if m else 'eval'
32 |     return os.path.join(base_dir, name)
33 | 
34 | 
35 | def run_eval(args):
36 |     print(hparams_debug_string())
37 |     synth = Synthesizer()
38 |     synth.load(args.checkpoint)
39 |     base_path = get_output_base_path(args.checkpoint)
40 |     sentences=[]
41 |     with open('./eval_char.txt', encoding='utf-8') as f:
42 |         for line in f:
43 |             try:
44 |                 parts = line.strip().replace('"', '').split('|')
45 |                 text = parts[3]
46 |                 sentences.append(text)
47 |             except:
48 |                 pass
49 |     for i, text in enumerate(sentences):
50 |         path = '%s-%d.wav' % (base_path, i)
51 |         print('Synthesizing: %s' % path)
52 |         with open(path, 'wb') as f:
53 |             f.write(synth.synthesize(text))
54 | 
55 | 
56 | def main():
57 |     parser = argparse.ArgumentParser()
58 |     parser.add_argument('--checkpoint', required=True, help='Path to model checkpoint')
59 |     parser.add_argument('--hparams', default='',
60 |         help='Hyperparameter overrides as a comma-separated list of name=value pairs')
61 |     parser.add_argument('--gpu', default='1')
62 |     args = parser.parse_args()
63 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
64 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
65 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
66 |     hparams.parse(args.hparams)
67 |     run_eval(args)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |   main()
72 | 


--------------------------------------------------------------------------------
/tests/text_test.py:
--------------------------------------------------------------------------------
 1 | from text import cleaners, symbols, text_to_sequence, sequence_to_text
 2 | from unidecode import unidecode
 3 | 
 4 | 
 5 | def test_symbols():
 6 |   assert len(symbols) >= 3
 7 |   assert symbols[0] == '_'
 8 |   assert symbols[1] == '~'
 9 | 
10 | 
11 | def test_text_to_sequence():
12 |   assert text_to_sequence('', []) == [1]
13 |   assert text_to_sequence('Hi!', []) == [9, 36, 54, 1]
14 |   assert text_to_sequence('"A"_B', []) == [2, 3, 1]
15 |   assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
16 |   assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
17 |   assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]
18 | 
19 | 
20 | def test_sequence_to_text():
21 |   assert sequence_to_text([]) == ''
22 |   assert sequence_to_text([1]) == '~'
23 |   assert sequence_to_text([9, 36, 54, 1]) == 'Hi!~'
24 |   assert sequence_to_text([2, 64, 83, 132, 64, 3]) == 'A {AW1 S} B'
25 | 
26 | 
27 | def test_collapse_whitespace():
28 |   assert cleaners.collapse_whitespace('') == ''
29 |   assert cleaners.collapse_whitespace('  ') == ' '
30 |   assert cleaners.collapse_whitespace('x') == 'x'
31 |   assert cleaners.collapse_whitespace(' x.  y,  \tz') == ' x. y, z'
32 | 
33 | 
34 | def test_convert_to_ascii():
35 |   assert cleaners.convert_to_ascii("raison d'être") == "raison d'etre"
36 |   assert cleaners.convert_to_ascii('grüß gott') == 'gruss gott'
37 |   assert cleaners.convert_to_ascii('안녕') == 'annyeong'
38 |   assert cleaners.convert_to_ascii('Здравствуйте') == 'Zdravstvuite'
39 | 
40 | 
41 | def test_lowercase():
42 |   assert cleaners.lowercase('Happy Birthday!') == 'happy birthday!'
43 |   assert cleaners.lowercase('CAFÉ') == 'café'
44 | 
45 | 
46 | def test_expand_abbreviations():
47 |   assert cleaners.expand_abbreviations('mr. and mrs. smith') == 'mister and misess smith'
48 | 
49 | 
50 | def test_expand_numbers():
51 |   assert cleaners.expand_numbers('3 apples and 44 pears') == 'three apples and forty-four pears'
52 |   assert cleaners.expand_numbers('$3.50 for gas.') == 'three dollars, fifty cents for gas.'
53 | 
54 | 
55 | def test_cleaner_pipelines():
56 |   text = 'Mr. Müller ate  2 Apples'
57 |   assert cleaners.english_cleaners(text) == 'mister muller ate two apples'
58 |   assert cleaners.transliteration_cleaners(text) == 'mr. muller ate 2 apples'
59 |   assert cleaners.basic_cleaners(text) == 'mr. müller ate 2 apples'
60 | 
61 | 


--------------------------------------------------------------------------------
/text/numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | # Code based on https://github.com/keithito/tacotron/blob/master/text/cleaners.py
 4 | 
 5 | import re
 6 | from .korean import tokenize as ko_tokenize
 7 | 
 8 | # # Added to support LJ_speech
 9 | # from unidecode import unidecode
10 | # from .en_numbers import normalize_numbers as en_normalize_numbers
11 | 
12 | # Regular expression matching whitespace:
13 | _whitespace_re = re.compile(r'\s+')
14 | 
15 | 
16 | def korean_cleaners(text):
17 |     '''Pipeline for Korean text, including number and abbreviation expansion.'''
18 |     text = ko_tokenize(text)  # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
19 |     return text
20 | 
21 | 
22 | # # List of (regular expression, replacement) pairs for abbreviations:
23 | # _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 | #     ('mrs', 'misess'),
25 | #     ('mr', 'mister'),
26 | #     ('dr', 'doctor'),
27 | #     ('st', 'saint'),
28 | #     ('co', 'company'),
29 | #     ('jr', 'junior'),
30 | #     ('maj', 'major'),
31 | #     ('gen', 'general'),
32 | #     ('drs', 'doctors'),
33 | #     ('rev', 'reverend'),
34 | #     ('lt', 'lieutenant'),
35 | #     ('hon', 'honorable'),
36 | #     ('sgt', 'sergeant'),
37 | #     ('capt', 'captain'),
38 | #     ('esq', 'esquire'),
39 | #     ('ltd', 'limited'),
40 | #     ('col', 'colonel'),
41 | #     ('ft', 'fort'),
42 | # ]]
43 | 
44 | 
45 | # def expand_abbreviations(text):
46 | #     for regex, replacement in _abbreviations:
47 | #         text = re.sub(regex, replacement, text)
48 | #     return text
49 | #
50 | #
51 | # def expand_numbers(text):
52 | #     return en_normalize_numbers(text)
53 | 
54 | 
55 | def lowercase(text):
56 |     return text.lower()
57 | 
58 | 
59 | def collapse_whitespace(text):
60 |     return re.sub(_whitespace_re, ' ', text)
61 | 
62 | 
63 | # def convert_to_ascii(text):
64 | #     return unidecode(text)
65 | 
66 | 
67 | def basic_cleaners(text):
68 |     text = lowercase(text)
69 |     text = collapse_whitespace(text)
70 |     return text
71 | 
72 | 
73 | # def transliteration_cleaners(text):
74 | #     # text = convert_to_ascii(text)
75 | #     text = lowercase(text)
76 | #     text = collapse_whitespace(text)
77 | #     return text
78 | #
79 | #
80 | # def english_cleaners(text):
81 | #     text = convert_to_ascii(text)
82 | #     text = lowercase(text)
83 | #     text = expand_numbers(text)
84 | #     text = expand_abbreviations(text)
85 | #     text = collapse_whitespace(text)
86 | #     return text
87 | 


--------------------------------------------------------------------------------
/tests/numbers_test.py:
--------------------------------------------------------------------------------
 1 | from text.numbers import normalize_numbers
 2 | 
 3 | 
 4 | def test_normalize_numbers():
 5 |   assert normalize_numbers('1') == 'one'
 6 |   assert normalize_numbers('15') == 'fifteen'
 7 |   assert normalize_numbers('24') == 'twenty-four'
 8 |   assert normalize_numbers('100') == 'one hundred'
 9 |   assert normalize_numbers('101') == 'one hundred one'
10 |   assert normalize_numbers('456') == 'four hundred fifty-six'
11 |   assert normalize_numbers('1000') == 'one thousand'
12 |   assert normalize_numbers('1800') == 'eighteen hundred'
13 |   assert normalize_numbers('2,000') == 'two thousand'
14 |   assert normalize_numbers('3000') == 'three thousand'
15 |   assert normalize_numbers('18000') == 'eighteen thousand'
16 |   assert normalize_numbers('24,000') == 'twenty-four thousand'
17 |   assert normalize_numbers('124,001') == 'one hundred twenty-four thousand one'
18 |   assert normalize_numbers('6.4 sec') == 'six point four sec'
19 | 
20 | 
21 | def test_normalize_ordinals():
22 |   assert normalize_numbers('1st') == 'first'
23 |   assert normalize_numbers('2nd') == 'second'
24 |   assert normalize_numbers('9th') == 'ninth'
25 |   assert normalize_numbers('243rd place') == 'two hundred and forty-third place'
26 | 
27 | 
28 | def test_normalize_dates():
29 |   assert normalize_numbers('1400') == 'fourteen hundred'
30 |   assert normalize_numbers('1901') == 'nineteen oh one'
31 |   assert normalize_numbers('1999') == 'nineteen ninety-nine'
32 |   assert normalize_numbers('2000') == 'two thousand'
33 |   assert normalize_numbers('2004') == 'two thousand four'
34 |   assert normalize_numbers('2010') == 'twenty ten'
35 |   assert normalize_numbers('2012') == 'twenty twelve'
36 |   assert normalize_numbers('2025') == 'twenty twenty-five'
37 |   assert normalize_numbers('September 11, 2001') == 'September eleven, two thousand one'
38 |   assert normalize_numbers('July 26, 1984.') == 'July twenty-six, nineteen eighty-four.'
39 | 
40 | 
41 | def test_normalize_money():
42 |   assert normalize_numbers('$0.00') == 'zero dollars'
43 |   assert normalize_numbers('$1') == 'one dollar'
44 |   assert normalize_numbers('$10') == 'ten dollars'
45 |   assert normalize_numbers('$.01') == 'one cent'
46 |   assert normalize_numbers('$0.25') == 'twenty-five cents'
47 |   assert normalize_numbers('$5.00') == 'five dollars'
48 |   assert normalize_numbers('$5.01') == 'five dollars, one cent'
49 |   assert normalize_numbers('$135.99.') == 'one hundred thirty-five dollars, ninety-nine cents.'
50 |   assert normalize_numbers('$40,000') == 'forty thousand dollars'
51 |   assert normalize_numbers('for £2500!') == 'for twenty-five hundred pounds!'
52 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from multiprocessing import cpu_count
 4 | from tqdm import tqdm
 5 | from datasets import blizzard, ljspeech, bible
 6 | from hparams import hparams
 7 | 
 8 | 
 9 | def preprocess_blizzard(args):
10 |     in_dir = os.path.join(args.base_dir, 'Blizzard2012')
11 |     out_dir = os.path.join(args.base_dir, args.output)
12 |     os.makedirs(out_dir, exist_ok=True)
13 |     metadata = blizzard.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
14 |     write_metadata(metadata, out_dir)
15 | 
16 | 
17 | def preprocess_ljspeech(args):
18 |     in_dir = os.path.join(args.base_dir, 'LJSpeech-1.1')
19 |     out_dir = os.path.join(args.base_dir, args.output)
20 |     os.makedirs(out_dir, exist_ok=True)
21 |     metadata = ljspeech.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
22 |     write_metadata(metadata, out_dir)
23 | 
24 | 
25 | def preprocess_bible(args):
26 |     in_dir = os.path.join(args.base_dir, 'bible')
27 |     out_dir = os.path.join(args.base_dir, args.output)
28 |     os.makedirs(out_dir, exist_ok=True)
29 |     metadata = bible.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
30 |     write_metadata(metadata, out_dir)
31 | 
32 | 
33 | def preprocess_kss(args):
34 |     in_dir = os.path.join(args.base_dir, 'kss')
35 |     out_dir = os.path.join(args.base_dir, args.output)
36 |     os.makedirs(out_dir, exist_ok=True)
37 |     metadata = bible.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
38 |     write_metadata(metadata, out_dir)
39 | 
40 | 
41 | def write_metadata(metadata, out_dir):
42 |     with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
43 |         for m in metadata:
44 |             f.write('|'.join([str(x) for x in m]) + '\n')
45 |     frames = sum([m[2] for m in metadata])
46 |     hours = frames * hparams.frame_shift_ms / (3600 * 1000)
47 |     print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours))
48 |     print('Max input length:  %d' % max(len(m[3]) for m in metadata))
49 |     print('Max output length: %d' % max(m[2] for m in metadata))
50 | 
51 | 
52 | def main():
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron/Tacotron2/'))
55 |     parser.add_argument('--output', default='training')
56 |     parser.add_argument('--dataset', required=True, choices=['blizzard', 'ljspeech', 'bible', 'kss'])
57 |     parser.add_argument('--num_workers', type=int, default=cpu_count())
58 |     args = parser.parse_args()
59 |     if args.dataset == 'blizzard':
60 |         preprocess_blizzard(args)
61 |     elif args.dataset == 'ljspeech':
62 |         preprocess_ljspeech(args)
63 |     elif args.dataset == 'bible':
64 |         preprocess_bible(args)
65 |     elif args.dataset == 'kss':
66 |         preprocess_kss(args)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/TRAINING_DATA.md:
--------------------------------------------------------------------------------
 1 | # Training Data
 2 | 
 3 | 
 4 | This repo supports the following speech datasets:
 5 |   * [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) (Public Domain)
 6 |   * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
 7 | 
 8 | You can use any other dataset if you write a preprocessor for it.
 9 | 
10 | 
11 | ### Writing a Preprocessor
12 | 
13 | Each training example consists of:
14 |   1. The text that was spoken
15 |   2. A mel-scale spectrogram of the audio
16 |   3. A linear-scale spectrogram of the audio
17 | 
18 | The preprocessor is responsible for generating these. See [ljspeech.py](datasets/ljspeech.py) for a
19 | commented example.
20 | 
21 | For each training example, a preprocessor should:
22 | 
23 |   1. Load the audio file:
24 |      ```python
25 |      wav = audio.load_wav(wav_path)
26 |      ```
27 | 
28 |   2. Compute linear-scale and mel-scale spectrograms (float32 numpy arrays):
29 |      ```python
30 |      spectrogram = audio.spectrogram(wav).astype(np.float32)
31 |      mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
32 |      ```
33 | 
34 |   3. Save the spectrograms to disk:
35 |      ```python
36 |      np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
37 |      np.save(os.path.join(out_dir, mel_spectrogram_filename), mel_spectrogram.T,  allow_pickle=False)
38 |      ```
39 |      Note that the transpose of the matrix returned by `audio.spectrogram` is saved so that it's
40 |      in time-major format.
41 | 
42 |   4. Generate a tuple `(spectrogram_filename, mel_spectrogram_filename, n_frames, text)` to
43 |      write to train.txt. n_frames is just the length of the time axis of the spectrogram.
44 | 
45 | 
46 | After you've written your preprocessor, you can add it to [preprocess.py](preprocess.py) by
47 | following the example of the other preprocessors in that file.
48 | 
49 | 
50 | ### Non-English Data
51 | 
52 | If your training data is in a language other than English, you will probably want to change the
53 | text cleaners by setting the `cleaners` hyperparameter.
54 | 
55 |   * If your text is in a Latin script or can be transliterated to ASCII using the
56 |     [Unidecode](https://pypi.python.org/pypi/Unidecode) library, you can use the transliteration
57 |     cleaners by setting the hyperparameter `cleaners=transliteration_cleaners`.
58 | 
59 |   * If you don't want to transliterate, you can define a custom character set.
60 |     This allows you to train directly on the character set used in your data.
61 | 
62 |     To do so, edit [symbols.py](text/symbols.py) and change the `_characters` variable to be a
63 |     string containing the UTF-8 characters in your data. Then set the hyperparameter `cleaners=basic_cleaners`.
64 | 
65 |   * If you're not sure which option to use, you can evaluate the transliteration cleaners like this:
66 | 
67 |     ```python
68 |     from text import cleaners
69 |     cleaners.transliteration_cleaners('Здравствуйте')   # Replace with the text you want to try
70 |     ```
71 | 


--------------------------------------------------------------------------------
/datasets/ljspeech.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from util import audio
 6 | 
 7 | 
 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 9 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
10 | 
11 |       Args:
12 |         in_dir: The directory where you have downloaded the LJ Speech dataset
13 |         out_dir: The directory to write the output into
14 |         num_workers: Optional number of worker processes to parallelize across
15 |         tqdm: You can optionally pass tqdm to get a nice progress bar
16 | 
17 |       Returns:
18 |         A list of tuples describing the training examples. This should be written to train.txt
19 |     '''
20 | 
21 |     # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you
22 |     # can omit it and just call _process_utterance on each input if you want.
23 |     executor = ProcessPoolExecutor(max_workers=num_workers)
24 |     futures = []
25 |     index = 1
26 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
27 |         for line in f:
28 |             parts = line.strip().split('|')
29 |             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
30 |             text = parts[2]
31 |             futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
32 |             index += 1
33 |     return [future.result() for future in tqdm(futures)]
34 | 
35 | 
36 | def _process_utterance(out_dir, index, wav_path, text):
37 |     '''Preprocesses a single utterance audio/text pair.
38 | 
39 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
40 |     to the train.txt file.
41 | 
42 |     Args:
43 |       out_dir: The directory to write the spectrograms into
44 |       index: The numeric index to use in the spectrogram filenames.
45 |       wav_path: Path to the audio file containing the speech input
46 |       text: The text spoken in the input audio file
47 | 
48 |     Returns:
49 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
50 |     '''
51 | 
52 |     # Load the audio to a numpy array:
53 |     wav = audio.load_wav(wav_path)
54 | 
55 |     # Compute the linear-scale spectrogram from the wav:
56 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
57 |     n_frames = spectrogram.shape[1]
58 | 
59 |     # Compute a mel-scale spectrogram from the wav:
60 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
61 | 
62 |     # Write the spectrograms to disk:
63 |     spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
64 |     mel_filename = 'ljspeech-mel-%05d.npy' % index
65 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
66 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
67 | 
68 |     # Return a tuple describing this training example:
69 |     return (spectrogram_filename, mel_filename, n_frames, text)
70 | 


--------------------------------------------------------------------------------
/datasets/blizzard.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from hparams import hparams
 6 | from util import audio
 7 | 
 8 | _max_out_length = 700
 9 | _end_buffer = 0.05
10 | _min_confidence = 90
11 | 
12 | # Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others.
13 | books = [
14 |     'ATrampAbroad',
15 |     'TheManThatCorruptedHadleyburg',
16 |     # 'LifeOnTheMississippi',
17 |     # 'TheAdventuresOfTomSawyer',
18 | ]
19 | 
20 | 
21 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
22 |     executor = ProcessPoolExecutor(max_workers=num_workers)
23 |     futures = []
24 |     index = 1
25 |     for book in books:
26 |         with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f:
27 |             for line in f:
28 |                 parts = line.strip().split('\t')
29 |                 if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence:
30 |                     wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0])
31 |                     labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0])
32 |                     text = parts[5]
33 |                     task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text)
34 |                     futures.append(executor.submit(task))
35 |                     index += 1
36 |     results = [future.result() for future in tqdm(futures)]
37 |     return [r for r in results if r is not None]
38 | 
39 | 
40 | def _process_utterance(out_dir, index, wav_path, labels_path, text):
41 |     # Load the wav file and trim silence from the ends:
42 |     wav = audio.load_wav(wav_path)
43 |     start_offset, end_offset = _parse_labels(labels_path)
44 |     start = int(start_offset * hparams.sample_rate)
45 |     end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
46 |     wav = wav[start:end]
47 |     max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
48 |     if len(wav) > max_samples:
49 |         return None
50 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
51 |     n_frames = spectrogram.shape[1]
52 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
53 |     spectrogram_filename = 'blizzard-spec-%05d.npy' % index
54 |     mel_filename = 'blizzard-mel-%05d.npy' % index
55 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
56 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
57 |     return (spectrogram_filename, mel_filename, n_frames, text)
58 | 
59 | 
60 | def _parse_labels(path):
61 |     labels = []
62 |     with open(os.path.join(path)) as f:
63 |         for line in f:
64 |             parts = line.strip().split(' ')
65 |             if len(parts) >= 3:
66 |                 labels.append((float(parts[0]), ' '.join(parts[2:])))
67 |     start = 0
68 |     end = None
69 |     if labels[0][1] == 'sil':
70 |         start = labels[0][0]
71 |     if labels[-1][1] == 'sil':
72 |         end = labels[-2][0] + _end_buffer
73 |     return (start, end)
74 | 


--------------------------------------------------------------------------------
/datasets/bible.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ProcessPoolExecutor
 2 | from functools import partial
 3 | import numpy as np
 4 | import os
 5 | from util import audio
 6 | 
 7 | 
 8 | def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 9 |     '''Preprocesses the LJ Speech dataset from a given input path into a given output directory.
10 | 
11 |       Args:
12 |         in_dir: The directory where you have downloaded the LJ Speech dataset
13 |         out_dir: The directory to write the output into
14 |         num_workers: Optional number of worker processes to parallelize across
15 |         tqdm: You can optionally pass tqdm to get a nice progress bar
16 | 
17 |       Returns:
18 |         A list of tuples describing the training examples. This should be written to train.txt
19 |     '''
20 | 
21 |     # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you
22 |     # can omit it and just call _process_utterance on each input if you want.
23 |     executor = ProcessPoolExecutor(max_workers=num_workers)
24 |     futures = []
25 |     index = 1
26 |     with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
27 |         for line in f:
28 |             try:
29 | 
30 |                 parts = line.strip().split('|')
31 |                 wav_path = os.path.join(in_dir, 'wavs', '%s' % parts[0])
32 |                 text = parts[1]
33 |                 futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
34 |                 index += 1
35 | 
36 |             except:
37 | 
38 |                 pass
39 |     return [future.result() for future in tqdm(futures)]
40 | 
41 | 
42 | def _process_utterance(out_dir, index, wav_path, text):
43 |     '''Preprocesses a single utterance audio/text pair.
44 | 
45 |     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
46 |     to the train.txt file.
47 | 
48 |     Args:
49 |       out_dir: The directory to write the spectrograms into
50 |       index: The numeric index to use in the spectrogram filenames.
51 |       wav_path: Path to the audio file containing the speech input
52 |       text: The text spoken in the input audio file
53 | 
54 |     Returns:
55 |       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
56 |     '''
57 | 
58 |     # Load the audio to a numpy array:
59 |     wav = audio.load_wav(wav_path)
60 | 
61 |     # Compute the linear-scale spectrogram from the wav:
62 |     spectrogram = audio.spectrogram(wav).astype(np.float32)
63 |     n_frames = spectrogram.shape[1]
64 | 
65 |     # Compute a mel-scale spectrogram from the wav:
66 |     mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
67 | 
68 |     # Write the spectrograms to disk:
69 |     spectrogram_filename = 'bible-spec-%05d.npy' % index
70 |     mel_filename = 'bible-mel-%05d.npy' % index
71 |     np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
72 |     np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
73 | 
74 |     # Return a tuple describing this training example:
75 |     return (spectrogram_filename, mel_filename, n_frames, text)
76 | 


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.contrib.seq2seq import Helper
 4 | 
 5 | 
 6 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper
 7 | class TacoTestHelper(Helper):
 8 |     def __init__(self, batch_size, output_dim, r):
 9 |         with tf.name_scope('TacoTestHelper'):
10 |             self._batch_size = batch_size
11 |             self._output_dim = output_dim
12 |             self._end_token = tf.tile([0.0], [output_dim * r])
13 | 
14 |     @property
15 |     def batch_size(self):
16 |         return self._batch_size
17 | 
18 |     @property
19 |     def sample_ids_shape(self):
20 |         return tf.TensorShape([])
21 | 
22 |     @property
23 |     def sample_ids_dtype(self):
24 |         return np.int32
25 | 
26 |     def initialize(self, name=None):
27 |         return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
28 | 
29 |     def sample(self, time, outputs, state, name=None):
30 |         return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
31 | 
32 |     def next_inputs(self, time, outputs, state, sample_ids, name=None):
33 |         '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
34 |         with tf.name_scope('TacoTestHelper'):
35 |             finished = tf.reduce_all(tf.equal(outputs, self._end_token), axis=1)
36 |             # Feed last output frame as next input. outputs is [N, output_dim * r]
37 |             next_inputs = outputs[:, -self._output_dim:]
38 |             return (finished, next_inputs, state)
39 | 
40 | 
41 | class TacoTrainingHelper(Helper):
42 |     def __init__(self, inputs, targets, output_dim, r):
43 |         # inputs is [N, T_in], targets is [N, T_out, D]
44 |         with tf.name_scope('TacoTrainingHelper'):
45 |             self._batch_size = tf.shape(inputs)[0]
46 |             self._output_dim = output_dim
47 | 
48 |             # Feed every r-th target frame as input
49 |             self._targets = targets[:, r - 1::r, :]
50 | 
51 |             # Use full length for every target because we don't want to mask the padding frames
52 |             num_steps = tf.shape(self._targets)[1]
53 |             self._lengths = tf.tile([num_steps], [self._batch_size])
54 | 
55 |     @property
56 |     def batch_size(self):
57 |         return self._batch_size
58 | 
59 |     @property
60 |     def sample_ids_shape(self):
61 |         return tf.TensorShape([])
62 | 
63 |     @property
64 |     def sample_ids_dtype(self):
65 |         return np.int32
66 | 
67 |     def initialize(self, name=None):
68 |         return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
69 | 
70 |     def sample(self, time, outputs, state, name=None):
71 |         return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
72 | 
73 |     def next_inputs(self, time, outputs, state, sample_ids, name=None):
74 |         with tf.name_scope(name or 'TacoTrainingHelper'):
75 |             finished = (time + 1 >= self._lengths)
76 |             next_inputs = self._targets[:, time, :]
77 |             return (finished, next_inputs, state)
78 | 
79 | 
80 | def _go_frames(batch_size, output_dim):
81 |     '''Returns all-zero <GO> frames for a given batch size and output dimension'''
82 |     return tf.tile([[0.0]], [batch_size, output_dim])
83 | 


--------------------------------------------------------------------------------
/demo_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import falcon
 3 | from hparams import hparams, hparams_debug_string
 4 | import os
 5 | from synthesizer import Synthesizer
 6 | 
 7 | html_body = '''<html><title>Demo</title>
 8 | <style>
 9 | body {padding: 16px; font-family: sans-serif; font-size: 14px; color: #444}
10 | input {font-size: 14px; padding: 8px 12px; outline: none; border: 1px solid #ddd}
11 | input:focus {box-shadow: 0 1px 2px rgba(0,0,0,.15)}
12 | p {padding: 12px}
13 | button {background: #28d; padding: 9px 14px; margin-left: 8px; border: none; outline: none;
14 | 		color: #fff; font-size: 14px; border-radius: 4px; cursor: pointer;}
15 | button:hover {box-shadow: 0 1px 2px rgba(0,0,0,.15); opacity: 0.9;}
16 | button:active {background: #29f;}
17 | button[disabled] {opacity: 0.4; cursor: default}
18 | </style>
19 | <body>
20 | <form>
21 |   <input id="text" type="text" size="40" placeholder="Enter Text">
22 |   <button id="button" name="synthesize">Speak</button>
23 | </form>
24 | <p id="message"></p>
25 | <audio id="audio" controls autoplay hidden></audio>
26 | <script>
27 | function q(selector) {return document.querySelector(selector)}
28 | q('#text').focus()
29 | q('#button').addEventListener('click', function(e) {
30 |   text = q('#text').value.trim()
31 |   if (text) {
32 | 	q('#message').textContent = 'Synthesizing...'
33 | 	q('#button').disabled = true
34 | 	q('#audio').hidden = true
35 | 	synthesize(text)
36 |   }
37 |   e.preventDefault()
38 |   return false
39 | })
40 | function synthesize(text) {
41 |   fetch('/synthesize?text=' + encodeURIComponent(text), {cache: 'no-cache'})
42 | 	.then(function(res) {
43 | 	  if (!res.ok) throw Error(res.statusText)
44 | 	  return res.blob()
45 | 	}).then(function(blob) {
46 | 	  q('#message').textContent = ''
47 | 	  q('#button').disabled = false
48 | 	  q('#audio').src = URL.createObjectURL(blob)
49 | 	  q('#audio').hidden = false
50 | 	}).catch(function(err) {
51 | 	  q('#message').textContent = 'Error: ' + err.message
52 | 	  q('#button').disabled = false
53 | 	})
54 | }
55 | </script></body></html>
56 | '''
57 | 
58 | 
59 | class UIResource:
60 |     def on_get(self, req, res):
61 |         res.content_type = 'text/html'
62 |         res.body = html_body
63 | 
64 | 
65 | class SynthesisResource:
66 |     def on_get(self, req, res):
67 |         if not req.params.get('text'):
68 |             raise falcon.HTTPBadRequest()
69 |         res.data = synthesizer.synthesize(req.params.get('text'))
70 |         res.content_type = 'audio/wav'
71 | 
72 | 
73 | synthesizer = Synthesizer()
74 | api = falcon.API()
75 | api.add_route('/synthesize', SynthesisResource())
76 | api.add_route('/', UIResource())
77 | 
78 | if __name__ == '__main__':
79 |     from wsgiref import simple_server
80 | 
81 |     parser = argparse.ArgumentParser()
82 |     parser.add_argument('--checkpoint', required=True, help='Full path to model checkpoint')
83 |     parser.add_argument('--port', type=int, default=3000)
84 |     parser.add_argument('--hparams', default='',
85 |                         help='Hyperparameter overrides as a comma-separated list of name=value pairs')
86 |     parser.add_argument('--gpu', default='1')
87 |     args = parser.parse_args()
88 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
89 |     hparams.parse(args.hparams)
90 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
91 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
92 |     print(hparams_debug_string())
93 |     synthesizer.load(args.checkpoint)
94 |     print('Serving on port %d' % args.port)
95 |     simple_server.make_server('0.0.0.0', args.port, api).serve_forever()
96 | else:
97 |     synthesizer.load(os.environ['CHECKPOINT'])
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tacotron2
  2 | 
  3 | Korean Speech Synthesis with Tacotron
  4 | 
  5 | Note that this repo is based on https://github.com/hccho2/Tacotron2-Wavenet-Korean-TTS, https://github.com/keithito/tacotron
  6 | 
  7 | 
  8 | ## Background
  9 | 
 10 | In February 2018, Google published a paper, [NATURAL TTS SYNTHESIS BY CONDITIONINGWAVENET ON MEL SPECTROGRAM PREDICTIONS],
 11 | where they present a neural text-to-speech model that learns to synthesize speech directly from
 12 | (text, audio) pairs. However, they didn't release their source code or training data. This is an
 13 | independent attempt to provide an open-source implementation of the model described in their paper.
 14 | 
 15 | The quality isn't as good as Google's demo yet, but hopefully it will get there someday :-).
 16 | Pull requests are welcome!
 17 | 
 18 | 
 19 | 
 20 | ## Quick Start
 21 | 
 22 | ### Installing dependencies
 23 | 
 24 | 1. Install Python 3.
 25 | 
 26 | 2. Install the latest version of [TensorFlow](https://www.tensorflow.org/install/) for your platform. For better
 27 |    performance, install with GPU support if it's available. This code works with TensorFlow 1.3 and later.
 28 | 
 29 | 3. Install requirements:
 30 |    ```
 31 |    pip install -r requirements.txt
 32 |    ```
 33 | 
 34 | 
 35 | ### Training
 36 | 
 37 | *Note: you need at least 40GB of free disk space to train a model.*
 38 | 
 39 | 1. **Download a speech dataset.**
 40 | 
 41 |    The following are supported out of the box:
 42 |     * [KSS Dataset](https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset) (Public Domain)
 43 | 
 44 |    You can use other datasets if you convert them to the right format. See [TRAINING_DATA.md](TRAINING_DATA.md) for more info.
 45 | 
 46 | 
 47 | 2. **Unpack the dataset into `~/tacotron`**
 48 | 
 49 |    After unpacking, your tree should look like this for LJ Speech:
 50 |    ```
 51 |    tacotron
 52 |      |- kss
 53 |          |- metadata.csv
 54 |          |- wavs
 55 |    ```
 56 | 
 57 | 
 58 | 3. **Preprocess the data**
 59 |    ```
 60 |    python3 preprocess.py --dataset kss
 61 |    ```
 62 | 
 63 | 4. **Train a model**
 64 |    ```
 65 |    python3 train.py
 66 |    ```
 67 | 
 68 |    Tunable hyperparameters are found in [hparams.py](hparams.py). You can adjust these at the command
 69 |    line using the `--hparams` flag, for example `--hparams="batch_size=16,outputs_per_step=2"`.
 70 |    Hyperparameters should generally be set to the same values at both training and eval time.
 71 |    The default hyperparameters are recommended for LJ Speech and other English-language data.
 72 |    See [TRAINING_DATA.md](TRAINING_DATA.md) for other languages.
 73 | 
 74 | 
 75 | 5. **Monitor with Tensorboard** (optional)
 76 |    ```
 77 |    tensorboard --logdir ~/tacotron/logs-tacotron
 78 |    ```
 79 | 
 80 |    The trainer dumps audio and alignments every 1000 steps. You can find these in
 81 |    `~/tacotron/logs-tacotron`.
 82 | 
 83 | 6. **Synthesize from a checkpoint**
 84 |    ```
 85 |    python3 demo_server.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
 86 |    ```
 87 |    Replace "185000" with the checkpoint number that you want to use, then open a browser
 88 |    to `localhost:9000` and type what you want to speak. Alternately, you can
 89 |    run [eval.py](eval.py) at the command line:
 90 |    ```
 91 |    python3 eval.py --checkpoint ~/tacotron/logs-tacotron/model.ckpt-185000
 92 |    ```
 93 |    If you set the `--hparams` flag when training, set the same value here.
 94 | 
 95 | 
 96 | ## Modifications
 97 | 
 98 |   * We add Stepwise Monotonic Attention, Monotonic Attention, GMM Attention, Loung Attention (20.01.20)
 99 |  
100 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import re
  3 | import string
  4 | import numpy as np
  5 | 
  6 | from text import cleaners
  7 | from hparams import hparams
  8 | from text.symbols import symbols, en_symbols, PAD, EOS
  9 | from text.korean import jamo_to_korean
 10 | 
 11 | # Mappings from symbol to numeric ID and vice versa:
 12 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 13 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 14 | isEn = False
 15 | 
 16 | # Regular expression matching text enclosed in curly braces:
 17 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 18 | 
 19 | puncuation_table = str.maketrans({key: None for key in string.punctuation})
 20 | 
 21 | 
 22 | def convert_to_en_symbols():
 23 |     '''Converts built-in korean symbols to english, to be used for english training
 24 | 
 25 | '''
 26 |     global _symbol_to_id, _id_to_symbol, isEn
 27 |     if not isEn:
 28 |         print(" [!] Converting to english mode")
 29 |     _symbol_to_id = {s: i for i, s in enumerate(en_symbols)}
 30 |     _id_to_symbol = {i: s for i, s in enumerate(en_symbols)}
 31 |     isEn = True
 32 | 
 33 | 
 34 | def remove_puncuations(text):
 35 |     return text.translate(puncuation_table)
 36 | 
 37 | 
 38 | # def text_to_sequence(text, as_token=False):
 39 | #     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 40 | #     if ('english_cleaners' in cleaner_names) and isEn == False:
 41 | #         convert_to_en_symbols()
 42 | #     else:
 43 | #
 44 | #     return _text_to_sequence(text, cleaner_names, as_token)
 45 | 
 46 | 
 47 | def text_to_sequence(text, cleaner_names):
 48 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 49 |       The text can optionally have ARPAbet sequences enclosed in curly braces embedded
 50 |       in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
 51 |       Args:
 52 |           text: string to convert to a sequence
 53 |           cleaner_names: names of the cleaner functions to run the text through
 54 |       Returns:
 55 |           List of integers corresponding to the symbols in the text
 56 |     '''
 57 |     sequence = []
 58 | 
 59 |     # Check for curly braces and treat their contents as ARPAbet:
 60 |     while len(text):
 61 |         m = _curly_re.match(text)
 62 |         if not m:
 63 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
 64 |             break
 65 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 66 |         sequence += _arpabet_to_sequence(m.group(2))
 67 |         text = m.group(3)
 68 | 
 69 |     # Append EOS token
 70 |     sequence.append(_symbol_to_id[EOS])  # [14, 29, 45, 2, 27, 62, 20, 21, 4, 39, 45, 1]
 71 | 
 72 |     # if as_token:
 73 |     #     return sequence_to_text(sequence, combine_jamo=True)
 74 |     # else:
 75 |     return np.array(sequence, dtype=np.int32)
 76 | 
 77 | 
 78 | def sequence_to_text(sequence, skip_eos_and_pad=False, combine_jamo=False):
 79 |     '''Converts a sequence of IDs back to a string'''
 80 |     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 81 |     if 'english_cleaners' in cleaner_names and isEn == False:
 82 |         convert_to_en_symbols()
 83 | 
 84 |     result = ''
 85 |     for symbol_id in sequence:
 86 |         if symbol_id in _id_to_symbol:
 87 |             s = _id_to_symbol[symbol_id]
 88 |             # Enclose ARPAbet back in curly braces:
 89 |             if len(s) > 1 and s[0] == '@':
 90 |                 s = '{%s}' % s[1:]
 91 | 
 92 |             if not skip_eos_and_pad or s not in [EOS, PAD]:
 93 |                 result += s
 94 | 
 95 |     result = result.replace('}{', ' ')
 96 | 
 97 |     if combine_jamo:
 98 |         return jamo_to_korean(result)
 99 |     else:
100 |         return result
101 | 
102 | 
103 | def _clean_text(text, cleaner_names):
104 |     for name in cleaner_names:
105 |         cleaner = getattr(cleaners, name)
106 |         if not cleaner:
107 |             raise Exception('Unknown cleaner: %s' % name)
108 |         text = cleaner(text)  # '존경하는' --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
109 |     return text
110 | 
111 | 
112 | def _symbols_to_sequence(symbols):
113 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
114 | 
115 | 
116 | def _arpabet_to_sequence(text):
117 |     return _symbols_to_sequence(['@' + s for s in text.split()])
118 | 
119 | 
120 | def _should_keep_symbol(s):
121 |     return s in _symbol_to_id and s is not '_' and s is not '~'
122 | 


--------------------------------------------------------------------------------
/text/kor_dic.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | etc_dictionary = {
  4 |         '2 30대': '이삼십대',
  5 |         '20~30대': '이삼십대',
  6 |         '20, 30대': '이십대 삼십대',
  7 |         '1+1': '원플러스원',
  8 |         '3에서 6개월인': '3개월에서 육개월인',
  9 | }
 10 | 
 11 | english_dictionary = {
 12 |         'Devsisters': '데브시스터즈',
 13 |         'track': '트랙',
 14 | 
 15 |         # krbook
 16 |         'LA': '엘에이',
 17 |         'LG': '엘지',
 18 |         'KOREA': '코리아',
 19 |         'JSA': '제이에스에이',
 20 |         'PGA': '피지에이',
 21 |         'GA': '지에이',
 22 |         'idol': '아이돌',
 23 |         'KTX': '케이티엑스',
 24 |         'AC': '에이씨',
 25 |         'DVD': '디비디',
 26 |         'US': '유에스',
 27 |         'CNN': '씨엔엔',
 28 |         'LPGA': '엘피지에이',
 29 |         'P': '피',
 30 |         'L': '엘',
 31 |         'T': '티',
 32 |         'B': '비',
 33 |         'C': '씨',
 34 |         'BIFF': '비아이에프에프',
 35 |         'GV': '지비',
 36 | 
 37 |         # JTBC
 38 |         'IT': '아이티',
 39 |         'IQ': '아이큐',
 40 |         'JTBC': '제이티비씨',
 41 |         'trickle down effect': '트리클 다운 이펙트',
 42 |         'trickle up effect': '트리클 업 이펙트',
 43 |         'down': '다운',
 44 |         'up': '업',
 45 |         'FCK': '에프씨케이',
 46 |         'AP': '에이피',
 47 |         'WHERETHEWILDTHINGSARE': '',
 48 |         'Rashomon Effect': '',
 49 |         'O': '오',
 50 |         'OO': '오오',
 51 |         'B': '비',
 52 |         'GDP': '지디피',
 53 |         'CIPA': '씨아이피에이',
 54 |         'YS': '와이에스',
 55 |         'Y': '와이',
 56 |         'S': '에스',
 57 |         'JTBC': '제이티비씨',
 58 |         'PC': '피씨',
 59 |         'bill': '빌',
 60 |         'Halmuny': '하모니', #####
 61 |         'X': '엑스',
 62 |         'SNS': '에스엔에스',
 63 |         'ability': '어빌리티',
 64 |         'shy': '',
 65 |         'CCTV': '씨씨티비',
 66 |         'IT': '아이티',
 67 |         'the tenth man': '더 텐쓰 맨', ####
 68 |         'L': '엘',
 69 |         'PC': '피씨',
 70 |         'YSDJJPMB': '', ########
 71 |         'Content Attitude Timing': '컨텐트 애티튜드 타이밍',
 72 |         'CAT': '캣',
 73 |         'IS': '아이에스',
 74 |         'SNS': '에스엔에스',
 75 |         'K': '케이',
 76 |         'Y': '와이',
 77 |         'KDI': '케이디아이',
 78 |         'DOC': '디오씨',
 79 |         'CIA': '씨아이에이',
 80 |         'PBS': '피비에스',
 81 |         'D': '디',
 82 |         'PPropertyPositionPowerPrisonP'
 83 |         'S': '에스',
 84 |         'francisco': '프란시스코',
 85 |         'I': '아이',
 86 |         'III': '아이아이', ######
 87 |         'No joke': '노 조크',
 88 |         'BBK': '비비케이',
 89 |         'LA': '엘에이',
 90 |         'Don': '',
 91 |         't worry be happy': ' 워리 비 해피',
 92 |         'NO': '엔오', #####
 93 |         'it was our sky': '잇 워즈 아워 스카이',
 94 |         'it is our sky': '잇 이즈 아워 스카이', ####
 95 |         'NEIS': '엔이아이에스', #####
 96 |         'IMF': '아이엠에프',
 97 |         'apology': '어폴로지',
 98 |         'humble': '험블',
 99 |         'M': '엠',
100 |         'Nowhere Man': '노웨어 맨',
101 |         'The Tenth Man': '더 텐쓰 맨',
102 |         'PBS': '피비에스',
103 |         'BBC': '비비씨',
104 |         'MRJ': '엠알제이',
105 |         'CCTV': '씨씨티비',
106 |         'Pick me up': '픽 미 업',
107 |         'DNA': '디엔에이',
108 |         'UN': '유엔',
109 |         'STOP': '스탑', #####
110 |         'PRESS': '프레스', #####
111 |         'not to be': '낫 투비',
112 |         'Denial': '디나이얼',
113 |         'G': '지',
114 |         'IMF': '아이엠에프',
115 |         'GDP': '지디피',
116 |         'JTBC': '제이티비씨',
117 |         'Time flies like an arrow': '타임 플라이즈 라이크 언 애로우',
118 |         'DDT': '디디티',
119 |         'AI': '에이아이',
120 |         'Z': '제트',
121 |         'OECD': '오이씨디',
122 |         'N': '앤',
123 |         'A': '에이',
124 |         'MB': '엠비',
125 |         'EH': '이에이치',
126 |         'IS': '아이에스',
127 |         'TV': '티비',
128 |         'MIT': '엠아이티',
129 |         'KBO': '케이비오',
130 |         'I love America': '아이 러브 아메리카',
131 |         'SF': '에스에프',
132 |         'Q': '큐',
133 |         'KFX': '케이에프엑스',
134 |         'PM': '피엠',
135 |         'Prime Minister': '프라임 미니스터',
136 |         'Swordline': '스워드라인',
137 |         'TBS': '티비에스',
138 |         'DDT': '디디티',
139 |         'CS': '씨에스',
140 |         'Reflecting Absence': '리플렉팅 앱센스',
141 |         'PBS': '피비에스',
142 |         'Drum being beaten by everyone': '드럼 빙 비튼 바이 에브리원',
143 |         'negative pressure': '네거티브 프레셔',
144 |         'F': '에프',
145 |         'KIA': '기아',
146 |         'FTA': '에프티에이',
147 |         'Que sais-je': '',
148 |         'UFC': '유에프씨',
149 |         'P': '피',
150 |         'DJ': '디제이',
151 |         'Chaebol': '채벌',
152 |         'BBC': '비비씨',
153 |         'OECD': '오이씨디',
154 |         'BC': '삐씨',
155 |         'C': '씨',
156 |         'B': '씨',
157 |         'KY': '케이와이',
158 |         'K': '케이',
159 |         'CEO': '씨이오',
160 |         'YH': '와이에치',
161 |         'IS': '아이에스',
162 |         'who are you': '후 얼 유',
163 |         'Y': '와이',
164 |         'The Devils Advocate': '더 데빌즈 어드보카트',
165 |         'YS': '와이에스',
166 |         'so sorry': '쏘 쏘리',
167 |         'Santa': '산타',
168 |         'Big Endian': '빅 엔디안',
169 |         'Small Endian': '스몰 엔디안',
170 |         'Oh Captain My Captain': '오 캡틴 마이 캡틴',
171 |         'AIB': '에이아이비',
172 |         'K': '케이',
173 |         'PBS': '피비에스',
174 | }


--------------------------------------------------------------------------------
/LJSpeech-1.1/README:
--------------------------------------------------------------------------------
  1 | -----------------------------------------------------------------------------
  2 | The LJ Speech Dataset
  3 | 
  4 | Version 1.0
  5 | July 5, 2017
  6 | https://keithito.com/LJ-Speech-Dataset
  7 | -----------------------------------------------------------------------------
  8 | 
  9 | 
 10 | OVERVIEW
 11 | 
 12 | This is a public domain speech dataset consisting of 13,100 short audio clips
 13 | of a single speaker reading passages from 7 non-fiction books. A transcription
 14 | is provided for each clip. Clips vary in length from 1 to 10 seconds and have
 15 | a total length of approximately 24 hours.
 16 | 
 17 | The texts were published between 1884 and 1964, and are in the public domain.
 18 | The audio was recorded in 2016-17 by the LibriVox project and is also in the
 19 | public domain.
 20 | 
 21 | 
 22 | 
 23 | FILE FORMAT
 24 | 
 25 | Metadata is provided in metadata.csv. This file consists of one record per
 26 | line, delimited by the pipe character (0x7c). The fields are:
 27 | 
 28 |   1. ID: this is the name of the corresponding .wav file
 29 |   2. Transcription: words spoken by the reader (UTF-8)
 30 |   3. Normalized Transcription: transcription with numbers, ordinals, and
 31 |      monetary units expanded into full words (UTF-8).
 32 | 
 33 | Each audio file is a single-channel 16-bit PCM WAV with a sample rate of
 34 | 22050 Hz.
 35 | 
 36 | 
 37 | 
 38 | STATISTICS
 39 | 
 40 | Total Clips            13,100
 41 | Total Words            225,715
 42 | Total Characters       1,308,674
 43 | Total Duration         23:55:17
 44 | Mean Clip Duration     6.57 sec
 45 | Min Clip Duration      1.11 sec
 46 | Max Clip Duration      10.10 sec
 47 | Mean Words per Clip    17.23
 48 | Distinct Words         13,821
 49 | 
 50 | 
 51 | 
 52 | MISCELLANEOUS
 53 | 
 54 | The audio clips range in length from approximately 1 second to 10 seconds.
 55 | They were segmented automatically based on silences in the recording. Clip
 56 | boundaries generally align with sentence or clause boundaries, but not always.
 57 | 
 58 | The text was matched to the audio manually, and a QA pass was done to ensure
 59 | that the text accurately matched the words spoken in the audio.
 60 | 
 61 | The original LibriVox recordings were distributed as 128 kbps MP3 files. As a
 62 | result, they may contain artifacts introduced by the MP3 encoding.
 63 | 
 64 | The following abbreviations appear in the text. They may be expanded as
 65 | follows:
 66 | 
 67 |      Abbreviation   Expansion
 68 |      --------------------------
 69 |      Mr.            Mister
 70 |      Mrs.           Misess (*)
 71 |      Dr.            Doctor
 72 |      No.            Number
 73 |      St.            Saint
 74 |      Co.            Company
 75 |      Jr.            Junior
 76 |      Maj.           Major
 77 |      Gen.           General
 78 |      Drs.           Doctors
 79 |      Rev.           Reverend
 80 |      Lt.            Lieutenant
 81 |      Hon.           Honorable
 82 |      Sgt.           Sergeant
 83 |      Capt.          Captain
 84 |      Esq.           Esquire
 85 |      Ltd.           Limited
 86 |      Col.           Colonel
 87 |      Ft.            Fort
 88 | 
 89 |      * there's no standard expansion of "Mrs."
 90 | 
 91 | 
 92 | 19 of the transcriptions contain non-ASCII characters (for example, LJ016-0257
 93 | contains "raison d'être").
 94 | 
 95 | For more information or to report errors, please email kito@kito.us.
 96 | 
 97 | 
 98 | 
 99 | LICENSE
100 | 
101 | This dataset is in the public domain in the USA (and likely other countries as
102 | well). There are no restrictions on its use. For more information, please see:
103 | https://librivox.org/pages/public-domain.
104 | 
105 | 
106 | CHANGELOG
107 | 
108 | * 1.0 (July 8, 2017):
109 |   Initial release
110 | 
111 | * 1.1 (Feb 19, 2018):
112 |   Version 1.0 included 30 .wav files with no corresponding annotations in
113 |   metadata.csv. These have been removed in version 1.1. Thanks to Rafael Valle
114 |   for spotting this.
115 | 
116 | 
117 | CREDITS
118 | 
119 | This dataset consists of excerpts from the following works:
120 | 
121 | * Morris, William, et al. Arts and Crafts Essays. 1893.
122 | * Griffiths, Arthur. The Chronicles of Newgate, Vol. 2. 1884.
123 | * Roosevelt, Franklin D. The Fireside Chats of Franklin Delano Roosevelt.
124 |   1933-42.
125 | * Harland, Marion. Marion Harland's Cookery for Beginners. 1893.
126 | * Rolt-Wheeler, Francis. The Science - History of the Universe, Vol. 5:
127 |   Biology. 1910.
128 | * Banks, Edgar J. The Seven Wonders of the Ancient World. 1916.
129 | * President's Commission on the Assassination of President Kennedy. Report
130 |   of the President's Commission on the Assassination of President Kennedy.
131 |   1964.
132 | 
133 | Recordings by Linda Johnson. Alignment and annotation by Keith Ito. All text,
134 | audio, and annotations are in the public domain.
135 | 
136 | There's no requirement to cite this work, but if you'd like to do so, you can
137 | link to: https://keithito.com/LJ-Speech-Dataset
138 | 
139 | or use the following:
140 | @misc{ljspeech17,
141 |   author       = {Keith Ito},
142 |   title        = {The LJ Speech Dataset},
143 |   howpublished = {\url{https://keithito.com/LJ-Speech-Dataset/}},
144 |   year         = 2017
145 | }
146 | 


--------------------------------------------------------------------------------
/util/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import math
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | import scipy
  7 | from hparams import hparams
  8 | 
  9 | 
 10 | def load_wav(path):
 11 |   return librosa.core.load(path, sr=hparams.sample_rate)[0]
 12 | 
 13 | 
 14 | def save_wav(wav, path):
 15 |   wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 16 |   scipy.io.wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))
 17 | 
 18 | 
 19 | def preemphasis(x):
 20 |   return scipy.signal.lfilter([1, -hparams.preemphasis], [1], x)
 21 | 
 22 | 
 23 | def inv_preemphasis(x):
 24 |   return scipy.signal.lfilter([1], [1, -hparams.preemphasis], x)
 25 | 
 26 | 
 27 | def spectrogram(y):
 28 |   D = _stft(preemphasis(y))
 29 |   S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 30 |   return _normalize(S)
 31 | 
 32 | 
 33 | def inv_spectrogram(spectrogram):
 34 |   '''Converts spectrogram to waveform using librosa'''
 35 |   S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
 36 |   return inv_preemphasis(_griffin_lim(S ** hparams.power))          # Reconstruct phase
 37 | 
 38 | 
 39 | def inv_spectrogram_tensorflow(spectrogram):
 40 |   '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
 41 | 
 42 |   Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
 43 |   inv_preemphasis on the output after running the graph.
 44 |   '''
 45 |   S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
 46 |   return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
 47 | 
 48 | 
 49 | def melspectrogram(y):
 50 |   D = _stft(preemphasis(y))
 51 |   S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db
 52 |   return _normalize(S)
 53 | 
 54 | 
 55 | def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8):
 56 |   window_length = int(hparams.sample_rate * min_silence_sec)
 57 |   hop_length = int(window_length / 4)
 58 |   threshold = _db_to_amp(threshold_db)
 59 |   for x in range(hop_length, len(wav) - window_length, hop_length):
 60 |     if np.max(wav[x:x+window_length]) < threshold:
 61 |       return x + hop_length
 62 |   return len(wav)
 63 | 
 64 | 
 65 | def _griffin_lim(S):
 66 |   '''librosa implementation of Griffin-Lim
 67 |   Based on https://github.com/librosa/librosa/issues/434
 68 |   '''
 69 |   angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 70 |   S_complex = np.abs(S).astype(np.complex)
 71 |   y = _istft(S_complex * angles)
 72 |   for i in range(hparams.griffin_lim_iters):
 73 |     angles = np.exp(1j * np.angle(_stft(y)))
 74 |     y = _istft(S_complex * angles)
 75 |   return y
 76 | 
 77 | 
 78 | def _griffin_lim_tensorflow(S):
 79 |   '''TensorFlow implementation of Griffin-Lim
 80 |   Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
 81 |   '''
 82 |   with tf.variable_scope('griffinlim'):
 83 |     # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
 84 |     S = tf.expand_dims(S, 0)
 85 |     S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
 86 |     y = _istft_tensorflow(S_complex)
 87 |     for i in range(hparams.griffin_lim_iters):
 88 |       est = _stft_tensorflow(y)
 89 |       angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
 90 |       y = _istft_tensorflow(S_complex * angles)
 91 |     return tf.squeeze(y, 0)
 92 | 
 93 | 
 94 | def _stft(y):
 95 |   n_fft, hop_length, win_length = _stft_parameters()
 96 |   return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
 97 | 
 98 | 
 99 | def _istft(y):
100 |   _, hop_length, win_length = _stft_parameters()
101 |   return librosa.istft(y, hop_length=hop_length, win_length=win_length)
102 | 
103 | 
104 | def _stft_tensorflow(signals):
105 |   n_fft, hop_length, win_length = _stft_parameters()
106 |   return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
107 | 
108 | 
109 | def _istft_tensorflow(stfts):
110 |   n_fft, hop_length, win_length = _stft_parameters()
111 |   return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
112 | 
113 | 
114 | def _stft_parameters():
115 |   n_fft = (hparams.num_freq - 1) * 2
116 |   hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
117 |   win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
118 |   return n_fft, hop_length, win_length
119 | 
120 | 
121 | # Conversions:
122 | 
123 | _mel_basis = None
124 | 
125 | def _linear_to_mel(spectrogram):
126 |   global _mel_basis
127 |   if _mel_basis is None:
128 |     _mel_basis = _build_mel_basis()
129 |   return np.dot(_mel_basis, spectrogram)
130 | 
131 | def _build_mel_basis():
132 |   n_fft = (hparams.num_freq - 1) * 2
133 |   return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
134 | 
135 | def _amp_to_db(x):
136 |   return 20 * np.log10(np.maximum(1e-5, x))
137 | 
138 | def _db_to_amp(x):
139 |   return np.power(10.0, x * 0.05)
140 | 
141 | def _db_to_amp_tensorflow(x):
142 |   return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
143 | 
144 | def _normalize(S):
145 |   return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
146 | 
147 | def _denormalize(S):
148 |   return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
149 | 
150 | def _denormalize_tensorflow(S):
151 |   return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
152 | 


--------------------------------------------------------------------------------
/models/tacotron.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper, LSTMCell
  3 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, AttentionWrapper
  4 | from text.symbols import symbols
  5 | from util.infolog import log
  6 | from .helpers import TacoTestHelper, TacoTrainingHelper
  7 | from .modules import encoder_cbhg, post_cbhg, prenet
  8 | from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper
  9 | 
 10 | 
 11 | class Tacotron():
 12 |     def __init__(self, hparams):
 13 |         self._hparams = hparams
 14 | 
 15 |     def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
 16 | 
 17 |         with tf.variable_scope('embedding') as scope:
 18 |             is_training = linear_targets is not None
 19 |             batch_size = tf.shape(inputs)[0]
 20 |             hp = self._hparams
 21 | 
 22 |             # Embeddings
 23 |             embedding_table = tf.get_variable(
 24 |                 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
 25 |                 initializer=tf.truncated_normal_initializer(stddev=0.5))
 26 |             embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, embed_depth=512]
 27 | 
 28 |         with tf.variable_scope('encoder') as scope:
 29 |             x = embedded_inputs
 30 |             for i in range(hp.encoder_stack_size):
 31 |                 x = tf.layers.conv1d(x,
 32 |                                      filters=hp.encoder_conv_filter,
 33 |                                      kernel_size=hp.encoder_conv_kernel,
 34 |                                      padding='same',
 35 |                                      activation=tf.nn.relu)
 36 |                 x = tf.layers.batch_normalization(x, training=is_training)
 37 | 
 38 |             lstm_fw = LSTMCell(hp.encoder_lstm_hidden_dim)
 39 |             lstm_bw = LSTMCell(hp.encoder_lstm_hidden_dim)
 40 | 
 41 |             encoder_conv_output = x
 42 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(lstm_fw,
 43 |                                                               lstm_bw,
 44 |                                                               encoder_conv_output,
 45 |                                                               sequence_length=input_lengths,
 46 |                                                               dtype=tf.float32) # [N, T_in, 512]
 47 |             encoder_output = tf.concat(outputs, axis=2)
 48 | 
 49 |         # with tf.variable_scope('decoder') as scope:
 50 | 
 51 | 			
 52 |         self.inputs = inputs
 53 |         self.input_lengths = input_lengths
 54 |         # self.mel_outputs = mel_outputs
 55 |         # self.linear_outputs = linear_outputs
 56 |         # self.alignments = alignments
 57 |         self.mel_targets = mel_targets
 58 |         self.linear_targets = linear_targets
 59 |         log('Initialized Tacotron model. Dimensions: ')
 60 |         log('  embedding:               %d' % embedded_inputs.shape[-1])
 61 |         log('  encoder out:             %d' % encoder_output.shape[-1])
 62 |         # log('  attention out:           %d' % attention_cell.output_size)
 63 |         # log('  concat attn & out:       %d' % concat_cell.output_size)
 64 |         # log('  decoder cell out:        %d' % decoder_cell.output_size)
 65 |         # log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
 66 |         # log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
 67 |         # log('  postnet out:             %d' % post_outputs.shape[-1])
 68 |         # log('  linear out:              %d' % linear_outputs.shape[-1])
 69 | 
 70 |     # def add_loss(self):
 71 |     #     '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
 72 |     #     with tf.variable_scope('loss') as scope:
 73 |     #         hp = self._hparams
 74 |     #         self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs))
 75 |     #         l1 = tf.abs(self.linear_targets - self.linear_outputs)
 76 |     #         # Prioritize loss for frequencies under 3000 Hz.
 77 |     #         n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq)
 78 |     #         self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:, :, 0:n_priority_freq])
 79 |     #         self.loss = self.mel_loss + self.linear_loss
 80 |     #
 81 |     # def add_optimizer(self, global_step):
 82 |     #     '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
 83 |     #
 84 |     #     Args:
 85 |     #       global_step: int32 scalar Tensor representing current global step in training
 86 |     #     '''
 87 |     #     with tf.variable_scope('optimizer') as scope:
 88 |     #         hp = self._hparams
 89 |     #         if hp.decay_learning_rate:
 90 |     #             self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
 91 |     #         else:
 92 |     #             self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
 93 |     #         optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
 94 |     #         gradients, variables = zip(*optimizer.compute_gradients(self.loss))
 95 |     #         self.gradients = gradients
 96 |     #         clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
 97 |     #
 98 |     #         # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
 99 |     #         # https://github.com/tensorflow/tensorflow/issues/1122
100 |     #         with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
101 |     #             self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
102 |     #                                                       global_step=global_step)
103 | 
104 | 
105 | def _learning_rate_decay(init_lr, global_step):
106 |     # Noam scheme from tensor2tensor:
107 |     warmup_steps = 4000.0
108 |     step = tf.cast(global_step + 1, dtype=tf.float32)
109 |     return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
110 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from datetime import datetime
  3 | import math
  4 | import os
  5 | import subprocess
  6 | import time
  7 | import tensorflow as tf
  8 | import traceback
  9 | import sys
 10 | from datasets.datafeeder import DataFeeder
 11 | from hparams import hparams, hparams_debug_string
 12 | from models import create_model
 13 | from text import sequence_to_text
 14 | from util import audio, infolog, plot, ValueWindow
 15 | 
 16 | log = infolog.log
 17 | 
 18 | 
 19 | def get_git_commit():
 20 | 	subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD'])  # Verify client is clean
 21 | 	commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10]
 22 | 	log('Git commit: %s' % commit)
 23 | 	return commit
 24 | 
 25 | 
 26 | def add_stats(model):
 27 | 	with tf.variable_scope('stats') as scope:
 28 | 		tf.summary.histogram('linear_outputs', model.linear_outputs)
 29 | 		tf.summary.histogram('linear_targets', model.linear_targets)
 30 | 		tf.summary.histogram('mel_outputs', model.mel_outputs)
 31 | 		tf.summary.histogram('mel_targets', model.mel_targets)
 32 | 		tf.summary.scalar('loss_mel', model.mel_loss)
 33 | 		tf.summary.scalar('loss_linear', model.linear_loss)
 34 | 		tf.summary.scalar('learning_rate', model.learning_rate)
 35 | 		tf.summary.scalar('loss', model.loss)
 36 | 		gradient_norms = [tf.norm(grad) for grad in model.gradients]
 37 | 		tf.summary.histogram('gradient_norm', gradient_norms)
 38 | 		tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms))
 39 | 		return tf.summary.merge_all()
 40 | 
 41 | 
 42 | def time_string():
 43 | 	return datetime.now().strftime('%Y-%m-%d %H:%M')
 44 | 
 45 | 
 46 | def train(log_dir, args):
 47 | 	commit = get_git_commit() if args.git else 'None'
 48 | 	checkpoint_path = os.path.join(log_dir, 'model.ckpt')
 49 | 	input_path = os.path.join(args.base_dir, args.input)
 50 | 	log('Checkpoint path: %s' % checkpoint_path)
 51 | 	log('Loading training data from: %s' % input_path)
 52 | 	log('Using model: %s' % args.model)
 53 | 	log(hparams_debug_string())
 54 | 
 55 | 	# Set up DataFeeder:
 56 | 	coord = tf.train.Coordinator()
 57 | 	with tf.variable_scope('datafeeder') as scope:
 58 | 		feeder = DataFeeder(coord, input_path, hparams)
 59 | 
 60 | 	# Set up model:
 61 | 	global_step = tf.Variable(0, name='global_step', trainable=False)
 62 | 	with tf.variable_scope('model') as scope:
 63 | 		model = create_model(args.model, hparams)
 64 | 		model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.linear_targets, feeder.stop_token_targets)
 65 | 		model.add_loss()
 66 | 		model.add_optimizer(global_step)
 67 | 		stats = add_stats(model)
 68 | 
 69 | 	# Bookkeeping:
 70 | 	step = 0
 71 | 	time_window = ValueWindow(100)
 72 | 	loss_window = ValueWindow(100)
 73 | 	saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)
 74 | 
 75 | 	# Train!
 76 | 	with tf.Session() as sess:
 77 | 		try:
 78 | 			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
 79 | 			sess.run(tf.global_variables_initializer())
 80 | 
 81 | 			if args.restore_step:
 82 | 				# Restore from a checkpoint if the user requested it.
 83 | 				restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
 84 | 				saver.restore(sess, restore_path)
 85 | 				log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
 86 | 			else:
 87 | 				log('Starting new training run at commit: %s' % commit, slack=True)
 88 | 
 89 | 			feeder.start_in_session(sess)
 90 | 
 91 | 			while not coord.should_stop():
 92 | 				start_time = time.time()
 93 | 				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
 94 | 				time_window.append(time.time() - start_time)
 95 | 				loss_window.append(loss)
 96 | 				message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
 97 | 					step, time_window.average, loss, loss_window.average)
 98 | 				log(message, slack=(step % args.checkpoint_interval == 0))
 99 | 
100 | 				if loss > 100 or math.isnan(loss):
101 | 					log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
102 | 					raise Exception('Loss Exploded')
103 | 
104 | 				if step % args.summary_interval == 0:
105 | 					log('Writing summary at step: %d' % step)
106 | 					summary_writer.add_summary(sess.run(stats), step)
107 | 
108 | 				if step % args.checkpoint_interval == 0:
109 | 					log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
110 | 					saver.save(sess, checkpoint_path, global_step=step)
111 | 					log('Saving audio and alignment...')
112 | 					input_seq, spectrogram, alignment = sess.run([
113 | 						model.inputs[0], model.linear_outputs[0], model.alignments[0]])
114 | 					waveform = audio.inv_spectrogram(spectrogram.T)
115 | 					audio.save_wav(waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
116 | 					plot.plot_alignment(alignment, os.path.join(log_dir, 'step-%d-align.png' % step),
117 | 										info='%s, %s, %s, step=%d, loss=%.5f' % (
118 | 										args.model, commit, time_string(), step, loss))
119 | 					log('Input: %s' % sequence_to_text(input_seq))
120 | 
121 | 		except Exception as e:
122 | 			log('Exiting due to exception: %s' % e, slack=True)
123 | 			traceback.print_exc()
124 | 			coord.request_stop(e)
125 | 
126 | 
127 | def main():
128 | 	parser = argparse.ArgumentParser()
129 | 	parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron/Tacotron2/'))
130 | 	parser.add_argument('--input', default='training/train.txt')
131 | 	parser.add_argument('--model', default='tacotron')
132 | 	parser.add_argument('--name', help='Name of the run. Used for logging. Defaults to model name.')
133 | 	parser.add_argument('--hparams', default='',
134 | 						help='Hyperparameter overrides as a comma-separated list of name=value pairs')
135 | 	parser.add_argument('--restore_step', type=int, help='Global step to restore from checkpoint.')
136 | 	parser.add_argument('--summary_interval', type=int, default=100,
137 | 						help='Steps between running summary ops.')
138 | 	parser.add_argument('--checkpoint_interval', type=int, default=1000,
139 | 						help='Steps between writing checkpoints.')
140 | 	parser.add_argument('--slack_url', help='Slack webhook URL to get periodic reports.')
141 | 	parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
142 | 	parser.add_argument('--git', action='store_true', help='If set, verify that the client is clean.')
143 | 	parser.add_argument('--gpu', default='1')
144 | 	args = parser.parse_args()
145 | 	os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
146 | 
147 | 
148 | 	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
149 | 	os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
150 | 	run_name = args.name or args.model
151 | 	hparams.parse(args.hparams)
152 | 	attention_name = hparams.attention_type
153 | 	print(attention_name)
154 | 	log_dir = os.path.join(args.base_dir, 'logs-%s-%s' % (run_name, attention_name))
155 | 	os.makedirs(log_dir, exist_ok=True)
156 | 	infolog.init(os.path.join(log_dir, 'train.log'), run_name, args.slack_url)
157 | 	train(log_dir, args)
158 | 
159 | 
160 | if __name__ == '__main__':
161 | 	main()
162 | 


--------------------------------------------------------------------------------
/datasets/datafeeder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import random
  4 | import tensorflow as tf
  5 | import threading
  6 | import time
  7 | import traceback
  8 | from text import text_to_sequence
  9 | from util.infolog import log
 10 | 
 11 | _batches_per_group = 32
 12 | # _p_cmudict = 0.5
 13 | _pad = 0
 14 | _stop_token_pad = 1
 15 | 
 16 | 
 17 | class DataFeeder(threading.Thread):
 18 |     '''Feeds batches of data into a queue on a background thread.'''
 19 | 
 20 |     def __init__(self, coordinator, metadata_filename, hparams):
 21 |         super(DataFeeder, self).__init__()
 22 |         self._coord = coordinator
 23 |         self._hparams = hparams
 24 |         self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 25 |         self._offset = 0
 26 | 
 27 |         # Load metadata:
 28 |         self._datadir = os.path.dirname(metadata_filename)
 29 |         with open(metadata_filename, encoding='utf-8') as f:
 30 |             self._metadata = [line.strip().split('|') for line in f]
 31 |             hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
 32 |             log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
 33 | 
 34 |         # Create placeholders for inputs and targets. Don't specify batch size because we want to
 35 |         # be able to feed different sized batches at eval time.
 36 |         self._placeholders = [
 37 |             tf.placeholder(tf.int32, [None, None], 'inputs'),
 38 |             tf.placeholder(tf.int32, [None], 'input_lengths'),
 39 |             tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
 40 |             tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
 41 |             tf.placeholder(tf.float32, [None, None], 'stop_token_targets')
 42 |         ]
 43 | 
 44 |         # Create queue for buffering data:
 45 |         queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32], name='input_queue')
 46 |         self._enqueue_op = queue.enqueue(self._placeholders)
 47 |         self.inputs, self.input_lengths, self.mel_targets, self.linear_targets, self.stop_token_targets = queue.dequeue()
 48 |         self.inputs.set_shape(self._placeholders[0].shape)
 49 |         self.input_lengths.set_shape(self._placeholders[1].shape)
 50 |         self.mel_targets.set_shape(self._placeholders[2].shape)
 51 |         self.linear_targets.set_shape(self._placeholders[3].shape)
 52 |         self.stop_token_targets.set_shape(self._placeholders[4].shape)
 53 |         self._cmudict = None
 54 | 
 55 |         # # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
 56 |         # # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
 57 |         # # synthesis (useful for proper nouns, etc.)
 58 |         # if hparams.use_cmudict:
 59 |         #     cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
 60 |         #     if not os.path.isfile(cmudict_path):
 61 |         #         raise Exception('If use_cmudict=True, you must download ' +
 62 |         #                         'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path)
 63 |         #     self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
 64 |         #     log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
 65 |         # else:
 66 |         #     self._cmudict = None
 67 | 
 68 |     def start_in_session(self, session):
 69 |         self._session = session
 70 |         self.start()
 71 | 
 72 |     def run(self):
 73 |         try:
 74 |             while not self._coord.should_stop():
 75 |                 self._enqueue_next_group()
 76 |         except Exception as e:
 77 |             traceback.print_exc()
 78 |             self._coord.request_stop(e)
 79 | 
 80 |     def _enqueue_next_group(self):
 81 |         start = time.time()
 82 | 
 83 |         # Read a group of examples:
 84 |         n = self._hparams.batch_size
 85 |         r = self._hparams.outputs_per_step
 86 |         examples = [self._get_next_example() for i in range(n * _batches_per_group)]
 87 | 
 88 |         # Bucket examples based on similar output sequence length for efficiency:
 89 |         examples.sort(key=lambda x: x[-1])
 90 |         batches = [examples[i:i + n] for i in range(0, len(examples), n)]
 91 |         random.shuffle(batches)
 92 | 
 93 |         log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
 94 |         for batch in batches:
 95 |             feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
 96 |             self._session.run(self._enqueue_op, feed_dict=feed_dict)
 97 | 
 98 |     def _get_next_example(self):
 99 |         '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
100 |         if self._offset >= len(self._metadata):
101 |             self._offset = 0
102 |             random.shuffle(self._metadata)
103 |         meta = self._metadata[self._offset]
104 |         self._offset += 1
105 | 
106 |         text = meta[3]
107 |         # if self._cmudict and random.random() < _p_cmudict:
108 |         #     text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])
109 | 
110 |         input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
111 |         linear_target = np.load(os.path.join(self._datadir, meta[0]))
112 |         mel_target = np.load(os.path.join(self._datadir, meta[1]))
113 |         stop_token_target = np.asarray([0.] * len(mel_target))
114 |         return (input_data, mel_target, linear_target, stop_token_target, len(linear_target))
115 | 
116 |     def _maybe_get_arpabet(self, word):
117 |         arpabet = self._cmudict.lookup(word)
118 |         return '{%s}' % arpabet[0] if arpabet is not None and random.random() < 0.5 else word
119 | 
120 | 
121 | def _prepare_batch(batch, outputs_per_step):
122 |     random.shuffle(batch)
123 |     inputs = _prepare_inputs([x[0] for x in batch])
124 |     input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
125 |     mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
126 |     linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
127 |     stop_token_targets = _prepare_stop_token_targets([x[3] for x in batch], outputs_per_step)
128 |     return (inputs, input_lengths, mel_targets, linear_targets, stop_token_targets)
129 | 
130 | 
131 | def _prepare_inputs(inputs):
132 |     max_len = max((len(x) for x in inputs))
133 |     return np.stack([_pad_input(x, max_len) for x in inputs])
134 | 
135 | 
136 | def _prepare_targets(targets, alignment):
137 |     max_len = max((len(t) for t in targets)) + 1
138 |     return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
139 | 
140 | def _prepare_stop_token_targets(targets, alignment):
141 |     max_len = max((len(t) for t in targets)) + 1
142 |     return np.stack([_pad_stop_token_target(t, _round_up(max_len, alignment)) for t in targets])
143 | 
144 | def _pad_input(x, length):
145 |     return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
146 | 
147 | 
148 | def _pad_target(t, length):
149 |     return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=_pad)
150 | 
151 | def _pad_stop_token_target(t, length):
152 |     return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=_stop_token_pad)
153 | 
154 | def _round_up(x, multiple):
155 |     remainder = x % multiple
156 |     return x if remainder == 0 else x + multiple - remainder
157 | 


--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # Code based on carpedm20
  3 | 
  4 | import re
  5 | import os
  6 | import ast
  7 | import json
  8 | from jamo import hangul_to_jamo, h2j, j2h
  9 | 
 10 | from .kor_dic import english_dictionary, etc_dictionary
 11 | 
 12 | PAD = '_'
 13 | EOS = '~'
 14 | PUNC = '!\'(),-.:;?'
 15 | SPACE = ' '
 16 | 
 17 | JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
 18 | JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
 19 | JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
 20 | 
 21 | VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
 22 | ALL_SYMBOLS = PAD + EOS + VALID_CHARS
 23 | 
 24 | char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}
 25 | id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}
 26 | 
 27 | quote_checker = """([`"'＂“‘])(.+?)([`"'＂”’])"""
 28 | 
 29 | 
 30 | def is_lead(char):
 31 |     return char in JAMO_LEADS
 32 | 
 33 | 
 34 | def is_vowel(char):
 35 |     return char in JAMO_VOWELS
 36 | 
 37 | 
 38 | def is_tail(char):
 39 |     return char in JAMO_TAILS
 40 | 
 41 | 
 42 | def get_mode(char):
 43 |     if is_lead(char):
 44 |         return 0
 45 |     elif is_vowel(char):
 46 |         return 1
 47 |     elif is_tail(char):
 48 |         return 2
 49 |     else:
 50 |         return -1
 51 | 
 52 | 
 53 | def _get_text_from_candidates(candidates):
 54 |     if len(candidates) == 0:
 55 |         return ""
 56 |     elif len(candidates) == 1:
 57 |         return _jamo_char_to_hcj(candidates[0])
 58 |     else:
 59 |         return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
 60 | 
 61 | 
 62 | def jamo_to_korean(text):
 63 |     text = h2j(text)
 64 | 
 65 |     idx = 0
 66 |     new_text = ""
 67 |     candidates = []
 68 | 
 69 |     while True:
 70 |         if idx >= len(text):
 71 |             new_text += _get_text_from_candidates(candidates)
 72 |             break
 73 | 
 74 |         char = text[idx]
 75 |         mode = get_mode(char)
 76 | 
 77 |         if mode == 0:
 78 |             new_text += _get_text_from_candidates(candidates)
 79 |             candidates = [char]
 80 |         elif mode == -1:
 81 |             new_text += _get_text_from_candidates(candidates)
 82 |             new_text += char
 83 |             candidates = []
 84 |         else:
 85 |             candidates.append(char)
 86 | 
 87 |         idx += 1
 88 |     return new_text
 89 | 
 90 | 
 91 | num_to_kor = {
 92 |     '0': '영',
 93 |     '1': '일',
 94 |     '2': '이',
 95 |     '3': '삼',
 96 |     '4': '사',
 97 |     '5': '오',
 98 |     '6': '육',
 99 |     '7': '칠',
100 |     '8': '팔',
101 |     '9': '구',
102 | }
103 | 
104 | unit_to_kor1 = {
105 |     '%': '퍼센트',
106 |     'cm': '센치미터',
107 |     'mm': '밀리미터',
108 |     'km': '킬로미터',
109 |     'kg': '킬로그람',
110 | }
111 | unit_to_kor2 = {
112 |     'm': '미터',
113 | }
114 | 
115 | upper_to_kor = {
116 |     'A': '에이',
117 |     'B': '비',
118 |     'C': '씨',
119 |     'D': '디',
120 |     'E': '이',
121 |     'F': '에프',
122 |     'G': '지',
123 |     'H': '에이치',
124 |     'I': '아이',
125 |     'J': '제이',
126 |     'K': '케이',
127 |     'L': '엘',
128 |     'M': '엠',
129 |     'N': '엔',
130 |     'O': '오',
131 |     'P': '피',
132 |     'Q': '큐',
133 |     'R': '알',
134 |     'S': '에스',
135 |     'T': '티',
136 |     'U': '유',
137 |     'V': '브이',
138 |     'W': '더블유',
139 |     'X': '엑스',
140 |     'Y': '와이',
141 |     'Z': '지',
142 | }
143 | 
144 | 
145 | def compare_sentence_with_jamo(text1, text2):
146 |     return h2j(text1) != h2j(text2)
147 | 
148 | 
149 | def tokenize(text, as_id=False):
150 |     # jamo package에 있는 hangul_to_jamo를 이용하여 한글 string을 초성/중성/종성으로 나눈다.
151 |     text = normalize(text)
152 |     tokens = list(hangul_to_jamo(text))  # '존경하는'  --> ['ᄌ', 'ᅩ', 'ᆫ', 'ᄀ', 'ᅧ', 'ᆼ', 'ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆫ', '~']
153 | 
154 |     if as_id:
155 |         return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
156 |     else:
157 |         return [token for token in tokens] + [EOS]
158 | 
159 | 
160 | def tokenizer_fn(iterator):
161 |     return (token for x in iterator for token in tokenize(x, as_id=False))
162 | 
163 | 
164 | def normalize(text):
165 |     text = text.strip()
166 | 
167 |     text = re.sub('\(\d+일\)', '', text)
168 |     text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text)
169 | 
170 |     text = normalize_with_dictionary(text, etc_dictionary)
171 |     text = normalize_english(text)
172 |     text = re.sub('[a-zA-Z]+', normalize_upper, text)
173 | 
174 |     text = normalize_quote(text)
175 |     text = normalize_number(text)
176 | 
177 |     return text
178 | 
179 | 
180 | def normalize_with_dictionary(text, dic):
181 |     if any(key in text for key in dic.keys()):
182 |         pattern = re.compile('|'.join(re.escape(key) for key in dic.keys()))
183 |         return pattern.sub(lambda x: dic[x.group()], text)
184 |     else:
185 |         return text
186 | 
187 | 
188 | def normalize_english(text):
189 |     def fn(m):
190 |         word = m.group()
191 |         if word in english_dictionary:
192 |             return english_dictionary.get(word)
193 |         else:
194 |             return word
195 | 
196 |     text = re.sub("([A-Za-z]+)", fn, text)
197 |     return text
198 | 
199 | 
200 | def normalize_upper(text):
201 |     text = text.group(0)
202 | 
203 |     if all([char.isupper() for char in text]):
204 |         return "".join(upper_to_kor[char] for char in text)
205 |     else:
206 |         return text
207 | 
208 | 
209 | def normalize_quote(text):
210 |     def fn(found_text):
211 |         from nltk import sent_tokenize  # NLTK doesn't along with multiprocessing
212 | 
213 |         found_text = found_text.group()
214 |         unquoted_text = found_text[1:-1]
215 | 
216 |         sentences = sent_tokenize(unquoted_text)
217 |         return " ".join(["'{}'".format(sent) for sent in sentences])
218 | 
219 |     return re.sub(quote_checker, fn, text)
220 | 
221 | 
222 | number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
223 | count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
224 | 
225 | 
226 | def normalize_number(text):
227 |     text = normalize_with_dictionary(text, unit_to_kor1)
228 |     text = normalize_with_dictionary(text, unit_to_kor2)
229 |     text = re.sub(number_checker + count_checker,
230 |                   lambda x: number_to_korean(x, True), text)
231 |     text = re.sub(number_checker,
232 |                   lambda x: number_to_korean(x, False), text)
233 |     return text
234 | 
235 | 
236 | num_to_kor1 = [""] + list("일이삼사오육칠팔구")
237 | num_to_kor2 = [""] + list("만억조경해")
238 | num_to_kor3 = [""] + list("십백천")
239 | 
240 | # count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
241 | count_to_kor1 = [""] + ["한", "두", "세", "네", "다섯", "여섯", "일곱", "여덟", "아홉"]
242 | 
243 | count_tenth_dict = {
244 |     "십": "열",
245 |     "두십": "스물",
246 |     "세십": "서른",
247 |     "네십": "마흔",
248 |     "다섯십": "쉰",
249 |     "여섯십": "예순",
250 |     "일곱십": "일흔",
251 |     "여덟십": "여든",
252 |     "아홉십": "아흔",
253 | }
254 | 
255 | 
256 | def number_to_korean(num_str, is_count=False):
257 |     if is_count:
258 |         num_str, unit_str = num_str.group(1), num_str.group(2)
259 |     else:
260 |         num_str, unit_str = num_str.group(), ""
261 | 
262 |     num_str = num_str.replace(',', '')
263 |     num = ast.literal_eval(num_str)
264 | 
265 |     if num == 0:
266 |         return "영"
267 | 
268 |     check_float = num_str.split('.')
269 |     if len(check_float) == 2:
270 |         digit_str, float_str = check_float
271 |     elif len(check_float) >= 3:
272 |         raise Exception(" [!] Wrong number format")
273 |     else:
274 |         digit_str, float_str = check_float[0], None
275 | 
276 |     if is_count and float_str is not None:
277 |         raise Exception(" [!] `is_count` and float number does not fit each other")
278 | 
279 |     digit = int(digit_str)
280 | 
281 |     if digit_str.startswith("-"):
282 |         digit, digit_str = abs(digit), str(abs(digit))
283 | 
284 |     kor = ""
285 |     size = len(str(digit))
286 |     tmp = []
287 | 
288 |     for i, v in enumerate(digit_str, start=1):
289 |         v = int(v)
290 | 
291 |         if v != 0:
292 |             if is_count:
293 |                 tmp += count_to_kor1[v]
294 |             else:
295 |                 tmp += num_to_kor1[v]
296 | 
297 |             tmp += num_to_kor3[(size - i) % 4]
298 | 
299 |         if (size - i) % 4 == 0 and len(tmp) != 0:
300 |             kor += "".join(tmp)
301 |             tmp = []
302 |             kor += num_to_kor2[int((size - i) / 4)]
303 | 
304 |     if is_count:
305 |         if kor.startswith("한") and len(kor) > 1:
306 |             kor = kor[1:]
307 | 
308 |         if any(word in kor for word in count_tenth_dict):
309 |             kor = re.sub(
310 |                 '|'.join(count_tenth_dict.keys()),
311 |                 lambda x: count_tenth_dict[x.group()], kor)
312 | 
313 |     if not is_count and kor.startswith("일") and len(kor) > 1:
314 |         kor = kor[1:]
315 | 
316 |     if float_str is not None:
317 |         kor += "쩜 "
318 |         kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)
319 | 
320 |     if num_str.startswith("+"):
321 |         kor = "플러스 " + kor
322 |     elif num_str.startswith("-"):
323 |         kor = "마이너스 " + kor
324 | 
325 |     return kor + unit_str
326 | 
327 | 
328 | if __name__ == "__main__":
329 |     def test_normalize(text):
330 |         print(text)
331 |         print(normalize(text))
332 |         print("=" * 30)
333 | 
334 | 
335 |     test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")
336 |     test_normalize("오늘(13일) 3,600마리 강아지가")
337 |     test_normalize("60.3%")
338 |     test_normalize('"저돌"(猪突) 입니다.')
339 |     test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
340 |     test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
341 |     test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
342 |     print(list(hangul_to_jamo(list(hangul_to_jamo('비대위원장이 지난 1월 이런 말을 했습니다? “난 그냥 산돼지처럼 돌파하는 스타일이다”')))))


--------------------------------------------------------------------------------
/models/tacotron2.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper
  3 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, AttentionWrapper, BahdanauMonotonicAttention, LuongAttention
  4 | from text.symbols import symbols
  5 | from util.infolog import log
  6 | from .helpers import TacoTestHelper, TacoTrainingHelper
  7 | from .modules import encoder_cbhg, post_cbhg, prenet, LocationSensitiveAttention, ZoneoutLSTMCell, GmmAttention, BahdanauStepwiseMonotonicAttention
  8 | from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper
  9 | 
 10 | 
 11 | class Tacotron2():
 12 |     def __init__(self, hparams):
 13 |         self._hparams = hparams
 14 | 
 15 |     def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None, stop_token_targets=None):
 16 |         '''Initializes the model for inference.
 17 | 
 18 |         Sets "mel_outputs", "linear_outputs", and "alignments" fields.
 19 | 
 20 |         Args:
 21 |           inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
 22 |             steps in the input time series, and values are character IDs
 23 |           input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
 24 |             of each sequence in inputs.
 25 |           mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
 26 |             of steps in the output time series, M is num_mels, and values are entries in the mel
 27 |             spectrogram. Only needed for training.
 28 |           linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
 29 |             of steps in the output time series, F is num_freq, and values are entries in the linear
 30 |             spectrogram. Only needed for training.
 31 |         '''
 32 |         with tf.variable_scope('inference') as scope:
 33 |             is_training = linear_targets is not None
 34 |             batch_size = tf.shape(inputs)[0]
 35 |             hp = self._hparams
 36 | 
 37 |             # Embeddings
 38 |             embedding_table = tf.get_variable(
 39 |                 'embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
 40 |                 initializer=tf.truncated_normal_initializer(stddev=0.5))
 41 |             
 42 |             embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, embed_depth=256]
 43 |             
 44 |         with tf.variable_scope('Encoder') as scope:
 45 | 
 46 |             x = embedded_inputs
 47 |             
 48 |             #3 Conv Layers
 49 |             for i in range(3):
 50 |                 x = tf.layers.conv1d(x,filters=512,kernel_size=5,padding='same',activation=tf.nn.relu,name='Encoder_{}'.format(i))
 51 |                 x = tf.layers.batch_normalization(x, training=is_training)
 52 |                 x = tf.layers.dropout(x, rate=0.5, training=is_training, name='dropout_{}'.format(i))
 53 |             encoder_conv_output = x
 54 |             
 55 |             #bi-directional LSTM
 56 |             cell_fw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_fw_LSTM')
 57 |             cell_bw= ZoneoutLSTMCell(256, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='encoder_bw_LSTM')
 58 |            
 59 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, encoder_conv_output, sequence_length=input_lengths, dtype=tf.float32)
 60 |             
 61 |             # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
 62 |             encoder_outputs = tf.concat(outputs, axis=2) # Concat and return forward + backward outputs
 63 |             
 64 |         with tf.variable_scope('Decoder') as scope:
 65 |             
 66 |             if hp.attention_type == 'loc_sen': # Location Sensitivity Attention
 67 |                 attention_mechanism = LocationSensitiveAttention(128, encoder_outputs,hparams=hp, is_training=is_training,
 68 |                                     mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
 69 |             elif hp.attention_type == 'gmm': # GMM Attention
 70 |                 attention_mechanism = GmmAttention(128, memory=encoder_outputs, memory_sequence_length = input_lengths) 
 71 |             elif hp.attention_type == 'step_bah':
 72 |                 attention_mechanism = BahdanauStepwiseMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, mode="parallel")
 73 |             elif hp.attention_type == 'mon_bah':
 74 |                 attention_mechanism = BahdanauMonotonicAttention(128, encoder_outputs, memory_sequence_length = input_lengths, normalize=True)
 75 |             elif hp.attention_type == 'loung':
 76 |                 attention_mechanism = LuongAttention(128, encoder_outputs, memory_sequence_length = input_lengths) 
 77 | 
 78 |             # attention_mechanism = LocationSensitiveAttention(128, encoder_outputs, hparams=hp, is_training=is_training, mask_encoder=True, memory_sequence_length = input_lengths, smoothing=False, cumulate_weights=True)
 79 |             #mask_encoder: whether to mask encoder padding while computing location sensitive attention. Set to True for better prosody but slower convergence.
 80 |             #cumulate_weights: Whether to cumulate (sum) all previous attention weights or simply feed previous weights (Recommended: True)
 81 |             
 82 |             decoder_lstm = [ZoneoutLSTMCell(1024, is_training, zoneout_factor_cell=0.1, zoneout_factor_output=0.1, name='decoder_LSTM_{}'.format(i+1)) for i in range(2)]
 83 |             
 84 |             decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm, state_is_tuple=True)
 85 |             # decoder_init_state = decoder_lstm.zero_state(batch_size=batch_size, dtype=tf.float32) #tensorflow1에는 없음
 86 |             
 87 |             attention_cell = AttentionWrapper(decoder_lstm, attention_mechanism, alignment_history=True, output_attention=False)
 88 | 
 89 |             # attention_state_size = 256
 90 |             # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
 91 |             dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)
 92 |             dec_outputs_cell = OutputProjectionWrapper(dec_outputs,(hp.num_mels) * hp.outputs_per_step)
 93 | 
 94 |             if is_training:
 95 |                 helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
 96 |             else:
 97 |                 helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
 98 |                 
 99 |             decoder_init_state = dec_outputs_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
100 |             (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
101 |                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
102 |                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]
103 | 
104 |             # Reshape outputs to be one output per entry
105 |             decoder_mel_outputs = tf.reshape(decoder_outputs[:,:,:hp.num_mels * hp.outputs_per_step], [batch_size, -1, hp.num_mels])  # [N, T_out, M]
106 |             #stop_token_outputs = tf.reshape(decoder_outputs[:,:,hp.num_mels * hp.outputs_per_step:], [batch_size, -1]) # [N,iters]
107 |             
108 |      # Postnet
109 |             x = decoder_mel_outputs
110 |             for i in range(5):
111 |                 activation = tf.nn.tanh if i != (4) else None
112 |                 x = tf.layers.conv1d(x,filters=512, kernel_size=5, padding='same', activation=activation, name='Postnet_{}'.format(i))
113 |                 x = tf.layers.batch_normalization(x, training=is_training)
114 |                 x = tf.layers.dropout(x, rate=0.5, training=is_training, name='Postnet_dropout_{}'.format(i))
115 |  
116 |             residual = tf.layers.dense(x, hp.num_mels, name='residual_projection')
117 |             mel_outputs = decoder_mel_outputs + residual
118 | 
119 |             # Add post-processing CBHG:
120 |             # mel_outputs: (N,T,num_mels)
121 |             post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training, hp.postnet_depth)
122 |             linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)    # [N, T_out, F(1025)]
123 |  
124 |             # Grab alignments from the final decoder state:
125 |             alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])  # batch_size, text length(encoder), target length(decoder)
126 |  
127 | 			
128 |             self.inputs = inputs
129 |             self.input_lengths = input_lengths
130 |             self.decoder_mel_outputs = decoder_mel_outputs
131 |             self.mel_outputs = mel_outputs
132 |             self.linear_outputs = linear_outputs
133 |             self.alignments = alignments
134 |             self.mel_targets = mel_targets
135 |             self.linear_targets = linear_targets
136 |             #self.stop_token_targets = stop_token_targets
137 |             #self.stop_token_outputs = stop_token_outputs
138 |             self.all_vars = tf.trainable_variables()
139 |             log('Initialized Tacotron model. Dimensions: ')
140 |             log('  embedding:               %d' % embedded_inputs.shape[-1])
141 |             # log('  prenet out:              %d' % prenet_outputs.shape[-1])
142 |             log('  encoder out:             %d' % encoder_outputs.shape[-1])
143 |             log('  attention out:           %d' % attention_cell.output_size)
144 |             #log('  concat attn & out:       %d' % concat_cell.output_size)
145 |             log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
146 |             log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
147 |             log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
148 |             log('  postnet out:             %d' % post_outputs.shape[-1])
149 |             log('  linear out:              %d' % linear_outputs.shape[-1])
150 | 
151 |     def add_loss(self):
152 |         '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
153 |         with tf.variable_scope('loss') as scope:
154 |             hp = self._hparams
155 |             before = tf.losses.mean_squared_error(self.mel_targets, self.decoder_mel_outputs)
156 |             after = tf.losses.mean_squared_error(self.mel_targets, self.mel_outputs)
157 |     
158 |             self.mel_loss = before + after
159 | 
160 | 
161 |             #self.stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.stop_token_targets, logits=self.stop_token_outputs))
162 | 
163 |             l1 = tf.abs(self.linear_targets - self.linear_outputs)
164 |             # Prioritize loss for frequencies under 3000 Hz.
165 |             n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq)
166 |             self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:, :, 0:n_priority_freq])
167 | 
168 |             self.regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars
169 | 						if not('bias' in v.name or 'Bias' in v.name or '_projection' in v.name or 'inputs_embedding' in v.name
170 | 							or 'RNN' in v.name or 'LSTM' in v.name)]) * hp.reg_weight
171 |             self.loss = self.mel_loss + self.linear_loss + self.regularization
172 | 
173 |     def add_optimizer(self, global_step):
174 |         '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
175 | 
176 |         Args:
177 |           global_step: int32 scalar Tensor representing current global step in training
178 |         '''
179 |         with tf.variable_scope('optimizer') as scope:
180 |             hp = self._hparams
181 |             if hp.decay_learning_rate:
182 |                 self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
183 |             else:
184 |                 self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
185 |             optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
186 |             gradients, variables = zip(*optimizer.compute_gradients(self.loss))
187 |             self.gradients = gradients
188 |             clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
189 | 
190 |             # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
191 |             # https://github.com/tensorflow/tensorflow/issues/1122
192 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
193 |                 self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
194 |                                                           global_step=global_step)
195 | 
196 | 
197 | def _learning_rate_decay(init_lr, global_step):
198 |     # Noam scheme from tensor2tensor:
199 |     warmup_steps = 4000.0
200 |     step = tf.cast(global_step + 1, dtype=tf.float32)
201 |     return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
202 | 


--------------------------------------------------------------------------------
/models/modules.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.rnn import RNNCell, GRUCell
  4 | from tensorflow.python.ops import rnn_cell_impl
  5 | from tensorflow.contrib.framework import nest
  6 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, BahdanauMonotonicAttention, AttentionWrapperState, AttentionMechanism, _BaseMonotonicAttentionMechanism, _maybe_mask_score,_prepare_memory, _monotonic_probability_fn
  7 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope, random_ops
  8 | from tensorflow.python.layers.core import Dense
  9 | 
 10 | import functools
 11 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 12 | 
 13 | '''
 14 | Adding zoneoutLSTMcell and LocationSensitiveAttention function to existing code for Tacotron2
 15 | '''
 16 | 
 17 | def prenet(inputs, is_training, layer_sizes, scope=None):
 18 | 	"""
 19 | 	Args:
 20 | 		inputs: input vector
 21 | 		is_training: dropout option
 22 | 		layer_sizes: iteration number
 23 | 	
 24 | 	Output:
 25 | 		x: prenet
 26 | 	"""
 27 | 	x = inputs
 28 | 	drop_rate = 0.5 if is_training else 0.0 # set dropout rate 0.5 (only training)
 29 | 	with tf.variable_scope(scope or 'prenet'):
 30 | 		for i, size in enumerate(layer_sizes): # iterate layer_sizes
 31 | 			dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i + 1))
 32 | 			x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, name='dropout_%d' % (i + 1)) 
 33 | 	return x
 34 | 
 35 | 
 36 | def encoder_cbhg(inputs, input_lengths, is_training, depth):
 37 | 	"""
 38 | 	Args:
 39 | 		inputs: input tensor
 40 | 		input_lengths: length of input tensor
 41 | 		is_training: Batch Normalization option in Conv1D
 42 | 		depth: dimensionality option of Highway net and Bidirectical GRU's output
 43 | 	
 44 | 	Output:
 45 | 		cbhg function
 46 | 	"""
 47 | 	input_channels = inputs.get_shape()[2] # 3rd element of inputs' shape
 48 | 	return cbhg(
 49 | 		inputs,
 50 | 		input_lengths,
 51 | 		is_training,
 52 | 		scope='encoder_cbhg',
 53 | 		K=16,
 54 | 		projections=[128, input_channels],
 55 | 		depth=depth)
 56 | 
 57 | 
 58 | def post_cbhg(inputs, input_dim, is_training, depth):
 59 | 	"""
 60 | 	Args:
 61 | 		inputs: input tensor
 62 | 		input_dim: dimension of input tensor
 63 | 		is_training: Batch Normalization option in Conv1D
 64 | 		depth: dimensionality option of Highway net and Bidirectical GRU's output
 65 | 	
 66 | 	Output:
 67 | 		cbhg function
 68 | 	"""
 69 | 	return cbhg(
 70 | 		inputs,
 71 | 		None,
 72 | 		is_training,
 73 | 		scope='post_cbhg',
 74 | 		K=8,
 75 | 		projections=[256, input_dim],
 76 | 		depth=depth)
 77 | 
 78 | 
 79 | def cbhg(inputs, input_lengths, is_training, scope, K, projections, depth):
 80 |     """
 81 |     Args:
 82 |         inputs: input tensor
 83 |         input_lengths: length of input tensor
 84 |         is_training: Batch Normalization option in Conv1D
 85 |         scope: network or model name
 86 |         K: kernel size range
 87 |         projections: projection layers option
 88 |         depth: dimensionality option of Highway net and Bidirectical GRU's output
 89 |     The layers in the code are staked in the order in which they came out.
 90 |     """
 91 |     with tf.variable_scope(scope):
 92 |         with tf.variable_scope('conv_bank'):
 93 | 
 94 |             conv_outputs = tf.concat(
 95 |                 [conv1d(inputs, k, 128, tf.nn.relu, is_training, 'conv1d_%d' % k) for k in range(1, K + 1)], #1D Convolution layers using multiple types of Convolution Kernel.
 96 |                 axis=-1																						 #Iterate K with increasing filter size by 1.
 97 |             )# Convolution bank: concatenate on the last axis to stack channels from all convolutions
 98 | 
 99 |         # Maxpooling:
100 |         maxpool_output = tf.layers.max_pooling1d(
101 |             conv_outputs,
102 |             pool_size=2,
103 |             strides=1,
104 |             padding='same') #1D Maxpooling layer(strides=1, width=2) 
105 | 
106 |         # Two projection layers:
107 |         proj1_output = conv1d(maxpool_output, 3, projections[0], tf.nn.relu, is_training, 'proj_1')#1st Conv1D projections
108 |         proj2_output = conv1d(proj1_output, 3, projections[1], None, is_training, 'proj_2')#2nd Conv1D projections
109 | 
110 |         # Residual connection:
111 |         highway_input = proj2_output + inputs #Highway net input with residual connection
112 | 
113 |         half_depth = depth // 2
114 |         assert half_depth * 2 == depth, 'encoder and postnet depths must be even.' #assert depth to be even
115 | 
116 |         # Handle dimensionality mismatch:
117 |         if highway_input.shape[2] != half_depth: #check input's dimensionality and output's dimensionality are the same
118 |             highway_input = tf.layers.dense(highway_input, half_depth) #change input's channel size to Highway net output's  size
119 | 
120 |         # 4-layer HighwayNet:
121 |         for i in range(4):
122 |             highway_input = highwaynet(highway_input, 'highway_%d' % (i + 1), half_depth) #make 4 Highway net layers
123 |         rnn_input = highway_input
124 | 
125 |         # Bidirectional GRU
126 |         outputs, states = tf.nn.bidirectional_dynamic_rnn( #make Bidirectional GRU
127 |             GRUCell(half_depth),
128 |             GRUCell(half_depth),
129 |             rnn_input,
130 |             sequence_length=input_lengths,
131 |             dtype=tf.float32)
132 |         return tf.concat(outputs, axis=2)  # Concat forward sequence and backward sequence
133 | 
134 | def highwaynet(inputs, scope, depth):
135 | 	with tf.variable_scope(scope):
136 | 		H = tf.layers.dense(
137 | 			inputs,
138 | 			units=depth,
139 | 			activation=tf.nn.relu,
140 | 			name='H')
141 | 		T = tf.layers.dense(
142 | 			inputs,
143 | 			units=depth,
144 | 			activation=tf.nn.sigmoid,
145 | 			name='T',
146 | 			bias_initializer=tf.constant_initializer(-1.0))
147 | 		return H * T + inputs * (1.0 - T)
148 | 
149 | 
150 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
151 | 	"""
152 | 	Args:
153 | 		inputs: input tensor
154 | 		kernel_size: length of the 1D convolution window
155 | 		channels: dimensionality of the output space
156 | 		activation: Activation function (None means linear activation)
157 | 		is_training: Batch Normalization option in Conv1D
158 | 		scope: namespace
159 | 	
160 | 	Output:
161 | 		output tensor
162 | 	"""
163 | 	with tf.variable_scope(scope):
164 | 		conv1d_output = tf.layers.conv1d( # creates a convolution kernel
165 | 			inputs,
166 | 			filters=channels,
167 | 			kernel_size=kernel_size,
168 | 			activation=activation,
169 | 			padding='same') # return output tensor
170 | 		return tf.layers.batch_normalization(conv1d_output, training=is_training)
171 |     
172 | 
173 | class ZoneoutLSTMCell(RNNCell):
174 |     '''Wrapper for tf LSTM to create Zoneout LSTM Cell
175 |     inspired by:
176 |     https://github.com/teganmaharaj/zoneout/blob/master/zoneout_tensorflow.py
177 |     Published by one of 'https://arxiv.org/pdf/1606.01305.pdf' paper writers.
178 |     '''
179 |     def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_factor_output=0., state_is_tuple=True, name=None):
180 |         '''Initializer with possibility to set different zoneout values for cell/hidden states.
181 |         '''
182 |         zm = min(zoneout_factor_output, zoneout_factor_cell)
183 |         zs = max(zoneout_factor_output, zoneout_factor_cell)
184 | 
185 |         if zm < 0. or zs > 1.:
186 |             raise ValueError('One/both provided Zoneout factors are not in [0, 1]')
187 | 
188 |         self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name)
189 |         self._zoneout_cell = zoneout_factor_cell
190 |         self._zoneout_outputs = zoneout_factor_output
191 |         self.is_training = is_training
192 |         self.state_is_tuple = state_is_tuple
193 | 
194 |     @property
195 |     def state_size(self):
196 |         return self._cell.state_size
197 | 
198 |     @property
199 |     def output_size(self):
200 |         return self._cell.output_size
201 | 
202 |     def __call__(self, inputs, state, scope=None):
203 |         '''Runs vanilla LSTM Cell and applies zoneout.
204 |         '''
205 |         #Apply vanilla LSTM
206 |         output, new_state = self._cell(inputs, state, scope)
207 | 
208 |         if self.state_is_tuple:
209 |             (prev_c, prev_h) = state
210 |             (new_c, new_h) = new_state
211 |         else:
212 |             num_proj = self._cell._num_units if self._cell._num_proj is None else self._cell._num_proj
213 |             prev_c = tf.slice(state, [0, 0], [-1, self._cell._num_units])
214 |             prev_h = tf.slice(state, [0, self._cell._num_units], [-1, num_proj])
215 |             new_c = tf.slice(new_state, [0, 0], [-1, self._cell._num_units])
216 |             new_h = tf.slice(new_state, [0, self._cell._num_units], [-1, num_proj])
217 | 
218 |         #Apply zoneout
219 |         if self.is_training:
220 |             #nn.dropout takes keep_prob (probability to keep activations) not drop_prob (probability to mask activations)!
221 |             c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c   # tf.nn.dropout outputs the input element scaled up by 1 / keep_prob
222 |             h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h
223 | 
224 |         else:
225 |             c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c
226 |             h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h
227 | 
228 |         new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, h])
229 | 
230 |         return output, new_state
231 |     
232 | 
233 | class LocationSensitiveAttention(BahdanauAttention):
234 |     """Impelements Bahdanau-style (cumulative) scoring function.
235 |     Usually referred to as "hybrid" attention (content-based + location-based)
236 |     Extends the additive attention described in:
237 |     "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
238 |   tion by jointly learning to align and translate,” in Proceedings
239 |   of ICLR, 2015."
240 |     to use previous alignments as additional location features.
241 |     This attention is described in:
242 |     J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
243 |   gio, “Attention-based models for speech recognition,” in Ad-
244 |   vances in Neural Information Processing Systems, 2015, pp.
245 |   577–585.
246 |     """
247 | 
248 |     def __init__(self,
249 |                  num_units,
250 |                  memory,
251 |                  hparams,
252 |                  is_training,
253 |                  mask_encoder=True,
254 |                  memory_sequence_length=None,
255 |                  smoothing=False,
256 |                  cumulate_weights=True,
257 |                  name='LocationSensitiveAttention'):
258 |         """Construct the Attention mechanism.
259 |         Args:
260 |             num_units: The depth of the query mechanism.
261 |             memory: The memory to query; usually the output of an RNN encoder.  This
262 |                 tensor should be shaped `[batch_size, max_time, ...]`.
263 |             mask_encoder (optional): Boolean, whether to mask encoder paddings.
264 |             memory_sequence_length (optional): Sequence lengths for the batch entries
265 |                 in memory.  If provided, the memory tensor rows are masked with zeros
266 |                 for values past the respective sequence lengths. Only relevant if mask_encoder = True.
267 |             smoothing (optional): Boolean. Determines which normalization function to use.
268 |                 Default normalization function (probablity_fn) is softmax. If smoothing is
269 |                 enabled, we replace softmax with:
270 |                         a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
271 |                 Introduced in:
272 |                     J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
273 |                   gio, “Attention-based models for speech recognition,” in Ad-
274 |                   vances in Neural Information Processing Systems, 2015, pp.
275 |                   577–585.
276 |                 This is mainly used if the model wants to attend to multiple input parts
277 |                 at the same decoding step. We probably won't be using it since multiple sound
278 |                 frames may depend on the same character/phone, probably not the way around.
279 |                 Note:
280 |                     We still keep it implemented in case we want to test it. They used it in the
281 |                     paper in the context of speech recognition, where one phoneme may depend on
282 |                     multiple subsequent sound frames.
283 |             name: Name to use when creating ops.
284 |         """
285 |         #Create normalization function
286 |         #Setting it to None defaults in using softmax
287 |         normalization_function = _smoothing_normalization if (smoothing == True) else None
288 |         memory_length = memory_sequence_length if (mask_encoder==True) else None
289 |         super(LocationSensitiveAttention, self).__init__(
290 |                 num_units=num_units,
291 |                 memory=memory,
292 |                 memory_sequence_length=memory_length,
293 |                 probability_fn=normalization_function,
294 |                 name=name)
295 | 
296 |         self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
297 |             kernel_size=hparams.attention_kernel, padding='same', use_bias=True,
298 |             bias_initializer=tf.zeros_initializer(), name='location_features_convolution')
299 |         self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,dtype=tf.float32, name='location_features_projection')
300 |         self._cumulate = cumulate_weights
301 |         self.synthesis_constraint = hparams.synthesis_constraint and not is_training
302 |         self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32)
303 |         self.constraint_type = hparams.synthesis_constraint_type
304 | 
305 |     def __call__(self, query, state):
306 |         """Score the query based on the keys and values.
307 |         Args:
308 |             query: Tensor of dtype matching `self.values` and shape
309 |                 `[batch_size, query_depth]`.
310 |             state (previous alignments): Tensor of dtype matching `self.values` and shape
311 |                 `[batch_size, alignments_size]`
312 |                 (`alignments_size` is memory's `max_time`).
313 |         Returns:
314 |             alignments: Tensor of dtype matching `self.values` and shape
315 |                 `[batch_size, alignments_size]` (`alignments_size` is memory's
316 |                 `max_time`).
317 |         """
318 |         previous_alignments = state
319 |         with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
320 | 
321 |             # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
322 |             processed_query = self.query_layer(query) if self.query_layer else query
323 |             # -> [batch_size, 1, attention_dim]
324 |             processed_query = tf.expand_dims(processed_query, 1)
325 | 
326 |             # processed_location_features shape [batch_size, max_time, attention dimension]
327 |             # [batch_size, max_time] -> [batch_size, max_time, 1]
328 |             expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
329 |             # location features [batch_size, max_time, filters]
330 |             f = self.location_convolution(expanded_alignments)
331 |             # Projected location features [batch_size, max_time, attention_dim]
332 |             processed_location_features = self.location_layer(f)
333 | 
334 |             # energy shape [batch_size, max_time]
335 |             energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
336 | 
337 |         if self.synthesis_constraint:
338 |             prev_max_attentions = tf.argmax(previous_alignments, -1, output_type=tf.int32)
339 |             Tx = tf.shape(energy)[-1]
340 |             # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1])
341 |             if self.constraint_type == 'monotonic':
342 |                 key_masks = tf.sequence_mask(prev_max_attentions, Tx)
343 |                 reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1]
344 |             else:
345 |                 assert self.constraint_type == 'window'
346 |                 key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx)
347 |                 reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1]
348 |             
349 |             masks = tf.logical_or(key_masks, reverse_masks)
350 |             paddings = tf.ones_like(energy) * (-2 ** 32 + 1)  # (N, Ty/r, Tx)
351 |             energy = tf.where(tf.equal(masks, False), energy, paddings)
352 | 
353 |         # alignments shape = energy shape = [batch_size, max_time]
354 |         alignments = self._probability_fn(energy, previous_alignments)
355 | 
356 |         # Cumulate alignments
357 |         if self._cumulate:
358 |             next_state = alignments + previous_alignments
359 |         else:
360 |             next_state = alignments
361 | 
362 |         return alignments, next_state
363 | 
364 | 
365 | def _location_sensitive_score(W_query, W_fil, W_keys):
366 |     """Impelements Bahdanau-style (cumulative) scoring function.
367 |     This attention is described in:
368 |         J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
369 |       gio, “Attention-based models for speech recognition,” in Ad-
370 |       vances in Neural Information Processing Systems, 2015, pp.
371 |       577–585.
372 |     #############################################################################
373 |               hybrid attention (content-based + location-based)
374 |                                f = F * α_{i-1}
375 |        energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
376 |     #############################################################################
377 |     Args:
378 |         W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
379 |         W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
380 |         W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
381 |     Returns:
382 |         A '[batch_size, max_time]' attention score (energy)
383 |     """
384 |     # Get the number of hidden units from the trailing dimension of keys
385 |     dtype = W_query.dtype
386 |     num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
387 | 
388 |     v_a = tf.get_variable(
389 |         'attention_variable_projection', shape=[num_units], dtype=dtype,
390 |         initializer=tf.contrib.layers.xavier_initializer())
391 |     b_a = tf.get_variable(
392 |         'attention_bias', shape=[num_units], dtype=dtype,
393 |         initializer=tf.zeros_initializer())
394 | 
395 |     return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
396 | 
397 | 
398 | def _smoothing_normalization(e):
399 |     """Applies a smoothing normalization function instead of softmax
400 |     Introduced in:
401 |         J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
402 |       gio, “Attention-based models for speech recognition,” in Ad-
403 |       vances in Neural Information Processing Systems, 2015, pp.
404 |       577–585.
405 |     ############################################################################
406 |                         Smoothing normalization function
407 |                 a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
408 |     ############################################################################
409 |     Args:
410 |         e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
411 |             values of an attention mechanism
412 |     Returns:
413 |         matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
414 |             attendance to multiple memory time steps.
415 |     """
416 |     return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
417 | 
418 | class GmmAttention(AttentionMechanism):
419 |     def __init__(self,
420 |                  num_mixtures,
421 |                  memory,
422 |                  memory_sequence_length=None,
423 |                  check_inner_dims_defined=True,
424 |                  score_mask_value=None,
425 |                  name='GmmAttention'):
426 | 
427 |         self.dtype = memory.dtype
428 |         self.num_mixtures = num_mixtures
429 |         self.query_layer = tf.layers.Dense(3 * num_mixtures, name='gmm_query_projection', use_bias=True, dtype=self.dtype)
430 | 
431 |         with tf.name_scope(name, 'GmmAttentionMechanismInit'):
432 |             if score_mask_value is None:
433 |                 score_mask_value = 0.
434 |             self._maybe_mask_score = functools.partial(
435 |                 _maybe_mask_score,
436 |                 memory_sequence_length=memory_sequence_length,
437 |                 score_mask_value=score_mask_value)
438 |             self._value = _prepare_memory(
439 |                 memory, memory_sequence_length, check_inner_dims_defined)
440 |             self._batch_size = (
441 |                 self._value.shape[0].value or tf.shape(self._value)[0])
442 |             self._alignments_size = (
443 |                     self._value.shape[1].value or tf.shape(self._value)[1])
444 | 
445 |     @property
446 |     def values(self):
447 |         return self._value
448 | 
449 |     @property
450 |     def batch_size(self):
451 |         return self._batch_size
452 | 
453 |     @property
454 |     def alignments_size(self):
455 |         return self._alignments_size
456 | 
457 |     @property
458 |     def state_size(self):
459 |         return self.num_mixtures
460 | 
461 |     def initial_alignments(self, batch_size, dtype):
462 |         max_time = self._alignments_size
463 |         return _zero_state_tensors(max_time, batch_size, dtype)
464 | 
465 |     def initial_state(self, batch_size, dtype):
466 |         state_size_ = self.state_size
467 |         return _zero_state_tensors(state_size_, batch_size, dtype)
468 | 
469 |     def __call__(self, query, state):
470 |         with tf.variable_scope("GmmAttention"):
471 |             previous_kappa = state
472 |             
473 |             params = self.query_layer(query)   # query(dec_rnn_size=256) , params(num_mixtures(256)*3)
474 |             alpha_hat, beta_hat, kappa_hat = tf.split(params, num_or_size_splits=3, axis=1)
475 | 
476 |             # [batch_size, num_mixtures, 1]
477 |             alpha = tf.expand_dims(tf.exp(alpha_hat), axis=2)
478 |             # softmax makes the alpha value more stable.
479 |             # alpha = tf.expand_dims(tf.nn.softmax(alpha_hat, axis=1), axis=2)
480 |             beta = tf.expand_dims(tf.exp(beta_hat), axis=2)
481 |             kappa = tf.expand_dims(previous_kappa + tf.exp(kappa_hat), axis=2)
482 | 
483 |             # [1, 1, max_input_steps]
484 |             mu = tf.reshape(tf.cast(tf.range(self.alignments_size), dtype=tf.float32), shape=[1, 1, self.alignments_size])  # [[[0,1,2,...]]]
485 | 
486 |             # [batch_size, max_input_steps]
487 |             phi = tf.reduce_sum(alpha * tf.exp(-beta * (kappa - mu) ** 2.), axis=1)
488 | 
489 |         alignments = self._maybe_mask_score(phi)
490 |         state = tf.squeeze(kappa, axis=2)
491 | 
492 |         return alignments, state
493 | 
494 | def monotonic_stepwise_attention(p_choose_i, previous_attention, mode):
495 |     # p_choose_i, previous_alignments, previous_score: [batch_size, memory_size]
496 |     # p_choose_i: probability to keep attended to the last attended entry i
497 |     if mode == "parallel":
498 |         pad = tf.zeros([tf.shape(p_choose_i)[0], 1], dtype=p_choose_i.dtype)
499 |         attention = previous_attention * p_choose_i + tf.concat(
500 |             [pad, previous_attention[:, :-1] * (1.0 - p_choose_i[:, :-1])], axis=1)
501 |     elif mode == "hard":
502 |         # Given that previous_alignments is one_hot
503 |         move_next_mask = tf.concat([tf.zeros_like(previous_attention[:, :1]), previous_attention[:, :-1]], axis=1)
504 |         stay_prob = tf.reduce_sum(p_choose_i * previous_attention, axis=1) # [B]
505 |         attention = tf.where(stay_prob > 0.5, previous_attention, move_next_mask)
506 |     else:
507 |         raise ValueError("mode must be 'parallel', or 'hard'.")
508 |     return attention
509 | 
510 | 
511 | def _stepwise_monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, seed=None):
512 |     if sigmoid_noise > 0:
513 |         noise = random_ops.random_normal(array_ops.shape(score), dtype=score.dtype,
514 |                                          seed=seed)
515 |         score += sigmoid_noise * noise
516 |     if mode == "hard":
517 |         # When mode is hard, use a hard sigmoid
518 |         p_choose_i = math_ops.cast(score > 0, score.dtype)
519 |     else:
520 |         p_choose_i = math_ops.sigmoid(score)
521 |     alignments = monotonic_stepwise_attention(p_choose_i, previous_alignments, mode)
522 |     return alignments
523 | 
524 | 
525 | class BahdanauStepwiseMonotonicAttention(BahdanauMonotonicAttention):
526 |     def __init__(self,
527 |                  num_units,
528 |                  memory,
529 |                  memory_sequence_length=None,
530 |                  normalize=True,
531 |                  score_mask_value=None,
532 |                  sigmoid_noise=2.0,
533 |                  sigmoid_noise_seed=None,
534 |                  score_bias_init=3.5,
535 |                  mode="parallel",
536 |                  dtype=None,
537 |                  name="BahdanauStepwiseMonotonicAttention"):
538 |         if dtype is None:
539 |             dtype = tf.float32
540 |         wrapped_probability_fn = functools.partial(
541 |             _stepwise_monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
542 |             seed=sigmoid_noise_seed)
543 |         super(BahdanauMonotonicAttention, self).__init__(
544 |             query_layer=tf.layers.Dense(
545 |                 num_units, name="query_layer", use_bias=False, dtype=dtype),
546 |             memory_layer=tf.layers.Dense(
547 |                 num_units, name="memory_layer", use_bias=False, dtype=dtype),
548 |             memory=memory,
549 |             probability_fn=wrapped_probability_fn,
550 |             memory_sequence_length=memory_sequence_length,
551 |             score_mask_value=score_mask_value,
552 |             name=name)
553 |         self._num_units = num_units
554 |         self._normalize = normalize
555 |         self._name = name
556 |         self._score_bias_init = score_bias_init
557 | 
558 |     # def __call__(self, query, state):
559 |     #     """Score the query based on the keys and values.
560 |     #     Args:
561 |     #       query: Tensor of dtype matching `self.values` and shape
562 |     #         `[batch_size, query_depth]`.
563 |     #       state: Tensor of dtype matching `self.values` and shape
564 |     #         `[batch_size, alignments_size]`
565 |     #         (`alignments_size` is memory's `max_time`).
566 |     #     Returns:
567 |     #       alignments: Tensor of dtype matching `self.values` and shape
568 |     #         `[batch_size, alignments_size]` (`alignments_size` is memory's
569 |     #         `max_time`).
570 |     #     """
571 |     #     with tf.variable_scope(None, "bahdanau_stepwise_monotonic_attention", [query]):
572 |     #         processed_query = self.query_layer(query) if self.query_layer else query
573 |     #         score = _bahdanau_score(processed_query, self._keys, self._normalize)     # keys 가 memory임
574 |     #         score_bias = tf.get_variable("attention_score_bias", dtype=processed_query.dtype, initializer=self._score_bias_init)
575 | 
576 |     #         #alignments_bias = tf.get_variable("alignments_bias", shape = state.get_shape()[-1],dtype=processed_query.dtype, initializer=tf.zeros_initializer())  # hccho
577 |     #         alignments_bias = tf.get_variable("alignments_bias", shape = (1),dtype=processed_query.dtype, initializer=tf.zeros_initializer())  # hccho
578 | 
579 |     #         score += score_bias
580 |     #     alignments = self._probability_fn(score, state)   #BahdanauAttention에서 _probability_fn = softmax
581 | 
582 |     #     next_state = alignments   # 다음 alignment 계산에 사용할 state 값  =  AttentionWrapperState.attention_state
583 |     #     # hccho. alignment가 attention 계산에 직접 사용된다.
584 |     #     alignments = tf.nn.relu(alignments+alignments_bias)
585 |     #     alignments = alignments/(tf.reduce_sum(alignments,axis=-1,keepdims=True) + 1.0e-12 )  # hccho 수정
586 | 
587 | 
588 |     #     return alignments, next_state
589 | 


--------------------------------------------------------------------------------