├── .gitignore
├── LICENSE
├── README.md
├── data_load.py
├── eval.py
├── fig
    ├── attention.gif
    ├── mean_loss.png
    └── training_curve.png
├── harvard_sentences.txt
├── history.md
├── hyperparams.py
├── modules.py
├── networks.py
├── prepro.py
├── synthesize.py
├── train.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | etc/
 2 | asset/
 3 | data/
 4 | corpora/
 5 | logdir/
 6 | samples/
 7 | *.pyc
 8 | _*
 9 | *~
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A (Heavily Documented) TensorFlow Implementation of Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model
  2 | 
  3 | ## Requirements
  4 | 
  5 |   * NumPy >= 1.11.1
  6 |   * TensorFlow >= 1.3
  7 |   * librosa
  8 |   * tqdm
  9 |   * matplotlib
 10 |   * scipy
 11 | 
 12 | ## Data
 13 | 
 14 | <img src="https://upload.wikimedia.org/wikipedia/commons/7/72/World_English_Bible_Cover.jpg" height="200" align="right">
 15 | <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/f6/Nick_Offerman_at_UMBC_%28cropped%29.jpg/440px-Nick_Offerman_at_UMBC_%28cropped%29.jpg" height="200" align="right">
 16 | <img src="https://image.shutterstock.com/z/stock-vector-lj-letters-four-colors-in-abstract-background-logo-design-identity-in-circle-alphabet-letter-418687846.jpg" height="200" align="right">
 17 | 
 18 | We train the model on three different speech datasets.
 19 |   1. [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/)
 20 |   2. [Nick Offerman's Audiobooks](https://www.audible.com.au/search?searchNarrator=Nick+Offerman)
 21 |   3. [The World English Bible](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset)
 22 | 
 23 | LJ Speech Dataset is recently widely used as a benchmark dataset in the TTS task because it is publicly available. It has 24 hours of reasonable quality samples.
 24 | Nick's audiobooks are additionally used to see if the model can learn even with less data, variable speech samples. They are 18 hours long.
 25 | [The World English Bible](https://en.wikipedia.org/wiki/World_English_Bible) is a public domain update of the American Standard Version of 1901 into modern English. Its original audios are freely available [here](http://www.audiotreasure.com/webindex.htm). Kyubyong split each chapter by verse manually and aligned the segmented audio clips to the text. They are 72 hours in total. You can download them at [Kaggle Datasets](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset).
 26 | 
 27 | ## Training
 28 |   * STEP 0. Download [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) or prepare your own data.
 29 |   * STEP 1. Adjust hyper parameters in `hyperparams.py`. (If you want to do preprocessing, set `prepro` True`.
 30 |   * STEP 2. Run `python train.py`. (If you set `prepro` True, run `python prepro.py` first)
 31 |   * STEP 3. Run `python eval.py` regularly during training.
 32 | 
 33 | ## Sample Synthesis
 34 | 
 35 | We generate speech samples based on [Harvard Sentences](http://www.cs.columbia.edu/~hgs/audio/harvard.html) as the original paper does. It is already included in the repo.
 36 | 
 37 |   * Run `python synthesize.py` and check the files in `samples`.
 38 | 
 39 | ## Training Curve
 40 | 
 41 | <img src="fig/training_curve.png">
 42 | 
 43 | 
 44 | ## Attention Plot
 45 | 
 46 | <img src="fig/attention.gif">
 47 | 
 48 | ## Generated Samples
 49 | 
 50 |   * [LJ at 200k steps](https://soundcloud.com/kyubyong-park/sets/tacotron_lj_200k)
 51 |   * [Nick at 215k steps](https://soundcloud.com/kyubyong-park/sets/tacotron_nick_215k)
 52 |   * [WEB at 183k steps](https://soundcloud.com/kyubyong-park/sets/tacotron_web_183k)
 53 | 
 54 | ## Pretrained Files
 55 |   * Keep in mind 200k steps may not be enough for the best performance.
 56 |   * [LJ 200k](https://www.dropbox.com/s/8kxa3xh2vfna3s9/LJ_logdir.zip?dl=0)
 57 |   * [WEB 200k](https://www.dropbox.com/s/g7m6xhd350ozkz7/WEB_logdir.zip?dl=0)
 58 |   
 59 | ## Notes
 60 | 
 61 |   * It's important to monitor the attention plots during training.  If the attention plots look good (alignment looks linear), and then they look bad (the plots will look similar to what they looked like in the begining of training), then training has gone awry and most likely will need to be restarted from a checkpoint where the attention looked good, because we've learned that it's unlikely that the loss will ever recover.  This deterioration of attention will correspond with a spike in the loss.
 62 | 
 63 |   * In the original paper, the authors said, "An important trick we discovered was predicting multiple, non-overlapping output frames at each decoder step" where the number of of multiple frame is the reduction factor, `r`.  We originally interpretted this as predicting non-sequential frames during each decoding step `t`.  Thus were using the following scheme (with `r=5`) during decoding.
 64 |         
 65 |         
 66 |         t    frame numbers
 67 |         -----------------------
 68 |         0    [ 0  1  2  3  4]
 69 |         1    [ 5  6  7  8  9]
 70 |         2    [10 11 12 13 14]
 71 |         ...
 72 |         
 73 |       After much experimentation, we were unable to have our model learning anything useful.  We then switched to predicting `r` sequential frames during each decoding step.
 74 |           
 75 |         
 76 |         t    frame numbers
 77 |         -----------------------
 78 |         0    [ 0  1  2  3  4]
 79 |         1    [ 5  6  7  8  9]
 80 |         2    [10 11 12 13 14]
 81 |         ...
 82 |         
 83 |       With this setup we noticed improvements in the attention and have since kept it.
 84 | 
 85 |   * **Perhaps the most important hyperparemeter is the learning rate.**  With  an intitial learning rate of 0.002 we were never able to learn a clean attention, the loss would frequently explode.  With an initial learning rate of 0.001 we were able to learn a clean attention and train for much longer get decernable words during synthesis.
 86 |   * Check other TTS models such as [DCTTS](https://github.com/kyubyong/dc_tts) or [deep voice 3](https://github.com/kyubyong/deepvoice3).
 87 | 
 88 | ### Differences from the original paper
 89 | 
 90 |   * We use Noam style warmup and decay.
 91 |   * We implement gradient clipping.
 92 |   * Our training batches are bucketed.
 93 |   * After the last convolutional layer of the post-processing net, we apply an affine transformation to bring the dimensionality up to 128 from 80, because the required dimensionality of highway net is 128.  In the original highway networks paper, the authors mention that the dimensionality of the input can also be increased with zero-padding, but they used the affine transformation in all their experiments.  We do not know what the Tacotron authors chose.
 94 | 
 95 | 
 96 | ## Papers that referenced this repo
 97 | 
 98 |   * [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969)
 99 |   * [Storytime - End to end neural networks for audiobooks](http://web.stanford.edu/class/cs224s/reports/Pierce_Freeman.pdf)
100 |   
101 |   Jan. 2018,
102 |   Kyubyong Park & [Tommy Mulc](tmulc18@gmail.com)
103 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | from utils import *
 14 | import codecs
 15 | import re
 16 | import os
 17 | import unicodedata
 18 | 
 19 | def load_vocab():
 20 |     char2idx = {char: idx for idx, char in enumerate(hp.vocab)}
 21 |     idx2char = {idx: char for idx, char in enumerate(hp.vocab)}
 22 |     return char2idx, idx2char
 23 | 
 24 | def text_normalize(text):
 25 |     text = ''.join(char for char in unicodedata.normalize('NFD', text)
 26 |                            if unicodedata.category(char) != 'Mn') # Strip accents
 27 | 
 28 |     text = text.lower()
 29 |     text = re.sub("[^{}]".format(hp.vocab), " ", text)
 30 |     text = re.sub("[ ]+", " ", text)
 31 |     return text
 32 | 
 33 | def load_data(mode="train"):
 34 |     # Load vocabulary
 35 |     char2idx, idx2char = load_vocab()
 36 | 
 37 |     if mode in ("train", "eval"):
 38 |         # Parse
 39 |         fpaths, text_lengths, texts = [], [], []
 40 |         transcript = os.path.join(hp.data, 'transcript.csv')
 41 |         lines = codecs.open(transcript, 'r', 'utf-8').readlines()
 42 |         total_hours = 0
 43 |         if mode=="train":
 44 |             lines = lines[1:]
 45 |         else: # We attack only one sample!
 46 |             lines = lines[:1]
 47 | 
 48 |         for line in lines:
 49 |             fname, _, text = line.strip().split("|")
 50 | 
 51 |             fpath = os.path.join(hp.data, "wavs", fname + ".wav")
 52 |             fpaths.append(fpath)
 53 | 
 54 |             text = text_normalize(text) + "E"  # E: EOS
 55 |             text = [char2idx[char] for char in text]
 56 |             text_lengths.append(len(text))
 57 |             texts.append(np.array(text, np.int32).tostring())
 58 | 
 59 |         return fpaths, text_lengths, texts
 60 |     else:
 61 |         # Parse
 62 |         lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:]
 63 |         sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS
 64 |         lengths = [len(sent) for sent in sents]
 65 |         maxlen = sorted(lengths, reverse=True)[0]
 66 |         texts = np.zeros((len(sents), maxlen), np.int32)
 67 |         for i, sent in enumerate(sents):
 68 |             texts[i, :len(sent)] = [char2idx[char] for char in sent]
 69 |         return texts
 70 | 
 71 | def get_batch():
 72 |     """Loads training data and put them in queues"""
 73 |     with tf.device('/cpu:0'):
 74 |         # Load data
 75 |         fpaths, text_lengths, texts = load_data() # list
 76 |         maxlen, minlen = max(text_lengths), min(text_lengths)
 77 | 
 78 |         # Calc total batch count
 79 |         num_batch = len(fpaths) // hp.batch_size
 80 | 
 81 |         fpaths = tf.convert_to_tensor(fpaths)
 82 |         text_lengths = tf.convert_to_tensor(text_lengths)
 83 |         texts = tf.convert_to_tensor(texts)
 84 | 
 85 |         # Create Queues
 86 |         fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)
 87 | 
 88 |         # Parse
 89 |         text = tf.decode_raw(text, tf.int32)  # (None,)
 90 | 
 91 |         if hp.prepro:
 92 |             def _load_spectrograms(fpath):
 93 |                 fname = os.path.basename(fpath)
 94 |                 mel = "mels/{}".format(fname.replace("wav", "npy"))
 95 |                 mag = "mags/{}".format(fname.replace("wav", "npy"))
 96 |                 return fname, np.load(mel), np.load(mag)
 97 | 
 98 |             fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])
 99 |         else:
100 |             fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])  # (None, n_mels)
101 | 
102 |         # Add shape information
103 |         fname.set_shape(())
104 |         text.set_shape((None,))
105 |         mel.set_shape((None, hp.n_mels*hp.r))
106 |         mag.set_shape((None, hp.n_fft//2+1))
107 | 
108 |         # Batching
109 |         _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length(
110 |                                             input_length=text_length,
111 |                                             tensors=[text, mel, mag, fname],
112 |                                             batch_size=hp.batch_size,
113 |                                             bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)],
114 |                                             num_threads=16,
115 |                                             capacity=hp.batch_size * 4,
116 |                                             dynamic_pad=True)
117 | 
118 |     return texts, mels, mags, fnames, num_batch
119 | 
120 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from hyperparams import Hyperparams as hp
11 | import numpy as np
12 | from data_load import load_data
13 | import tensorflow as tf
14 | from train import Graph
15 | from utils import load_spectrograms
16 | 
17 | 
18 | def eval(): 
19 |     # Load graph
20 |     g = Graph(mode="eval"); print("Evaluation Graph loaded")
21 | 
22 |     # Load data
23 |     fpaths, text_lengths, texts = load_data(mode="eval")
24 | 
25 |     # Parse
26 |     text = np.fromstring(texts[0], np.int32) # (None,)
27 |     fname, mel, mag = load_spectrograms(fpaths[0])
28 | 
29 |     x = np.expand_dims(text, 0) # (1, None)
30 |     y = np.expand_dims(mel, 0) # (1, None, n_mels*r)
31 |     z = np.expand_dims(mag, 0) # (1, None, n_mfccs)
32 | 
33 |     saver = tf.train.Saver()
34 |     with tf.Session() as sess:
35 |         saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!")
36 | 
37 |         writer = tf.summary.FileWriter(hp.logdir, sess.graph)
38 | 
39 |         # Feed Forward
40 |         ## mel
41 |         y_hat = np.zeros((1, y.shape[1], y.shape[2]), np.float32)  # hp.n_mels*hp.r
42 |         for j in range(y.shape[1]):
43 |             _y_hat = sess.run(g.y_hat, {g.x: x, g.y: y_hat})
44 |             y_hat[:, j, :] = _y_hat[:, j, :]
45 | 
46 |         ## mag
47 |         merged, gs = sess.run([g.merged, g.global_step], {g.x:x, g.y:y, g.y_hat: y_hat, g.z: z})
48 |         writer.add_summary(merged, global_step=gs)
49 |         writer.close()
50 | 
51 | if __name__ == '__main__':
52 |     eval()
53 |     print("Done")
54 |     
55 |     
56 | 


--------------------------------------------------------------------------------
/fig/attention.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron/379bb7f54c3359ffe97d1f09a773bc6da49eba6f/fig/attention.gif


--------------------------------------------------------------------------------
/fig/mean_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron/379bb7f54c3359ffe97d1f09a773bc6da49eba6f/fig/mean_loss.png


--------------------------------------------------------------------------------
/fig/training_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron/379bb7f54c3359ffe97d1f09a773bc6da49eba6f/fig/training_curve.png


--------------------------------------------------------------------------------
/harvard_sentences.txt:
--------------------------------------------------------------------------------
 1 | http://www.cs.columbia.edu/~hgs/audio/harvard.html
 2 | 1. The birch canoe slid on the smooth planks.
 3 | 2. Glue the sheet to the dark blue background.
 4 | 3. It's easy to tell the depth of a well.
 5 | 4. These days a chicken leg is a rare dish.
 6 | 5. Rice is often served in round bowls.
 7 | 6. The juice of lemons makes fine punch.
 8 | 7. The box was thrown beside the parked truck.
 9 | 8. The hogs were fed chopped corn and garbage.
10 | 9. Four hours of steady work faced us.
11 | 10. Large size in stockings is hard to sell.
12 | 11. The boy was there when the sun rose.
13 | 12. A rod is used to catch pink salmon.
14 | 13. The source of the huge river is the clear spring.
15 | 14. Kick the ball straight and follow through.
16 | 15. Help the woman get back to her feet.
17 | 16. A pot of tea helps to pass the evening.
18 | 17. Smoky fires lack flame and heat.
19 | 18. The soft cushion broke the man's fall.
20 | 19. The salt breeze came across from the sea.
21 | 20. The girl at the booth sold fifty bonds.
22 | 


--------------------------------------------------------------------------------
/history.md:
--------------------------------------------------------------------------------
 1 | # A (Heavily Documented) TensorFlow Implementation of Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model
 2 | 
 3 | ## **Major History**
 4 |   * June 21, 2017. Fourth draft. 
 5 |     * **I've updated the code for TF 1.1 to TF 1.2.** Turns out that TF 1.2 has a new api for attention wrapper and more detailed options.
 6 |     * I've added a sanity check option to the `hyperparams.py` file. Basically, it's purpose is to find out if our model is able to learn a handful of training data wihtout caring about generalization. For that, the training was done on a single mini-batch (32 samples) over and over again, and sample generation was based on the same text. I observed a quite smooth training curve for as below, and after around 18K global steps it started to generate recognizable sounds. The sample results after 36K steps are available in the `logdir_s` folder. It took around seven hours on a single gtx 1080. The pretrained files can be downloaded from [here](https://u42868014.dl.dropboxusercontent.com/u/42868014/tacotron/logdir_s.zip). The training curve looks like this.
 7 | 
 8 | <img src="fig/mean_loss.png">
 9 | 
10 |   * June 4, 2017. Third draft. 
11 |     * Some people reported they gained promising results, based on my code. Among them are, [@ggsonic](https://www.github.com/ggsonic), [@chief7](https://www.github.com/chief7). To check relevant discussions, see this [discussion](https://www.github.com/Kyubyong/tacotron/issues/30), or their repo. 
12 |     * According @ggsonic, instance normalization worked better than batch normalization.
13 |     * @chief7 trained on pavoque data, a German corpus spoken by a single male actor. He said that instance normalization and zero-masking are good choices.
14 |     * Yuxuan, the first author of the paer, advised me to do sanity-check first with small data, and to adjust hyperparemters since our dataset is different from his. I really appreciate his tips, and hope this would help you.
15 |     * [Alex's repo](https://github.com/barronalex/Tacotron), which is another implementation of Tacotron, seems to be successful in getting promising results with some small dataset. He's working on a big one.
16 |   * June 2, 2017. 
17 |     * Added `train_multiple_gpus.py` for multiple GPUs.
18 |   * June 1, 2017. Second draft. 
19 |     * I corrected some mistakes with the help of several contributors (THANKS!), and re-factored source codes so that they are more readable and modular. So far, I couldn't get any promising results.
20 |   * May 17, 2017. First draft. 
21 |     * You can run it following the steps below, but good results are not guaranteed. I'll be working on debugging this weekend. (**Code reviews and/or contributions are more than welcome!**)
22 | 
23 | ## Requirements
24 |   * NumPy >= 1.11.1
25 |   * TensorFlow == 1.2
26 |   * librosa
27 |   * tqdm
28 | 
29 | ## Data
30 | Since the [original paper](https://arxiv.org/abs/1703.10135) was based on their internal data, I use a freely available one, instead.
31 | 
32 | [The World English Bible](https://en.wikipedia.org/wiki/World_English_Bible) is a public domain update of the American Standard Version of 1901 into modern English. Its text and audio recordings are freely available [here](http://www.audiotreasure.com/webindex.htm). Unfortunately, however, each of the audio files matches a chapter, not a verse, so is too long for many machine learning tasks. I had someone slice them by verse manually. You can download [the audio data](https://dl.dropboxusercontent.com/u/42868014/WEB.zip) and its [text](https://dl.dropboxusercontent.com/u/42868014/text.csv) from my dropbox.
33 | 
34 | 
35 | 
36 | ## File description
37 |   * `hyperparams.py` includes all hyper parameters that are needed.
38 |   * `prepare_pavoque.py` creates sliced sound files from raw sound data, and constructs necessary information.
39 |   * `prepro.py` loads vocabulary, training/evaluation data.
40 |   * `data_load.py` loads data and put them in queues so multiple mini-bach data are generated in parallel.
41 |   * `utils.py` has several custom operational functions.
42 |   * `modules.py` contains building blocks for encoding/decoding networks.
43 |   * `networks.py` has three core networks, that is, encoding, decoding, and postprocessing network.
44 |   * `train.py` is for training.
45 |   * `eval.py` is for sample synthesis.
46 |   
47 | 
48 | ## Training
49 |   * STEP 1. Adjust hyper parameters in `hyperparams.py` if necessary.
50 |   * STEP 2. Download and extract [the audio data](https://dl.dropboxusercontent.com/u/42868014/WEB.zip) and its [text](https://dl.dropboxusercontent.com/u/42868014/text.csv).
51 |   * STEP 3. Run `train.py`. or `train_multi_gpus.py` if you have more than one gpu.
52 | 
53 | ## Sample Synthesis
54 |   * Run `eval.py` to get samples.
55 | 
56 | ### Acknowledgements
57 | I would like to show my respect to Dave, the host of www.audiotreasure.com and the reader of the audio files.
58 | 


--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron
 6 | '''
 7 | class Hyperparams:
 8 |     '''Hyper parameters'''
 9 |     
10 |     # pipeline
11 |     prepro = False  # if True, run `python prepro.py` first before running `python train.py`.
12 | 
13 |     vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding E: End of Sentence
14 | 
15 |     # data
16 |     data = "/data/private/voice/LJSpeech-1.0"
17 |     # data = "/data/private/voice/nick"
18 |     test_data = 'harvard_sentences.txt'
19 |     max_duration = 10.0
20 | 
21 |     # signal processing
22 |     sr = 22050 # Sample rate.
23 |     n_fft = 2048 # fft points (samples)
24 |     frame_shift = 0.0125 # seconds
25 |     frame_length = 0.05 # seconds
26 |     hop_length = int(sr*frame_shift) # samples.
27 |     win_length = int(sr*frame_length) # samples.
28 |     n_mels = 80 # Number of Mel banks to generate
29 |     power = 1.2 # Exponent for amplifying the predicted magnitude
30 |     n_iter = 50 # Number of inversion iterations
31 |     preemphasis = .97 # or None
32 |     max_db = 100
33 |     ref_db = 20
34 | 
35 |     # model
36 |     embed_size = 256 # alias = E
37 |     encoder_num_banks = 16
38 |     decoder_num_banks = 8
39 |     num_highwaynet_blocks = 4
40 |     r = 5 # Reduction factor. Paper => 2, 3, 5
41 |     dropout_rate = .5
42 | 
43 |     # training scheme
44 |     lr = 0.001 # Initial learning rate.
45 |     logdir = "logdir/01"
46 |     sampledir = 'samples'
47 |     batch_size = 32
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | import tensorflow as tf
 12 | 
 13 | 
 14 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
 15 |     '''Embeds a given tensor. 
 16 |     
 17 |     Args:
 18 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 19 |          to be looked up in `lookup table`.
 20 |       vocab_size: An int. Vocabulary size.
 21 |       num_units: An int. Number of embedding hidden units.
 22 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 23 |         should be constant zeros.
 24 |       scope: Optional scope for `variable_scope`.  
 25 |       reuse: Boolean, whether to reuse the weights of a previous layer
 26 |         by the same name.
 27 |         
 28 |     Returns:
 29 |       A `Tensor` with one more rank than inputs's. The last dimesionality
 30 |         should be `num_units`.
 31 |     '''
 32 |     with tf.variable_scope(scope, reuse=reuse):
 33 |         lookup_table = tf.get_variable('lookup_table', 
 34 |                                        dtype=tf.float32, 
 35 |                                        shape=[vocab_size, num_units],
 36 |                                        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01))
 37 |         if zero_pad:
 38 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
 39 |                                       lookup_table[1:, :]), 0)
 40 |     return tf.nn.embedding_lookup(lookup_table, inputs)
 41 | 
 42 | 
 43 | def bn(inputs,
 44 |        is_training=True,
 45 |        activation_fn=None,
 46 |        scope="bn",
 47 |        reuse=None):
 48 |     '''Applies batch normalization.
 49 | 
 50 |     Args:
 51 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 52 |         `batch_size`. If type is `bn`, the normalization is over all but
 53 |         the last dimension. Or if type is `ln`, the normalization is over
 54 |         the last dimension. Note that this is different from the native
 55 |         `tf.contrib.layers.batch_norm`. For this I recommend you change
 56 |         a line in ``tensorflow/contrib/layers/python/layers/layer.py`
 57 |         as follows.
 58 |         Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
 59 |         After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
 60 |       is_training: Whether or not the layer is in training mode.
 61 |       activation_fn: Activation function.
 62 |       scope: Optional scope for `variable_scope`.
 63 |       reuse: Boolean, whether to reuse the weights of a previous layer
 64 |         by the same name.
 65 | 
 66 |     Returns:
 67 |       A tensor with the same shape and data dtype as `inputs`.
 68 |     '''
 69 |     inputs_shape = inputs.get_shape()
 70 |     inputs_rank = inputs_shape.ndims
 71 | 
 72 |     # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
 73 |     # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC.
 74 |     if inputs_rank in [2, 3, 4]:
 75 |         if inputs_rank == 2:
 76 |             inputs = tf.expand_dims(inputs, axis=1)
 77 |             inputs = tf.expand_dims(inputs, axis=2)
 78 |         elif inputs_rank == 3:
 79 |             inputs = tf.expand_dims(inputs, axis=1)
 80 | 
 81 |         outputs = tf.contrib.layers.batch_norm(inputs=inputs,
 82 |                                                center=True,
 83 |                                                scale=True,
 84 |                                                updates_collections=None,
 85 |                                                is_training=is_training,
 86 |                                                scope=scope,
 87 |                                                fused=True,
 88 |                                                reuse=reuse)
 89 |         # restore original shape
 90 |         if inputs_rank == 2:
 91 |             outputs = tf.squeeze(outputs, axis=[1, 2])
 92 |         elif inputs_rank == 3:
 93 |             outputs = tf.squeeze(outputs, axis=1)
 94 |     else:  # fallback to naive batch norm
 95 |         outputs = tf.contrib.layers.batch_norm(inputs=inputs,
 96 |                                                center=True,
 97 |                                                scale=True,
 98 |                                                updates_collections=None,
 99 |                                                is_training=is_training,
100 |                                                scope=scope,
101 |                                                reuse=reuse,
102 |                                                fused=False)
103 |     if activation_fn is not None:
104 |         outputs = activation_fn(outputs)
105 | 
106 |     return outputs
107 | 
108 | def conv1d(inputs, 
109 |            filters=None, 
110 |            size=1, 
111 |            rate=1, 
112 |            padding="SAME", 
113 |            use_bias=False,
114 |            activation_fn=None,
115 |            scope="conv1d",
116 |            reuse=None):
117 |     '''
118 |     Args:
119 |       inputs: A 3-D tensor with shape of [batch, time, depth].
120 |       filters: An int. Number of outputs (=activation maps)
121 |       size: An int. Filter size.
122 |       rate: An int. Dilation rate.
123 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
124 |       use_bias: A boolean.
125 |       scope: Optional scope for `variable_scope`.
126 |       reuse: Boolean, whether to reuse the weights of a previous layer
127 |         by the same name.
128 |     '''
129 |     with tf.variable_scope(scope):
130 |         if padding.lower()=="causal":
131 |             # pre-padding for causality
132 |             pad_len = (size - 1) * rate  # padding size
133 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
134 |             padding = "valid"
135 |         
136 |         if filters is None:
137 |             filters = inputs.get_shape().as_list[-1]
138 |         
139 |         params = {"inputs":inputs, "filters":filters, "kernel_size":size,
140 |                 "dilation_rate":rate, "padding":padding, "activation":activation_fn, 
141 |                 "use_bias":use_bias, "reuse":reuse}
142 |         
143 |         outputs = tf.layers.conv1d(**params)
144 |     return outputs
145 | 
146 | def conv1d_banks(inputs, K=16, is_training=True, scope="conv1d_banks", reuse=None):
147 |     '''Applies a series of conv1d separately.
148 |     
149 |     Args:
150 |       inputs: A 3d tensor with shape of [N, T, C]
151 |       K: An int. The size of conv1d banks. That is, 
152 |         The `inputs` are convolved with K filters: 1, 2, ..., K.
153 |       is_training: A boolean. This is passed to an argument of `bn`.
154 |       scope: Optional scope for `variable_scope`.
155 |       reuse: Boolean, whether to reuse the weights of a previous layer
156 |         by the same name.
157 | 
158 |     Returns:
159 |       A 3d tensor with shape of [N, T, K*Hp.embed_size//2].
160 |     '''
161 |     with tf.variable_scope(scope, reuse=reuse):
162 |         outputs = conv1d(inputs, hp.embed_size//2, 1) # k=1
163 |         for k in range(2, K+1): # k = 2...K
164 |             with tf.variable_scope("num_{}".format(k)):
165 |                 output = conv1d(inputs, hp.embed_size // 2, k)
166 |                 outputs = tf.concat((outputs, output), -1)
167 |         outputs = bn(outputs, is_training=is_training, activation_fn=tf.nn.relu)
168 |     return outputs # (N, T, Hp.embed_size//2*K)
169 | 
170 | def gru(inputs, num_units=None, bidirection=False, scope="gru", reuse=None):
171 |     '''Applies a GRU.
172 |     
173 |     Args:
174 |       inputs: A 3d tensor with shape of [N, T, C].
175 |       num_units: An int. The number of hidden units.
176 |       bidirection: A boolean. If True, bidirectional results 
177 |         are concatenated.
178 |       scope: Optional scope for `variable_scope`.  
179 |       reuse: Boolean, whether to reuse the weights of a previous layer
180 |         by the same name.
181 |         
182 |     Returns:
183 |       If bidirection is True, a 3d tensor with shape of [N, T, 2*num_units],
184 |         otherwise [N, T, num_units].
185 |     '''
186 |     with tf.variable_scope(scope, reuse=reuse):
187 |         if num_units is None:
188 |             num_units = inputs.get_shape().as_list[-1]
189 |             
190 |         cell = tf.contrib.rnn.GRUCell(num_units)  
191 |         if bidirection: 
192 |             cell_bw = tf.contrib.rnn.GRUCell(num_units)
193 |             outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, dtype=tf.float32)
194 |             return tf.concat(outputs, 2)  
195 |         else:
196 |             outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
197 |             return outputs
198 | 
199 | def attention_decoder(inputs, memory, num_units=None, scope="attention_decoder", reuse=None):
200 |     '''Applies a GRU to `inputs`, while attending `memory`.
201 |     Args:
202 |       inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
203 |       memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
204 |       num_units: An int. Attention size.
205 |       scope: Optional scope for `variable_scope`.  
206 |       reuse: Boolean, whether to reuse the weights of a previous layer
207 |         by the same name.
208 |     
209 |     Returns:
210 |       A 3d tensor with shape of [N, T, num_units].    
211 |     '''
212 |     with tf.variable_scope(scope, reuse=reuse):
213 |         if num_units is None:
214 |             num_units = inputs.get_shape().as_list[-1]
215 |         
216 |         attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units, 
217 |                                                                    memory)
218 |         decoder_cell = tf.contrib.rnn.GRUCell(num_units)
219 |         cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
220 |                                                                   attention_mechanism,
221 |                                                                   num_units,
222 |                                                                   alignment_history=True)
223 |         outputs, state = tf.nn.dynamic_rnn(cell_with_attention, inputs, dtype=tf.float32) #( N, T', 16)
224 | 
225 |     return outputs, state
226 | 
227 | def prenet(inputs, num_units=None, is_training=True, scope="prenet", reuse=None):
228 |     '''Prenet for Encoder and Decoder1.
229 |     Args:
230 |       inputs: A 2D or 3D tensor.
231 |       num_units: A list of two integers. or None.
232 |       is_training: A python boolean.
233 |       scope: Optional scope for `variable_scope`.  
234 |       reuse: Boolean, whether to reuse the weights of a previous layer
235 |         by the same name.
236 |         
237 |     Returns:
238 |       A 3D tensor of shape [N, T, num_units/2].
239 |     '''
240 |     if num_units is None:
241 |         num_units = [hp.embed_size, hp.embed_size//2]
242 |         
243 |     with tf.variable_scope(scope, reuse=reuse):
244 |         outputs = tf.layers.dense(inputs, units=num_units[0], activation=tf.nn.relu, name="dense1")
245 |         outputs = tf.layers.dropout(outputs, rate=hp.dropout_rate, training=is_training, name="dropout1")
246 |         outputs = tf.layers.dense(outputs, units=num_units[1], activation=tf.nn.relu, name="dense2")
247 |         outputs = tf.layers.dropout(outputs, rate=hp.dropout_rate, training=is_training, name="dropout2") 
248 |     return outputs # (N, ..., num_units[1])
249 | 
250 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
251 |     '''Highway networks, see https://arxiv.org/abs/1505.00387
252 | 
253 |     Args:
254 |       inputs: A 3D tensor of shape [N, T, W].
255 |       num_units: An int or `None`. Specifies the number of units in the highway layer
256 |              or uses the input size if `None`.
257 |       scope: Optional scope for `variable_scope`.  
258 |       reuse: Boolean, whether to reuse the weights of a previous layer
259 |         by the same name.
260 | 
261 |     Returns:
262 |       A 3D tensor of shape [N, T, W].
263 |     '''
264 |     if not num_units:
265 |         num_units = inputs.get_shape()[-1]
266 |         
267 |     with tf.variable_scope(scope, reuse=reuse):
268 |         H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
269 |         T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid,
270 |                             bias_initializer=tf.constant_initializer(-1.0), name="dense2")
271 |         outputs = H*T + inputs*(1.-T)
272 |     return outputs
273 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | from modules import *
 12 | import tensorflow as tf
 13 | 
 14 | 
 15 | def encoder(inputs, is_training=True, scope="encoder", reuse=None):
 16 |     '''
 17 |     Args:
 18 |       inputs: A 2d tensor with shape of [N, T_x, E], with dtype of int32. Encoder inputs.
 19 |       is_training: Whether or not the layer is in training mode.
 20 |       scope: Optional scope for `variable_scope`
 21 |       reuse: Boolean, whether to reuse the weights of a previous layer
 22 |         by the same name.
 23 |     
 24 |     Returns:
 25 |       A collection of Hidden vectors. So-called memory. Has the shape of (N, T_x, E).
 26 |     '''
 27 |     with tf.variable_scope(scope, reuse=reuse): 
 28 |         # Encoder pre-net
 29 |         prenet_out = prenet(inputs, is_training=is_training) # (N, T_x, E/2)
 30 |         
 31 |         # Encoder CBHG 
 32 |         ## Conv1D banks
 33 |         enc = conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T_x, K*E/2)
 34 |         
 35 |         ## Max pooling
 36 |         enc = tf.layers.max_pooling1d(enc, pool_size=2, strides=1, padding="same")  # (N, T_x, K*E/2)
 37 |           
 38 |         ## Conv1D projections
 39 |         enc = conv1d(enc, filters=hp.embed_size//2, size=3, scope="conv1d_1") # (N, T_x, E/2)
 40 |         enc = bn(enc, is_training=is_training, activation_fn=tf.nn.relu, scope="conv1d_1")
 41 | 
 42 |         enc = conv1d(enc, filters=hp.embed_size // 2, size=3, scope="conv1d_2")  # (N, T_x, E/2)
 43 |         enc = bn(enc, is_training=is_training, scope="conv1d_2")
 44 | 
 45 |         enc += prenet_out # (N, T_x, E/2) # residual connections
 46 |           
 47 |         ## Highway Nets
 48 |         for i in range(hp.num_highwaynet_blocks):
 49 |             enc = highwaynet(enc, num_units=hp.embed_size//2, 
 50 |                                  scope='highwaynet_{}'.format(i)) # (N, T_x, E/2)
 51 | 
 52 |         ## Bidirectional GRU
 53 |         memory = gru(enc, num_units=hp.embed_size//2, bidirection=True) # (N, T_x, E)
 54 |     
 55 |     return memory
 56 |         
 57 | def decoder1(inputs, memory, is_training=True, scope="decoder1", reuse=None):
 58 |     '''
 59 |     Args:
 60 |       inputs: A 3d tensor with shape of [N, T_y/r, n_mels(*r)]. Shifted log melspectrogram of sound files.
 61 |       memory: A 3d tensor with shape of [N, T_x, E].
 62 |       is_training: Whether or not the layer is in training mode.
 63 |       scope: Optional scope for `variable_scope`
 64 |       reuse: Boolean, whether to reuse the weights of a previous layer
 65 |         by the same name.
 66 |         
 67 |     Returns
 68 |       Predicted log melspectrogram tensor with shape of [N, T_y/r, n_mels*r].
 69 |     '''
 70 |     with tf.variable_scope(scope, reuse=reuse):
 71 |         # Decoder pre-net
 72 |         inputs = prenet(inputs, is_training=is_training)  # (N, T_y/r, E/2)
 73 | 
 74 |         # Attention RNN
 75 |         dec, state = attention_decoder(inputs, memory, num_units=hp.embed_size) # (N, T_y/r, E)
 76 | 
 77 |         ## for attention monitoring
 78 |         alignments = tf.transpose(state.alignment_history.stack(),[1,2,0])
 79 | 
 80 |         # Decoder RNNs
 81 |         dec += gru(dec, hp.embed_size, bidirection=False, scope="decoder_gru1") # (N, T_y/r, E)
 82 |         dec += gru(dec, hp.embed_size, bidirection=False, scope="decoder_gru2") # (N, T_y/r, E)
 83 |           
 84 |         # Outputs => (N, T_y/r, n_mels*r)
 85 |         mel_hats = tf.layers.dense(dec, hp.n_mels*hp.r)
 86 |     
 87 |     return mel_hats, alignments
 88 | 
 89 | def decoder2(inputs, is_training=True, scope="decoder2", reuse=None):
 90 |     '''Decoder Post-processing net = CBHG
 91 |     Args:
 92 |       inputs: A 3d tensor with shape of [N, T_y/r, n_mels*r]. Log magnitude spectrogram of sound files.
 93 |         It is recovered to its original shape.
 94 |       is_training: Whether or not the layer is in training mode.  
 95 |       scope: Optional scope for `variable_scope`
 96 |       reuse: Boolean, whether to reuse the weights of a previous layer
 97 |         by the same name.
 98 |         
 99 |     Returns
100 |       Predicted linear spectrogram tensor with shape of [N, T_y, 1+n_fft//2].
101 |     '''
102 |     with tf.variable_scope(scope, reuse=reuse):
103 |         # Restore shape -> (N, Ty, n_mels)
104 |         inputs = tf.reshape(inputs, [tf.shape(inputs)[0], -1, hp.n_mels])
105 | 
106 |         # Conv1D bank
107 |         dec = conv1d_banks(inputs, K=hp.decoder_num_banks, is_training=is_training) # (N, T_y, E*K/2)
108 |          
109 |         # Max pooling
110 |         dec = tf.layers.max_pooling1d(dec, pool_size=2, strides=1, padding="same") # (N, T_y, E*K/2)
111 | 
112 |         ## Conv1D projections
113 |         dec = conv1d(dec, filters=hp.embed_size // 2, size=3, scope="conv1d_1")  # (N, T_x, E/2)
114 |         dec = bn(dec, is_training=is_training, activation_fn=tf.nn.relu, scope="conv1d_1")
115 | 
116 |         dec = conv1d(dec, filters=hp.n_mels, size=3, scope="conv1d_2")  # (N, T_x, E/2)
117 |         dec = bn(dec, is_training=is_training, scope="conv1d_2")
118 | 
119 |         # Extra affine transformation for dimensionality sync
120 |         dec = tf.layers.dense(dec, hp.embed_size//2) # (N, T_y, E/2)
121 |          
122 |         # Highway Nets
123 |         for i in range(4):
124 |             dec = highwaynet(dec, num_units=hp.embed_size//2, 
125 |                                  scope='highwaynet_{}'.format(i)) # (N, T_y, E/2)
126 |          
127 |         # Bidirectional GRU    
128 |         dec = gru(dec, hp.embed_size//2, bidirection=True) # (N, T_y, E)
129 |         
130 |         # Outputs => (N, T_y, 1+n_fft//2)
131 |         outputs = tf.layers.dense(dec, 1+hp.n_fft//2)
132 | 
133 |     return outputs
134 | 


--------------------------------------------------------------------------------
/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/tacotron
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from utils import load_spectrograms
11 | import os
12 | from data_load import load_data
13 | import numpy as np
14 | import tqdm
15 | 
16 | # Load data
17 | fpaths, _, _ = load_data() # list
18 | 
19 | for fpath in tqdm.tqdm(fpaths):
20 |     fname, mel, mag = load_spectrograms(fpath)
21 |     if not os.path.exists("mels"): os.mkdir("mels")
22 |     if not os.path.exists("mags"): os.mkdir("mags")
23 | 
24 |     np.save("mels/{}".format(fname.replace("wav", "npy")), mel)
25 |     np.save("mags/{}".format(fname.replace("wav", "npy")), mag)


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # /usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/tacotron
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from hyperparams import Hyperparams as hp
11 | import tqdm
12 | from data_load import load_data
13 | import tensorflow as tf
14 | from train import Graph
15 | from utils import spectrogram2wav
16 | from scipy.io.wavfile import write
17 | import os
18 | import numpy as np
19 | 
20 | 
21 | def synthesize():
22 |     if not os.path.exists(hp.sampledir): os.mkdir(hp.sampledir)
23 | 
24 |     # Load graph
25 |     g = Graph(mode="synthesize"); print("Graph loaded")
26 | 
27 |     # Load data
28 |     texts = load_data(mode="synthesize")
29 | 
30 |     saver = tf.train.Saver()
31 |     with tf.Session() as sess:
32 |         saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!")
33 | 
34 |         # Feed Forward
35 |         ## mel
36 |         y_hat = np.zeros((texts.shape[0], 200, hp.n_mels*hp.r), np.float32)  # hp.n_mels*hp.r
37 |         for j in tqdm.tqdm(range(200)):
38 |             _y_hat = sess.run(g.y_hat, {g.x: texts, g.y: y_hat})
39 |             y_hat[:, j, :] = _y_hat[:, j, :]
40 |         ## mag
41 |         mags = sess.run(g.z_hat, {g.y_hat: y_hat})
42 |         for i, mag in enumerate(mags):
43 |             print("File {}.wav is being generated ...".format(i+1))
44 |             audio = spectrogram2wav(mag)
45 |             write(os.path.join(hp.sampledir, '{}.wav'.format(i+1)), hp.sr, audio)
46 | 
47 | if __name__ == '__main__':
48 |     synthesize()
49 |     print("Done")
50 | 
51 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import os
 11 | from hyperparams import Hyperparams as hp
 12 | import tensorflow as tf
 13 | from tqdm import tqdm
 14 | from data_load import get_batch, load_vocab
 15 | from modules import *
 16 | from networks import encoder, decoder1, decoder2
 17 | from utils import *
 18 | 
 19 | class Graph:
 20 |     def __init__(self, mode="train"):
 21 |         # Load vocabulary
 22 |         self.char2idx, self.idx2char = load_vocab()
 23 | 
 24 |         # Set phase
 25 |         is_training=True if mode=="train" else False
 26 | 
 27 |         # Graph
 28 |         # Data Feeding
 29 |         # x: Text. (N, Tx)
 30 |         # y: Reduced melspectrogram. (N, Ty//r, n_mels*r)
 31 |         # z: Magnitude. (N, Ty, n_fft//2+1)
 32 |         if mode=="train":
 33 |             self.x, self.y, self.z, self.fnames, self.num_batch = get_batch()
 34 |         elif mode=="eval":
 35 |             self.x = tf.placeholder(tf.int32, shape=(None, None))
 36 |             self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels*hp.r))
 37 |             self.z = tf.placeholder(tf.float32, shape=(None, None, 1+hp.n_fft//2))
 38 |             self.fnames = tf.placeholder(tf.string, shape=(None,))
 39 |         else: # Synthesize
 40 |             self.x = tf.placeholder(tf.int32, shape=(None, None))
 41 |             self.y = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels * hp.r))
 42 | 
 43 |         # Get encoder/decoder inputs
 44 |         self.encoder_inputs = embed(self.x, len(hp.vocab), hp.embed_size) # (N, T_x, E)
 45 |         self.decoder_inputs = tf.concat((tf.zeros_like(self.y[:, :1, :]), self.y[:, :-1, :]), 1) # (N, Ty/r, n_mels*r)
 46 |         self.decoder_inputs = self.decoder_inputs[:, :, -hp.n_mels:] # feed last frames only (N, Ty/r, n_mels)
 47 | 
 48 |         # Networks
 49 |         with tf.variable_scope("net"):
 50 |             # Encoder
 51 |             self.memory = encoder(self.encoder_inputs, is_training=is_training) # (N, T_x, E)
 52 | 
 53 |             # Decoder1
 54 |             self.y_hat, self.alignments = decoder1(self.decoder_inputs,
 55 |                                                      self.memory,
 56 |                                                      is_training=is_training) # (N, T_y//r, n_mels*r)
 57 |             # Decoder2 or postprocessing
 58 |             self.z_hat = decoder2(self.y_hat, is_training=is_training) # (N, T_y//r, (1+n_fft//2)*r)
 59 | 
 60 |         # monitor
 61 |         self.audio = tf.py_func(spectrogram2wav, [self.z_hat[0]], tf.float32)
 62 | 
 63 |         if mode in ("train", "eval"):
 64 |             # Loss
 65 |             self.loss1 = tf.reduce_mean(tf.abs(self.y_hat - self.y))
 66 |             self.loss2 = tf.reduce_mean(tf.abs(self.z_hat - self.z))
 67 |             self.loss = self.loss1 + self.loss2
 68 | 
 69 |             # Training Scheme
 70 |             self.global_step = tf.Variable(0, name='global_step', trainable=False)
 71 |             self.lr = learning_rate_decay(hp.lr, global_step=self.global_step)
 72 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
 73 | 
 74 |             ## gradient clipping
 75 |             self.gvs = self.optimizer.compute_gradients(self.loss)
 76 |             self.clipped = []
 77 |             for grad, var in self.gvs:
 78 |                 grad = tf.clip_by_norm(grad, 5.)
 79 |                 self.clipped.append((grad, var))
 80 |             self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
 81 | 
 82 |             # Summary
 83 |             tf.summary.scalar('{}/loss1'.format(mode), self.loss1)
 84 |             tf.summary.scalar('{}/loss'.format(mode), self.loss)
 85 |             tf.summary.scalar('{}/lr'.format(mode), self.lr)
 86 | 
 87 |             tf.summary.image("{}/mel_gt".format(mode), tf.expand_dims(self.y, -1), max_outputs=1)
 88 |             tf.summary.image("{}/mel_hat".format(mode), tf.expand_dims(self.y_hat, -1), max_outputs=1)
 89 |             tf.summary.image("{}/mag_gt".format(mode), tf.expand_dims(self.z, -1), max_outputs=1)
 90 |             tf.summary.image("{}/mag_hat".format(mode), tf.expand_dims(self.z_hat, -1), max_outputs=1)
 91 | 
 92 |             tf.summary.audio("{}/sample".format(mode), tf.expand_dims(self.audio, 0), hp.sr)
 93 |             self.merged = tf.summary.merge_all()
 94 |          
 95 | if __name__ == '__main__':
 96 |     g = Graph(); print("Training Graph loaded")
 97 |     
 98 |     # with g.graph.as_default():
 99 |     sv = tf.train.Supervisor(logdir=hp.logdir, save_summaries_secs=60, save_model_secs=0)
100 |     with sv.managed_session() as sess:
101 |         while 1:
102 |             for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
103 |                 _, gs = sess.run([g.train_op, g.global_step])
104 | 
105 |                 # Write checkpoint files
106 |                 if gs % 1000 == 0:
107 |                     sv.saver.save(sess, hp.logdir + '/model_gs_{}k'.format(gs//1000))
108 | 
109 |                     # plot the first alignment for logging
110 |                     al = sess.run(g.alignments)
111 |                     plot_alignment(al[0], gs)
112 | 
113 |     print("Done")
114 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # /usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com.
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | from __future__ import print_function, division
  8 | 
  9 | from hyperparams import Hyperparams as hp
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | import librosa
 13 | import copy
 14 | import matplotlib
 15 | matplotlib.use('pdf')
 16 | import matplotlib.pyplot as plt
 17 | from scipy import signal
 18 | import os
 19 | 
 20 | 
 21 | def get_spectrograms(fpath):
 22 |     '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`.
 23 |     Args:
 24 |       sound_file: A string. The full path of a sound file.
 25 | 
 26 |     Returns:
 27 |       mel: A 2d array of shape (T, n_mels) <- Transposed
 28 |       mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
 29 |     '''
 30 |     # num = np.random.randn()
 31 |     # if num < .2:
 32 |     #     y, sr = librosa.load(fpath, sr=hp.sr)
 33 |     # else:
 34 |     #     if num < .4:
 35 |     #         tempo = 1.1
 36 |     #     elif num < .6:
 37 |     #         tempo = 1.2
 38 |     #     elif num < .8:
 39 |     #         tempo = 0.9
 40 |     #     else:
 41 |     #         tempo = 0.8
 42 |     #     cmd = "ffmpeg -i {} -y ar {} -hide_banner -loglevel panic -ac 1 -filter:a atempo={} -vn temp.wav".format(fpath, hp.sr, tempo)
 43 |     #     os.system(cmd)
 44 |     #     y, sr = librosa.load('temp.wav', sr=hp.sr)
 45 | 
 46 |     # Loading sound file
 47 |     y, sr = librosa.load(fpath, sr=hp.sr)
 48 | 
 49 | 
 50 |     # Trimming
 51 |     y, _ = librosa.effects.trim(y)
 52 | 
 53 |     # Preemphasis
 54 |     y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
 55 | 
 56 |     # stft
 57 |     linear = librosa.stft(y=y,
 58 |                           n_fft=hp.n_fft,
 59 |                           hop_length=hp.hop_length,
 60 |                           win_length=hp.win_length)
 61 | 
 62 |     # magnitude spectrogram
 63 |     mag = np.abs(linear)  # (1+n_fft//2, T)
 64 | 
 65 |     # mel spectrogram
 66 |     mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
 67 |     mel = np.dot(mel_basis, mag)  # (n_mels, t)
 68 | 
 69 |     # to decibel
 70 |     mel = 20 * np.log10(np.maximum(1e-5, mel))
 71 |     mag = 20 * np.log10(np.maximum(1e-5, mag))
 72 | 
 73 |     # normalize
 74 |     mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 75 |     mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 76 | 
 77 |     # Transpose
 78 |     mel = mel.T.astype(np.float32)  # (T, n_mels)
 79 |     mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)
 80 | 
 81 |     return mel, mag
 82 | 
 83 | 
 84 | def spectrogram2wav(mag):
 85 |     '''# Generate wave file from spectrogram'''
 86 |     # transpose
 87 |     mag = mag.T
 88 | 
 89 |     # de-noramlize
 90 |     mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db
 91 | 
 92 |     # to amplitude
 93 |     mag = np.power(10.0, mag * 0.05)
 94 | 
 95 |     # wav reconstruction
 96 |     wav = griffin_lim(mag)
 97 | 
 98 |     # de-preemphasis
 99 |     wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
100 | 
101 |     # trim
102 |     wav, _ = librosa.effects.trim(wav)
103 | 
104 |     return wav.astype(np.float32)
105 | 
106 | 
107 | def griffin_lim(spectrogram):
108 |     '''Applies Griffin-Lim's raw.
109 |     '''
110 |     X_best = copy.deepcopy(spectrogram)
111 |     for i in range(hp.n_iter):
112 |         X_t = invert_spectrogram(X_best)
113 |         est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
114 |         phase = est / np.maximum(1e-8, np.abs(est))
115 |         X_best = spectrogram * phase
116 |     X_t = invert_spectrogram(X_best)
117 |     y = np.real(X_t)
118 | 
119 |     return y
120 | 
121 | 
122 | def invert_spectrogram(spectrogram):
123 |     '''
124 |     spectrogram: [f, t]
125 |     '''
126 |     return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
127 | 
128 | 
129 | def plot_alignment(alignment, gs):
130 |     """Plots the alignment
131 |     alignments: A list of (numpy) matrix of shape (encoder_steps, decoder_steps)
132 |     gs : (int) global step
133 |     """
134 |     fig, ax = plt.subplots()
135 |     im = ax.imshow(alignment)
136 | 
137 |     # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
138 |     fig.colorbar(im)
139 |     plt.title('{} Steps'.format(gs))
140 |     plt.savefig('{}/alignment_{}k.png'.format(hp.logdir, gs//1000), format='png')
141 | 
142 | def learning_rate_decay(init_lr, global_step, warmup_steps=4000.):
143 |     '''Noam scheme from tensor2tensor'''
144 |     step = tf.cast(global_step + 1, dtype=tf.float32)
145 |     return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
146 | 
147 | def load_spectrograms(fpath):
148 |     fname = os.path.basename(fpath)
149 |     mel, mag = get_spectrograms(fpath)
150 |     t = mel.shape[0]
151 |     num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction
152 |     mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
153 |     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
154 |     return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag
155 | 


--------------------------------------------------------------------------------