├── LICENSE
├── README.md
├── data_load.py
├── fig
    ├── aaa
    ├── attention.gif
    └── training_curves.png
├── harvard_sentences.txt
├── hyperparams.py
├── modules.py
├── networks.py
├── prepo.py
├── synthesize.py
├── train.py
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A TensorFlow Implementation of DC-TTS: yet another text-to-speech model
 2 | 
 3 | I implement yet another text-to-speech model, dc-tts, introduced in [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969). My goal, however, is not just replicating the paper. Rather, I'd like to gain insights about various sound projects.
 4 | 
 5 | ## Requirements
 6 |   * NumPy >= 1.11.1
 7 |   * TensorFlow >= 1.3 (Note that the API of `tf.contrib.layers.layer_norm` has changed since 1.3)
 8 |   * librosa
 9 |   * tqdm
10 |   * matplotlib
11 |   * scipy
12 | 
13 | ## Data
14 | 
15 | <img src="https://image.shutterstock.com/z/stock-vector-korean-alphabet-korean-hangul-pattern-693680611.jpg" height="200" align="right">
16 | <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/9c/Kate_Winslet_March_18%2C_2014_%28headshot%29.jpg/890px-Kate_Winslet_March_18%2C_2014_%28headshot%29.jpg" height="200" align="right">
17 | <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/f6/Nick_Offerman_at_UMBC_%28cropped%29.jpg/440px-Nick_Offerman_at_UMBC_%28cropped%29.jpg" height="200" align="right">
18 | <img src="https://image.shutterstock.com/z/stock-vector-lj-letters-four-colors-in-abstract-background-logo-design-identity-in-circle-alphabet-letter-418687846.jpg" height="200" align="right">
19 | 
20 | I train English models and an Korean model on four different speech datasets. <p> 1. [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) <br/> 2. [Nick Offerman's Audiobooks](https://www.audible.com.au/search?searchNarrator=Nick+Offerman) <br/> 3. [Kate Winslet's Audiobook](https://www.audible.com.au/pd/Classics/Therese-Raquin-Audiobook/B00FF0SLW4/ref=a_search_c4_1_3_srTtl?qid=1516854754&sr=1-3) <br/> 4. [KSS Dataset](https://kaggle.com/bryanpark/korean-single-speaker-speech-dataset)
21 | 
22 | LJ Speech Dataset is recently widely used as a benchmark dataset in the TTS task because it is publicly available, and it has 24 hours of reasonable quality samples.
23 | Nick's and Kate's audiobooks are additionally used to see if the model can learn even with less data, variable speech samples. They are 18 hours and 5 hours long, respectively. Finally, KSS Dataset is a Korean single speaker speech dataset that lasts more than 12 hours.
24 | 
25 | 
26 | ## Training
27 |   * STEP 0. Download [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) or prepare your own data.
28 |   * STEP 1. Adjust hyper parameters in `hyperparams.py`. (If you want to do preprocessing, set prepro True`.
29 |   * STEP 2. Run `python train.py 1` for training Text2Mel. (If you set prepro True, run python prepro.py first)
30 |   * STEP 3. Run `python train.py 2` for training SSRN.
31 | 
32 | You can do STEP 2 and 3 at the same time, if you have more than one gpu card.
33 | 
34 | ## Training Curves
35 | 
36 | <img src="fig/training_curves.png">
37 | 
38 | ## Attention Plot
39 | <img src="fig/attention.gif">
40 | 
41 | ## Sample Synthesis
42 | I generate speech samples based on [Harvard Sentences](http://www.cs.columbia.edu/~hgs/audio/harvard.html) as the original paper does. It is already included in the repo.
43 | 
44 |   * Run `synthesize.py` and check the files in `samples`.
45 | 
46 | ## Generated Samples
47 | 
48 | | Dataset       | Samples |
49 | | :----- |:-------------|
50 | | LJ      | [50k](https://soundcloud.com/kyubyong-park/sets/dc_tts) [200k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_200k) [310k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_310k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_800k)|
51 | | Nick      | [40k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_40k) [170k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_170k) [300k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_300k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_800k)|
52 | | Kate| [40k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_40k) [160k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_160k) [300k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_300k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_800k) |
53 | | KSS| [400k](https://soundcloud.com/kyubyong-park/sets/dc_tts_ko_400k) |
54 | 
55 | ## Pretrained Model for LJ
56 | 
57 | Download [this](https://www.dropbox.com/s/1oyipstjxh2n5wo/LJ_logdir.tar?dl=0).
58 | 
59 | ## Notes
60 | 
61 |   * The paper didn't mention normalization, but without normalization I couldn't get it to work. So I added layer normalization.
62 |   * The paper fixed the learning rate to 0.001, but it didn't work for me. So I decayed it.
63 |   * I tried to train Text2Mel and SSRN simultaneously, but it didn't work. I guess separating those two networks mitigates the burden of training.
64 |   * The authors claimed that the model can be trained within a day, but unfortunately the luck was not mine. However obviously this is much fater than Tacotron as it uses only convolution layers.
65 |   * Thanks to the guided attention, the attention plot looks monotonic almost from the beginning. I guess this seems to hold the aligment tight so it won't lose track.
66 |   * The paper didn't mention dropouts. I applied them as I believe it helps for regularization.
67 |   * Check also other TTS models such as [Tacotron](https://github.com/kyubyong/tacotron) and [Deep Voice 3](https://github.com/kyubyong/deepvoice3).
68 |   
69 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | from utils import *
 14 | import codecs
 15 | import re
 16 | import os
 17 | import unicodedata
 18 | 
 19 | def load_vocab():
 20 |     char2idx = {char: idx for idx, char in enumerate(hp.vocab)}
 21 |     idx2char = {idx: char for idx, char in enumerate(hp.vocab)}
 22 |     return char2idx, idx2char
 23 | 
 24 | def text_normalize(text):
 25 |     text = ''.join(char for char in unicodedata.normalize('NFD', text)
 26 |                            if unicodedata.category(char) != 'Mn') # Strip accents
 27 | 
 28 |     text = text.lower()
 29 |     text = re.sub("[^{}]".format(hp.vocab), " ", text)
 30 |     text = re.sub("[ ]+", " ", text)
 31 |     return text
 32 | 
 33 | def load_data(mode="train"):
 34 |     '''Loads data
 35 |       Args:
 36 |           mode: "train" or "synthesize".
 37 |     '''
 38 |     # Load vocabulary
 39 |     char2idx, idx2char = load_vocab()
 40 | 
 41 |     if mode=="train":
 42 |         if "LJ" in hp.data:
 43 |             # Parse
 44 |             fpaths, text_lengths, texts = [], [], []
 45 |             transcript = os.path.join(hp.data, 'transcript.csv')
 46 |             lines = codecs.open(transcript, 'r', 'utf-8').readlines()
 47 |             for line in lines:
 48 |                 fname, _, text = line.strip().split("|")
 49 | 
 50 |                 fpath = os.path.join(hp.data, "wavs", fname + ".wav")
 51 |                 fpaths.append(fpath)
 52 | 
 53 |                 text = text_normalize(text) + "E"  # E: EOS
 54 |                 text = [char2idx[char] for char in text]
 55 |                 text_lengths.append(len(text))
 56 |                 texts.append(np.array(text, np.int32).tostring())
 57 | 
 58 |             return fpaths, text_lengths, texts
 59 |         else: # nick or kate
 60 |             # Parse
 61 |             fpaths, text_lengths, texts = [], [], []
 62 |             transcript = os.path.join(hp.data, 'transcript.csv')
 63 |             lines = codecs.open(transcript, 'r', 'utf-8').readlines()
 64 |             for line in lines:
 65 |                 fname, _, text, is_inside_quotes, duration = line.strip().split("|")
 66 |                 duration = float(duration)
 67 |                 if duration > 10. : continue
 68 | 
 69 |                 fpath = os.path.join(hp.data, fname)
 70 |                 fpaths.append(fpath)
 71 | 
 72 |                 text += "E"  # E: EOS
 73 |                 text = [char2idx[char] for char in text]
 74 |                 text_lengths.append(len(text))
 75 |                 texts.append(np.array(text, np.int32).tostring())
 76 | 
 77 |         return fpaths, text_lengths, texts
 78 | 
 79 |     else: # synthesize on unseen test text.
 80 |         # Parse
 81 |         lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:]
 82 |         sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS
 83 |         texts = np.zeros((len(sents), hp.max_N), np.int32)
 84 |         for i, sent in enumerate(sents):
 85 |             texts[i, :len(sent)] = [char2idx[char] for char in sent]
 86 |         return texts
 87 | 
 88 | def get_batch():
 89 |     """Loads training data and put them in queues"""
 90 |     with tf.device('/cpu:0'):
 91 |         # Load data
 92 |         fpaths, text_lengths, texts = load_data() # list
 93 |         maxlen, minlen = max(text_lengths), min(text_lengths)
 94 | 
 95 |         # Calc total batch count
 96 |         num_batch = len(fpaths) // hp.B
 97 | 
 98 |         # Create Queues
 99 |         fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)
100 | 
101 |         # Parse
102 |         text = tf.decode_raw(text, tf.int32)  # (None,)
103 | 
104 |         if hp.prepro:
105 |             def _load_spectrograms(fpath):
106 |                 fname = os.path.basename(fpath)
107 |                 mel = "mels/{}".format(fname.replace("wav", "npy"))
108 |                 mag = "mags/{}".format(fname.replace("wav", "npy"))
109 |                 return fname, np.load(mel), np.load(mag)
110 | 
111 |             fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])
112 |         else:
113 |             fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])  # (None, n_mels)
114 | 
115 |         # Add shape information
116 |         fname.set_shape(())
117 |         text.set_shape((None,))
118 |         mel.set_shape((None, hp.n_mels))
119 |         mag.set_shape((None, hp.n_fft//2+1))
120 | 
121 |         # Batching
122 |         _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length(
123 |                                             input_length=text_length,
124 |                                             tensors=[text, mel, mag, fname],
125 |                                             batch_size=hp.B,
126 |                                             bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)],
127 |                                             num_threads=8,
128 |                                             capacity=hp.B*4,
129 |                                             dynamic_pad=True)
130 | 
131 |     return texts, mels, mags, fnames, num_batch
132 | 
133 | 


--------------------------------------------------------------------------------
/fig/aaa:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/fig/attention.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/dc_tts/8b38110875920923343778ff959d01501323765e/fig/attention.gif


--------------------------------------------------------------------------------
/fig/training_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/dc_tts/8b38110875920923343778ff959d01501323765e/fig/training_curves.png


--------------------------------------------------------------------------------
/harvard_sentences.txt:
--------------------------------------------------------------------------------
 1 | http://www.cs.columbia.edu/~hgs/audio/harvard.html
 2 | 1. The birch canoe slid on the smooth planks.
 3 | 2. Glue the sheet to the dark blue background.
 4 | 3. It's easy to tell the depth of a well.
 5 | 4. These days a chicken leg is a rare dish.
 6 | 5. Rice is often served in round bowls.
 7 | 6. The juice of lemons makes fine punch.
 8 | 7. The box was thrown beside the parked truck.
 9 | 8. The hogs were fed chopped corn and garbage.
10 | 9. Four hours of steady work faced us.
11 | 10. Large size in stockings is hard to sell.
12 | 11. The boy was there when the sun rose.
13 | 12. A rod is used to catch pink salmon.
14 | 13. The source of the huge river is the clear spring.
15 | 14. Kick the ball straight and follow through.
16 | 15. Help the woman get back to her feet.
17 | 16. A pot of tea helps to pass the evening.
18 | 17. Smoky fires lack flame and heat.
19 | 18. The soft cushion broke the man's fall.
20 | 19. The salt breeze came across from the sea.
21 | 20. The girl at the booth sold fifty bonds.
22 | 


--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/dc_tts
 6 | '''
 7 | class Hyperparams:
 8 |     '''Hyper parameters'''
 9 |     # pipeline
10 |     prepro = True  # if True, run `python prepro.py` first before running `python train.py`.
11 |     
12 |     # signal processing
13 |     sr = 22050  # Sampling rate.
14 |     n_fft = 2048  # fft points (samples)
15 |     frame_shift = 0.0125  # seconds
16 |     frame_length = 0.05  # seconds
17 |     hop_length = int(sr * frame_shift)  # samples. =276.
18 |     win_length = int(sr * frame_length)  # samples. =1102.
19 |     n_mels = 80  # Number of Mel banks to generate
20 |     power = 1.5  # Exponent for amplifying the predicted magnitude
21 |     n_iter = 50  # Number of inversion iterations
22 |     preemphasis = .97
23 |     max_db = 100
24 |     ref_db = 20
25 | 
26 |     # Model
27 |     r = 4 # Reduction factor. Do not change this.
28 |     dropout_rate = 0.05
29 |     e = 128 # == embedding
30 |     d = 256 # == hidden units of Text2Mel
31 |     c = 512 # == hidden units of SSRN
32 |     attention_win_size = 3
33 | 
34 |     # data
35 |     data = "/data/private/voice/LJSpeech-1.0"
36 |     # data = "/data/private/voice/kate"
37 |     test_data = 'harvard_sentences.txt'
38 |     vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS.
39 |     max_N = 180 # Maximum number of characters.
40 |     max_T = 210 # Maximum number of mel frames.
41 | 
42 |     # training scheme
43 |     lr = 0.001 # Initial learning rate.
44 |     logdir = "logdir/LJ01"
45 |     sampledir = 'samples'
46 |     B = 32 # batch size
47 |     num_iterations = 2000000
48 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function, division
  9 | 
 10 | import tensorflow as tf
 11 | 
 12 | 
 13 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
 14 |     '''Embeds a given tensor. 
 15 |     
 16 |     Args:
 17 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 18 |          to be looked up in `lookup table`.
 19 |       vocab_size: An int. Vocabulary size.
 20 |       num_units: An int. Number of embedding hidden units.
 21 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 22 |         should be constant zeros.
 23 |       scope: Optional scope for `variable_scope`.  
 24 |       reuse: Boolean, whether to reuse the weights of a previous layer
 25 |         by the same name.
 26 |         
 27 |     Returns:
 28 |       A `Tensor` with one more rank than inputs's. The last dimensionality
 29 |         should be `num_units`.
 30 |     '''
 31 |     with tf.variable_scope(scope, reuse=reuse):
 32 |         lookup_table = tf.get_variable('lookup_table', 
 33 |                                        dtype=tf.float32, 
 34 |                                        shape=[vocab_size, num_units],
 35 |                                        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
 36 |         if zero_pad:
 37 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
 38 |                                       lookup_table[1:, :]), 0)
 39 | 
 40 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
 41 | 
 42 |     return outputs
 43 | 
 44 | 
 45 | def normalize(inputs,
 46 |               scope="normalize",
 47 |               reuse=None):
 48 |     '''Applies layer normalization that normalizes along the last axis.
 49 | 
 50 |     Args:
 51 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 52 |         `batch_size`. The normalization is over the last dimension.
 53 |       scope: Optional scope for `variable_scope`.
 54 |       reuse: Boolean, whether to reuse the weights of a previous layer
 55 |         by the same name.
 56 | 
 57 |     Returns:
 58 |       A tensor with the same shape and data dtype as `inputs`.
 59 |     '''
 60 |     outputs = tf.contrib.layers.layer_norm(inputs,
 61 |                                            begin_norm_axis=-1,
 62 |                                            scope=scope,
 63 |                                            reuse=reuse)
 64 |     return outputs
 65 | 
 66 | 
 67 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
 68 |     '''Highway networks, see https://arxiv.org/abs/1505.00387
 69 | 
 70 |     Args:
 71 |       inputs: A 3D tensor of shape [N, T, W].
 72 |       num_units: An int or `None`. Specifies the number of units in the highway layer
 73 |              or uses the input size if `None`.
 74 |       scope: Optional scope for `variable_scope`.
 75 |       reuse: Boolean, whether to reuse the weights of a previous layer
 76 |         by the same name.
 77 | 
 78 |     Returns:
 79 |       A 3D tensor of shape [N, T, W].
 80 |     '''
 81 |     if not num_units:
 82 |         num_units = inputs.get_shape()[-1]
 83 | 
 84 |     with tf.variable_scope(scope, reuse=reuse):
 85 |         H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
 86 |         T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid,
 87 |                             bias_initializer=tf.constant_initializer(-1.0), name="dense2")
 88 |         outputs = H * T + inputs * (1. - T)
 89 |     return outputs
 90 | 
 91 | def conv1d(inputs,
 92 |            filters=None,
 93 |            size=1,
 94 |            rate=1,
 95 |            padding="SAME",
 96 |            dropout_rate=0,
 97 |            use_bias=True,
 98 |            activation_fn=None,
 99 |            training=True,
100 |            scope="conv1d",
101 |            reuse=None):
102 |     '''
103 |     Args:
104 |       inputs: A 3-D tensor with shape of [batch, time, depth].
105 |       filters: An int. Number of outputs (=activation maps)
106 |       size: An int. Filter size.
107 |       rate: An int. Dilation rate.
108 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
109 |       dropout_rate: A float of [0, 1].
110 |       use_bias: A boolean.
111 |       activation_fn: A string.
112 |       training: A boolean. If True, dropout is applied.
113 |       scope: Optional scope for `variable_scope`.
114 |       reuse: Boolean, whether to reuse the weights of a previous layer
115 |         by the same name.
116 | 
117 |     Returns:
118 |       A masked tensor of the same shape and dtypes as `inputs`.
119 |     '''
120 |     with tf.variable_scope(scope):
121 |         if padding.lower() == "causal":
122 |             # pre-padding for causality
123 |             pad_len = (size - 1) * rate  # padding size
124 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
125 |             padding = "valid"
126 | 
127 |         if filters is None:
128 |             filters = inputs.get_shape().as_list()[-1]
129 | 
130 |         params = {"inputs": inputs, "filters": filters, "kernel_size": size,
131 |                   "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
132 |                   "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse}
133 | 
134 |         tensor = tf.layers.conv1d(**params)
135 |         tensor = normalize(tensor)
136 |         if activation_fn is not None:
137 |             tensor = activation_fn(tensor)
138 | 
139 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
140 | 
141 |     return tensor
142 | 
143 | def hc(inputs,
144 |        filters=None,
145 |        size=1,
146 |        rate=1,
147 |        padding="SAME",
148 |        dropout_rate=0,
149 |        use_bias=True,
150 |        activation_fn=None,
151 |        training=True,
152 |        scope="hc",
153 |        reuse=None):
154 |     '''
155 |     Args:
156 |       inputs: A 3-D tensor with shape of [batch, time, depth].
157 |       filters: An int. Number of outputs (=activation maps)
158 |       size: An int. Filter size.
159 |       rate: An int. Dilation rate.
160 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
161 |       use_bias: A boolean.
162 |       activation_fn: A string.
163 |       training: A boolean. If True, dropout is applied.
164 |       scope: Optional scope for `variable_scope`.
165 |       reuse: Boolean, whether to reuse the weights of a previous layer
166 |         by the same name.
167 | 
168 |     Returns:
169 |       A masked tensor of the same shape and dtypes as `inputs`.
170 |     '''
171 |     _inputs = inputs
172 |     with tf.variable_scope(scope):
173 |         if padding.lower() == "causal":
174 |             # pre-padding for causality
175 |             pad_len = (size - 1) * rate  # padding size
176 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
177 |             padding = "valid"
178 | 
179 |         if filters is None:
180 |             filters = inputs.get_shape().as_list()[-1]
181 | 
182 | 
183 |         params = {"inputs": inputs, "filters": 2*filters, "kernel_size": size,
184 |                   "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
185 |                   "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse}
186 | 
187 |         tensor = tf.layers.conv1d(**params)
188 |         H1, H2 = tf.split(tensor, 2, axis=-1)
189 |         H1 = normalize(H1, scope="H1")
190 |         H2 = normalize(H2, scope="H2")
191 |         H1 = tf.nn.sigmoid(H1, "gate")
192 |         H2 = activation_fn(H2, "info") if activation_fn is not None else H2
193 |         tensor = H1*H2 + (1.-H1)*_inputs
194 | 
195 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
196 | 
197 |     return tensor
198 | 
199 | def conv1d_transpose(inputs,
200 |                      filters=None,
201 |                      size=3,
202 |                      stride=2,
203 |                      padding='same',
204 |                      dropout_rate=0,
205 |                      use_bias=True,
206 |                      activation=None,
207 |                      training=True,
208 |                      scope="conv1d_transpose",
209 |                      reuse=None):
210 |     '''
211 |         Args:
212 |           inputs: A 3-D tensor with shape of [batch, time, depth].
213 |           filters: An int. Number of outputs (=activation maps)
214 |           size: An int. Filter size.
215 |           rate: An int. Dilation rate.
216 |           padding: Either `same` or `valid` or `causal` (case-insensitive).
217 |           dropout_rate: A float of [0, 1].
218 |           use_bias: A boolean.
219 |           activation_fn: A string.
220 |           training: A boolean. If True, dropout is applied.
221 |           scope: Optional scope for `variable_scope`.
222 |           reuse: Boolean, whether to reuse the weights of a previous layer
223 |             by the same name.
224 | 
225 |         Returns:
226 |           A tensor of the shape with [batch, time*2, depth].
227 |         '''
228 |     with tf.variable_scope(scope, reuse=reuse):
229 |         if filters is None:
230 |             filters = inputs.get_shape().as_list()[-1]
231 |         inputs = tf.expand_dims(inputs, 1)
232 |         tensor = tf.layers.conv2d_transpose(inputs,
233 |                                    filters=filters,
234 |                                    kernel_size=(1, size),
235 |                                    strides=(1, stride),
236 |                                    padding=padding,
237 |                                    activation=None,
238 |                                    kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
239 |                                    use_bias=use_bias)
240 |         tensor = tf.squeeze(tensor, 1)
241 |         tensor = normalize(tensor)
242 |         if activation is not None:
243 |             tensor = activation(tensor)
244 | 
245 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
246 | 
247 |     return tensor
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | from modules import *
 12 | import tensorflow as tf
 13 | 
 14 | def TextEnc(L, training=True):
 15 |     '''
 16 |     Args:
 17 |       L: Text inputs. (B, N)
 18 | 
 19 |     Return:
 20 |         K: Keys. (B, N, d)
 21 |         V: Values. (B, N, d)
 22 |     '''
 23 |     i = 1
 24 |     tensor = embed(L,
 25 |                    vocab_size=len(hp.vocab),
 26 |                    num_units=hp.e,
 27 |                    scope="embed_{}".format(i)); i += 1
 28 |     tensor = conv1d(tensor,
 29 |                     filters=2*hp.d,
 30 |                     size=1,
 31 |                     rate=1,
 32 |                     dropout_rate=hp.dropout_rate,
 33 |                     activation_fn=tf.nn.relu,
 34 |                     training=training,
 35 |                     scope="C_{}".format(i)); i += 1
 36 |     tensor = conv1d(tensor,
 37 |                     size=1,
 38 |                     rate=1,
 39 |                     dropout_rate=hp.dropout_rate,
 40 |                     training=training,
 41 |                     scope="C_{}".format(i)); i += 1
 42 | 
 43 |     for _ in range(2):
 44 |         for j in range(4):
 45 |             tensor = hc(tensor,
 46 |                             size=3,
 47 |                             rate=3**j,
 48 |                             dropout_rate=hp.dropout_rate,
 49 |                             activation_fn=None,
 50 |                             training=training,
 51 |                             scope="HC_{}".format(i)); i += 1
 52 |     for _ in range(2):
 53 |         tensor = hc(tensor,
 54 |                         size=3,
 55 |                         rate=1,
 56 |                         dropout_rate=hp.dropout_rate,
 57 |                         activation_fn=None,
 58 |                         training=training,
 59 |                         scope="HC_{}".format(i)); i += 1
 60 | 
 61 |     for _ in range(2):
 62 |         tensor = hc(tensor,
 63 |                         size=1,
 64 |                         rate=1,
 65 |                         dropout_rate=hp.dropout_rate,
 66 |                         activation_fn=None,
 67 |                         training=training,
 68 |                         scope="HC_{}".format(i)); i += 1
 69 | 
 70 |     K, V = tf.split(tensor, 2, -1)
 71 |     return K, V
 72 | 
 73 | def AudioEnc(S, training=True):
 74 |     '''
 75 |     Args:
 76 |       S: melspectrogram. (B, T/r, n_mels)
 77 | 
 78 |     Returns
 79 |       Q: Queries. (B, T/r, d)
 80 |     '''
 81 |     i = 1
 82 |     tensor = conv1d(S,
 83 |                     filters=hp.d,
 84 |                     size=1,
 85 |                     rate=1,
 86 |                     padding="CAUSAL",
 87 |                     dropout_rate=hp.dropout_rate,
 88 |                     activation_fn=tf.nn.relu,
 89 |                     training=training,
 90 |                     scope="C_{}".format(i)); i += 1
 91 |     tensor = conv1d(tensor,
 92 |                     size=1,
 93 |                     rate=1,
 94 |                     padding="CAUSAL",
 95 |                     dropout_rate=hp.dropout_rate,
 96 |                     activation_fn=tf.nn.relu,
 97 |                     training=training,
 98 |                     scope="C_{}".format(i)); i += 1
 99 |     tensor = conv1d(tensor,
100 |                     size=1,
101 |                     rate=1,
102 |                     padding="CAUSAL",
103 |                     dropout_rate=hp.dropout_rate,
104 |                     training=training,
105 |                     scope="C_{}".format(i)); i += 1
106 |     for _ in range(2):
107 |         for j in range(4):
108 |             tensor = hc(tensor,
109 |                             size=3,
110 |                             rate=3**j,
111 |                             padding="CAUSAL",
112 |                             dropout_rate=hp.dropout_rate,
113 |                             training=training,
114 |                             scope="HC_{}".format(i)); i += 1
115 |     for _ in range(2):
116 |         tensor = hc(tensor,
117 |                         size=3,
118 |                         rate=3,
119 |                         padding="CAUSAL",
120 |                         dropout_rate=hp.dropout_rate,
121 |                         training=training,
122 |                         scope="HC_{}".format(i)); i += 1
123 | 
124 |     return tensor
125 | 
126 | def Attention(Q, K, V, mononotic_attention=False, prev_max_attentions=None):
127 |     '''
128 |     Args:
129 |       Q: Queries. (B, T/r, d)
130 |       K: Keys. (B, N, d)
131 |       V: Values. (B, N, d)
132 |       mononotic_attention: A boolean. At training, it is False.
133 |       prev_max_attentions: (B,). At training, it is set to None.
134 | 
135 |     Returns:
136 |       R: [Context Vectors; Q]. (B, T/r, 2d)
137 |       alignments: (B, N, T/r)
138 |       max_attentions: (B, T/r)
139 |     '''
140 |     A = tf.matmul(Q, K, transpose_b=True) * tf.rsqrt(tf.to_float(hp.d))
141 |     if mononotic_attention:  # for inference
142 |         key_masks = tf.sequence_mask(prev_max_attentions, hp.max_N)
143 |         reverse_masks = tf.sequence_mask(hp.max_N - hp.attention_win_size - prev_max_attentions, hp.max_N)[:, ::-1]
144 |         masks = tf.logical_or(key_masks, reverse_masks)
145 |         masks = tf.tile(tf.expand_dims(masks, 1), [1, hp.max_T, 1])
146 |         paddings = tf.ones_like(A) * (-2 ** 32 + 1)  # (B, T/r, N)
147 |         A = tf.where(tf.equal(masks, False), A, paddings)
148 |     A = tf.nn.softmax(A) # (B, T/r, N)
149 |     max_attentions = tf.argmax(A, -1)  # (B, T/r)
150 |     R = tf.matmul(A, V)
151 |     R = tf.concat((R, Q), -1)
152 | 
153 |     alignments = tf.transpose(A, [0, 2, 1]) # (B, N, T/r)
154 | 
155 |     return R, alignments, max_attentions
156 | 
157 | def AudioDec(R, training=True):
158 |     '''
159 |     Args:
160 |       R: [Context Vectors; Q]. (B, T/r, 2d)
161 | 
162 |     Returns:
163 |       Y: Melspectrogram predictions. (B, T/r, n_mels)
164 |     '''
165 | 
166 |     i = 1
167 |     tensor = conv1d(R,
168 |                     filters=hp.d,
169 |                     size=1,
170 |                     rate=1,
171 |                     padding="CAUSAL",
172 |                     dropout_rate=hp.dropout_rate,
173 |                     training=training,
174 |                     scope="C_{}".format(i)); i += 1
175 |     for j in range(4):
176 |         tensor = hc(tensor,
177 |                         size=3,
178 |                         rate=3**j,
179 |                         padding="CAUSAL",
180 |                         dropout_rate=hp.dropout_rate,
181 |                         training=training,
182 |                         scope="HC_{}".format(i)); i += 1
183 | 
184 |     for _ in range(2):
185 |         tensor = hc(tensor,
186 |                         size=3,
187 |                         rate=1,
188 |                         padding="CAUSAL",
189 |                         dropout_rate=hp.dropout_rate,
190 |                         training=training,
191 |                         scope="HC_{}".format(i)); i += 1
192 |     for _ in range(3):
193 |         tensor = conv1d(tensor,
194 |                         size=1,
195 |                         rate=1,
196 |                         padding="CAUSAL",
197 |                         dropout_rate=hp.dropout_rate,
198 |                         activation_fn=tf.nn.relu,
199 |                         training=training,
200 |                         scope="C_{}".format(i)); i += 1
201 |     # mel_hats
202 |     logits = conv1d(tensor,
203 |                     filters=hp.n_mels,
204 |                     size=1,
205 |                     rate=1,
206 |                     padding="CAUSAL",
207 |                     dropout_rate=hp.dropout_rate,
208 |                     training=training,
209 |                     scope="C_{}".format(i)); i += 1
210 |     Y = tf.nn.sigmoid(logits) # mel_hats
211 | 
212 |     return logits, Y
213 | 
214 | def SSRN(Y, training=True):
215 |     '''
216 |     Args:
217 |       Y: Melspectrogram Predictions. (B, T/r, n_mels)
218 | 
219 |     Returns:
220 |       Z: Spectrogram Predictions. (B, T, 1+n_fft/2)
221 |     '''
222 | 
223 |     i = 1 # number of layers
224 | 
225 |     # -> (B, T/r, c)
226 |     tensor = conv1d(Y,
227 |                     filters=hp.c,
228 |                     size=1,
229 |                     rate=1,
230 |                     dropout_rate=hp.dropout_rate,
231 |                     training=training,
232 |                     scope="C_{}".format(i)); i += 1
233 |     for j in range(2):
234 |         tensor = hc(tensor,
235 |                       size=3,
236 |                       rate=3**j,
237 |                       dropout_rate=hp.dropout_rate,
238 |                       training=training,
239 |                       scope="HC_{}".format(i)); i += 1
240 |     for _ in range(2):
241 |         # -> (B, T/2, c) -> (B, T, c)
242 |         tensor = conv1d_transpose(tensor,
243 |                                   scope="D_{}".format(i),
244 |                                   dropout_rate=hp.dropout_rate,
245 |                                   training=training,); i += 1
246 |         for j in range(2):
247 |             tensor = hc(tensor,
248 |                             size=3,
249 |                             rate=3**j,
250 |                             dropout_rate=hp.dropout_rate,
251 |                             training=training,
252 |                             scope="HC_{}".format(i)); i += 1
253 |     # -> (B, T, 2*c)
254 |     tensor = conv1d(tensor,
255 |                     filters=2*hp.c,
256 |                     size=1,
257 |                     rate=1,
258 |                     dropout_rate=hp.dropout_rate,
259 |                     training=training,
260 |                     scope="C_{}".format(i)); i += 1
261 |     for _ in range(2):
262 |         tensor = hc(tensor,
263 |                         size=3,
264 |                         rate=1,
265 |                         dropout_rate=hp.dropout_rate,
266 |                         training=training,
267 |                         scope="HC_{}".format(i)); i += 1
268 |     # -> (B, T, 1+n_fft/2)
269 |     tensor = conv1d(tensor,
270 |                     filters=1+hp.n_fft//2,
271 |                     size=1,
272 |                     rate=1,
273 |                     dropout_rate=hp.dropout_rate,
274 |                     training=training,
275 |                     scope="C_{}".format(i)); i += 1
276 | 
277 |     for _ in range(2):
278 |         tensor = conv1d(tensor,
279 |                         size=1,
280 |                         rate=1,
281 |                         dropout_rate=hp.dropout_rate,
282 |                         activation_fn=tf.nn.relu,
283 |                         training=training,
284 |                         scope="C_{}".format(i)); i += 1
285 |     logits = conv1d(tensor,
286 |                size=1,
287 |                rate=1,
288 |                dropout_rate=hp.dropout_rate,
289 |                training=training,
290 |                scope="C_{}".format(i))
291 |     Z = tf.nn.sigmoid(logits)
292 |     return logits, Z
293 | 


--------------------------------------------------------------------------------
/prepo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/dc_tts
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from utils import load_spectrograms
11 | import os
12 | from data_load import load_data
13 | import numpy as np
14 | import tqdm
15 | 
16 | # Load data
17 | fpaths, _, _ = load_data() # list
18 | 
19 | for fpath in tqdm.tqdm(fpaths):
20 |     fname, mel, mag = load_spectrograms(fpath)
21 |     if not os.path.exists("mels"): os.mkdir("mels")
22 |     if not os.path.exists("mags"): os.mkdir("mags")
23 | 
24 |     np.save("mels/{}".format(fname.replace("wav", "npy")), mel)
25 |     np.save("mags/{}".format(fname.replace("wav", "npy")), mag)


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # /usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/dc_tts
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | import os
11 | 
12 | from hyperparams import Hyperparams as hp
13 | import numpy as np
14 | import tensorflow as tf
15 | from train import Graph
16 | from utils import *
17 | from data_load import load_data
18 | from scipy.io.wavfile import write
19 | from tqdm import tqdm
20 | 
21 | def synthesize():
22 |     # Load data
23 |     L = load_data("synthesize")
24 | 
25 |     # Load graph
26 |     g = Graph(mode="synthesize"); print("Graph loaded")
27 | 
28 |     with tf.Session() as sess:
29 |         sess.run(tf.global_variables_initializer())
30 | 
31 |         # Restore parameters
32 |         var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')
33 |         saver1 = tf.train.Saver(var_list=var_list)
34 |         saver1.restore(sess, tf.train.latest_checkpoint(hp.logdir + "-1"))
35 |         print("Text2Mel Restored!")
36 | 
37 |         var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \
38 |                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
39 |         saver2 = tf.train.Saver(var_list=var_list)
40 |         saver2.restore(sess, tf.train.latest_checkpoint(hp.logdir + "-2"))
41 |         print("SSRN Restored!")
42 | 
43 |         # Feed Forward
44 |         ## mel
45 |         Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32)
46 |         prev_max_attentions = np.zeros((len(L),), np.int32)
47 |         for j in tqdm(range(hp.max_T)):
48 |             _gs, _Y, _max_attentions, _alignments = \
49 |                 sess.run([g.global_step, g.Y, g.max_attentions, g.alignments],
50 |                          {g.L: L,
51 |                           g.mels: Y,
52 |                           g.prev_max_attentions: prev_max_attentions})
53 |             Y[:, j, :] = _Y[:, j, :]
54 |             prev_max_attentions = _max_attentions[:, j]
55 | 
56 |         # Get magnitude
57 |         Z = sess.run(g.Z, {g.Y: Y})
58 | 
59 |         # Generate wav files
60 |         if not os.path.exists(hp.sampledir): os.makedirs(hp.sampledir)
61 |         for i, mag in enumerate(Z):
62 |             print("Working on file", i+1)
63 |             wav = spectrogram2wav(mag)
64 |             write(hp.sampledir + "/{}.wav".format(i+1), hp.sr, wav)
65 | 
66 | if __name__ == '__main__':
67 |     synthesize()
68 |     print("Done")
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # /usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from tqdm import tqdm
 11 | 
 12 | from data_load import get_batch, load_vocab
 13 | from hyperparams import Hyperparams as hp
 14 | from modules import *
 15 | from networks import TextEnc, AudioEnc, AudioDec, Attention, SSRN
 16 | import tensorflow as tf
 17 | from utils import *
 18 | import sys
 19 | 
 20 | 
 21 | class Graph:
 22 |     def __init__(self, num=1, mode="train"):
 23 |         '''
 24 |         Args:
 25 |           num: Either 1 or 2. 1 for Text2Mel 2 for SSRN.
 26 |           mode: Either "train" or "synthesize".
 27 |         '''
 28 |         # Load vocabulary
 29 |         self.char2idx, self.idx2char = load_vocab()
 30 | 
 31 |         # Set flag
 32 |         training = True if mode=="train" else False
 33 | 
 34 |         # Graph
 35 |         # Data Feeding
 36 |         ## L: Text. (B, N), int32
 37 |         ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32
 38 |         ## mags: Magnitude. (B, T, n_fft//2+1) float32
 39 |         if mode=="train":
 40 |             self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch()
 41 |             self.prev_max_attentions = tf.ones(shape=(hp.B,), dtype=tf.int32)
 42 |             self.gts = tf.convert_to_tensor(guided_attention())
 43 |         else:  # Synthesize
 44 |             self.L = tf.placeholder(tf.int32, shape=(None, None))
 45 |             self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels))
 46 |             self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None,))
 47 | 
 48 |         if num==1 or (not training):
 49 |             with tf.variable_scope("Text2Mel"):
 50 |                 # Get S or decoder inputs. (B, T//r, n_mels)
 51 |                 self.S = tf.concat((tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)
 52 | 
 53 |                 # Networks
 54 |                 with tf.variable_scope("TextEnc"):
 55 |                     self.K, self.V = TextEnc(self.L, training=training)  # (N, Tx, e)
 56 | 
 57 |                 with tf.variable_scope("AudioEnc"):
 58 |                     self.Q = AudioEnc(self.S, training=training)
 59 | 
 60 |                 with tf.variable_scope("Attention"):
 61 |                     # R: (B, T/r, 2d)
 62 |                     # alignments: (B, N, T/r)
 63 |                     # max_attentions: (B,)
 64 |                     self.R, self.alignments, self.max_attentions = Attention(self.Q, self.K, self.V,
 65 |                                                                              mononotic_attention=(not training),
 66 |                                                                              prev_max_attentions=self.prev_max_attentions)
 67 |                 with tf.variable_scope("AudioDec"):
 68 |                     self.Y_logits, self.Y = AudioDec(self.R, training=training) # (B, T/r, n_mels)
 69 |         else:  # num==2 & training. Note that during training,
 70 |             # the ground truth melspectrogram values are fed.
 71 |             with tf.variable_scope("SSRN"):
 72 |                 self.Z_logits, self.Z = SSRN(self.mels, training=training)
 73 | 
 74 |         if not training:
 75 |             # During inference, the predicted melspectrogram values are fed.
 76 |             with tf.variable_scope("SSRN"):
 77 |                 self.Z_logits, self.Z = SSRN(self.Y, training=training)
 78 | 
 79 |         with tf.variable_scope("gs"):
 80 |             self.global_step = tf.Variable(0, name='global_step', trainable=False)
 81 | 
 82 |         if training:
 83 |             if num==1: # Text2Mel
 84 |                 # mel L1 loss
 85 |                 self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels))
 86 | 
 87 |                 # mel binary divergence loss
 88 |                 self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.mels))
 89 | 
 90 |                 # guided_attention loss
 91 |                 self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T]
 92 |                 self.attention_masks = tf.to_float(tf.not_equal(self.A, -1))
 93 |                 self.loss_att = tf.reduce_sum(tf.abs(self.A * self.gts) * self.attention_masks)
 94 |                 self.mask_sum = tf.reduce_sum(self.attention_masks)
 95 |                 self.loss_att /= self.mask_sum
 96 | 
 97 |                 # total loss
 98 |                 self.loss = self.loss_mels + self.loss_bd1 + self.loss_att
 99 | 
100 |                 tf.summary.scalar('train/loss_mels', self.loss_mels)
101 |                 tf.summary.scalar('train/loss_bd1', self.loss_bd1)
102 |                 tf.summary.scalar('train/loss_att', self.loss_att)
103 |                 tf.summary.image('train/mel_gt', tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1))
104 |                 tf.summary.image('train/mel_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1))
105 |             else: # SSRN
106 |                 # mag L1 loss
107 |                 self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags))
108 | 
109 |                 # mag binary divergence loss
110 |                 self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.mags))
111 | 
112 |                 # total loss
113 |                 self.loss = self.loss_mags + self.loss_bd2
114 | 
115 |                 tf.summary.scalar('train/loss_mags', self.loss_mags)
116 |                 tf.summary.scalar('train/loss_bd2', self.loss_bd2)
117 |                 tf.summary.image('train/mag_gt', tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1))
118 |                 tf.summary.image('train/mag_hat', tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1))
119 | 
120 |             # Training Scheme
121 |             self.lr = learning_rate_decay(hp.lr, self.global_step)
122 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
123 |             tf.summary.scalar("lr", self.lr)
124 | 
125 |             ## gradient clipping
126 |             self.gvs = self.optimizer.compute_gradients(self.loss)
127 |             self.clipped = []
128 |             for grad, var in self.gvs:
129 |                 grad = tf.clip_by_value(grad, -1., 1.)
130 |                 self.clipped.append((grad, var))
131 |                 self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
132 | 
133 |             # Summary
134 |             self.merged = tf.summary.merge_all()
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     # argument: 1 or 2. 1 for Text2mel, 2 for SSRN.
139 |     num = int(sys.argv[1])
140 | 
141 |     g = Graph(num=num); print("Training Graph loaded")
142 | 
143 |     logdir = hp.logdir + "-" + str(num)
144 |     sv = tf.train.Supervisor(logdir=logdir, save_model_secs=0, global_step=g.global_step)
145 |     with sv.managed_session() as sess:
146 |         while 1:
147 |             for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
148 |                 gs, _ = sess.run([g.global_step, g.train_op])
149 | 
150 |                 # Write checkpoint files at every 1k steps
151 |                 if gs % 1000 == 0:
152 |                     sv.saver.save(sess, logdir + '/model_gs_{}'.format(str(gs // 1000).zfill(3) + "k"))
153 | 
154 |                     if num==1:
155 |                         # plot alignment
156 |                         alignments = sess.run(g.alignments)
157 |                         plot_alignment(alignments[0], str(gs // 1000).zfill(3) + "k", logdir)
158 | 
159 |                 # break
160 |                 if gs > hp.num_iterations: break
161 | 
162 |     print("Done")
163 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | from __future__ import print_function, division
  8 | 
  9 | import numpy as np
 10 | import librosa
 11 | import os, copy
 12 | import matplotlib
 13 | matplotlib.use('pdf')
 14 | import matplotlib.pyplot as plt
 15 | from scipy import signal
 16 | 
 17 | from hyperparams import Hyperparams as hp
 18 | import tensorflow as tf
 19 | 
 20 | def get_spectrograms(fpath):
 21 |     '''Parse the wave file in `fpath` and
 22 |     Returns normalized melspectrogram and linear spectrogram.
 23 | 
 24 |     Args:
 25 |       fpath: A string. The full path of a sound file.
 26 | 
 27 |     Returns:
 28 |       mel: A 2d array of shape (T, n_mels) and dtype of float32.
 29 |       mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
 30 |     '''
 31 |     # Loading sound file
 32 |     y, sr = librosa.load(fpath, sr=hp.sr)
 33 | 
 34 |     # Trimming
 35 |     y, _ = librosa.effects.trim(y)
 36 | 
 37 |     # Preemphasis
 38 |     y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
 39 | 
 40 |     # stft
 41 |     linear = librosa.stft(y=y,
 42 |                           n_fft=hp.n_fft,
 43 |                           hop_length=hp.hop_length,
 44 |                           win_length=hp.win_length)
 45 | 
 46 |     # magnitude spectrogram
 47 |     mag = np.abs(linear)  # (1+n_fft//2, T)
 48 | 
 49 |     # mel spectrogram
 50 |     mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
 51 |     mel = np.dot(mel_basis, mag)  # (n_mels, t)
 52 | 
 53 |     # to decibel
 54 |     mel = 20 * np.log10(np.maximum(1e-5, mel))
 55 |     mag = 20 * np.log10(np.maximum(1e-5, mag))
 56 | 
 57 |     # normalize
 58 |     mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 59 |     mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 60 | 
 61 |     # Transpose
 62 |     mel = mel.T.astype(np.float32)  # (T, n_mels)
 63 |     mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)
 64 | 
 65 |     return mel, mag
 66 | 
 67 | def spectrogram2wav(mag):
 68 |     '''# Generate wave file from linear magnitude spectrogram
 69 | 
 70 |     Args:
 71 |       mag: A numpy array of (T, 1+n_fft//2)
 72 | 
 73 |     Returns:
 74 |       wav: A 1-D numpy array.
 75 |     '''
 76 |     # transpose
 77 |     mag = mag.T
 78 | 
 79 |     # de-noramlize
 80 |     mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db
 81 | 
 82 |     # to amplitude
 83 |     mag = np.power(10.0, mag * 0.05)
 84 | 
 85 |     # wav reconstruction
 86 |     wav = griffin_lim(mag**hp.power)
 87 | 
 88 |     # de-preemphasis
 89 |     wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
 90 | 
 91 |     # trim
 92 |     wav, _ = librosa.effects.trim(wav)
 93 | 
 94 |     return wav.astype(np.float32)
 95 | 
 96 | def griffin_lim(spectrogram):
 97 |     '''Applies Griffin-Lim's raw.'''
 98 |     X_best = copy.deepcopy(spectrogram)
 99 |     for i in range(hp.n_iter):
100 |         X_t = invert_spectrogram(X_best)
101 |         est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
102 |         phase = est / np.maximum(1e-8, np.abs(est))
103 |         X_best = spectrogram * phase
104 |     X_t = invert_spectrogram(X_best)
105 |     y = np.real(X_t)
106 | 
107 |     return y
108 | 
109 | def invert_spectrogram(spectrogram):
110 |     '''Applies inverse fft.
111 |     Args:
112 |       spectrogram: [1+n_fft//2, t]
113 |     '''
114 |     return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
115 | 
116 | def plot_alignment(alignment, gs, dir=hp.logdir):
117 |     """Plots the alignment.
118 | 
119 |     Args:
120 |       alignment: A numpy array with shape of (encoder_steps, decoder_steps)
121 |       gs: (int) global step.
122 |       dir: Output path.
123 |     """
124 |     if not os.path.exists(dir): os.mkdir(dir)
125 | 
126 |     fig, ax = plt.subplots()
127 |     im = ax.imshow(alignment)
128 | 
129 |     fig.colorbar(im)
130 |     plt.title('{} Steps'.format(gs))
131 |     plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png')
132 |     plt.close(fig)
133 | 
134 | def guided_attention(g=0.2):
135 |     '''Guided attention. Refer to page 3 on the paper.'''
136 |     W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
137 |     for n_pos in range(W.shape[0]):
138 |         for t_pos in range(W.shape[1]):
139 |             W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g))
140 |     return W
141 | 
142 | def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
143 |     '''Noam scheme from tensor2tensor'''
144 |     step = tf.to_float(global_step + 1)
145 |     return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
146 | 
147 | def load_spectrograms(fpath):
148 |     '''Read the wave file in `fpath`
149 |     and extracts spectrograms'''
150 | 
151 |     fname = os.path.basename(fpath)
152 |     mel, mag = get_spectrograms(fpath)
153 |     t = mel.shape[0]
154 | 
155 |     # Marginal padding for reduction shape sync.
156 |     num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0
157 |     mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
158 |     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
159 | 
160 |     # Reduction
161 |     mel = mel[::hp.r, :]
162 |     return fname, mel, mag
163 | 
164 | 


--------------------------------------------------------------------------------