├── LICENSE ├── README.md ├── data_load.py ├── fig ├── aaa ├── attention.gif └── training_curves.png ├── harvard_sentences.txt ├── hyperparams.py ├── modules.py ├── networks.py ├── prepo.py ├── synthesize.py ├── train.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A TensorFlow Implementation of DC-TTS: yet another text-to-speech model 2 | 3 | I implement yet another text-to-speech model, dc-tts, introduced in [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969). My goal, however, is not just replicating the paper. Rather, I'd like to gain insights about various sound projects. 4 | 5 | ## Requirements 6 | * NumPy >= 1.11.1 7 | * TensorFlow >= 1.3 (Note that the API of `tf.contrib.layers.layer_norm` has changed since 1.3) 8 | * librosa 9 | * tqdm 10 | * matplotlib 11 | * scipy 12 | 13 | ## Data 14 | 15 | 16 | 17 | 18 | 19 | 20 | I train English models and an Korean model on four different speech datasets.

1. [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/)
2. [Nick Offerman's Audiobooks](https://www.audible.com.au/search?searchNarrator=Nick+Offerman)
3. [Kate Winslet's Audiobook](https://www.audible.com.au/pd/Classics/Therese-Raquin-Audiobook/B00FF0SLW4/ref=a_search_c4_1_3_srTtl?qid=1516854754&sr=1-3)
4. [KSS Dataset](https://kaggle.com/bryanpark/korean-single-speaker-speech-dataset) 21 | 22 | LJ Speech Dataset is recently widely used as a benchmark dataset in the TTS task because it is publicly available, and it has 24 hours of reasonable quality samples. 23 | Nick's and Kate's audiobooks are additionally used to see if the model can learn even with less data, variable speech samples. They are 18 hours and 5 hours long, respectively. Finally, KSS Dataset is a Korean single speaker speech dataset that lasts more than 12 hours. 24 | 25 | 26 | ## Training 27 | * STEP 0. Download [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/) or prepare your own data. 28 | * STEP 1. Adjust hyper parameters in `hyperparams.py`. (If you want to do preprocessing, set prepro True`. 29 | * STEP 2. Run `python train.py 1` for training Text2Mel. (If you set prepro True, run python prepro.py first) 30 | * STEP 3. Run `python train.py 2` for training SSRN. 31 | 32 | You can do STEP 2 and 3 at the same time, if you have more than one gpu card. 33 | 34 | ## Training Curves 35 | 36 | 37 | 38 | ## Attention Plot 39 | 40 | 41 | ## Sample Synthesis 42 | I generate speech samples based on [Harvard Sentences](http://www.cs.columbia.edu/~hgs/audio/harvard.html) as the original paper does. It is already included in the repo. 43 | 44 | * Run `synthesize.py` and check the files in `samples`. 45 | 46 | ## Generated Samples 47 | 48 | | Dataset | Samples | 49 | | :----- |:-------------| 50 | | LJ | [50k](https://soundcloud.com/kyubyong-park/sets/dc_tts) [200k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_200k) [310k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_310k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_lj_800k)| 51 | | Nick | [40k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_40k) [170k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_170k) [300k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_300k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_nick_800k)| 52 | | Kate| [40k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_40k) [160k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_160k) [300k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_300k) [800k](https://soundcloud.com/kyubyong-park/sets/dc_tts_kate_800k) | 53 | | KSS| [400k](https://soundcloud.com/kyubyong-park/sets/dc_tts_ko_400k) | 54 | 55 | ## Pretrained Model for LJ 56 | 57 | Download [this](https://www.dropbox.com/s/1oyipstjxh2n5wo/LJ_logdir.tar?dl=0). 58 | 59 | ## Notes 60 | 61 | * The paper didn't mention normalization, but without normalization I couldn't get it to work. So I added layer normalization. 62 | * The paper fixed the learning rate to 0.001, but it didn't work for me. So I decayed it. 63 | * I tried to train Text2Mel and SSRN simultaneously, but it didn't work. I guess separating those two networks mitigates the burden of training. 64 | * The authors claimed that the model can be trained within a day, but unfortunately the luck was not mine. However obviously this is much fater than Tacotron as it uses only convolution layers. 65 | * Thanks to the guided attention, the attention plot looks monotonic almost from the beginning. I guess this seems to hold the aligment tight so it won't lose track. 66 | * The paper didn't mention dropouts. I applied them as I believe it helps for regularization. 67 | * Check also other TTS models such as [Tacotron](https://github.com/kyubyong/tacotron) and [Deep Voice 3](https://github.com/kyubyong/deepvoice3). 68 | 69 | -------------------------------------------------------------------------------- /data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from hyperparams import Hyperparams as hp 11 | import numpy as np 12 | import tensorflow as tf 13 | from utils import * 14 | import codecs 15 | import re 16 | import os 17 | import unicodedata 18 | 19 | def load_vocab(): 20 | char2idx = {char: idx for idx, char in enumerate(hp.vocab)} 21 | idx2char = {idx: char for idx, char in enumerate(hp.vocab)} 22 | return char2idx, idx2char 23 | 24 | def text_normalize(text): 25 | text = ''.join(char for char in unicodedata.normalize('NFD', text) 26 | if unicodedata.category(char) != 'Mn') # Strip accents 27 | 28 | text = text.lower() 29 | text = re.sub("[^{}]".format(hp.vocab), " ", text) 30 | text = re.sub("[ ]+", " ", text) 31 | return text 32 | 33 | def load_data(mode="train"): 34 | '''Loads data 35 | Args: 36 | mode: "train" or "synthesize". 37 | ''' 38 | # Load vocabulary 39 | char2idx, idx2char = load_vocab() 40 | 41 | if mode=="train": 42 | if "LJ" in hp.data: 43 | # Parse 44 | fpaths, text_lengths, texts = [], [], [] 45 | transcript = os.path.join(hp.data, 'transcript.csv') 46 | lines = codecs.open(transcript, 'r', 'utf-8').readlines() 47 | for line in lines: 48 | fname, _, text = line.strip().split("|") 49 | 50 | fpath = os.path.join(hp.data, "wavs", fname + ".wav") 51 | fpaths.append(fpath) 52 | 53 | text = text_normalize(text) + "E" # E: EOS 54 | text = [char2idx[char] for char in text] 55 | text_lengths.append(len(text)) 56 | texts.append(np.array(text, np.int32).tostring()) 57 | 58 | return fpaths, text_lengths, texts 59 | else: # nick or kate 60 | # Parse 61 | fpaths, text_lengths, texts = [], [], [] 62 | transcript = os.path.join(hp.data, 'transcript.csv') 63 | lines = codecs.open(transcript, 'r', 'utf-8').readlines() 64 | for line in lines: 65 | fname, _, text, is_inside_quotes, duration = line.strip().split("|") 66 | duration = float(duration) 67 | if duration > 10. : continue 68 | 69 | fpath = os.path.join(hp.data, fname) 70 | fpaths.append(fpath) 71 | 72 | text += "E" # E: EOS 73 | text = [char2idx[char] for char in text] 74 | text_lengths.append(len(text)) 75 | texts.append(np.array(text, np.int32).tostring()) 76 | 77 | return fpaths, text_lengths, texts 78 | 79 | else: # synthesize on unseen test text. 80 | # Parse 81 | lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:] 82 | sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS 83 | texts = np.zeros((len(sents), hp.max_N), np.int32) 84 | for i, sent in enumerate(sents): 85 | texts[i, :len(sent)] = [char2idx[char] for char in sent] 86 | return texts 87 | 88 | def get_batch(): 89 | """Loads training data and put them in queues""" 90 | with tf.device('/cpu:0'): 91 | # Load data 92 | fpaths, text_lengths, texts = load_data() # list 93 | maxlen, minlen = max(text_lengths), min(text_lengths) 94 | 95 | # Calc total batch count 96 | num_batch = len(fpaths) // hp.B 97 | 98 | # Create Queues 99 | fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True) 100 | 101 | # Parse 102 | text = tf.decode_raw(text, tf.int32) # (None,) 103 | 104 | if hp.prepro: 105 | def _load_spectrograms(fpath): 106 | fname = os.path.basename(fpath) 107 | mel = "mels/{}".format(fname.replace("wav", "npy")) 108 | mag = "mags/{}".format(fname.replace("wav", "npy")) 109 | return fname, np.load(mel), np.load(mag) 110 | 111 | fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32]) 112 | else: 113 | fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32]) # (None, n_mels) 114 | 115 | # Add shape information 116 | fname.set_shape(()) 117 | text.set_shape((None,)) 118 | mel.set_shape((None, hp.n_mels)) 119 | mag.set_shape((None, hp.n_fft//2+1)) 120 | 121 | # Batching 122 | _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length( 123 | input_length=text_length, 124 | tensors=[text, mel, mag, fname], 125 | batch_size=hp.B, 126 | bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)], 127 | num_threads=8, 128 | capacity=hp.B*4, 129 | dynamic_pad=True) 130 | 131 | return texts, mels, mags, fnames, num_batch 132 | 133 | -------------------------------------------------------------------------------- /fig/aaa: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /fig/attention.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kyubyong/dc_tts/8b38110875920923343778ff959d01501323765e/fig/attention.gif -------------------------------------------------------------------------------- /fig/training_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kyubyong/dc_tts/8b38110875920923343778ff959d01501323765e/fig/training_curves.png -------------------------------------------------------------------------------- /harvard_sentences.txt: -------------------------------------------------------------------------------- 1 | http://www.cs.columbia.edu/~hgs/audio/harvard.html 2 | 1. The birch canoe slid on the smooth planks. 3 | 2. Glue the sheet to the dark blue background. 4 | 3. It's easy to tell the depth of a well. 5 | 4. These days a chicken leg is a rare dish. 6 | 5. Rice is often served in round bowls. 7 | 6. The juice of lemons makes fine punch. 8 | 7. The box was thrown beside the parked truck. 9 | 8. The hogs were fed chopped corn and garbage. 10 | 9. Four hours of steady work faced us. 11 | 10. Large size in stockings is hard to sell. 12 | 11. The boy was there when the sun rose. 13 | 12. A rod is used to catch pink salmon. 14 | 13. The source of the huge river is the clear spring. 15 | 14. Kick the ball straight and follow through. 16 | 15. Help the woman get back to her feet. 17 | 16. A pot of tea helps to pass the evening. 18 | 17. Smoky fires lack flame and heat. 19 | 18. The soft cushion broke the man's fall. 20 | 19. The salt breeze came across from the sea. 21 | 20. The girl at the booth sold fifty bonds. 22 | -------------------------------------------------------------------------------- /hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | class Hyperparams: 8 | '''Hyper parameters''' 9 | # pipeline 10 | prepro = True # if True, run `python prepro.py` first before running `python train.py`. 11 | 12 | # signal processing 13 | sr = 22050 # Sampling rate. 14 | n_fft = 2048 # fft points (samples) 15 | frame_shift = 0.0125 # seconds 16 | frame_length = 0.05 # seconds 17 | hop_length = int(sr * frame_shift) # samples. =276. 18 | win_length = int(sr * frame_length) # samples. =1102. 19 | n_mels = 80 # Number of Mel banks to generate 20 | power = 1.5 # Exponent for amplifying the predicted magnitude 21 | n_iter = 50 # Number of inversion iterations 22 | preemphasis = .97 23 | max_db = 100 24 | ref_db = 20 25 | 26 | # Model 27 | r = 4 # Reduction factor. Do not change this. 28 | dropout_rate = 0.05 29 | e = 128 # == embedding 30 | d = 256 # == hidden units of Text2Mel 31 | c = 512 # == hidden units of SSRN 32 | attention_win_size = 3 33 | 34 | # data 35 | data = "/data/private/voice/LJSpeech-1.0" 36 | # data = "/data/private/voice/kate" 37 | test_data = 'harvard_sentences.txt' 38 | vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS. 39 | max_N = 180 # Maximum number of characters. 40 | max_T = 210 # Maximum number of mel frames. 41 | 42 | # training scheme 43 | lr = 0.001 # Initial learning rate. 44 | logdir = "logdir/LJ01" 45 | sampledir = 'samples' 46 | B = 32 # batch size 47 | num_iterations = 2000000 48 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function, division 9 | 10 | import tensorflow as tf 11 | 12 | 13 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None): 14 | '''Embeds a given tensor. 15 | 16 | Args: 17 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 18 | to be looked up in `lookup table`. 19 | vocab_size: An int. Vocabulary size. 20 | num_units: An int. Number of embedding hidden units. 21 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 22 | should be constant zeros. 23 | scope: Optional scope for `variable_scope`. 24 | reuse: Boolean, whether to reuse the weights of a previous layer 25 | by the same name. 26 | 27 | Returns: 28 | A `Tensor` with one more rank than inputs's. The last dimensionality 29 | should be `num_units`. 30 | ''' 31 | with tf.variable_scope(scope, reuse=reuse): 32 | lookup_table = tf.get_variable('lookup_table', 33 | dtype=tf.float32, 34 | shape=[vocab_size, num_units], 35 | initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) 36 | if zero_pad: 37 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 38 | lookup_table[1:, :]), 0) 39 | 40 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 41 | 42 | return outputs 43 | 44 | 45 | def normalize(inputs, 46 | scope="normalize", 47 | reuse=None): 48 | '''Applies layer normalization that normalizes along the last axis. 49 | 50 | Args: 51 | inputs: A tensor with 2 or more dimensions, where the first dimension has 52 | `batch_size`. The normalization is over the last dimension. 53 | scope: Optional scope for `variable_scope`. 54 | reuse: Boolean, whether to reuse the weights of a previous layer 55 | by the same name. 56 | 57 | Returns: 58 | A tensor with the same shape and data dtype as `inputs`. 59 | ''' 60 | outputs = tf.contrib.layers.layer_norm(inputs, 61 | begin_norm_axis=-1, 62 | scope=scope, 63 | reuse=reuse) 64 | return outputs 65 | 66 | 67 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None): 68 | '''Highway networks, see https://arxiv.org/abs/1505.00387 69 | 70 | Args: 71 | inputs: A 3D tensor of shape [N, T, W]. 72 | num_units: An int or `None`. Specifies the number of units in the highway layer 73 | or uses the input size if `None`. 74 | scope: Optional scope for `variable_scope`. 75 | reuse: Boolean, whether to reuse the weights of a previous layer 76 | by the same name. 77 | 78 | Returns: 79 | A 3D tensor of shape [N, T, W]. 80 | ''' 81 | if not num_units: 82 | num_units = inputs.get_shape()[-1] 83 | 84 | with tf.variable_scope(scope, reuse=reuse): 85 | H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1") 86 | T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, 87 | bias_initializer=tf.constant_initializer(-1.0), name="dense2") 88 | outputs = H * T + inputs * (1. - T) 89 | return outputs 90 | 91 | def conv1d(inputs, 92 | filters=None, 93 | size=1, 94 | rate=1, 95 | padding="SAME", 96 | dropout_rate=0, 97 | use_bias=True, 98 | activation_fn=None, 99 | training=True, 100 | scope="conv1d", 101 | reuse=None): 102 | ''' 103 | Args: 104 | inputs: A 3-D tensor with shape of [batch, time, depth]. 105 | filters: An int. Number of outputs (=activation maps) 106 | size: An int. Filter size. 107 | rate: An int. Dilation rate. 108 | padding: Either `same` or `valid` or `causal` (case-insensitive). 109 | dropout_rate: A float of [0, 1]. 110 | use_bias: A boolean. 111 | activation_fn: A string. 112 | training: A boolean. If True, dropout is applied. 113 | scope: Optional scope for `variable_scope`. 114 | reuse: Boolean, whether to reuse the weights of a previous layer 115 | by the same name. 116 | 117 | Returns: 118 | A masked tensor of the same shape and dtypes as `inputs`. 119 | ''' 120 | with tf.variable_scope(scope): 121 | if padding.lower() == "causal": 122 | # pre-padding for causality 123 | pad_len = (size - 1) * rate # padding size 124 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) 125 | padding = "valid" 126 | 127 | if filters is None: 128 | filters = inputs.get_shape().as_list()[-1] 129 | 130 | params = {"inputs": inputs, "filters": filters, "kernel_size": size, 131 | "dilation_rate": rate, "padding": padding, "use_bias": use_bias, 132 | "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse} 133 | 134 | tensor = tf.layers.conv1d(**params) 135 | tensor = normalize(tensor) 136 | if activation_fn is not None: 137 | tensor = activation_fn(tensor) 138 | 139 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 140 | 141 | return tensor 142 | 143 | def hc(inputs, 144 | filters=None, 145 | size=1, 146 | rate=1, 147 | padding="SAME", 148 | dropout_rate=0, 149 | use_bias=True, 150 | activation_fn=None, 151 | training=True, 152 | scope="hc", 153 | reuse=None): 154 | ''' 155 | Args: 156 | inputs: A 3-D tensor with shape of [batch, time, depth]. 157 | filters: An int. Number of outputs (=activation maps) 158 | size: An int. Filter size. 159 | rate: An int. Dilation rate. 160 | padding: Either `same` or `valid` or `causal` (case-insensitive). 161 | use_bias: A boolean. 162 | activation_fn: A string. 163 | training: A boolean. If True, dropout is applied. 164 | scope: Optional scope for `variable_scope`. 165 | reuse: Boolean, whether to reuse the weights of a previous layer 166 | by the same name. 167 | 168 | Returns: 169 | A masked tensor of the same shape and dtypes as `inputs`. 170 | ''' 171 | _inputs = inputs 172 | with tf.variable_scope(scope): 173 | if padding.lower() == "causal": 174 | # pre-padding for causality 175 | pad_len = (size - 1) * rate # padding size 176 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) 177 | padding = "valid" 178 | 179 | if filters is None: 180 | filters = inputs.get_shape().as_list()[-1] 181 | 182 | 183 | params = {"inputs": inputs, "filters": 2*filters, "kernel_size": size, 184 | "dilation_rate": rate, "padding": padding, "use_bias": use_bias, 185 | "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse} 186 | 187 | tensor = tf.layers.conv1d(**params) 188 | H1, H2 = tf.split(tensor, 2, axis=-1) 189 | H1 = normalize(H1, scope="H1") 190 | H2 = normalize(H2, scope="H2") 191 | H1 = tf.nn.sigmoid(H1, "gate") 192 | H2 = activation_fn(H2, "info") if activation_fn is not None else H2 193 | tensor = H1*H2 + (1.-H1)*_inputs 194 | 195 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 196 | 197 | return tensor 198 | 199 | def conv1d_transpose(inputs, 200 | filters=None, 201 | size=3, 202 | stride=2, 203 | padding='same', 204 | dropout_rate=0, 205 | use_bias=True, 206 | activation=None, 207 | training=True, 208 | scope="conv1d_transpose", 209 | reuse=None): 210 | ''' 211 | Args: 212 | inputs: A 3-D tensor with shape of [batch, time, depth]. 213 | filters: An int. Number of outputs (=activation maps) 214 | size: An int. Filter size. 215 | rate: An int. Dilation rate. 216 | padding: Either `same` or `valid` or `causal` (case-insensitive). 217 | dropout_rate: A float of [0, 1]. 218 | use_bias: A boolean. 219 | activation_fn: A string. 220 | training: A boolean. If True, dropout is applied. 221 | scope: Optional scope for `variable_scope`. 222 | reuse: Boolean, whether to reuse the weights of a previous layer 223 | by the same name. 224 | 225 | Returns: 226 | A tensor of the shape with [batch, time*2, depth]. 227 | ''' 228 | with tf.variable_scope(scope, reuse=reuse): 229 | if filters is None: 230 | filters = inputs.get_shape().as_list()[-1] 231 | inputs = tf.expand_dims(inputs, 1) 232 | tensor = tf.layers.conv2d_transpose(inputs, 233 | filters=filters, 234 | kernel_size=(1, size), 235 | strides=(1, stride), 236 | padding=padding, 237 | activation=None, 238 | kernel_initializer=tf.contrib.layers.variance_scaling_initializer(), 239 | use_bias=use_bias) 240 | tensor = tf.squeeze(tensor, 1) 241 | tensor = normalize(tensor) 242 | if activation is not None: 243 | tensor = activation(tensor) 244 | 245 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 246 | 247 | return tensor 248 | 249 | 250 | 251 | 252 | 253 | -------------------------------------------------------------------------------- /networks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from hyperparams import Hyperparams as hp 11 | from modules import * 12 | import tensorflow as tf 13 | 14 | def TextEnc(L, training=True): 15 | ''' 16 | Args: 17 | L: Text inputs. (B, N) 18 | 19 | Return: 20 | K: Keys. (B, N, d) 21 | V: Values. (B, N, d) 22 | ''' 23 | i = 1 24 | tensor = embed(L, 25 | vocab_size=len(hp.vocab), 26 | num_units=hp.e, 27 | scope="embed_{}".format(i)); i += 1 28 | tensor = conv1d(tensor, 29 | filters=2*hp.d, 30 | size=1, 31 | rate=1, 32 | dropout_rate=hp.dropout_rate, 33 | activation_fn=tf.nn.relu, 34 | training=training, 35 | scope="C_{}".format(i)); i += 1 36 | tensor = conv1d(tensor, 37 | size=1, 38 | rate=1, 39 | dropout_rate=hp.dropout_rate, 40 | training=training, 41 | scope="C_{}".format(i)); i += 1 42 | 43 | for _ in range(2): 44 | for j in range(4): 45 | tensor = hc(tensor, 46 | size=3, 47 | rate=3**j, 48 | dropout_rate=hp.dropout_rate, 49 | activation_fn=None, 50 | training=training, 51 | scope="HC_{}".format(i)); i += 1 52 | for _ in range(2): 53 | tensor = hc(tensor, 54 | size=3, 55 | rate=1, 56 | dropout_rate=hp.dropout_rate, 57 | activation_fn=None, 58 | training=training, 59 | scope="HC_{}".format(i)); i += 1 60 | 61 | for _ in range(2): 62 | tensor = hc(tensor, 63 | size=1, 64 | rate=1, 65 | dropout_rate=hp.dropout_rate, 66 | activation_fn=None, 67 | training=training, 68 | scope="HC_{}".format(i)); i += 1 69 | 70 | K, V = tf.split(tensor, 2, -1) 71 | return K, V 72 | 73 | def AudioEnc(S, training=True): 74 | ''' 75 | Args: 76 | S: melspectrogram. (B, T/r, n_mels) 77 | 78 | Returns 79 | Q: Queries. (B, T/r, d) 80 | ''' 81 | i = 1 82 | tensor = conv1d(S, 83 | filters=hp.d, 84 | size=1, 85 | rate=1, 86 | padding="CAUSAL", 87 | dropout_rate=hp.dropout_rate, 88 | activation_fn=tf.nn.relu, 89 | training=training, 90 | scope="C_{}".format(i)); i += 1 91 | tensor = conv1d(tensor, 92 | size=1, 93 | rate=1, 94 | padding="CAUSAL", 95 | dropout_rate=hp.dropout_rate, 96 | activation_fn=tf.nn.relu, 97 | training=training, 98 | scope="C_{}".format(i)); i += 1 99 | tensor = conv1d(tensor, 100 | size=1, 101 | rate=1, 102 | padding="CAUSAL", 103 | dropout_rate=hp.dropout_rate, 104 | training=training, 105 | scope="C_{}".format(i)); i += 1 106 | for _ in range(2): 107 | for j in range(4): 108 | tensor = hc(tensor, 109 | size=3, 110 | rate=3**j, 111 | padding="CAUSAL", 112 | dropout_rate=hp.dropout_rate, 113 | training=training, 114 | scope="HC_{}".format(i)); i += 1 115 | for _ in range(2): 116 | tensor = hc(tensor, 117 | size=3, 118 | rate=3, 119 | padding="CAUSAL", 120 | dropout_rate=hp.dropout_rate, 121 | training=training, 122 | scope="HC_{}".format(i)); i += 1 123 | 124 | return tensor 125 | 126 | def Attention(Q, K, V, mononotic_attention=False, prev_max_attentions=None): 127 | ''' 128 | Args: 129 | Q: Queries. (B, T/r, d) 130 | K: Keys. (B, N, d) 131 | V: Values. (B, N, d) 132 | mononotic_attention: A boolean. At training, it is False. 133 | prev_max_attentions: (B,). At training, it is set to None. 134 | 135 | Returns: 136 | R: [Context Vectors; Q]. (B, T/r, 2d) 137 | alignments: (B, N, T/r) 138 | max_attentions: (B, T/r) 139 | ''' 140 | A = tf.matmul(Q, K, transpose_b=True) * tf.rsqrt(tf.to_float(hp.d)) 141 | if mononotic_attention: # for inference 142 | key_masks = tf.sequence_mask(prev_max_attentions, hp.max_N) 143 | reverse_masks = tf.sequence_mask(hp.max_N - hp.attention_win_size - prev_max_attentions, hp.max_N)[:, ::-1] 144 | masks = tf.logical_or(key_masks, reverse_masks) 145 | masks = tf.tile(tf.expand_dims(masks, 1), [1, hp.max_T, 1]) 146 | paddings = tf.ones_like(A) * (-2 ** 32 + 1) # (B, T/r, N) 147 | A = tf.where(tf.equal(masks, False), A, paddings) 148 | A = tf.nn.softmax(A) # (B, T/r, N) 149 | max_attentions = tf.argmax(A, -1) # (B, T/r) 150 | R = tf.matmul(A, V) 151 | R = tf.concat((R, Q), -1) 152 | 153 | alignments = tf.transpose(A, [0, 2, 1]) # (B, N, T/r) 154 | 155 | return R, alignments, max_attentions 156 | 157 | def AudioDec(R, training=True): 158 | ''' 159 | Args: 160 | R: [Context Vectors; Q]. (B, T/r, 2d) 161 | 162 | Returns: 163 | Y: Melspectrogram predictions. (B, T/r, n_mels) 164 | ''' 165 | 166 | i = 1 167 | tensor = conv1d(R, 168 | filters=hp.d, 169 | size=1, 170 | rate=1, 171 | padding="CAUSAL", 172 | dropout_rate=hp.dropout_rate, 173 | training=training, 174 | scope="C_{}".format(i)); i += 1 175 | for j in range(4): 176 | tensor = hc(tensor, 177 | size=3, 178 | rate=3**j, 179 | padding="CAUSAL", 180 | dropout_rate=hp.dropout_rate, 181 | training=training, 182 | scope="HC_{}".format(i)); i += 1 183 | 184 | for _ in range(2): 185 | tensor = hc(tensor, 186 | size=3, 187 | rate=1, 188 | padding="CAUSAL", 189 | dropout_rate=hp.dropout_rate, 190 | training=training, 191 | scope="HC_{}".format(i)); i += 1 192 | for _ in range(3): 193 | tensor = conv1d(tensor, 194 | size=1, 195 | rate=1, 196 | padding="CAUSAL", 197 | dropout_rate=hp.dropout_rate, 198 | activation_fn=tf.nn.relu, 199 | training=training, 200 | scope="C_{}".format(i)); i += 1 201 | # mel_hats 202 | logits = conv1d(tensor, 203 | filters=hp.n_mels, 204 | size=1, 205 | rate=1, 206 | padding="CAUSAL", 207 | dropout_rate=hp.dropout_rate, 208 | training=training, 209 | scope="C_{}".format(i)); i += 1 210 | Y = tf.nn.sigmoid(logits) # mel_hats 211 | 212 | return logits, Y 213 | 214 | def SSRN(Y, training=True): 215 | ''' 216 | Args: 217 | Y: Melspectrogram Predictions. (B, T/r, n_mels) 218 | 219 | Returns: 220 | Z: Spectrogram Predictions. (B, T, 1+n_fft/2) 221 | ''' 222 | 223 | i = 1 # number of layers 224 | 225 | # -> (B, T/r, c) 226 | tensor = conv1d(Y, 227 | filters=hp.c, 228 | size=1, 229 | rate=1, 230 | dropout_rate=hp.dropout_rate, 231 | training=training, 232 | scope="C_{}".format(i)); i += 1 233 | for j in range(2): 234 | tensor = hc(tensor, 235 | size=3, 236 | rate=3**j, 237 | dropout_rate=hp.dropout_rate, 238 | training=training, 239 | scope="HC_{}".format(i)); i += 1 240 | for _ in range(2): 241 | # -> (B, T/2, c) -> (B, T, c) 242 | tensor = conv1d_transpose(tensor, 243 | scope="D_{}".format(i), 244 | dropout_rate=hp.dropout_rate, 245 | training=training,); i += 1 246 | for j in range(2): 247 | tensor = hc(tensor, 248 | size=3, 249 | rate=3**j, 250 | dropout_rate=hp.dropout_rate, 251 | training=training, 252 | scope="HC_{}".format(i)); i += 1 253 | # -> (B, T, 2*c) 254 | tensor = conv1d(tensor, 255 | filters=2*hp.c, 256 | size=1, 257 | rate=1, 258 | dropout_rate=hp.dropout_rate, 259 | training=training, 260 | scope="C_{}".format(i)); i += 1 261 | for _ in range(2): 262 | tensor = hc(tensor, 263 | size=3, 264 | rate=1, 265 | dropout_rate=hp.dropout_rate, 266 | training=training, 267 | scope="HC_{}".format(i)); i += 1 268 | # -> (B, T, 1+n_fft/2) 269 | tensor = conv1d(tensor, 270 | filters=1+hp.n_fft//2, 271 | size=1, 272 | rate=1, 273 | dropout_rate=hp.dropout_rate, 274 | training=training, 275 | scope="C_{}".format(i)); i += 1 276 | 277 | for _ in range(2): 278 | tensor = conv1d(tensor, 279 | size=1, 280 | rate=1, 281 | dropout_rate=hp.dropout_rate, 282 | activation_fn=tf.nn.relu, 283 | training=training, 284 | scope="C_{}".format(i)); i += 1 285 | logits = conv1d(tensor, 286 | size=1, 287 | rate=1, 288 | dropout_rate=hp.dropout_rate, 289 | training=training, 290 | scope="C_{}".format(i)) 291 | Z = tf.nn.sigmoid(logits) 292 | return logits, Z 293 | -------------------------------------------------------------------------------- /prepo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from utils import load_spectrograms 11 | import os 12 | from data_load import load_data 13 | import numpy as np 14 | import tqdm 15 | 16 | # Load data 17 | fpaths, _, _ = load_data() # list 18 | 19 | for fpath in tqdm.tqdm(fpaths): 20 | fname, mel, mag = load_spectrograms(fpath) 21 | if not os.path.exists("mels"): os.mkdir("mels") 22 | if not os.path.exists("mags"): os.mkdir("mags") 23 | 24 | np.save("mels/{}".format(fname.replace("wav", "npy")), mel) 25 | np.save("mags/{}".format(fname.replace("wav", "npy")), mag) -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # /usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | import os 11 | 12 | from hyperparams import Hyperparams as hp 13 | import numpy as np 14 | import tensorflow as tf 15 | from train import Graph 16 | from utils import * 17 | from data_load import load_data 18 | from scipy.io.wavfile import write 19 | from tqdm import tqdm 20 | 21 | def synthesize(): 22 | # Load data 23 | L = load_data("synthesize") 24 | 25 | # Load graph 26 | g = Graph(mode="synthesize"); print("Graph loaded") 27 | 28 | with tf.Session() as sess: 29 | sess.run(tf.global_variables_initializer()) 30 | 31 | # Restore parameters 32 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel') 33 | saver1 = tf.train.Saver(var_list=var_list) 34 | saver1.restore(sess, tf.train.latest_checkpoint(hp.logdir + "-1")) 35 | print("Text2Mel Restored!") 36 | 37 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \ 38 | tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs') 39 | saver2 = tf.train.Saver(var_list=var_list) 40 | saver2.restore(sess, tf.train.latest_checkpoint(hp.logdir + "-2")) 41 | print("SSRN Restored!") 42 | 43 | # Feed Forward 44 | ## mel 45 | Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32) 46 | prev_max_attentions = np.zeros((len(L),), np.int32) 47 | for j in tqdm(range(hp.max_T)): 48 | _gs, _Y, _max_attentions, _alignments = \ 49 | sess.run([g.global_step, g.Y, g.max_attentions, g.alignments], 50 | {g.L: L, 51 | g.mels: Y, 52 | g.prev_max_attentions: prev_max_attentions}) 53 | Y[:, j, :] = _Y[:, j, :] 54 | prev_max_attentions = _max_attentions[:, j] 55 | 56 | # Get magnitude 57 | Z = sess.run(g.Z, {g.Y: Y}) 58 | 59 | # Generate wav files 60 | if not os.path.exists(hp.sampledir): os.makedirs(hp.sampledir) 61 | for i, mag in enumerate(Z): 62 | print("Working on file", i+1) 63 | wav = spectrogram2wav(mag) 64 | write(hp.sampledir + "/{}.wav".format(i+1), hp.sr, wav) 65 | 66 | if __name__ == '__main__': 67 | synthesize() 68 | print("Done") 69 | 70 | 71 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # /usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from tqdm import tqdm 11 | 12 | from data_load import get_batch, load_vocab 13 | from hyperparams import Hyperparams as hp 14 | from modules import * 15 | from networks import TextEnc, AudioEnc, AudioDec, Attention, SSRN 16 | import tensorflow as tf 17 | from utils import * 18 | import sys 19 | 20 | 21 | class Graph: 22 | def __init__(self, num=1, mode="train"): 23 | ''' 24 | Args: 25 | num: Either 1 or 2. 1 for Text2Mel 2 for SSRN. 26 | mode: Either "train" or "synthesize". 27 | ''' 28 | # Load vocabulary 29 | self.char2idx, self.idx2char = load_vocab() 30 | 31 | # Set flag 32 | training = True if mode=="train" else False 33 | 34 | # Graph 35 | # Data Feeding 36 | ## L: Text. (B, N), int32 37 | ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32 38 | ## mags: Magnitude. (B, T, n_fft//2+1) float32 39 | if mode=="train": 40 | self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch() 41 | self.prev_max_attentions = tf.ones(shape=(hp.B,), dtype=tf.int32) 42 | self.gts = tf.convert_to_tensor(guided_attention()) 43 | else: # Synthesize 44 | self.L = tf.placeholder(tf.int32, shape=(None, None)) 45 | self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels)) 46 | self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None,)) 47 | 48 | if num==1 or (not training): 49 | with tf.variable_scope("Text2Mel"): 50 | # Get S or decoder inputs. (B, T//r, n_mels) 51 | self.S = tf.concat((tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) 52 | 53 | # Networks 54 | with tf.variable_scope("TextEnc"): 55 | self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) 56 | 57 | with tf.variable_scope("AudioEnc"): 58 | self.Q = AudioEnc(self.S, training=training) 59 | 60 | with tf.variable_scope("Attention"): 61 | # R: (B, T/r, 2d) 62 | # alignments: (B, N, T/r) 63 | # max_attentions: (B,) 64 | self.R, self.alignments, self.max_attentions = Attention(self.Q, self.K, self.V, 65 | mononotic_attention=(not training), 66 | prev_max_attentions=self.prev_max_attentions) 67 | with tf.variable_scope("AudioDec"): 68 | self.Y_logits, self.Y = AudioDec(self.R, training=training) # (B, T/r, n_mels) 69 | else: # num==2 & training. Note that during training, 70 | # the ground truth melspectrogram values are fed. 71 | with tf.variable_scope("SSRN"): 72 | self.Z_logits, self.Z = SSRN(self.mels, training=training) 73 | 74 | if not training: 75 | # During inference, the predicted melspectrogram values are fed. 76 | with tf.variable_scope("SSRN"): 77 | self.Z_logits, self.Z = SSRN(self.Y, training=training) 78 | 79 | with tf.variable_scope("gs"): 80 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 81 | 82 | if training: 83 | if num==1: # Text2Mel 84 | # mel L1 loss 85 | self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels)) 86 | 87 | # mel binary divergence loss 88 | self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.mels)) 89 | 90 | # guided_attention loss 91 | self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T] 92 | self.attention_masks = tf.to_float(tf.not_equal(self.A, -1)) 93 | self.loss_att = tf.reduce_sum(tf.abs(self.A * self.gts) * self.attention_masks) 94 | self.mask_sum = tf.reduce_sum(self.attention_masks) 95 | self.loss_att /= self.mask_sum 96 | 97 | # total loss 98 | self.loss = self.loss_mels + self.loss_bd1 + self.loss_att 99 | 100 | tf.summary.scalar('train/loss_mels', self.loss_mels) 101 | tf.summary.scalar('train/loss_bd1', self.loss_bd1) 102 | tf.summary.scalar('train/loss_att', self.loss_att) 103 | tf.summary.image('train/mel_gt', tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1)) 104 | tf.summary.image('train/mel_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1)) 105 | else: # SSRN 106 | # mag L1 loss 107 | self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags)) 108 | 109 | # mag binary divergence loss 110 | self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.mags)) 111 | 112 | # total loss 113 | self.loss = self.loss_mags + self.loss_bd2 114 | 115 | tf.summary.scalar('train/loss_mags', self.loss_mags) 116 | tf.summary.scalar('train/loss_bd2', self.loss_bd2) 117 | tf.summary.image('train/mag_gt', tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1)) 118 | tf.summary.image('train/mag_hat', tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1)) 119 | 120 | # Training Scheme 121 | self.lr = learning_rate_decay(hp.lr, self.global_step) 122 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 123 | tf.summary.scalar("lr", self.lr) 124 | 125 | ## gradient clipping 126 | self.gvs = self.optimizer.compute_gradients(self.loss) 127 | self.clipped = [] 128 | for grad, var in self.gvs: 129 | grad = tf.clip_by_value(grad, -1., 1.) 130 | self.clipped.append((grad, var)) 131 | self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) 132 | 133 | # Summary 134 | self.merged = tf.summary.merge_all() 135 | 136 | 137 | if __name__ == '__main__': 138 | # argument: 1 or 2. 1 for Text2mel, 2 for SSRN. 139 | num = int(sys.argv[1]) 140 | 141 | g = Graph(num=num); print("Training Graph loaded") 142 | 143 | logdir = hp.logdir + "-" + str(num) 144 | sv = tf.train.Supervisor(logdir=logdir, save_model_secs=0, global_step=g.global_step) 145 | with sv.managed_session() as sess: 146 | while 1: 147 | for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 148 | gs, _ = sess.run([g.global_step, g.train_op]) 149 | 150 | # Write checkpoint files at every 1k steps 151 | if gs % 1000 == 0: 152 | sv.saver.save(sess, logdir + '/model_gs_{}'.format(str(gs // 1000).zfill(3) + "k")) 153 | 154 | if num==1: 155 | # plot alignment 156 | alignments = sess.run(g.alignments) 157 | plot_alignment(alignments[0], str(gs // 1000).zfill(3) + "k", logdir) 158 | 159 | # break 160 | if gs > hp.num_iterations: break 161 | 162 | print("Done") 163 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | from __future__ import print_function, division 8 | 9 | import numpy as np 10 | import librosa 11 | import os, copy 12 | import matplotlib 13 | matplotlib.use('pdf') 14 | import matplotlib.pyplot as plt 15 | from scipy import signal 16 | 17 | from hyperparams import Hyperparams as hp 18 | import tensorflow as tf 19 | 20 | def get_spectrograms(fpath): 21 | '''Parse the wave file in `fpath` and 22 | Returns normalized melspectrogram and linear spectrogram. 23 | 24 | Args: 25 | fpath: A string. The full path of a sound file. 26 | 27 | Returns: 28 | mel: A 2d array of shape (T, n_mels) and dtype of float32. 29 | mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. 30 | ''' 31 | # Loading sound file 32 | y, sr = librosa.load(fpath, sr=hp.sr) 33 | 34 | # Trimming 35 | y, _ = librosa.effects.trim(y) 36 | 37 | # Preemphasis 38 | y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) 39 | 40 | # stft 41 | linear = librosa.stft(y=y, 42 | n_fft=hp.n_fft, 43 | hop_length=hp.hop_length, 44 | win_length=hp.win_length) 45 | 46 | # magnitude spectrogram 47 | mag = np.abs(linear) # (1+n_fft//2, T) 48 | 49 | # mel spectrogram 50 | mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) 51 | mel = np.dot(mel_basis, mag) # (n_mels, t) 52 | 53 | # to decibel 54 | mel = 20 * np.log10(np.maximum(1e-5, mel)) 55 | mag = 20 * np.log10(np.maximum(1e-5, mag)) 56 | 57 | # normalize 58 | mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) 59 | mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) 60 | 61 | # Transpose 62 | mel = mel.T.astype(np.float32) # (T, n_mels) 63 | mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) 64 | 65 | return mel, mag 66 | 67 | def spectrogram2wav(mag): 68 | '''# Generate wave file from linear magnitude spectrogram 69 | 70 | Args: 71 | mag: A numpy array of (T, 1+n_fft//2) 72 | 73 | Returns: 74 | wav: A 1-D numpy array. 75 | ''' 76 | # transpose 77 | mag = mag.T 78 | 79 | # de-noramlize 80 | mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db 81 | 82 | # to amplitude 83 | mag = np.power(10.0, mag * 0.05) 84 | 85 | # wav reconstruction 86 | wav = griffin_lim(mag**hp.power) 87 | 88 | # de-preemphasis 89 | wav = signal.lfilter([1], [1, -hp.preemphasis], wav) 90 | 91 | # trim 92 | wav, _ = librosa.effects.trim(wav) 93 | 94 | return wav.astype(np.float32) 95 | 96 | def griffin_lim(spectrogram): 97 | '''Applies Griffin-Lim's raw.''' 98 | X_best = copy.deepcopy(spectrogram) 99 | for i in range(hp.n_iter): 100 | X_t = invert_spectrogram(X_best) 101 | est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) 102 | phase = est / np.maximum(1e-8, np.abs(est)) 103 | X_best = spectrogram * phase 104 | X_t = invert_spectrogram(X_best) 105 | y = np.real(X_t) 106 | 107 | return y 108 | 109 | def invert_spectrogram(spectrogram): 110 | '''Applies inverse fft. 111 | Args: 112 | spectrogram: [1+n_fft//2, t] 113 | ''' 114 | return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann") 115 | 116 | def plot_alignment(alignment, gs, dir=hp.logdir): 117 | """Plots the alignment. 118 | 119 | Args: 120 | alignment: A numpy array with shape of (encoder_steps, decoder_steps) 121 | gs: (int) global step. 122 | dir: Output path. 123 | """ 124 | if not os.path.exists(dir): os.mkdir(dir) 125 | 126 | fig, ax = plt.subplots() 127 | im = ax.imshow(alignment) 128 | 129 | fig.colorbar(im) 130 | plt.title('{} Steps'.format(gs)) 131 | plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png') 132 | plt.close(fig) 133 | 134 | def guided_attention(g=0.2): 135 | '''Guided attention. Refer to page 3 on the paper.''' 136 | W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32) 137 | for n_pos in range(W.shape[0]): 138 | for t_pos in range(W.shape[1]): 139 | W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g)) 140 | return W 141 | 142 | def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0): 143 | '''Noam scheme from tensor2tensor''' 144 | step = tf.to_float(global_step + 1) 145 | return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5) 146 | 147 | def load_spectrograms(fpath): 148 | '''Read the wave file in `fpath` 149 | and extracts spectrograms''' 150 | 151 | fname = os.path.basename(fpath) 152 | mel, mag = get_spectrograms(fpath) 153 | t = mel.shape[0] 154 | 155 | # Marginal padding for reduction shape sync. 156 | num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 157 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") 158 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") 159 | 160 | # Reduction 161 | mel = mel[::hp.r, :] 162 | return fname, mel, mag 163 | 164 | --------------------------------------------------------------------------------