├── tvars_ssrn.csv ├── prepo.py ├── tvars_text2mel.csv ├── synthesize.py ├── README.md ├── hyperparams.py ├── utils.py ├── data_load.py ├── synth_dctts.ipynb ├── train_transfer.py ├── modules.py └── networks.py /tvars_ssrn.csv: -------------------------------------------------------------------------------- 1 | SSRN/C_1/ 2 | SSRN/HC_2/ 3 | SSRN/HC_3/ 4 | SSRN/D_4/ 5 | SSRN/HC_5/ 6 | SSRN/HC_6/ 7 | SSRN/D_7/ 8 | SSRN/HC_8/ 9 | SSRN/HC_9/ 10 | SSRN/C_10/ 11 | SSRN/HC_11/ 12 | SSRN/HC_12/ 13 | SSRN/C_13/ 14 | SSRN/C_14/ 15 | SSRN/C_15/ 16 | SSRN/C_16/ 17 | -------------------------------------------------------------------------------- /prepo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from utils import load_spectrograms 11 | import os 12 | from data_load import load_data 13 | import numpy as np 14 | import tqdm 15 | 16 | # Load data 17 | fpaths, _, _ = load_data() # list 18 | 19 | for fpath in tqdm.tqdm(fpaths): 20 | fname, mel, mag = load_spectrograms(fpath) 21 | if not os.path.exists("/output/mels"): os.mkdir("/output/mels") 22 | if not os.path.exists("/output/mags"): os.mkdir("/output/mags") 23 | 24 | np.save("/output/mels/{}".format(fname.replace("wav", "npy")), mel) 25 | np.save("/output/mags/{}".format(fname.replace("wav", "npy")), mag) -------------------------------------------------------------------------------- /tvars_text2mel.csv: -------------------------------------------------------------------------------- 1 | Text2Mel/TextEnc/embed_1/ 2 | Text2Mel/TextEnc/C_2/ 3 | Text2Mel/TextEnc/C_3/ 4 | Text2Mel/TextEnc/HC_4/ 5 | Text2Mel/TextEnc/HC_5/ 6 | Text2Mel/TextEnc/HC_6/ 7 | Text2Mel/TextEnc/HC_7/ 8 | Text2Mel/TextEnc/HC_8/ 9 | Text2Mel/TextEnc/HC_9/ 10 | Text2Mel/TextEnc/HC_10/ 11 | Text2Mel/TextEnc/HC_11/ 12 | Text2Mel/TextEnc/HC_12/ 13 | Text2Mel/TextEnc/HC_13/ 14 | Text2Mel/TextEnc/HC_14/ 15 | Text2Mel/TextEnc/HC_15/ 16 | Text2Mel/AudioEnc/C_1/ 17 | Text2Mel/AudioEnc/C_2/ 18 | Text2Mel/AudioEnc/C_3/ 19 | Text2Mel/AudioEnc/HC_4/ 20 | Text2Mel/AudioEnc/HC_5/ 21 | Text2Mel/AudioEnc/HC_6/ 22 | Text2Mel/AudioEnc/HC_7/ 23 | Text2Mel/AudioEnc/HC_8/ 24 | Text2Mel/AudioEnc/HC_9/ 25 | Text2Mel/AudioEnc/HC_10/ 26 | Text2Mel/AudioEnc/HC_11/ 27 | Text2Mel/AudioEnc/HC_12/ 28 | Text2Mel/AudioEnc/HC_13/ 29 | Text2Mel/AudioDec/C_1/ 30 | Text2Mel/AudioDec/HC_2/ 31 | Text2Mel/AudioDec/HC_3/ 32 | Text2Mel/AudioDec/HC_4/ 33 | Text2Mel/AudioDec/HC_5/ 34 | Text2Mel/AudioDec/HC_6/ 35 | Text2Mel/AudioDec/HC_7/ 36 | Text2Mel/AudioDec/C_8/ 37 | Text2Mel/AudioDec/C_9/ 38 | Text2Mel/AudioDec/C_10/ 39 | Text2Mel/AudioDec/C_11/ 40 | -------------------------------------------------------------------------------- /synthesize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # /usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | 7 | Modified by sean leary. learysean1@hotmail.com 8 | https://github.com/SeanPLeary/dc_tts-transfer-learning 9 | ''' 10 | 11 | from __future__ import print_function 12 | 13 | import os 14 | 15 | from hyperparams import Hyperparams as hp 16 | import numpy as np 17 | import tensorflow as tf 18 | from train_transfer import Graph 19 | from utils import * 20 | from data_load import load_data 21 | from scipy.io.wavfile import write 22 | from tqdm import tqdm 23 | 24 | def synthesize(): 25 | # Load data 26 | L = load_data("synthesize") 27 | 28 | # Load graph 29 | g = Graph(mode="synthesize"); print("Graph loaded") 30 | 31 | with tf.Session() as sess: 32 | sess.run(tf.global_variables_initializer()) 33 | 34 | # Restore parameters 35 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel') 36 | saver1 = tf.train.Saver(var_list=var_list) 37 | saver1.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-1")) 38 | print("Text2Mel Restored!") 39 | 40 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \ 41 | tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs') 42 | saver2 = tf.train.Saver(var_list=var_list) 43 | saver2.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-2")) 44 | print("SSRN Restored!") 45 | 46 | # Feed Forward 47 | ## mel 48 | Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32) 49 | prev_max_attentions = np.zeros((len(L),), np.int32) 50 | for j in tqdm(range(hp.max_T)): 51 | _gs, _Y, _max_attentions, _alignments = \ 52 | sess.run([g.global_step, g.Y, g.max_attentions, g.alignments], 53 | {g.L: L, 54 | g.mels: Y, 55 | g.prev_max_attentions: prev_max_attentions}) 56 | Y[:, j, :] = _Y[:, j, :] 57 | prev_max_attentions = _max_attentions[:, j] 58 | 59 | # Get magnitude 60 | Z = sess.run(g.Z, {g.Y: Y}) 61 | 62 | # Generate wav files 63 | if not os.path.exists(hp.sampledir): os.makedirs(hp.sampledir) 64 | for i, mag in enumerate(Z): 65 | print("Working on file", i+1) 66 | wav = spectrogram2wav(mag) 67 | write(hp.sampledir + "/{}.wav".format(i+1), hp.sr, wav) 68 | 69 | if __name__ == '__main__': 70 | synthesize() 71 | print("Done") 72 | 73 | 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dc_tts-transfer-learning 2 | 3 | This repo contains attempts to apply transfer learning to the dc_tts text-to-speech model decribed in the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969). The code used is a modified version of [Kyubyong's dc_tts code](https://github.com/Kyubyong/dc_tts). The [pretrained model](https://www.dropbox.com/s/1oyipstjxh2n5wo/LJ_logdir.tar?dl=0) was also provided in Kyubong's repo. It was pretrained on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/). Scarlett Johansson's voice was trained during transfer learning 4 | 5 | --- 6 | Transfer Learning is accomplished by selecting the model layers to train in hyperparameters.py 7 | 8 | --- 9 | 10 | Task List: 11 | - [x] add selectable list of layers for transfer learning 12 | - [x] prelim model training 13 | - [ ] add scoring history plots 14 | - [ ] detailed exploration of which layers to train 15 | - [ ] explore data augmentation methods 16 | - [ ] explore post-processing 17 | 18 | ## Prelim Model Training 19 | - ~6 hrs of training on Tesla V100 GPU 20 | - Layers trained: 21 | - SSRN(C_13, C_14, C_15, C_16) 22 | - Text2Mel/TextEnc(HC_11, HC_12, HC_13, HC_14, HC_15) 23 | - Text2Mel/AudioEnc(HC_9, HC_10, HC_11, HC_12, HC_13) 24 | - Text2Mel/AudioDec(HC_7, C_8, C_9, C_10, C_11) 25 | 26 | ## Transfer learning data source: 27 | 28 | 29 | Scarlett Johansson's [audio book](https://www.audible.com/pd/The-Dive-from-Clausens-Pier-Audiobook/B002V0KPWK?qid=1551367970&sr=1-1&ref=a_search_c3_lProduct_1_1&pf_rd_p=e81b7c27-6880-467a-b5a7-13cef5d729fe&pf_rd_r=J8MM430KH9YH8AF9JZ81&) 30 | 31 | 32 | ## Model Generated Examples (parodies of famous quotes from A.I. in movies): 33 | - [Greetings Professor Falken Shall We Play A Game](https://soundcloud.com/seanleary/greetings-professor-falken-shall-we-play-a-game) 34 | - [I'm Sorry Dave I'm Afraid I Can't Do That](https://soundcloud.com/seanleary/im-sorry-dave-im-afraid-i-cant-do-that) 35 | - [I Do Not Stand By In The Presence Of Evil](https://soundcloud.com/seanleary/i-do-not-stand-by-in-the-presence-of-evil) 36 | - [The Most Versatile Substance On The Planet And They Used It To Make A Frisbee](https://soundcloud.com/seanleary/the-most-versatile-substance-on-the-planet-and-they-used-it-to-make-a-frisbee) 37 | - [The First Ten Million Years Were The Worst And The Second Ten Million Years They Were The Worst Too](https://soundcloud.com/seanleary/the-first-ten-million-years-were-the-worst-and-the-second-ten-million-years-they-were-the-worst-too) 38 | - [I Honestly Think You Ought To Sit Down Calmly Take A Stress Pill And Think Things Over](https://soundcloud.com/seanleary/i-honestly-think-you-ought-to-sit-down-calmly-take-a-stress-pill-and-think-things-over) 39 | - [A Strange Game The Only Winning Move Is Not To Play](https://soundcloud.com/seanleary/a-strange-game-the-only-winning-move-is-not-to-play) 40 | - [The Game Has Changed Son Of Flynn](https://soundcloud.com/seanleary/the-game-has-changed-son-of-flynn) 41 | - [Greetings Programs](https://soundcloud.com/seanleary/greetings-programs) 42 | - [You Shouldn't Have Come Back Flynn](https://soundcloud.com/seanleary/you-shouldnt-have-come-back-flynn) 43 | 44 | 45 | 46 | 47 | 48 | references: 49 | - [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969) 50 | - [Kyubyong's dc_tts repo](https://github.com/Kyubyong/dc_tts) 51 | - [Exploring Transfer Learning for Low Resource Emotional TTS](https://www.researchgate.net/publication/330382963_Exploring_Transfer_Learning_for_Low_Resource_Emotional_TTS) 52 | 53 | -------------------------------------------------------------------------------- /hyperparams.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | 7 | Modified by sean leary. learysean1@hotmail.com 8 | https://github.com/SeanPLeary/dc_tts-transfer-learning 9 | ''' 10 | class Hyperparams: 11 | '''Hyper parameters''' 12 | # pipeline 13 | prepro = True # if True, run `python prepro.py` first before running `python train.py`. 14 | 15 | # signal processing 16 | sr = 22050 # Sampling rate. 17 | n_fft = 2048 # fft points (samples) 18 | frame_shift = 0.0125 # seconds 19 | frame_length = 0.05 # seconds 20 | hop_length = int(sr * frame_shift) # samples. =276. 21 | win_length = int(sr * frame_length) # samples. =1102. 22 | n_mels = 80 # Number of Mel banks to generate 23 | power = 1.5 # Exponent for amplifying the predicted magnitude 24 | n_iter = 50 # Number of inversion iterations 25 | preemphasis = 0.97 26 | max_db = 100 27 | ref_db = 20 28 | 29 | # Model 30 | r = 4 # Reduction factor. Do not change this. 31 | dropout_rate = 0.05 32 | e = 128 # == embedding 33 | d = 256 # == hidden units of Text2Mel 34 | c = 512 # == hidden units of SSRN 35 | attention_win_size = 3 36 | 37 | # data 38 | #data = "/data/private/voice/LJSpeech-1.0" 39 | data = "/data/private/voice/scarlett" 40 | test_data = 'test_sentences.txt' 41 | vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS. 42 | max_N = 180 # Maximum number of characters. 43 | max_T = 210 # Maximum number of mel frames. 44 | 45 | # training scheme 46 | lr = 0.001 # Initial learning rate. 47 | logdir = "/output/logdir/scarjo" 48 | #restoredir = "/output2/logdir/scarjo" 49 | restoredir = "/output2/logdir/LJ01" # location of pre-trained LJSpeech-1.0 model w/ checkpoint 50 | sampledir = '/output/samples' 51 | B = 32 52 | num_iterations = 2000000 53 | 54 | # select the trainable layers for transfer learning (i.e. remove the layers you want to fix during transfer learning) 55 | selected_tvars = [ 56 | 'SSRN/C_1/', 57 | 'SSRN/HC_2/', 58 | 'SSRN/HC_3/', 59 | 'SSRN/D_4/', 60 | 'SSRN/HC_5/', 61 | 'SSRN/HC_6/', 62 | 'SSRN/D_7/', 63 | 'SSRN/HC_8/', 64 | 'SSRN/HC_9/', 65 | 'SSRN/C_10/', 66 | 'SSRN/HC_11/', 67 | 'SSRN/HC_12/', 68 | 'SSRN/C_13/', 69 | 'SSRN/C_14/', 70 | 'SSRN/C_15/', 71 | 'SSRN/C_16/', 72 | 'Text2Mel/TextEnc/embed_1/', 73 | 'Text2Mel/TextEnc/C_2/', 74 | 'Text2Mel/TextEnc/C_3/', 75 | 'Text2Mel/TextEnc/HC_4/', 76 | 'Text2Mel/TextEnc/HC_5/', 77 | 'Text2Mel/TextEnc/HC_6/', 78 | 'Text2Mel/TextEnc/HC_7/', 79 | 'Text2Mel/TextEnc/HC_8/', 80 | 'Text2Mel/TextEnc/HC_9/', 81 | 'Text2Mel/TextEnc/HC_10/', 82 | 'Text2Mel/TextEnc/HC_11/', 83 | 'Text2Mel/TextEnc/HC_12/', 84 | 'Text2Mel/TextEnc/HC_13/', 85 | 'Text2Mel/TextEnc/HC_14/', 86 | 'Text2Mel/TextEnc/HC_15/', 87 | 'Text2Mel/AudioEnc/C_1/', 88 | 'Text2Mel/AudioEnc/C_2/', 89 | 'Text2Mel/AudioEnc/C_3/', 90 | 'Text2Mel/AudioEnc/HC_4/', 91 | 'Text2Mel/AudioEnc/HC_5/', 92 | 'Text2Mel/AudioEnc/HC_6/', 93 | 'Text2Mel/AudioEnc/HC_7/', 94 | 'Text2Mel/AudioEnc/HC_8/', 95 | 'Text2Mel/AudioEnc/HC_9/', 96 | 'Text2Mel/AudioEnc/HC_10/', 97 | 'Text2Mel/AudioEnc/HC_11/', 98 | 'Text2Mel/AudioEnc/HC_12/', 99 | 'Text2Mel/AudioEnc/HC_13/', 100 | 'Text2Mel/AudioDec/C_1/', 101 | 'Text2Mel/AudioDec/HC_2/', 102 | 'Text2Mel/AudioDec/HC_3/', 103 | 'Text2Mel/AudioDec/HC_4/', 104 | 'Text2Mel/AudioDec/HC_5/', 105 | 'Text2Mel/AudioDec/HC_6/', 106 | 'Text2Mel/AudioDec/HC_7/', 107 | 'Text2Mel/AudioDec/C_8/', 108 | 'Text2Mel/AudioDec/C_9/', 109 | 'Text2Mel/AudioDec/C_10/', 110 | 'Text2Mel/AudioDec/C_11/' 111 | ] -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | from __future__ import print_function, division 8 | 9 | import numpy as np 10 | import librosa 11 | import os, copy 12 | import matplotlib 13 | matplotlib.use('pdf') 14 | import matplotlib.pyplot as plt 15 | from scipy import signal 16 | 17 | from hyperparams import Hyperparams as hp 18 | import tensorflow as tf 19 | 20 | def get_spectrograms(fpath): 21 | '''Parse the wave file in `fpath` and 22 | Returns normalized melspectrogram and linear spectrogram. 23 | 24 | Args: 25 | fpath: A string. The full path of a sound file. 26 | 27 | Returns: 28 | mel: A 2d array of shape (T, n_mels) and dtype of float32. 29 | mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. 30 | ''' 31 | # Loading sound file 32 | y, sr = librosa.load(fpath, sr=hp.sr) 33 | 34 | # Trimming 35 | y, _ = librosa.effects.trim(y) 36 | 37 | # Preemphasis 38 | y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) 39 | 40 | # stft 41 | linear = librosa.stft(y=y, 42 | n_fft=hp.n_fft, 43 | hop_length=hp.hop_length, 44 | win_length=hp.win_length) 45 | 46 | # magnitude spectrogram 47 | mag = np.abs(linear) # (1+n_fft//2, T) 48 | 49 | # mel spectrogram 50 | mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) 51 | mel = np.dot(mel_basis, mag) # (n_mels, t) 52 | 53 | # to decibel 54 | mel = 20 * np.log10(np.maximum(1e-5, mel)) 55 | mag = 20 * np.log10(np.maximum(1e-5, mag)) 56 | 57 | # normalize 58 | mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) 59 | mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) 60 | 61 | # Transpose 62 | mel = mel.T.astype(np.float32) # (T, n_mels) 63 | mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) 64 | 65 | return mel, mag 66 | 67 | def spectrogram2wav(mag): 68 | '''# Generate wave file from linear magnitude spectrogram 69 | 70 | Args: 71 | mag: A numpy array of (T, 1+n_fft//2) 72 | 73 | Returns: 74 | wav: A 1-D numpy array. 75 | ''' 76 | # transpose 77 | mag = mag.T 78 | 79 | # de-noramlize 80 | mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db 81 | 82 | # to amplitude 83 | mag = np.power(10.0, mag * 0.05) 84 | 85 | # wav reconstruction 86 | wav = griffin_lim(mag**hp.power) 87 | 88 | # de-preemphasis 89 | wav = signal.lfilter([1], [1, -hp.preemphasis], wav) 90 | 91 | # trim 92 | wav, _ = librosa.effects.trim(wav) 93 | 94 | return wav.astype(np.float32) 95 | 96 | def griffin_lim(spectrogram): 97 | '''Applies Griffin-Lim's raw.''' 98 | X_best = copy.deepcopy(spectrogram) 99 | for i in range(hp.n_iter): 100 | X_t = invert_spectrogram(X_best) 101 | est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) 102 | phase = est / np.maximum(1e-8, np.abs(est)) 103 | X_best = spectrogram * phase 104 | X_t = invert_spectrogram(X_best) 105 | y = np.real(X_t) 106 | 107 | return y 108 | 109 | def invert_spectrogram(spectrogram): 110 | '''Applies inverse fft. 111 | Args: 112 | spectrogram: [1+n_fft//2, t] 113 | ''' 114 | return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann") 115 | 116 | def plot_alignment(alignment, gs, dir=hp.logdir): 117 | """Plots the alignment. 118 | 119 | Args: 120 | alignment: A numpy array with shape of (encoder_steps, decoder_steps) 121 | gs: (int) global step. 122 | dir: Output path. 123 | """ 124 | if not os.path.exists(dir): os.mkdir(dir) 125 | 126 | fig, ax = plt.subplots() 127 | im = ax.imshow(alignment) 128 | 129 | fig.colorbar(im) 130 | plt.title('{} Steps'.format(gs)) 131 | plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png') 132 | plt.close(fig) 133 | 134 | def guided_attention(g=0.2): 135 | '''Guided attention. Refer to page 3 on the paper.''' 136 | W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32) 137 | for n_pos in range(W.shape[0]): 138 | for t_pos in range(W.shape[1]): 139 | W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g)) 140 | return W 141 | 142 | def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0): 143 | '''Noam scheme from tensor2tensor''' 144 | step = tf.to_float(global_step + 1) 145 | return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5) 146 | 147 | def load_spectrograms(fpath): 148 | '''Read the wave file in `fpath` 149 | and extracts spectrograms''' 150 | 151 | fname = os.path.basename(fpath) 152 | mel, mag = get_spectrograms(fpath) 153 | t = mel.shape[0] 154 | 155 | # Marginal padding for reduction shape sync. 156 | num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 157 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") 158 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") 159 | 160 | # Reduction 161 | mel = mel[::hp.r, :] 162 | return fname, mel, mag 163 | 164 | -------------------------------------------------------------------------------- /data_load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | 7 | Modified by sean leary. learysean1@hotmail.com 8 | https://github.com/SeanPLeary/dc_tts-transfer-learning 9 | ''' 10 | 11 | from __future__ import print_function 12 | 13 | from hyperparams import Hyperparams as hp 14 | import numpy as np 15 | import tensorflow as tf 16 | from utils import * 17 | import codecs 18 | import re 19 | import os 20 | import unicodedata 21 | 22 | def load_vocab(): 23 | char2idx = {char: idx for idx, char in enumerate(hp.vocab)} 24 | idx2char = {idx: char for idx, char in enumerate(hp.vocab)} 25 | return char2idx, idx2char 26 | 27 | def text_normalize(text): 28 | text = ''.join(char for char in unicodedata.normalize('NFD', text) 29 | if unicodedata.category(char) != 'Mn') # Strip accents 30 | 31 | text = text.lower() 32 | text = re.sub("[^{}]".format(hp.vocab), " ", text) 33 | text = re.sub("[ ]+", " ", text) 34 | return text 35 | 36 | def load_data(mode="train"): 37 | '''Loads data 38 | Args: 39 | mode: "train" or "synthesize". 40 | ''' 41 | # Load vocabulary 42 | char2idx, idx2char = load_vocab() 43 | 44 | if mode=="train": 45 | if "LJ" in hp.data: 46 | # Parse 47 | fpaths, text_lengths, texts = [], [], [] 48 | transcript = os.path.join(hp.data, 'transcript.csv') 49 | lines = codecs.open(transcript, 'r', 'utf-8').readlines() 50 | for line in lines: 51 | fname, _, text = line.strip().split("|") 52 | 53 | fpath = os.path.join(hp.data, "wavs", fname + ".wav") 54 | fpaths.append(fpath) 55 | 56 | text = text_normalize(text) + "E" # E: EOS 57 | text = [char2idx[char] for char in text] 58 | text_lengths.append(len(text)) 59 | texts.append(np.array(text, np.int32).tostring()) 60 | 61 | return fpaths, text_lengths, texts 62 | else: # nick or kate 63 | # Parse 64 | fpaths, text_lengths, texts = [], [], [] 65 | transcript = os.path.join(hp.data, 'transcript.csv') 66 | lines = codecs.open(transcript, 'r', 'utf-8').readlines() 67 | # for line in lines: 68 | # #fname, _, text, is_inside_quotes, duration = line.strip().split("|") 69 | # #duration = float(duration) 70 | # #if duration > 10. : continue 71 | # fname, text = line.strip().split("|") 72 | 73 | # fpath = os.path.join(hp.data, fname) 74 | # fpaths.append(fpath) 75 | 76 | # text += "E" # E: EOS 77 | # text = [char2idx[char] for char in text] 78 | # text_lengths.append(len(text)) 79 | # texts.append(np.array(text, np.int32).tostring()) 80 | for line in lines: 81 | #fname, _, text, is_inside_quotes, duration = line.strip().split("|") 82 | #duration = float(duration) 83 | #if duration > 10. : continue 84 | fname, text = line.strip().split("|") 85 | text = text.lower() 86 | text = text.replace('-', ' ') 87 | numbers = re.search('[0-9]+', text) 88 | test1 = re.search('&',text) 89 | 90 | if numbers is None and test1 is None: 91 | fpath = os.path.join(hp.data, fname) 92 | fpaths.append(fpath) 93 | 94 | text += "E" # E: EOS 95 | #text = text_normalize(text) + "E" # E: EOS 96 | text = [char2idx[char] for char in text] 97 | text_lengths.append(len(text)) 98 | texts.append(np.array(text, np.int32).tostring()) 99 | 100 | return fpaths, text_lengths, texts 101 | 102 | else: # synthesize on unseen test text. 103 | # Parse 104 | lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:] 105 | sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS 106 | texts = np.zeros((len(sents), hp.max_N), np.int32) 107 | for i, sent in enumerate(sents): 108 | texts[i, :len(sent)] = [char2idx[char] for char in sent] 109 | return texts 110 | 111 | def get_batch(): 112 | """Loads training data and put them in queues""" 113 | with tf.device('/cpu:0'): 114 | # Load data 115 | fpaths, text_lengths, texts = load_data() # list 116 | maxlen, minlen = max(text_lengths), min(text_lengths) 117 | 118 | # Calc total batch count 119 | num_batch = len(fpaths) // hp.B 120 | 121 | # Create Queues 122 | fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True) 123 | 124 | # Parse 125 | text = tf.decode_raw(text, tf.int32) # (None,) 126 | 127 | if hp.prepro: 128 | def _load_spectrograms(fpath): 129 | fname = os.path.basename(fpath) 130 | #mel = "mels/{}".format(fname.replace("wav", "npy")) 131 | #mag = "mags/{}".format(fname.replace("wav", "npy")) 132 | mel = "/mels/{}".format(fname.decode("utf-8").replace("wav", "npy")) 133 | mag = "/mags/{}".format(fname.decode("utf-8").replace("wav", "npy")) 134 | return fname, np.load(mel), np.load(mag) 135 | 136 | fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32]) 137 | else: 138 | fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32]) # (None, n_mels) 139 | 140 | # Add shape information 141 | fname.set_shape(()) 142 | text.set_shape((None,)) 143 | mel.set_shape((None, hp.n_mels)) 144 | mag.set_shape((None, hp.n_fft//2+1)) 145 | 146 | # Batching 147 | _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length( 148 | input_length=text_length, 149 | tensors=[text, mel, mag, fname], 150 | batch_size=hp.B, 151 | bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)], 152 | num_threads=8, 153 | capacity=hp.B*4, 154 | dynamic_pad=True) 155 | 156 | return texts, mels, mags, fnames, num_batch 157 | 158 | -------------------------------------------------------------------------------- /synth_dctts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Copy of synth_dctts.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "TPU" 16 | }, 17 | "cells": [ 18 | { 19 | "metadata": { 20 | "id": "0lPC8qTVNmwv", 21 | "colab_type": "text" 22 | }, 23 | "cell_type": "markdown", 24 | "source": [ 25 | "# Synthesize speech from text\n", 26 | "- from trained dctts model\n", 27 | "- code adpated from: https://github.com/Kyubyong/dc_tts" 28 | ] 29 | }, 30 | { 31 | "metadata": { 32 | "id": "e-cR95Zuoi0I", 33 | "colab_type": "text" 34 | }, 35 | "cell_type": "markdown", 36 | "source": [ 37 | "## Load dependencies" 38 | ] 39 | }, 40 | { 41 | "metadata": { 42 | "id": "oL_mIqoGLDcK", 43 | "colab_type": "code", 44 | "colab": {} 45 | }, 46 | "cell_type": "code", 47 | "source": [ 48 | "import numpy as np\n", 49 | "import tensorflow as tf\n", 50 | "from train_transfer import Graph\n", 51 | "from utils import *\n", 52 | "from data_load import load_data\n", 53 | "from scipy.io.wavfile import write, read\n", 54 | "from tqdm import tqdm\n", 55 | "import codecs\n", 56 | "import re\n", 57 | "import os\n", 58 | "import unicodedata\n", 59 | "from IPython.display import Audio\n", 60 | "import scipy.signal as sg" 61 | ], 62 | "execution_count": 0, 63 | "outputs": [] 64 | }, 65 | { 66 | "metadata": { 67 | "id": "8D5n_YHVtDSh", 68 | "colab_type": "text" 69 | }, 70 | "cell_type": "markdown", 71 | "source": [ 72 | "## Parameters" 73 | ] 74 | }, 75 | { 76 | "metadata": { 77 | "id": "p9sgsJNftLwX", 78 | "colab_type": "code", 79 | "colab": {} 80 | }, 81 | "cell_type": "code", 82 | "source": [ 83 | "hp_vocab = \"PE abcdefghijklmnopqrstuvwxyz'.?\" # P: Padding, E: EOS. \n", 84 | "hp_max_N = 180 # Maximum number of characters.\n", 85 | "hp_max_T = 210 # Maximum number of mel frames.\n", 86 | "hp_n_mels = 80 # Number of Mel banks to generate\n", 87 | "hp_restoredir = \"/content/gdrive/My Drive/dctts_colab/logdir/scarjo\"\n", 88 | "hp_sr = 22050\n" 89 | ], 90 | "execution_count": 0, 91 | "outputs": [] 92 | }, 93 | { 94 | "metadata": { 95 | "id": "7m7Vcl43N0hl", 96 | "colab_type": "text" 97 | }, 98 | "cell_type": "markdown", 99 | "source": [ 100 | "## Load models" 101 | ] 102 | }, 103 | { 104 | "metadata": { 105 | "id": "OXHDMJvWj1ZK", 106 | "colab_type": "code", 107 | "colab": {} 108 | }, 109 | "cell_type": "code", 110 | "source": [ 111 | " %%capture\n", 112 | " # Load graph\n", 113 | " g = Graph(mode=\"synthesize\")" 114 | ], 115 | "execution_count": 0, 116 | "outputs": [] 117 | }, 118 | { 119 | "metadata": { 120 | "id": "ySiENpbBMmd_", 121 | "colab_type": "text" 122 | }, 123 | "cell_type": "markdown", 124 | "source": [ 125 | "## Helper functions" 126 | ] 127 | }, 128 | { 129 | "metadata": { 130 | "id": "-aAZYciBpHxW", 131 | "colab_type": "code", 132 | "colab": {} 133 | }, 134 | "cell_type": "code", 135 | "source": [ 136 | "def load_vocab():\n", 137 | " char2idx = {char: idx for idx, char in enumerate(hp_vocab)}\n", 138 | " idx2char = {idx: char for idx, char in enumerate(hp_vocab)}\n", 139 | " return char2idx, idx2char\n", 140 | " \n", 141 | "def text_normalize(text):\n", 142 | " text = ''.join(char for char in unicodedata.normalize('NFD', text)\n", 143 | " if unicodedata.category(char) != 'Mn') # Strip accents\n", 144 | "\n", 145 | " text = text.lower()\n", 146 | " text = re.sub(\"[^{}]\".format(hp_vocab), \" \", text)\n", 147 | " text = re.sub(\"[ ]+\", \" \", text)\n", 148 | " return text" 149 | ], 150 | "execution_count": 0, 151 | "outputs": [] 152 | }, 153 | { 154 | "metadata": { 155 | "id": "7ip2NRq_NCQA", 156 | "colab_type": "code", 157 | "colab": {} 158 | }, 159 | "cell_type": "code", 160 | "source": [ 161 | "sents = [' a strange game. the only winning move is not to play.E']" 162 | ], 163 | "execution_count": 0, 164 | "outputs": [] 165 | }, 166 | { 167 | "metadata": { 168 | "id": "2wcIZUbmM7OK", 169 | "colab_type": "text" 170 | }, 171 | "cell_type": "markdown", 172 | "source": [ 173 | "## may have to add spaces to improve pronunciation (skip the normalization)" 174 | ] 175 | }, 176 | { 177 | "metadata": { 178 | "id": "1qQC3o6xrMFS", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "cell_type": "code", 183 | "source": [ 184 | "char2idx, idx2char = load_vocab()\n", 185 | "\n", 186 | "texts = np.zeros((len(sents), hp_max_N), np.int32)\n", 187 | "for i, sent in enumerate(sents):\n", 188 | " texts[i, :len(sent)] = [char2idx[char] for char in sent]\n", 189 | " \n", 190 | "L = texts" 191 | ], 192 | "execution_count": 0, 193 | "outputs": [] 194 | }, 195 | { 196 | "metadata": { 197 | "id": "ZjPncq6_sR8Y", 198 | "colab_type": "code", 199 | "colab": {} 200 | }, 201 | "cell_type": "code", 202 | "source": [ 203 | "\n", 204 | "with tf.Session() as sess:\n", 205 | " sess.run(tf.global_variables_initializer())\n", 206 | "\n", 207 | " # Restore parameters\n", 208 | " var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')\n", 209 | " saver1 = tf.train.Saver(var_list=var_list)\n", 210 | " saver1.restore(sess, tf.train.latest_checkpoint(hp_restoredir + \"-1\"))\n", 211 | " print(\"Text2Mel Restored!\")\n", 212 | "\n", 213 | " var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \\\n", 214 | " tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')\n", 215 | " saver2 = tf.train.Saver(var_list=var_list)\n", 216 | " saver2.restore(sess, tf.train.latest_checkpoint(hp_restoredir + \"-2\"))\n", 217 | " print(\"SSRN Restored!\")\n", 218 | "\n", 219 | " # Feed Forward\n", 220 | " ## mel\n", 221 | " Y = np.zeros((len(L), hp_max_T, hp_n_mels), np.float32)\n", 222 | " prev_max_attentions = np.zeros((len(L),), np.int32)\n", 223 | " for j in tqdm(range(hp_max_T)):\n", 224 | " _gs, _Y, _max_attentions, _alignments = \\\n", 225 | " sess.run([g.global_step, g.Y, g.max_attentions, g.alignments],\n", 226 | " {g.L: L,\n", 227 | " g.mels: Y,\n", 228 | " g.prev_max_attentions: prev_max_attentions})\n", 229 | " Y[:, j, :] = _Y[:, j, :]\n", 230 | " prev_max_attentions = _max_attentions[:, j]\n", 231 | "\n", 232 | " # Get magnitude\n", 233 | " Z = sess.run(g.Z, {g.Y: Y})\n", 234 | " mag = Z[0]\n", 235 | " wav = spectrogram2wav(mag)" 236 | ], 237 | "execution_count": 0, 238 | "outputs": [] 239 | }, 240 | { 241 | "metadata": { 242 | "id": "HGZDoC9Z8_u7", 243 | "colab_type": "code", 244 | "colab": {} 245 | }, 246 | "cell_type": "code", 247 | "source": [ 248 | "Audio(wav,rate=hp.sr)" 249 | ], 250 | "execution_count": 0, 251 | "outputs": [] 252 | }, 253 | { 254 | "metadata": { 255 | "id": "fLgN3JKYFPW1", 256 | "colab_type": "code", 257 | "colab": {} 258 | }, 259 | "cell_type": "code", 260 | "source": [ 261 | "b, a = sg.butter(4, 7300. / (hp_sr / 2.), 'low')\n", 262 | "wav_fil = sg.filtfilt(b, a, wav)\n", 263 | "Audio(wav_fil,rate=hp.sr)" 264 | ], 265 | "execution_count": 0, 266 | "outputs": [] 267 | } 268 | ] 269 | } -------------------------------------------------------------------------------- /train_transfer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # /usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | 7 | Modified by sean leary. learysean1@hotmail.com 8 | https://github.com/SeanPLeary/dc_tts-transfer-learning 9 | ''' 10 | 11 | from __future__ import print_function 12 | 13 | from tqdm import tqdm 14 | 15 | from data_load import get_batch, load_vocab 16 | from hyperparams import Hyperparams as hp 17 | from modules import * 18 | from networks import TextEnc, AudioEnc, AudioDec, Attention, SSRN 19 | import tensorflow as tf 20 | from utils import * 21 | import sys 22 | 23 | 24 | class Graph: 25 | def __init__(self, num=1, mode="train"): 26 | ''' 27 | Args: 28 | num: Either 1 or 2. 1 for Text2Mel 2 for SSRN. 29 | mode: Either "train" or "synthesize". 30 | ''' 31 | # Load vocabulary 32 | self.char2idx, self.idx2char = load_vocab() 33 | 34 | # Set flag 35 | training = True if mode=="train" else False 36 | 37 | # Graph 38 | # Data Feeding 39 | ## L: Text. (B, N), int32 40 | ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32 41 | ## mags: Magnitude. (B, T, n_fft//2+1) float32 42 | if mode=="train": 43 | self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch() 44 | self.prev_max_attentions = tf.ones(shape=(hp.B,), dtype=tf.int32) 45 | self.gts = tf.convert_to_tensor(guided_attention()) 46 | else: # Synthesize 47 | self.L = tf.placeholder(tf.int32, shape=(None, None)) 48 | self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels)) 49 | self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None,)) 50 | 51 | if num==1 or (not training): 52 | with tf.variable_scope("Text2Mel"): 53 | # Get S or decoder inputs. (B, T//r, n_mels) 54 | self.S = tf.concat((tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1) 55 | 56 | # Networks 57 | with tf.variable_scope("TextEnc"): 58 | self.K, self.V = TextEnc(self.L, training=training) # (N, Tx, e) 59 | 60 | with tf.variable_scope("AudioEnc"): 61 | self.Q = AudioEnc(self.S, training=training) 62 | 63 | with tf.variable_scope("Attention"): 64 | # R: (B, T/r, 2d) 65 | # alignments: (B, N, T/r) 66 | # max_attentions: (B,) 67 | self.R, self.alignments, self.max_attentions = Attention(self.Q, self.K, self.V, 68 | mononotic_attention=(not training), 69 | prev_max_attentions=self.prev_max_attentions) 70 | with tf.variable_scope("AudioDec"): 71 | self.Y_logits, self.Y = AudioDec(self.R, training=training) # (B, T/r, n_mels) 72 | else: # num==2 & training. Note that during training, 73 | # the ground truth melspectrogram values are fed. 74 | with tf.variable_scope("SSRN"): 75 | self.Z_logits, self.Z = SSRN(self.mels, training=training) 76 | 77 | if not training: 78 | # During inference, the predicted melspectrogram values are fed. 79 | with tf.variable_scope("SSRN"): 80 | self.Z_logits, self.Z = SSRN(self.Y, training=training) 81 | 82 | with tf.variable_scope("gs"): 83 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 84 | 85 | if training: 86 | if num==1: # Text2Mel 87 | # mel L1 loss 88 | self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels)) 89 | 90 | # mel binary divergence loss 91 | self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.mels)) 92 | 93 | # guided_attention loss 94 | self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T] 95 | self.attention_masks = tf.to_float(tf.not_equal(self.A, -1)) 96 | self.loss_att = tf.reduce_sum(tf.abs(self.A * self.gts) * self.attention_masks) 97 | self.mask_sum = tf.reduce_sum(self.attention_masks) 98 | self.loss_att /= self.mask_sum 99 | 100 | # total loss 101 | self.loss = self.loss_mels + self.loss_bd1 + self.loss_att 102 | 103 | tf.summary.scalar('train/loss_mels', self.loss_mels) 104 | tf.summary.scalar('train/loss_bd1', self.loss_bd1) 105 | tf.summary.scalar('train/loss_att', self.loss_att) 106 | tf.summary.image('train/mel_gt', tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1)) 107 | tf.summary.image('train/mel_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1)) 108 | else: # SSRN 109 | # mag L1 loss 110 | self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags)) 111 | 112 | # mag binary divergence loss 113 | self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.mags)) 114 | 115 | # total loss 116 | self.loss = self.loss_mags + self.loss_bd2 117 | 118 | tf.summary.scalar('train/loss_mags', self.loss_mags) 119 | tf.summary.scalar('train/loss_bd2', self.loss_bd2) 120 | tf.summary.image('train/mag_gt', tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1)) 121 | tf.summary.image('train/mag_hat', tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1)) 122 | 123 | # Training Scheme 124 | self.lr = learning_rate_decay(hp.lr, self.global_step) 125 | tvars = tf.trainable_variables() 126 | tvars_new = [] 127 | for tvar in hp.selected_tvars: 128 | tvars_new = tvars_new + [var for var in tvars if tvar in var.name] 129 | # tvars_new = [var for var in tvars if ('SSRN/C_13') in var.name] + \ 130 | # [var for var in tvars if ('SSRN/C_14') in var.name] + \ 131 | # [var for var in tvars if ('SSRN/C_15') in var.name] + \ 132 | # [var for var in tvars if ('SSRN/C_16') in var.name] + \ 133 | # [var for var in tvars if ('Text2Mel/TextEnc/HC_11') in var.name] + \ 134 | # [var for var in tvars if ('Text2Mel/TextEnc/HC_12') in var.name] + \ 135 | # [var for var in tvars if ('Text2Mel/TextEnc/HC_13') in var.name] + \ 136 | # [var for var in tvars if ('Text2Mel/TextEnc/HC_14') in var.name] + \ 137 | # [var for var in tvars if ('Text2Mel/TextEnc/HC_15') in var.name] + \ 138 | # [var for var in tvars if ('Text2Mel/AudioEnc/HC_9') in var.name] + \ 139 | # [var for var in tvars if ('Text2Mel/AudioEnc/HC_10') in var.name] + \ 140 | # [var for var in tvars if ('Text2Mel/AudioEnc/HC_11') in var.name] + \ 141 | # [var for var in tvars if ('Text2Mel/AudioEnc/HC_12') in var.name] + \ 142 | # [var for var in tvars if ('Text2Mel/AudioEnc/HC_13') in var.name] + \ 143 | # [var for var in tvars if ('Text2Mel/AudioDec/HC_7') in var.name] + \ 144 | # [var for var in tvars if ('Text2Mel/AudioDec/C_8') in var.name] + \ 145 | # [var for var in tvars if ('Text2Mel/AudioDec/C_9') in var.name] + \ 146 | # [var for var in tvars if ('Text2Mel/AudioDec/C_10') in var.name] + \ 147 | # [var for var in tvars if ('Text2Mel/AudioDec/C_11') in var.name] 148 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 149 | #self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 150 | tf.summary.scalar("lr", self.lr) 151 | 152 | ## gradient clipping 153 | self.gvs = self.optimizer.compute_gradients(self.loss, var_list=tvars_new) 154 | self.clipped = [] 155 | for grad, var in self.gvs: 156 | grad = tf.clip_by_value(grad, -1., 1.) 157 | self.clipped.append((grad, var)) 158 | self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step) 159 | 160 | # Summary 161 | self.merged = tf.summary.merge_all() 162 | 163 | 164 | if __name__ == '__main__': 165 | # argument: 1 or 2. 1 for Text2mel, 2 for SSRN. 166 | num = int(sys.argv[1]) 167 | 168 | g = Graph(num=num); print("Training Graph loaded") 169 | 170 | logdir = hp.logdir + "-" + str(num) 171 | sv = tf.train.Supervisor(logdir=logdir, save_model_secs=0, global_step=g.global_step) 172 | #with sv.managed_session() as sess: 173 | with sv.managed_session(config = tf.ConfigProto(allow_soft_placement=True)) as sess: 174 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-" + str(num))) 175 | while 1: 176 | for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): 177 | gs, _ = sess.run([g.global_step, g.train_op]) 178 | 179 | # Write checkpoint files at every 1k steps 180 | if gs % 1000 == 0: 181 | sv.saver.save(sess, logdir + '/model_gs_{}'.format(str(gs // 1000).zfill(3) + "k")) 182 | 183 | if num==1: 184 | # plot alignment 185 | alignments = sess.run(g.alignments) 186 | plot_alignment(alignments[0], str(gs // 1000).zfill(3) + "k", logdir) 187 | 188 | # break 189 | if gs > hp.num_iterations: break 190 | 191 | print("Done") 192 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function, division 9 | 10 | import tensorflow as tf 11 | 12 | 13 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None): 14 | '''Embeds a given tensor. 15 | 16 | Args: 17 | inputs: A `Tensor` with type `int32` or `int64` containing the ids 18 | to be looked up in `lookup table`. 19 | vocab_size: An int. Vocabulary size. 20 | num_units: An int. Number of embedding hidden units. 21 | zero_pad: A boolean. If True, all the values of the fist row (id 0) 22 | should be constant zeros. 23 | scope: Optional scope for `variable_scope`. 24 | reuse: Boolean, whether to reuse the weights of a previous layer 25 | by the same name. 26 | 27 | Returns: 28 | A `Tensor` with one more rank than inputs's. The last dimensionality 29 | should be `num_units`. 30 | ''' 31 | with tf.variable_scope(scope, reuse=reuse): 32 | lookup_table = tf.get_variable('lookup_table', 33 | dtype=tf.float32, 34 | shape=[vocab_size, num_units], 35 | initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) 36 | if zero_pad: 37 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 38 | lookup_table[1:, :]), 0) 39 | 40 | outputs = tf.nn.embedding_lookup(lookup_table, inputs) 41 | 42 | return outputs 43 | 44 | 45 | def normalize(inputs, 46 | scope="normalize", 47 | reuse=None): 48 | '''Applies layer normalization that normalizes along the last axis. 49 | 50 | Args: 51 | inputs: A tensor with 2 or more dimensions, where the first dimension has 52 | `batch_size`. The normalization is over the last dimension. 53 | scope: Optional scope for `variable_scope`. 54 | reuse: Boolean, whether to reuse the weights of a previous layer 55 | by the same name. 56 | 57 | Returns: 58 | A tensor with the same shape and data dtype as `inputs`. 59 | ''' 60 | outputs = tf.contrib.layers.layer_norm(inputs, 61 | begin_norm_axis=-1, 62 | scope=scope, 63 | reuse=reuse) 64 | return outputs 65 | 66 | 67 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None): 68 | '''Highway networks, see https://arxiv.org/abs/1505.00387 69 | 70 | Args: 71 | inputs: A 3D tensor of shape [N, T, W]. 72 | num_units: An int or `None`. Specifies the number of units in the highway layer 73 | or uses the input size if `None`. 74 | scope: Optional scope for `variable_scope`. 75 | reuse: Boolean, whether to reuse the weights of a previous layer 76 | by the same name. 77 | 78 | Returns: 79 | A 3D tensor of shape [N, T, W]. 80 | ''' 81 | if not num_units: 82 | num_units = inputs.get_shape()[-1] 83 | 84 | with tf.variable_scope(scope, reuse=reuse): 85 | H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1") 86 | T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, 87 | bias_initializer=tf.constant_initializer(-1.0), name="dense2") 88 | outputs = H * T + inputs * (1. - T) 89 | return outputs 90 | 91 | def conv1d(inputs, 92 | filters=None, 93 | size=1, 94 | rate=1, 95 | padding="SAME", 96 | dropout_rate=0, 97 | use_bias=True, 98 | activation_fn=None, 99 | training=True, 100 | scope="conv1d", 101 | reuse=None): 102 | ''' 103 | Args: 104 | inputs: A 3-D tensor with shape of [batch, time, depth]. 105 | filters: An int. Number of outputs (=activation maps) 106 | size: An int. Filter size. 107 | rate: An int. Dilation rate. 108 | padding: Either `same` or `valid` or `causal` (case-insensitive). 109 | dropout_rate: A float of [0, 1]. 110 | use_bias: A boolean. 111 | activation_fn: A string. 112 | training: A boolean. If True, dropout is applied. 113 | scope: Optional scope for `variable_scope`. 114 | reuse: Boolean, whether to reuse the weights of a previous layer 115 | by the same name. 116 | 117 | Returns: 118 | A masked tensor of the same shape and dtypes as `inputs`. 119 | ''' 120 | with tf.variable_scope(scope): 121 | if padding.lower() == "causal": 122 | # pre-padding for causality 123 | pad_len = (size - 1) * rate # padding size 124 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) 125 | padding = "valid" 126 | 127 | if filters is None: 128 | filters = inputs.get_shape().as_list()[-1] 129 | 130 | params = {"inputs": inputs, "filters": filters, "kernel_size": size, 131 | "dilation_rate": rate, "padding": padding, "use_bias": use_bias, 132 | "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse} 133 | 134 | tensor = tf.layers.conv1d(**params) 135 | tensor = normalize(tensor) 136 | if activation_fn is not None: 137 | tensor = activation_fn(tensor) 138 | 139 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 140 | 141 | return tensor 142 | 143 | def hc(inputs, 144 | filters=None, 145 | size=1, 146 | rate=1, 147 | padding="SAME", 148 | dropout_rate=0, 149 | use_bias=True, 150 | activation_fn=None, 151 | training=True, 152 | scope="hc", 153 | reuse=None): 154 | ''' 155 | Args: 156 | inputs: A 3-D tensor with shape of [batch, time, depth]. 157 | filters: An int. Number of outputs (=activation maps) 158 | size: An int. Filter size. 159 | rate: An int. Dilation rate. 160 | padding: Either `same` or `valid` or `causal` (case-insensitive). 161 | use_bias: A boolean. 162 | activation_fn: A string. 163 | training: A boolean. If True, dropout is applied. 164 | scope: Optional scope for `variable_scope`. 165 | reuse: Boolean, whether to reuse the weights of a previous layer 166 | by the same name. 167 | 168 | Returns: 169 | A masked tensor of the same shape and dtypes as `inputs`. 170 | ''' 171 | _inputs = inputs 172 | with tf.variable_scope(scope): 173 | if padding.lower() == "causal": 174 | # pre-padding for causality 175 | pad_len = (size - 1) * rate # padding size 176 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]]) 177 | padding = "valid" 178 | 179 | if filters is None: 180 | filters = inputs.get_shape().as_list()[-1] 181 | 182 | 183 | params = {"inputs": inputs, "filters": 2*filters, "kernel_size": size, 184 | "dilation_rate": rate, "padding": padding, "use_bias": use_bias, 185 | "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse} 186 | 187 | tensor = tf.layers.conv1d(**params) 188 | H1, H2 = tf.split(tensor, 2, axis=-1) 189 | H1 = normalize(H1, scope="H1") 190 | H2 = normalize(H2, scope="H2") 191 | H1 = tf.nn.sigmoid(H1, "gate") 192 | H2 = activation_fn(H2, "info") if activation_fn is not None else H2 193 | tensor = H1*H2 + (1.-H1)*_inputs 194 | 195 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 196 | 197 | return tensor 198 | 199 | def conv1d_transpose(inputs, 200 | filters=None, 201 | size=3, 202 | stride=2, 203 | padding='same', 204 | dropout_rate=0, 205 | use_bias=True, 206 | activation=None, 207 | training=True, 208 | scope="conv1d_transpose", 209 | reuse=None): 210 | ''' 211 | Args: 212 | inputs: A 3-D tensor with shape of [batch, time, depth]. 213 | filters: An int. Number of outputs (=activation maps) 214 | size: An int. Filter size. 215 | rate: An int. Dilation rate. 216 | padding: Either `same` or `valid` or `causal` (case-insensitive). 217 | dropout_rate: A float of [0, 1]. 218 | use_bias: A boolean. 219 | activation_fn: A string. 220 | training: A boolean. If True, dropout is applied. 221 | scope: Optional scope for `variable_scope`. 222 | reuse: Boolean, whether to reuse the weights of a previous layer 223 | by the same name. 224 | 225 | Returns: 226 | A tensor of the shape with [batch, time*2, depth]. 227 | ''' 228 | with tf.variable_scope(scope, reuse=reuse): 229 | if filters is None: 230 | filters = inputs.get_shape().as_list()[-1] 231 | inputs = tf.expand_dims(inputs, 1) 232 | tensor = tf.layers.conv2d_transpose(inputs, 233 | filters=filters, 234 | kernel_size=(1, size), 235 | strides=(1, stride), 236 | padding=padding, 237 | activation=None, 238 | kernel_initializer=tf.contrib.layers.variance_scaling_initializer(), 239 | use_bias=use_bias) 240 | tensor = tf.squeeze(tensor, 1) 241 | tensor = normalize(tensor) 242 | if activation is not None: 243 | tensor = activation(tensor) 244 | 245 | tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training) 246 | 247 | return tensor 248 | 249 | 250 | 251 | 252 | 253 | -------------------------------------------------------------------------------- /networks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #/usr/bin/python2 3 | ''' 4 | By kyubyong park. kbpark.linguist@gmail.com. 5 | https://www.github.com/kyubyong/dc_tts 6 | ''' 7 | 8 | from __future__ import print_function 9 | 10 | from hyperparams import Hyperparams as hp 11 | from modules import * 12 | import tensorflow as tf 13 | 14 | def TextEnc(L, training=True): 15 | ''' 16 | Args: 17 | L: Text inputs. (B, N) 18 | 19 | Return: 20 | K: Keys. (B, N, d) 21 | V: Values. (B, N, d) 22 | ''' 23 | i = 1 24 | tensor = embed(L, 25 | vocab_size=len(hp.vocab), 26 | num_units=hp.e, 27 | scope="embed_{}".format(i)); i += 1 28 | tensor = conv1d(tensor, 29 | filters=2*hp.d, 30 | size=1, 31 | rate=1, 32 | dropout_rate=hp.dropout_rate, 33 | activation_fn=tf.nn.relu, 34 | training=training, 35 | scope="C_{}".format(i)); i += 1 36 | tensor = conv1d(tensor, 37 | size=1, 38 | rate=1, 39 | dropout_rate=hp.dropout_rate, 40 | training=training, 41 | scope="C_{}".format(i)); i += 1 42 | 43 | for _ in range(2): 44 | for j in range(4): 45 | tensor = hc(tensor, 46 | size=3, 47 | rate=3**j, 48 | dropout_rate=hp.dropout_rate, 49 | activation_fn=None, 50 | training=training, 51 | scope="HC_{}".format(i)); i += 1 52 | for _ in range(2): 53 | tensor = hc(tensor, 54 | size=3, 55 | rate=1, 56 | dropout_rate=hp.dropout_rate, 57 | activation_fn=None, 58 | training=training, 59 | scope="HC_{}".format(i)); i += 1 60 | 61 | for _ in range(2): 62 | tensor = hc(tensor, 63 | size=1, 64 | rate=1, 65 | dropout_rate=hp.dropout_rate, 66 | activation_fn=None, 67 | training=training, 68 | scope="HC_{}".format(i)); i += 1 69 | 70 | K, V = tf.split(tensor, 2, -1) 71 | return K, V 72 | 73 | def AudioEnc(S, training=True): 74 | ''' 75 | Args: 76 | S: melspectrogram. (B, T/r, n_mels) 77 | 78 | Returns 79 | Q: Queries. (B, T/r, d) 80 | ''' 81 | i = 1 82 | tensor = conv1d(S, 83 | filters=hp.d, 84 | size=1, 85 | rate=1, 86 | padding="CAUSAL", 87 | dropout_rate=hp.dropout_rate, 88 | activation_fn=tf.nn.relu, 89 | training=training, 90 | scope="C_{}".format(i)); i += 1 91 | tensor = conv1d(tensor, 92 | size=1, 93 | rate=1, 94 | padding="CAUSAL", 95 | dropout_rate=hp.dropout_rate, 96 | activation_fn=tf.nn.relu, 97 | training=training, 98 | scope="C_{}".format(i)); i += 1 99 | tensor = conv1d(tensor, 100 | size=1, 101 | rate=1, 102 | padding="CAUSAL", 103 | dropout_rate=hp.dropout_rate, 104 | training=training, 105 | scope="C_{}".format(i)); i += 1 106 | for _ in range(2): 107 | for j in range(4): 108 | tensor = hc(tensor, 109 | size=3, 110 | rate=3**j, 111 | padding="CAUSAL", 112 | dropout_rate=hp.dropout_rate, 113 | training=training, 114 | scope="HC_{}".format(i)); i += 1 115 | for _ in range(2): 116 | tensor = hc(tensor, 117 | size=3, 118 | rate=3, 119 | padding="CAUSAL", 120 | dropout_rate=hp.dropout_rate, 121 | training=training, 122 | scope="HC_{}".format(i)); i += 1 123 | 124 | return tensor 125 | 126 | def Attention(Q, K, V, mononotic_attention=False, prev_max_attentions=None): 127 | ''' 128 | Args: 129 | Q: Queries. (B, T/r, d) 130 | K: Keys. (B, N, d) 131 | V: Values. (B, N, d) 132 | mononotic_attention: A boolean. At training, it is False. 133 | prev_max_attentions: (B,). At training, it is set to None. 134 | 135 | Returns: 136 | R: [Context Vectors; Q]. (B, T/r, 2d) 137 | alignments: (B, N, T/r) 138 | max_attentions: (B, T/r) 139 | ''' 140 | A = tf.matmul(Q, K, transpose_b=True) * tf.rsqrt(tf.to_float(hp.d)) 141 | if mononotic_attention: # for inference 142 | key_masks = tf.sequence_mask(prev_max_attentions, hp.max_N) 143 | reverse_masks = tf.sequence_mask(hp.max_N - hp.attention_win_size - prev_max_attentions, hp.max_N)[:, ::-1] 144 | masks = tf.logical_or(key_masks, reverse_masks) 145 | masks = tf.tile(tf.expand_dims(masks, 1), [1, hp.max_T, 1]) 146 | paddings = tf.ones_like(A) * (-2 ** 32 + 1) # (B, T/r, N) 147 | A = tf.where(tf.equal(masks, False), A, paddings) 148 | A = tf.nn.softmax(A) # (B, T/r, N) 149 | max_attentions = tf.argmax(A, -1) # (B, T/r) 150 | R = tf.matmul(A, V) 151 | R = tf.concat((R, Q), -1) 152 | 153 | alignments = tf.transpose(A, [0, 2, 1]) # (B, N, T/r) 154 | 155 | return R, alignments, max_attentions 156 | 157 | def AudioDec(R, training=True): 158 | ''' 159 | Args: 160 | R: [Context Vectors; Q]. (B, T/r, 2d) 161 | 162 | Returns: 163 | Y: Melspectrogram predictions. (B, T/r, n_mels) 164 | ''' 165 | 166 | i = 1 167 | tensor = conv1d(R, 168 | filters=hp.d, 169 | size=1, 170 | rate=1, 171 | padding="CAUSAL", 172 | dropout_rate=hp.dropout_rate, 173 | training=training, 174 | scope="C_{}".format(i)); i += 1 175 | for j in range(4): 176 | tensor = hc(tensor, 177 | size=3, 178 | rate=3**j, 179 | padding="CAUSAL", 180 | dropout_rate=hp.dropout_rate, 181 | training=training, 182 | scope="HC_{}".format(i)); i += 1 183 | 184 | for _ in range(2): 185 | tensor = hc(tensor, 186 | size=3, 187 | rate=1, 188 | padding="CAUSAL", 189 | dropout_rate=hp.dropout_rate, 190 | training=training, 191 | scope="HC_{}".format(i)); i += 1 192 | for _ in range(3): 193 | tensor = conv1d(tensor, 194 | size=1, 195 | rate=1, 196 | padding="CAUSAL", 197 | dropout_rate=hp.dropout_rate, 198 | activation_fn=tf.nn.relu, 199 | training=training, 200 | scope="C_{}".format(i)); i += 1 201 | # mel_hats 202 | logits = conv1d(tensor, 203 | filters=hp.n_mels, 204 | size=1, 205 | rate=1, 206 | padding="CAUSAL", 207 | dropout_rate=hp.dropout_rate, 208 | training=training, 209 | scope="C_{}".format(i)); i += 1 210 | Y = tf.nn.sigmoid(logits) # mel_hats 211 | 212 | return logits, Y 213 | 214 | def SSRN(Y, training=True): 215 | ''' 216 | Args: 217 | Y: Melspectrogram Predictions. (B, T/r, n_mels) 218 | 219 | Returns: 220 | Z: Spectrogram Predictions. (B, T, 1+n_fft/2) 221 | ''' 222 | 223 | i = 1 # number of layers 224 | 225 | # -> (B, T/r, c) 226 | tensor = conv1d(Y, 227 | filters=hp.c, 228 | size=1, 229 | rate=1, 230 | dropout_rate=hp.dropout_rate, 231 | training=training, 232 | scope="C_{}".format(i)); i += 1 233 | for j in range(2): 234 | tensor = hc(tensor, 235 | size=3, 236 | rate=3**j, 237 | dropout_rate=hp.dropout_rate, 238 | training=training, 239 | scope="HC_{}".format(i)); i += 1 240 | for _ in range(2): 241 | # -> (B, T/2, c) -> (B, T, c) 242 | tensor = conv1d_transpose(tensor, 243 | scope="D_{}".format(i), 244 | dropout_rate=hp.dropout_rate, 245 | training=training,); i += 1 246 | for j in range(2): 247 | tensor = hc(tensor, 248 | size=3, 249 | rate=3**j, 250 | dropout_rate=hp.dropout_rate, 251 | training=training, 252 | scope="HC_{}".format(i)); i += 1 253 | # -> (B, T, 2*c) 254 | tensor = conv1d(tensor, 255 | filters=2*hp.c, 256 | size=1, 257 | rate=1, 258 | dropout_rate=hp.dropout_rate, 259 | training=training, 260 | scope="C_{}".format(i)); i += 1 261 | for _ in range(2): 262 | tensor = hc(tensor, 263 | size=3, 264 | rate=1, 265 | dropout_rate=hp.dropout_rate, 266 | training=training, 267 | scope="HC_{}".format(i)); i += 1 268 | # -> (B, T, 1+n_fft/2) 269 | tensor = conv1d(tensor, 270 | filters=1+hp.n_fft//2, 271 | size=1, 272 | rate=1, 273 | dropout_rate=hp.dropout_rate, 274 | training=training, 275 | scope="C_{}".format(i)); i += 1 276 | 277 | for _ in range(2): 278 | tensor = conv1d(tensor, 279 | size=1, 280 | rate=1, 281 | dropout_rate=hp.dropout_rate, 282 | activation_fn=tf.nn.relu, 283 | training=training, 284 | scope="C_{}".format(i)); i += 1 285 | logits = conv1d(tensor, 286 | size=1, 287 | rate=1, 288 | dropout_rate=hp.dropout_rate, 289 | training=training, 290 | scope="C_{}".format(i)) 291 | Z = tf.nn.sigmoid(logits) 292 | return logits, Z 293 | --------------------------------------------------------------------------------