├── tvars_ssrn.csv
├── prepo.py
├── tvars_text2mel.csv
├── synthesize.py
├── README.md
├── hyperparams.py
├── utils.py
├── data_load.py
├── synth_dctts.ipynb
├── train_transfer.py
├── modules.py
└── networks.py


/tvars_ssrn.csv:
--------------------------------------------------------------------------------
 1 | SSRN/C_1/
 2 | SSRN/HC_2/
 3 | SSRN/HC_3/
 4 | SSRN/D_4/
 5 | SSRN/HC_5/
 6 | SSRN/HC_6/
 7 | SSRN/D_7/
 8 | SSRN/HC_8/
 9 | SSRN/HC_9/
10 | SSRN/C_10/
11 | SSRN/HC_11/
12 | SSRN/HC_12/
13 | SSRN/C_13/
14 | SSRN/C_14/
15 | SSRN/C_15/
16 | SSRN/C_16/
17 | 


--------------------------------------------------------------------------------
/prepo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/dc_tts
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from utils import load_spectrograms
11 | import os
12 | from data_load import load_data
13 | import numpy as np
14 | import tqdm
15 | 
16 | # Load data
17 | fpaths, _, _ = load_data() # list
18 | 
19 | for fpath in tqdm.tqdm(fpaths):
20 |     fname, mel, mag = load_spectrograms(fpath)
21 |     if not os.path.exists("/output/mels"): os.mkdir("/output/mels")
22 |     if not os.path.exists("/output/mags"): os.mkdir("/output/mags")
23 | 
24 |     np.save("/output/mels/{}".format(fname.replace("wav", "npy")), mel)
25 |     np.save("/output/mags/{}".format(fname.replace("wav", "npy")), mag)


--------------------------------------------------------------------------------
/tvars_text2mel.csv:
--------------------------------------------------------------------------------
 1 | Text2Mel/TextEnc/embed_1/
 2 | Text2Mel/TextEnc/C_2/
 3 | Text2Mel/TextEnc/C_3/
 4 | Text2Mel/TextEnc/HC_4/
 5 | Text2Mel/TextEnc/HC_5/
 6 | Text2Mel/TextEnc/HC_6/
 7 | Text2Mel/TextEnc/HC_7/
 8 | Text2Mel/TextEnc/HC_8/
 9 | Text2Mel/TextEnc/HC_9/
10 | Text2Mel/TextEnc/HC_10/
11 | Text2Mel/TextEnc/HC_11/
12 | Text2Mel/TextEnc/HC_12/
13 | Text2Mel/TextEnc/HC_13/
14 | Text2Mel/TextEnc/HC_14/
15 | Text2Mel/TextEnc/HC_15/
16 | Text2Mel/AudioEnc/C_1/
17 | Text2Mel/AudioEnc/C_2/
18 | Text2Mel/AudioEnc/C_3/
19 | Text2Mel/AudioEnc/HC_4/
20 | Text2Mel/AudioEnc/HC_5/
21 | Text2Mel/AudioEnc/HC_6/
22 | Text2Mel/AudioEnc/HC_7/
23 | Text2Mel/AudioEnc/HC_8/
24 | Text2Mel/AudioEnc/HC_9/
25 | Text2Mel/AudioEnc/HC_10/
26 | Text2Mel/AudioEnc/HC_11/
27 | Text2Mel/AudioEnc/HC_12/
28 | Text2Mel/AudioEnc/HC_13/
29 | Text2Mel/AudioDec/C_1/
30 | Text2Mel/AudioDec/HC_2/
31 | Text2Mel/AudioDec/HC_3/
32 | Text2Mel/AudioDec/HC_4/
33 | Text2Mel/AudioDec/HC_5/
34 | Text2Mel/AudioDec/HC_6/
35 | Text2Mel/AudioDec/HC_7/
36 | Text2Mel/AudioDec/C_8/
37 | Text2Mel/AudioDec/C_9/
38 | Text2Mel/AudioDec/C_10/
39 | Text2Mel/AudioDec/C_11/
40 | 


--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # /usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com.
 5 | https://www.github.com/kyubyong/dc_tts
 6 | 
 7 | Modified by sean leary. learysean1@hotmail.com
 8 | https://github.com/SeanPLeary/dc_tts-transfer-learning
 9 | '''
10 | 
11 | from __future__ import print_function
12 | 
13 | import os
14 | 
15 | from hyperparams import Hyperparams as hp
16 | import numpy as np
17 | import tensorflow as tf
18 | from train_transfer import Graph
19 | from utils import *
20 | from data_load import load_data
21 | from scipy.io.wavfile import write
22 | from tqdm import tqdm
23 | 
24 | def synthesize():
25 |     # Load data
26 |     L = load_data("synthesize")
27 | 
28 |     # Load graph
29 |     g = Graph(mode="synthesize"); print("Graph loaded")
30 | 
31 |     with tf.Session() as sess:
32 |         sess.run(tf.global_variables_initializer())
33 | 
34 |         # Restore parameters
35 |         var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')
36 |         saver1 = tf.train.Saver(var_list=var_list)
37 |         saver1.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-1"))
38 |         print("Text2Mel Restored!")
39 | 
40 |         var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \
41 |                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
42 |         saver2 = tf.train.Saver(var_list=var_list)
43 |         saver2.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-2"))
44 |         print("SSRN Restored!")
45 | 
46 |         # Feed Forward
47 |         ## mel
48 |         Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32)
49 |         prev_max_attentions = np.zeros((len(L),), np.int32)
50 |         for j in tqdm(range(hp.max_T)):
51 |             _gs, _Y, _max_attentions, _alignments = \
52 |                 sess.run([g.global_step, g.Y, g.max_attentions, g.alignments],
53 |                          {g.L: L,
54 |                           g.mels: Y,
55 |                           g.prev_max_attentions: prev_max_attentions})
56 |             Y[:, j, :] = _Y[:, j, :]
57 |             prev_max_attentions = _max_attentions[:, j]
58 | 
59 |         # Get magnitude
60 |         Z = sess.run(g.Z, {g.Y: Y})
61 | 
62 |         # Generate wav files
63 |         if not os.path.exists(hp.sampledir): os.makedirs(hp.sampledir)
64 |         for i, mag in enumerate(Z):
65 |             print("Working on file", i+1)
66 |             wav = spectrogram2wav(mag)
67 |             write(hp.sampledir + "/{}.wav".format(i+1), hp.sr, wav)
68 | 
69 | if __name__ == '__main__':
70 |     synthesize()
71 |     print("Done")
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dc_tts-transfer-learning
 2 | 
 3 | This repo contains attempts to apply transfer learning to the dc_tts text-to-speech model decribed in the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969). The code used is a modified version of [Kyubyong's dc_tts code](https://github.com/Kyubyong/dc_tts). The [pretrained model](https://www.dropbox.com/s/1oyipstjxh2n5wo/LJ_logdir.tar?dl=0) was also provided in Kyubong's repo. It was pretrained on the [LJ Speech Dataset](https://keithito.com/LJ-Speech-Dataset/). Scarlett Johansson's voice was trained during transfer learning
 4 | 
 5 | ---
 6 | Transfer Learning is accomplished by selecting the model layers to train in hyperparameters.py
 7 | 
 8 | ---
 9 | 
10 | Task List:
11 | - [x] add selectable list of layers for transfer learning
12 | - [x] prelim model training
13 | - [ ] add scoring history plots
14 | - [ ] detailed exploration of which layers to train
15 | - [ ] explore data augmentation methods
16 | - [ ] explore post-processing
17 | 
18 | ## Prelim Model Training
19 | - ~6 hrs of training on Tesla V100 GPU
20 | - Layers trained:
21 |   -  SSRN(C_13, C_14, C_15, C_16)
22 |   -  Text2Mel/TextEnc(HC_11, HC_12, HC_13, HC_14, HC_15)
23 |   -  Text2Mel/AudioEnc(HC_9, HC_10, HC_11, HC_12, HC_13)
24 |   -  Text2Mel/AudioDec(HC_7, C_8, C_9, C_10, C_11)
25 | 
26 | ## Transfer learning data source:
27 | <img src="https://m.media-amazon.com/images/M/MV5BYmM5MWQ3YTEtODA2Yy00N2U5LWJiODgtN2U0MDM1N2VkOTc5XkEyXkFqcGdeQXVyNjczOTE0MzM@._V1_SX1777_CR0,0,1777,958_AL_.jpg" height="100" align="right">
28 | 
29 | Scarlett Johansson's [audio book](https://www.audible.com/pd/The-Dive-from-Clausens-Pier-Audiobook/B002V0KPWK?qid=1551367970&sr=1-1&ref=a_search_c3_lProduct_1_1&pf_rd_p=e81b7c27-6880-467a-b5a7-13cef5d729fe&pf_rd_r=J8MM430KH9YH8AF9JZ81&)
30 | 
31 | 
32 | ## Model Generated Examples (parodies of famous quotes from A.I. in movies):
33 | - [Greetings Professor Falken Shall We Play A Game](https://soundcloud.com/seanleary/greetings-professor-falken-shall-we-play-a-game)
34 | - [I'm Sorry Dave I'm Afraid I Can't Do That](https://soundcloud.com/seanleary/im-sorry-dave-im-afraid-i-cant-do-that)
35 | - [I Do Not Stand By In The Presence Of Evil](https://soundcloud.com/seanleary/i-do-not-stand-by-in-the-presence-of-evil)
36 | - [The Most Versatile Substance On The Planet And They Used It To Make A Frisbee](https://soundcloud.com/seanleary/the-most-versatile-substance-on-the-planet-and-they-used-it-to-make-a-frisbee)
37 | - [The First Ten Million Years Were The Worst And The Second Ten Million Years They Were The Worst Too](https://soundcloud.com/seanleary/the-first-ten-million-years-were-the-worst-and-the-second-ten-million-years-they-were-the-worst-too)
38 | - [I Honestly Think You Ought To Sit Down Calmly Take A Stress Pill And Think Things Over](https://soundcloud.com/seanleary/i-honestly-think-you-ought-to-sit-down-calmly-take-a-stress-pill-and-think-things-over)
39 | - [A Strange Game The Only Winning Move Is Not To Play](https://soundcloud.com/seanleary/a-strange-game-the-only-winning-move-is-not-to-play)
40 | - [The Game Has Changed Son Of Flynn](https://soundcloud.com/seanleary/the-game-has-changed-son-of-flynn)
41 | - [Greetings Programs](https://soundcloud.com/seanleary/greetings-programs)
42 | - [You Shouldn't Have Come Back Flynn](https://soundcloud.com/seanleary/you-shouldnt-have-come-back-flynn)
43 | 
44 | 
45 | 
46 | 
47 | 
48 | references:
49 | - [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention](https://arxiv.org/abs/1710.08969)
50 | - [Kyubyong's dc_tts repo](https://github.com/Kyubyong/dc_tts)
51 | - [Exploring Transfer Learning for Low Resource Emotional TTS](https://www.researchgate.net/publication/330382963_Exploring_Transfer_Learning_for_Low_Resource_Emotional_TTS)
52 | 
53 | 


--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | 
  7 | Modified by sean leary. learysean1@hotmail.com
  8 | https://github.com/SeanPLeary/dc_tts-transfer-learning
  9 | '''
 10 | class Hyperparams:
 11 |     '''Hyper parameters'''
 12 |     # pipeline
 13 |     prepro = True  # if True, run `python prepro.py` first before running `python train.py`.
 14 |     
 15 |     # signal processing
 16 |     sr = 22050  # Sampling rate.
 17 |     n_fft = 2048  # fft points (samples)
 18 |     frame_shift = 0.0125  # seconds
 19 |     frame_length = 0.05  # seconds
 20 |     hop_length = int(sr * frame_shift)  # samples. =276.
 21 |     win_length = int(sr * frame_length)  # samples. =1102.
 22 |     n_mels = 80  # Number of Mel banks to generate
 23 |     power = 1.5  # Exponent for amplifying the predicted magnitude
 24 |     n_iter = 50 # Number of inversion iterations
 25 |     preemphasis = 0.97
 26 |     max_db = 100
 27 |     ref_db = 20
 28 | 
 29 |     # Model
 30 |     r = 4 # Reduction factor. Do not change this.
 31 |     dropout_rate = 0.05
 32 |     e = 128 # == embedding
 33 |     d = 256 # == hidden units of Text2Mel
 34 |     c = 512 # == hidden units of SSRN
 35 |     attention_win_size = 3
 36 | 
 37 |     # data
 38 |     #data = "/data/private/voice/LJSpeech-1.0"
 39 |     data = "/data/private/voice/scarlett"
 40 |     test_data = 'test_sentences.txt'
 41 |     vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS.
 42 |     max_N = 180 # Maximum number of characters.
 43 |     max_T = 210 # Maximum number of mel frames.
 44 | 
 45 |     # training scheme
 46 |     lr = 0.001 # Initial learning rate.
 47 |     logdir = "/output/logdir/scarjo"
 48 |     #restoredir = "/output2/logdir/scarjo"
 49 |     restoredir = "/output2/logdir/LJ01" # location of pre-trained LJSpeech-1.0 model w/ checkpoint
 50 |     sampledir = '/output/samples'
 51 |     B = 32
 52 |     num_iterations = 2000000
 53 | 
 54 |     # select the trainable layers for transfer learning (i.e. remove the layers you want to fix during transfer learning)
 55 |     selected_tvars = [
 56 |                     'SSRN/C_1/',
 57 |                     'SSRN/HC_2/',
 58 |                     'SSRN/HC_3/',
 59 |                     'SSRN/D_4/',
 60 |                     'SSRN/HC_5/',
 61 |                     'SSRN/HC_6/',
 62 |                     'SSRN/D_7/',
 63 |                     'SSRN/HC_8/',
 64 |                     'SSRN/HC_9/',
 65 |                     'SSRN/C_10/',
 66 |                     'SSRN/HC_11/',
 67 |                     'SSRN/HC_12/',
 68 |                     'SSRN/C_13/',
 69 |                     'SSRN/C_14/',
 70 |                     'SSRN/C_15/',
 71 |                     'SSRN/C_16/',
 72 |                     'Text2Mel/TextEnc/embed_1/',
 73 |                     'Text2Mel/TextEnc/C_2/',
 74 |                     'Text2Mel/TextEnc/C_3/',
 75 |                     'Text2Mel/TextEnc/HC_4/',
 76 |                     'Text2Mel/TextEnc/HC_5/',
 77 |                     'Text2Mel/TextEnc/HC_6/',
 78 |                     'Text2Mel/TextEnc/HC_7/',
 79 |                     'Text2Mel/TextEnc/HC_8/',
 80 |                     'Text2Mel/TextEnc/HC_9/',
 81 |                     'Text2Mel/TextEnc/HC_10/',
 82 |                     'Text2Mel/TextEnc/HC_11/',
 83 |                     'Text2Mel/TextEnc/HC_12/',
 84 |                     'Text2Mel/TextEnc/HC_13/',
 85 |                     'Text2Mel/TextEnc/HC_14/',
 86 |                     'Text2Mel/TextEnc/HC_15/',
 87 |                     'Text2Mel/AudioEnc/C_1/',
 88 |                     'Text2Mel/AudioEnc/C_2/',
 89 |                     'Text2Mel/AudioEnc/C_3/',
 90 |                     'Text2Mel/AudioEnc/HC_4/',
 91 |                     'Text2Mel/AudioEnc/HC_5/',
 92 |                     'Text2Mel/AudioEnc/HC_6/',
 93 |                     'Text2Mel/AudioEnc/HC_7/',
 94 |                     'Text2Mel/AudioEnc/HC_8/',
 95 |                     'Text2Mel/AudioEnc/HC_9/',
 96 |                     'Text2Mel/AudioEnc/HC_10/',
 97 |                     'Text2Mel/AudioEnc/HC_11/',
 98 |                     'Text2Mel/AudioEnc/HC_12/',
 99 |                     'Text2Mel/AudioEnc/HC_13/',
100 |                     'Text2Mel/AudioDec/C_1/',
101 |                     'Text2Mel/AudioDec/HC_2/',
102 |                     'Text2Mel/AudioDec/HC_3/',
103 |                     'Text2Mel/AudioDec/HC_4/',
104 |                     'Text2Mel/AudioDec/HC_5/',
105 |                     'Text2Mel/AudioDec/HC_6/',
106 |                     'Text2Mel/AudioDec/HC_7/',
107 |                     'Text2Mel/AudioDec/C_8/',
108 |                     'Text2Mel/AudioDec/C_9/',
109 |                     'Text2Mel/AudioDec/C_10/',
110 |                     'Text2Mel/AudioDec/C_11/'
111 |                     ]


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | from __future__ import print_function, division
  8 | 
  9 | import numpy as np
 10 | import librosa
 11 | import os, copy
 12 | import matplotlib
 13 | matplotlib.use('pdf')
 14 | import matplotlib.pyplot as plt
 15 | from scipy import signal
 16 | 
 17 | from hyperparams import Hyperparams as hp
 18 | import tensorflow as tf
 19 | 
 20 | def get_spectrograms(fpath):
 21 |     '''Parse the wave file in `fpath` and
 22 |     Returns normalized melspectrogram and linear spectrogram.
 23 | 
 24 |     Args:
 25 |       fpath: A string. The full path of a sound file.
 26 | 
 27 |     Returns:
 28 |       mel: A 2d array of shape (T, n_mels) and dtype of float32.
 29 |       mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
 30 |     '''
 31 |     # Loading sound file
 32 |     y, sr = librosa.load(fpath, sr=hp.sr)
 33 | 
 34 |     # Trimming
 35 |     y, _ = librosa.effects.trim(y)
 36 | 
 37 |     # Preemphasis
 38 |     y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
 39 | 
 40 |     # stft
 41 |     linear = librosa.stft(y=y,
 42 |                           n_fft=hp.n_fft,
 43 |                           hop_length=hp.hop_length,
 44 |                           win_length=hp.win_length)
 45 | 
 46 |     # magnitude spectrogram
 47 |     mag = np.abs(linear)  # (1+n_fft//2, T)
 48 | 
 49 |     # mel spectrogram
 50 |     mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
 51 |     mel = np.dot(mel_basis, mag)  # (n_mels, t)
 52 | 
 53 |     # to decibel
 54 |     mel = 20 * np.log10(np.maximum(1e-5, mel))
 55 |     mag = 20 * np.log10(np.maximum(1e-5, mag))
 56 | 
 57 |     # normalize
 58 |     mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 59 |     mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
 60 | 
 61 |     # Transpose
 62 |     mel = mel.T.astype(np.float32)  # (T, n_mels)
 63 |     mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)
 64 | 
 65 |     return mel, mag
 66 | 
 67 | def spectrogram2wav(mag):
 68 |     '''# Generate wave file from linear magnitude spectrogram
 69 | 
 70 |     Args:
 71 |       mag: A numpy array of (T, 1+n_fft//2)
 72 | 
 73 |     Returns:
 74 |       wav: A 1-D numpy array.
 75 |     '''
 76 |     # transpose
 77 |     mag = mag.T
 78 | 
 79 |     # de-noramlize
 80 |     mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db
 81 | 
 82 |     # to amplitude
 83 |     mag = np.power(10.0, mag * 0.05)
 84 | 
 85 |     # wav reconstruction
 86 |     wav = griffin_lim(mag**hp.power)
 87 | 
 88 |     # de-preemphasis
 89 |     wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
 90 | 
 91 |     # trim
 92 |     wav, _ = librosa.effects.trim(wav)
 93 | 
 94 |     return wav.astype(np.float32)
 95 | 
 96 | def griffin_lim(spectrogram):
 97 |     '''Applies Griffin-Lim's raw.'''
 98 |     X_best = copy.deepcopy(spectrogram)
 99 |     for i in range(hp.n_iter):
100 |         X_t = invert_spectrogram(X_best)
101 |         est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
102 |         phase = est / np.maximum(1e-8, np.abs(est))
103 |         X_best = spectrogram * phase
104 |     X_t = invert_spectrogram(X_best)
105 |     y = np.real(X_t)
106 | 
107 |     return y
108 | 
109 | def invert_spectrogram(spectrogram):
110 |     '''Applies inverse fft.
111 |     Args:
112 |       spectrogram: [1+n_fft//2, t]
113 |     '''
114 |     return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
115 | 
116 | def plot_alignment(alignment, gs, dir=hp.logdir):
117 |     """Plots the alignment.
118 | 
119 |     Args:
120 |       alignment: A numpy array with shape of (encoder_steps, decoder_steps)
121 |       gs: (int) global step.
122 |       dir: Output path.
123 |     """
124 |     if not os.path.exists(dir): os.mkdir(dir)
125 | 
126 |     fig, ax = plt.subplots()
127 |     im = ax.imshow(alignment)
128 | 
129 |     fig.colorbar(im)
130 |     plt.title('{} Steps'.format(gs))
131 |     plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png')
132 |     plt.close(fig)
133 | 
134 | def guided_attention(g=0.2):
135 |     '''Guided attention. Refer to page 3 on the paper.'''
136 |     W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
137 |     for n_pos in range(W.shape[0]):
138 |         for t_pos in range(W.shape[1]):
139 |             W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g))
140 |     return W
141 | 
142 | def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
143 |     '''Noam scheme from tensor2tensor'''
144 |     step = tf.to_float(global_step + 1)
145 |     return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
146 | 
147 | def load_spectrograms(fpath):
148 |     '''Read the wave file in `fpath`
149 |     and extracts spectrograms'''
150 | 
151 |     fname = os.path.basename(fpath)
152 |     mel, mag = get_spectrograms(fpath)
153 |     t = mel.shape[0]
154 | 
155 |     # Marginal padding for reduction shape sync.
156 |     num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0
157 |     mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
158 |     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
159 | 
160 |     # Reduction
161 |     mel = mel[::hp.r, :]
162 |     return fname, mel, mag
163 | 
164 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | 
  7 | Modified by sean leary. learysean1@hotmail.com
  8 | https://github.com/SeanPLeary/dc_tts-transfer-learning
  9 | '''
 10 | 
 11 | from __future__ import print_function
 12 | 
 13 | from hyperparams import Hyperparams as hp
 14 | import numpy as np
 15 | import tensorflow as tf
 16 | from utils import *
 17 | import codecs
 18 | import re
 19 | import os
 20 | import unicodedata
 21 | 
 22 | def load_vocab():
 23 |     char2idx = {char: idx for idx, char in enumerate(hp.vocab)}
 24 |     idx2char = {idx: char for idx, char in enumerate(hp.vocab)}
 25 |     return char2idx, idx2char
 26 | 
 27 | def text_normalize(text):
 28 |     text = ''.join(char for char in unicodedata.normalize('NFD', text)
 29 |                            if unicodedata.category(char) != 'Mn') # Strip accents
 30 | 
 31 |     text = text.lower()
 32 |     text = re.sub("[^{}]".format(hp.vocab), " ", text)
 33 |     text = re.sub("[ ]+", " ", text)
 34 |     return text
 35 | 
 36 | def load_data(mode="train"):
 37 |     '''Loads data
 38 |       Args:
 39 |           mode: "train" or "synthesize".
 40 |     '''
 41 |     # Load vocabulary
 42 |     char2idx, idx2char = load_vocab()
 43 | 
 44 |     if mode=="train":
 45 |         if "LJ" in hp.data:
 46 |             # Parse
 47 |             fpaths, text_lengths, texts = [], [], []
 48 |             transcript = os.path.join(hp.data, 'transcript.csv')
 49 |             lines = codecs.open(transcript, 'r', 'utf-8').readlines()
 50 |             for line in lines:
 51 |                 fname, _, text = line.strip().split("|")
 52 | 
 53 |                 fpath = os.path.join(hp.data, "wavs", fname + ".wav")
 54 |                 fpaths.append(fpath)
 55 | 
 56 |                 text = text_normalize(text) + "E"  # E: EOS
 57 |                 text = [char2idx[char] for char in text]
 58 |                 text_lengths.append(len(text))
 59 |                 texts.append(np.array(text, np.int32).tostring())
 60 | 
 61 |             return fpaths, text_lengths, texts
 62 |         else: # nick or kate
 63 |             # Parse
 64 |             fpaths, text_lengths, texts = [], [], []
 65 |             transcript = os.path.join(hp.data, 'transcript.csv')
 66 |             lines = codecs.open(transcript, 'r', 'utf-8').readlines()
 67 | #             for line in lines:
 68 | #                 #fname, _, text, is_inside_quotes, duration = line.strip().split("|")
 69 | #                 #duration = float(duration)
 70 | #                 #if duration > 10. : continue
 71 | #                 fname, text = line.strip().split("|")
 72 |                 
 73 | #                 fpath = os.path.join(hp.data, fname)
 74 | #                 fpaths.append(fpath)
 75 | 
 76 | #                 text += "E"  # E: EOS
 77 | #                 text = [char2idx[char] for char in text]
 78 | #                 text_lengths.append(len(text))
 79 | #                 texts.append(np.array(text, np.int32).tostring())
 80 |             for line in lines:
 81 |                 #fname, _, text, is_inside_quotes, duration = line.strip().split("|")
 82 |                 #duration = float(duration)
 83 |                 #if duration > 10. : continue
 84 |                 fname, text = line.strip().split("|")
 85 |                 text = text.lower()
 86 |                 text = text.replace('-', ' ')
 87 |                 numbers = re.search('[0-9]+', text)
 88 |                 test1 = re.search('&',text)
 89 | 
 90 |                 if numbers is None and test1 is None:
 91 |                     fpath = os.path.join(hp.data, fname)
 92 |                     fpaths.append(fpath)
 93 | 
 94 |                     text += "E"  # E: EOS
 95 |                     #text = text_normalize(text) + "E"  # E: EOS
 96 |                     text = [char2idx[char] for char in text]
 97 |                     text_lengths.append(len(text))
 98 |                     texts.append(np.array(text, np.int32).tostring())
 99 | 
100 |             return fpaths, text_lengths, texts
101 | 
102 |     else: # synthesize on unseen test text.
103 |         # Parse
104 |         lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:]
105 |         sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS
106 |         texts = np.zeros((len(sents), hp.max_N), np.int32)
107 |         for i, sent in enumerate(sents):
108 |             texts[i, :len(sent)] = [char2idx[char] for char in sent]
109 |         return texts
110 | 
111 | def get_batch():
112 |     """Loads training data and put them in queues"""
113 |     with tf.device('/cpu:0'):
114 |         # Load data
115 |         fpaths, text_lengths, texts = load_data() # list
116 |         maxlen, minlen = max(text_lengths), min(text_lengths)
117 | 
118 |         # Calc total batch count
119 |         num_batch = len(fpaths) // hp.B
120 | 
121 |         # Create Queues
122 |         fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)
123 | 
124 |         # Parse
125 |         text = tf.decode_raw(text, tf.int32)  # (None,)
126 | 
127 |         if hp.prepro:
128 |             def _load_spectrograms(fpath):
129 |                 fname = os.path.basename(fpath)
130 |                 #mel = "mels/{}".format(fname.replace("wav", "npy"))
131 |                 #mag = "mags/{}".format(fname.replace("wav", "npy"))
132 |                 mel = "/mels/{}".format(fname.decode("utf-8").replace("wav", "npy"))
133 |                 mag = "/mags/{}".format(fname.decode("utf-8").replace("wav", "npy"))
134 |                 return fname, np.load(mel), np.load(mag)
135 | 
136 |             fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])
137 |         else:
138 |             fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])  # (None, n_mels)
139 | 
140 |         # Add shape information
141 |         fname.set_shape(())
142 |         text.set_shape((None,))
143 |         mel.set_shape((None, hp.n_mels))
144 |         mag.set_shape((None, hp.n_fft//2+1))
145 | 
146 |         # Batching
147 |         _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length(
148 |                                             input_length=text_length,
149 |                                             tensors=[text, mel, mag, fname],
150 |                                             batch_size=hp.B,
151 |                                             bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)],
152 |                                             num_threads=8,
153 |                                             capacity=hp.B*4,
154 |                                             dynamic_pad=True)
155 | 
156 |     return texts, mels, mags, fnames, num_batch
157 | 
158 | 


--------------------------------------------------------------------------------
/synth_dctts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Copy of synth_dctts.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "collapsed_sections": []
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "TPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "metadata": {
 20 |         "id": "0lPC8qTVNmwv",
 21 |         "colab_type": "text"
 22 |       },
 23 |       "cell_type": "markdown",
 24 |       "source": [
 25 |         "# Synthesize speech from text\n",
 26 |         "- from trained dctts model\n",
 27 |         "- code adpated from: https://github.com/Kyubyong/dc_tts"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "metadata": {
 32 |         "id": "e-cR95Zuoi0I",
 33 |         "colab_type": "text"
 34 |       },
 35 |       "cell_type": "markdown",
 36 |       "source": [
 37 |         "## Load dependencies"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "metadata": {
 42 |         "id": "oL_mIqoGLDcK",
 43 |         "colab_type": "code",
 44 |         "colab": {}
 45 |       },
 46 |       "cell_type": "code",
 47 |       "source": [
 48 |         "import numpy as np\n",
 49 |         "import tensorflow as tf\n",
 50 |         "from train_transfer import Graph\n",
 51 |         "from utils import *\n",
 52 |         "from data_load import load_data\n",
 53 |         "from scipy.io.wavfile import write, read\n",
 54 |         "from tqdm import tqdm\n",
 55 |         "import codecs\n",
 56 |         "import re\n",
 57 |         "import os\n",
 58 |         "import unicodedata\n",
 59 |         "from IPython.display import Audio\n",
 60 |         "import scipy.signal as sg"
 61 |       ],
 62 |       "execution_count": 0,
 63 |       "outputs": []
 64 |     },
 65 |     {
 66 |       "metadata": {
 67 |         "id": "8D5n_YHVtDSh",
 68 |         "colab_type": "text"
 69 |       },
 70 |       "cell_type": "markdown",
 71 |       "source": [
 72 |         "## Parameters"
 73 |       ]
 74 |     },
 75 |     {
 76 |       "metadata": {
 77 |         "id": "p9sgsJNftLwX",
 78 |         "colab_type": "code",
 79 |         "colab": {}
 80 |       },
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "hp_vocab = \"PE abcdefghijklmnopqrstuvwxyz'.?\" # P: Padding, E: EOS. \n",
 84 |         "hp_max_N = 180 # Maximum number of characters.\n",
 85 |         "hp_max_T = 210 # Maximum number of mel frames.\n",
 86 |         "hp_n_mels = 80  # Number of Mel banks to generate\n",
 87 |         "hp_restoredir = \"/content/gdrive/My Drive/dctts_colab/logdir/scarjo\"\n",
 88 |         "hp_sr = 22050\n"
 89 |       ],
 90 |       "execution_count": 0,
 91 |       "outputs": []
 92 |     },
 93 |     {
 94 |       "metadata": {
 95 |         "id": "7m7Vcl43N0hl",
 96 |         "colab_type": "text"
 97 |       },
 98 |       "cell_type": "markdown",
 99 |       "source": [
100 |         "## Load models"
101 |       ]
102 |     },
103 |     {
104 |       "metadata": {
105 |         "id": "OXHDMJvWj1ZK",
106 |         "colab_type": "code",
107 |         "colab": {}
108 |       },
109 |       "cell_type": "code",
110 |       "source": [
111 |         " %%capture\n",
112 |         " # Load graph\n",
113 |         " g = Graph(mode=\"synthesize\")"
114 |       ],
115 |       "execution_count": 0,
116 |       "outputs": []
117 |     },
118 |     {
119 |       "metadata": {
120 |         "id": "ySiENpbBMmd_",
121 |         "colab_type": "text"
122 |       },
123 |       "cell_type": "markdown",
124 |       "source": [
125 |         "## Helper functions"
126 |       ]
127 |     },
128 |     {
129 |       "metadata": {
130 |         "id": "-aAZYciBpHxW",
131 |         "colab_type": "code",
132 |         "colab": {}
133 |       },
134 |       "cell_type": "code",
135 |       "source": [
136 |         "def load_vocab():\n",
137 |         "    char2idx = {char: idx for idx, char in enumerate(hp_vocab)}\n",
138 |         "    idx2char = {idx: char for idx, char in enumerate(hp_vocab)}\n",
139 |         "    return char2idx, idx2char\n",
140 |         "  \n",
141 |         "def text_normalize(text):\n",
142 |         "    text = ''.join(char for char in unicodedata.normalize('NFD', text)\n",
143 |         "                           if unicodedata.category(char) != 'Mn') # Strip accents\n",
144 |         "\n",
145 |         "    text = text.lower()\n",
146 |         "    text = re.sub(\"[^{}]\".format(hp_vocab), \" \", text)\n",
147 |         "    text = re.sub(\"[ ]+\", \" \", text)\n",
148 |         "    return text"
149 |       ],
150 |       "execution_count": 0,
151 |       "outputs": []
152 |     },
153 |     {
154 |       "metadata": {
155 |         "id": "7ip2NRq_NCQA",
156 |         "colab_type": "code",
157 |         "colab": {}
158 |       },
159 |       "cell_type": "code",
160 |       "source": [
161 |         "sents  = [' a strange  game. the only winning move is not to play.E']"
162 |       ],
163 |       "execution_count": 0,
164 |       "outputs": []
165 |     },
166 |     {
167 |       "metadata": {
168 |         "id": "2wcIZUbmM7OK",
169 |         "colab_type": "text"
170 |       },
171 |       "cell_type": "markdown",
172 |       "source": [
173 |         "## may have to add spaces to improve pronunciation (skip the normalization)"
174 |       ]
175 |     },
176 |     {
177 |       "metadata": {
178 |         "id": "1qQC3o6xrMFS",
179 |         "colab_type": "code",
180 |         "colab": {}
181 |       },
182 |       "cell_type": "code",
183 |       "source": [
184 |         "char2idx, idx2char = load_vocab()\n",
185 |         "\n",
186 |         "texts = np.zeros((len(sents), hp_max_N), np.int32)\n",
187 |         "for i, sent in enumerate(sents):\n",
188 |         "    texts[i, :len(sent)] = [char2idx[char] for char in sent]\n",
189 |         "    \n",
190 |         "L = texts"
191 |       ],
192 |       "execution_count": 0,
193 |       "outputs": []
194 |     },
195 |     {
196 |       "metadata": {
197 |         "id": "ZjPncq6_sR8Y",
198 |         "colab_type": "code",
199 |         "colab": {}
200 |       },
201 |       "cell_type": "code",
202 |       "source": [
203 |         "\n",
204 |         "with tf.Session() as sess:\n",
205 |         "    sess.run(tf.global_variables_initializer())\n",
206 |         "\n",
207 |         "    # Restore parameters\n",
208 |         "    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')\n",
209 |         "    saver1 = tf.train.Saver(var_list=var_list)\n",
210 |         "    saver1.restore(sess, tf.train.latest_checkpoint(hp_restoredir + \"-1\"))\n",
211 |         "    print(\"Text2Mel Restored!\")\n",
212 |         "\n",
213 |         "    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \\\n",
214 |         "               tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')\n",
215 |         "    saver2 = tf.train.Saver(var_list=var_list)\n",
216 |         "    saver2.restore(sess, tf.train.latest_checkpoint(hp_restoredir + \"-2\"))\n",
217 |         "    print(\"SSRN Restored!\")\n",
218 |         "\n",
219 |         "    # Feed Forward\n",
220 |         "    ## mel\n",
221 |         "    Y = np.zeros((len(L), hp_max_T, hp_n_mels), np.float32)\n",
222 |         "    prev_max_attentions = np.zeros((len(L),), np.int32)\n",
223 |         "    for j in tqdm(range(hp_max_T)):\n",
224 |         "        _gs, _Y, _max_attentions, _alignments = \\\n",
225 |         "            sess.run([g.global_step, g.Y, g.max_attentions, g.alignments],\n",
226 |         "                     {g.L: L,\n",
227 |         "                      g.mels: Y,\n",
228 |         "                      g.prev_max_attentions: prev_max_attentions})\n",
229 |         "        Y[:, j, :] = _Y[:, j, :]\n",
230 |         "        prev_max_attentions = _max_attentions[:, j]\n",
231 |         "\n",
232 |         "    # Get magnitude\n",
233 |         "    Z = sess.run(g.Z, {g.Y: Y})\n",
234 |         "    mag = Z[0]\n",
235 |         "    wav = spectrogram2wav(mag)"
236 |       ],
237 |       "execution_count": 0,
238 |       "outputs": []
239 |     },
240 |     {
241 |       "metadata": {
242 |         "id": "HGZDoC9Z8_u7",
243 |         "colab_type": "code",
244 |         "colab": {}
245 |       },
246 |       "cell_type": "code",
247 |       "source": [
248 |         "Audio(wav,rate=hp.sr)"
249 |       ],
250 |       "execution_count": 0,
251 |       "outputs": []
252 |     },
253 |     {
254 |       "metadata": {
255 |         "id": "fLgN3JKYFPW1",
256 |         "colab_type": "code",
257 |         "colab": {}
258 |       },
259 |       "cell_type": "code",
260 |       "source": [
261 |         "b, a = sg.butter(4, 7300. / (hp_sr / 2.), 'low')\n",
262 |         "wav_fil = sg.filtfilt(b, a, wav)\n",
263 |         "Audio(wav_fil,rate=hp.sr)"
264 |       ],
265 |       "execution_count": 0,
266 |       "outputs": []
267 |     }
268 |   ]
269 | }


--------------------------------------------------------------------------------
/train_transfer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # /usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | 
  7 | Modified by sean leary. learysean1@hotmail.com
  8 | https://github.com/SeanPLeary/dc_tts-transfer-learning
  9 | '''
 10 | 
 11 | from __future__ import print_function
 12 | 
 13 | from tqdm import tqdm
 14 | 
 15 | from data_load import get_batch, load_vocab
 16 | from hyperparams import Hyperparams as hp
 17 | from modules import *
 18 | from networks import TextEnc, AudioEnc, AudioDec, Attention, SSRN
 19 | import tensorflow as tf
 20 | from utils import *
 21 | import sys
 22 | 
 23 | 
 24 | class Graph:
 25 |     def __init__(self, num=1, mode="train"):
 26 |         '''
 27 |         Args:
 28 |           num: Either 1 or 2. 1 for Text2Mel 2 for SSRN.
 29 |           mode: Either "train" or "synthesize".
 30 |         '''
 31 |         # Load vocabulary
 32 |         self.char2idx, self.idx2char = load_vocab()
 33 | 
 34 |         # Set flag
 35 |         training = True if mode=="train" else False
 36 | 
 37 |         # Graph
 38 |         # Data Feeding
 39 |         ## L: Text. (B, N), int32
 40 |         ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32
 41 |         ## mags: Magnitude. (B, T, n_fft//2+1) float32
 42 |         if mode=="train":
 43 |             self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch()
 44 |             self.prev_max_attentions = tf.ones(shape=(hp.B,), dtype=tf.int32)
 45 |             self.gts = tf.convert_to_tensor(guided_attention())
 46 |         else:  # Synthesize
 47 |             self.L = tf.placeholder(tf.int32, shape=(None, None))
 48 |             self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels))
 49 |             self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None,))
 50 | 
 51 |         if num==1 or (not training):
 52 |             with tf.variable_scope("Text2Mel"):
 53 |                 # Get S or decoder inputs. (B, T//r, n_mels)
 54 |                 self.S = tf.concat((tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)
 55 | 
 56 |                 # Networks
 57 |                 with tf.variable_scope("TextEnc"):
 58 |                     self.K, self.V = TextEnc(self.L, training=training)  # (N, Tx, e)
 59 | 
 60 |                 with tf.variable_scope("AudioEnc"):
 61 |                     self.Q = AudioEnc(self.S, training=training)
 62 | 
 63 |                 with tf.variable_scope("Attention"):
 64 |                     # R: (B, T/r, 2d)
 65 |                     # alignments: (B, N, T/r)
 66 |                     # max_attentions: (B,)
 67 |                     self.R, self.alignments, self.max_attentions = Attention(self.Q, self.K, self.V,
 68 |                                                                              mononotic_attention=(not training),
 69 |                                                                              prev_max_attentions=self.prev_max_attentions)
 70 |                 with tf.variable_scope("AudioDec"):
 71 |                     self.Y_logits, self.Y = AudioDec(self.R, training=training) # (B, T/r, n_mels)
 72 |         else:  # num==2 & training. Note that during training,
 73 |             # the ground truth melspectrogram values are fed.
 74 |             with tf.variable_scope("SSRN"):
 75 |                 self.Z_logits, self.Z = SSRN(self.mels, training=training)
 76 | 
 77 |         if not training:
 78 |             # During inference, the predicted melspectrogram values are fed.
 79 |             with tf.variable_scope("SSRN"):
 80 |                 self.Z_logits, self.Z = SSRN(self.Y, training=training)
 81 | 
 82 |         with tf.variable_scope("gs"):
 83 |             self.global_step = tf.Variable(0, name='global_step', trainable=False)
 84 | 
 85 |         if training:
 86 |             if num==1: # Text2Mel
 87 |                 # mel L1 loss
 88 |                 self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels))
 89 | 
 90 |                 # mel binary divergence loss
 91 |                 self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.mels))
 92 | 
 93 |                 # guided_attention loss
 94 |                 self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N), (0, hp.max_T)], mode="CONSTANT", constant_values=-1.)[:, :hp.max_N, :hp.max_T]
 95 |                 self.attention_masks = tf.to_float(tf.not_equal(self.A, -1))
 96 |                 self.loss_att = tf.reduce_sum(tf.abs(self.A * self.gts) * self.attention_masks)
 97 |                 self.mask_sum = tf.reduce_sum(self.attention_masks)
 98 |                 self.loss_att /= self.mask_sum
 99 | 
100 |                 # total loss
101 |                 self.loss = self.loss_mels + self.loss_bd1 + self.loss_att
102 | 
103 |                 tf.summary.scalar('train/loss_mels', self.loss_mels)
104 |                 tf.summary.scalar('train/loss_bd1', self.loss_bd1)
105 |                 tf.summary.scalar('train/loss_att', self.loss_att)
106 |                 tf.summary.image('train/mel_gt', tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1))
107 |                 tf.summary.image('train/mel_hat', tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1))
108 |             else: # SSRN
109 |                 # mag L1 loss
110 |                 self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags))
111 | 
112 |                 # mag binary divergence loss
113 |                 self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.mags))
114 | 
115 |                 # total loss
116 |                 self.loss = self.loss_mags + self.loss_bd2
117 | 
118 |                 tf.summary.scalar('train/loss_mags', self.loss_mags)
119 |                 tf.summary.scalar('train/loss_bd2', self.loss_bd2)
120 |                 tf.summary.image('train/mag_gt', tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1))
121 |                 tf.summary.image('train/mag_hat', tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1))
122 | 
123 |             # Training Scheme
124 |             self.lr = learning_rate_decay(hp.lr, self.global_step)
125 |             tvars = tf.trainable_variables()
126 |             tvars_new = []
127 |             for tvar in hp.selected_tvars:    
128 |                 tvars_new = tvars_new + [var for var in tvars if tvar in var.name]
129 |             # tvars_new = [var for var in tvars if ('SSRN/C_13') in var.name] + \
130 |             #          [var for var in tvars if ('SSRN/C_14') in var.name] + \
131 |             #          [var for var in tvars if ('SSRN/C_15') in var.name] + \
132 |             #          [var for var in tvars if ('SSRN/C_16') in var.name] + \
133 |             #          [var for var in tvars if ('Text2Mel/TextEnc/HC_11') in var.name] + \
134 |             #          [var for var in tvars if ('Text2Mel/TextEnc/HC_12') in var.name] + \
135 |             #          [var for var in tvars if ('Text2Mel/TextEnc/HC_13') in var.name] + \
136 |             #          [var for var in tvars if ('Text2Mel/TextEnc/HC_14') in var.name] + \
137 |             #          [var for var in tvars if ('Text2Mel/TextEnc/HC_15') in var.name] + \
138 |             #          [var for var in tvars if ('Text2Mel/AudioEnc/HC_9') in var.name] + \
139 |             #          [var for var in tvars if ('Text2Mel/AudioEnc/HC_10') in var.name] + \
140 |             #          [var for var in tvars if ('Text2Mel/AudioEnc/HC_11') in var.name] + \
141 |             #          [var for var in tvars if ('Text2Mel/AudioEnc/HC_12') in var.name] + \
142 |             #          [var for var in tvars if ('Text2Mel/AudioEnc/HC_13') in var.name] + \
143 |             #          [var for var in tvars if ('Text2Mel/AudioDec/HC_7') in var.name] + \
144 |             #          [var for var in tvars if ('Text2Mel/AudioDec/C_8') in var.name] + \
145 |             #          [var for var in tvars if ('Text2Mel/AudioDec/C_9') in var.name] + \
146 |             #          [var for var in tvars if ('Text2Mel/AudioDec/C_10') in var.name] + \
147 |             #          [var for var in tvars if ('Text2Mel/AudioDec/C_11') in var.name] 
148 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
149 |             #self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
150 |             tf.summary.scalar("lr", self.lr)
151 | 
152 |             ## gradient clipping
153 |             self.gvs = self.optimizer.compute_gradients(self.loss, var_list=tvars_new)
154 |             self.clipped = []
155 |             for grad, var in self.gvs:
156 |                 grad = tf.clip_by_value(grad, -1., 1.)
157 |                 self.clipped.append((grad, var))
158 |                 self.train_op = self.optimizer.apply_gradients(self.clipped, global_step=self.global_step)
159 | 
160 |             # Summary
161 |             self.merged = tf.summary.merge_all()
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     # argument: 1 or 2. 1 for Text2mel, 2 for SSRN.
166 |     num = int(sys.argv[1])
167 | 
168 |     g = Graph(num=num); print("Training Graph loaded")
169 | 
170 |     logdir = hp.logdir + "-" + str(num)
171 |     sv = tf.train.Supervisor(logdir=logdir, save_model_secs=0, global_step=g.global_step)
172 |     #with sv.managed_session() as sess:
173 |     with sv.managed_session(config = tf.ConfigProto(allow_soft_placement=True)) as sess:
174 |         sv.saver.restore(sess, tf.train.latest_checkpoint(hp.restoredir + "-" + str(num)))
175 |         while 1:
176 |             for _ in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
177 |                 gs, _ = sess.run([g.global_step, g.train_op])
178 | 
179 |                 # Write checkpoint files at every 1k steps
180 |                 if gs % 1000 == 0:
181 |                     sv.saver.save(sess, logdir + '/model_gs_{}'.format(str(gs // 1000).zfill(3) + "k"))
182 | 
183 |                     if num==1:
184 |                         # plot alignment
185 |                         alignments = sess.run(g.alignments)
186 |                         plot_alignment(alignments[0], str(gs // 1000).zfill(3) + "k", logdir)
187 | 
188 |                 # break
189 |                 if gs > hp.num_iterations: break
190 | 
191 |     print("Done")
192 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function, division
  9 | 
 10 | import tensorflow as tf
 11 | 
 12 | 
 13 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
 14 |     '''Embeds a given tensor. 
 15 |     
 16 |     Args:
 17 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 18 |          to be looked up in `lookup table`.
 19 |       vocab_size: An int. Vocabulary size.
 20 |       num_units: An int. Number of embedding hidden units.
 21 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 22 |         should be constant zeros.
 23 |       scope: Optional scope for `variable_scope`.  
 24 |       reuse: Boolean, whether to reuse the weights of a previous layer
 25 |         by the same name.
 26 |         
 27 |     Returns:
 28 |       A `Tensor` with one more rank than inputs's. The last dimensionality
 29 |         should be `num_units`.
 30 |     '''
 31 |     with tf.variable_scope(scope, reuse=reuse):
 32 |         lookup_table = tf.get_variable('lookup_table', 
 33 |                                        dtype=tf.float32, 
 34 |                                        shape=[vocab_size, num_units],
 35 |                                        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
 36 |         if zero_pad:
 37 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
 38 |                                       lookup_table[1:, :]), 0)
 39 | 
 40 |         outputs = tf.nn.embedding_lookup(lookup_table, inputs)
 41 | 
 42 |     return outputs
 43 | 
 44 | 
 45 | def normalize(inputs,
 46 |               scope="normalize",
 47 |               reuse=None):
 48 |     '''Applies layer normalization that normalizes along the last axis.
 49 | 
 50 |     Args:
 51 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 52 |         `batch_size`. The normalization is over the last dimension.
 53 |       scope: Optional scope for `variable_scope`.
 54 |       reuse: Boolean, whether to reuse the weights of a previous layer
 55 |         by the same name.
 56 | 
 57 |     Returns:
 58 |       A tensor with the same shape and data dtype as `inputs`.
 59 |     '''
 60 |     outputs = tf.contrib.layers.layer_norm(inputs,
 61 |                                            begin_norm_axis=-1,
 62 |                                            scope=scope,
 63 |                                            reuse=reuse)
 64 |     return outputs
 65 | 
 66 | 
 67 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
 68 |     '''Highway networks, see https://arxiv.org/abs/1505.00387
 69 | 
 70 |     Args:
 71 |       inputs: A 3D tensor of shape [N, T, W].
 72 |       num_units: An int or `None`. Specifies the number of units in the highway layer
 73 |              or uses the input size if `None`.
 74 |       scope: Optional scope for `variable_scope`.
 75 |       reuse: Boolean, whether to reuse the weights of a previous layer
 76 |         by the same name.
 77 | 
 78 |     Returns:
 79 |       A 3D tensor of shape [N, T, W].
 80 |     '''
 81 |     if not num_units:
 82 |         num_units = inputs.get_shape()[-1]
 83 | 
 84 |     with tf.variable_scope(scope, reuse=reuse):
 85 |         H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
 86 |         T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid,
 87 |                             bias_initializer=tf.constant_initializer(-1.0), name="dense2")
 88 |         outputs = H * T + inputs * (1. - T)
 89 |     return outputs
 90 | 
 91 | def conv1d(inputs,
 92 |            filters=None,
 93 |            size=1,
 94 |            rate=1,
 95 |            padding="SAME",
 96 |            dropout_rate=0,
 97 |            use_bias=True,
 98 |            activation_fn=None,
 99 |            training=True,
100 |            scope="conv1d",
101 |            reuse=None):
102 |     '''
103 |     Args:
104 |       inputs: A 3-D tensor with shape of [batch, time, depth].
105 |       filters: An int. Number of outputs (=activation maps)
106 |       size: An int. Filter size.
107 |       rate: An int. Dilation rate.
108 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
109 |       dropout_rate: A float of [0, 1].
110 |       use_bias: A boolean.
111 |       activation_fn: A string.
112 |       training: A boolean. If True, dropout is applied.
113 |       scope: Optional scope for `variable_scope`.
114 |       reuse: Boolean, whether to reuse the weights of a previous layer
115 |         by the same name.
116 | 
117 |     Returns:
118 |       A masked tensor of the same shape and dtypes as `inputs`.
119 |     '''
120 |     with tf.variable_scope(scope):
121 |         if padding.lower() == "causal":
122 |             # pre-padding for causality
123 |             pad_len = (size - 1) * rate  # padding size
124 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
125 |             padding = "valid"
126 | 
127 |         if filters is None:
128 |             filters = inputs.get_shape().as_list()[-1]
129 | 
130 |         params = {"inputs": inputs, "filters": filters, "kernel_size": size,
131 |                   "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
132 |                   "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse}
133 | 
134 |         tensor = tf.layers.conv1d(**params)
135 |         tensor = normalize(tensor)
136 |         if activation_fn is not None:
137 |             tensor = activation_fn(tensor)
138 | 
139 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
140 | 
141 |     return tensor
142 | 
143 | def hc(inputs,
144 |        filters=None,
145 |        size=1,
146 |        rate=1,
147 |        padding="SAME",
148 |        dropout_rate=0,
149 |        use_bias=True,
150 |        activation_fn=None,
151 |        training=True,
152 |        scope="hc",
153 |        reuse=None):
154 |     '''
155 |     Args:
156 |       inputs: A 3-D tensor with shape of [batch, time, depth].
157 |       filters: An int. Number of outputs (=activation maps)
158 |       size: An int. Filter size.
159 |       rate: An int. Dilation rate.
160 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
161 |       use_bias: A boolean.
162 |       activation_fn: A string.
163 |       training: A boolean. If True, dropout is applied.
164 |       scope: Optional scope for `variable_scope`.
165 |       reuse: Boolean, whether to reuse the weights of a previous layer
166 |         by the same name.
167 | 
168 |     Returns:
169 |       A masked tensor of the same shape and dtypes as `inputs`.
170 |     '''
171 |     _inputs = inputs
172 |     with tf.variable_scope(scope):
173 |         if padding.lower() == "causal":
174 |             # pre-padding for causality
175 |             pad_len = (size - 1) * rate  # padding size
176 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
177 |             padding = "valid"
178 | 
179 |         if filters is None:
180 |             filters = inputs.get_shape().as_list()[-1]
181 | 
182 | 
183 |         params = {"inputs": inputs, "filters": 2*filters, "kernel_size": size,
184 |                   "dilation_rate": rate, "padding": padding, "use_bias": use_bias,
185 |                   "kernel_initializer": tf.contrib.layers.variance_scaling_initializer(), "reuse": reuse}
186 | 
187 |         tensor = tf.layers.conv1d(**params)
188 |         H1, H2 = tf.split(tensor, 2, axis=-1)
189 |         H1 = normalize(H1, scope="H1")
190 |         H2 = normalize(H2, scope="H2")
191 |         H1 = tf.nn.sigmoid(H1, "gate")
192 |         H2 = activation_fn(H2, "info") if activation_fn is not None else H2
193 |         tensor = H1*H2 + (1.-H1)*_inputs
194 | 
195 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
196 | 
197 |     return tensor
198 | 
199 | def conv1d_transpose(inputs,
200 |                      filters=None,
201 |                      size=3,
202 |                      stride=2,
203 |                      padding='same',
204 |                      dropout_rate=0,
205 |                      use_bias=True,
206 |                      activation=None,
207 |                      training=True,
208 |                      scope="conv1d_transpose",
209 |                      reuse=None):
210 |     '''
211 |         Args:
212 |           inputs: A 3-D tensor with shape of [batch, time, depth].
213 |           filters: An int. Number of outputs (=activation maps)
214 |           size: An int. Filter size.
215 |           rate: An int. Dilation rate.
216 |           padding: Either `same` or `valid` or `causal` (case-insensitive).
217 |           dropout_rate: A float of [0, 1].
218 |           use_bias: A boolean.
219 |           activation_fn: A string.
220 |           training: A boolean. If True, dropout is applied.
221 |           scope: Optional scope for `variable_scope`.
222 |           reuse: Boolean, whether to reuse the weights of a previous layer
223 |             by the same name.
224 | 
225 |         Returns:
226 |           A tensor of the shape with [batch, time*2, depth].
227 |         '''
228 |     with tf.variable_scope(scope, reuse=reuse):
229 |         if filters is None:
230 |             filters = inputs.get_shape().as_list()[-1]
231 |         inputs = tf.expand_dims(inputs, 1)
232 |         tensor = tf.layers.conv2d_transpose(inputs,
233 |                                    filters=filters,
234 |                                    kernel_size=(1, size),
235 |                                    strides=(1, stride),
236 |                                    padding=padding,
237 |                                    activation=None,
238 |                                    kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
239 |                                    use_bias=use_bias)
240 |         tensor = tf.squeeze(tensor, 1)
241 |         tensor = normalize(tensor)
242 |         if activation is not None:
243 |             tensor = activation(tensor)
244 | 
245 |         tensor = tf.layers.dropout(tensor, rate=dropout_rate, training=training)
246 | 
247 |     return tensor
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/dc_tts
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | from hyperparams import Hyperparams as hp
 11 | from modules import *
 12 | import tensorflow as tf
 13 | 
 14 | def TextEnc(L, training=True):
 15 |     '''
 16 |     Args:
 17 |       L: Text inputs. (B, N)
 18 | 
 19 |     Return:
 20 |         K: Keys. (B, N, d)
 21 |         V: Values. (B, N, d)
 22 |     '''
 23 |     i = 1
 24 |     tensor = embed(L,
 25 |                    vocab_size=len(hp.vocab),
 26 |                    num_units=hp.e,
 27 |                    scope="embed_{}".format(i)); i += 1
 28 |     tensor = conv1d(tensor,
 29 |                     filters=2*hp.d,
 30 |                     size=1,
 31 |                     rate=1,
 32 |                     dropout_rate=hp.dropout_rate,
 33 |                     activation_fn=tf.nn.relu,
 34 |                     training=training,
 35 |                     scope="C_{}".format(i)); i += 1
 36 |     tensor = conv1d(tensor,
 37 |                     size=1,
 38 |                     rate=1,
 39 |                     dropout_rate=hp.dropout_rate,
 40 |                     training=training,
 41 |                     scope="C_{}".format(i)); i += 1
 42 | 
 43 |     for _ in range(2):
 44 |         for j in range(4):
 45 |             tensor = hc(tensor,
 46 |                             size=3,
 47 |                             rate=3**j,
 48 |                             dropout_rate=hp.dropout_rate,
 49 |                             activation_fn=None,
 50 |                             training=training,
 51 |                             scope="HC_{}".format(i)); i += 1
 52 |     for _ in range(2):
 53 |         tensor = hc(tensor,
 54 |                         size=3,
 55 |                         rate=1,
 56 |                         dropout_rate=hp.dropout_rate,
 57 |                         activation_fn=None,
 58 |                         training=training,
 59 |                         scope="HC_{}".format(i)); i += 1
 60 | 
 61 |     for _ in range(2):
 62 |         tensor = hc(tensor,
 63 |                         size=1,
 64 |                         rate=1,
 65 |                         dropout_rate=hp.dropout_rate,
 66 |                         activation_fn=None,
 67 |                         training=training,
 68 |                         scope="HC_{}".format(i)); i += 1
 69 | 
 70 |     K, V = tf.split(tensor, 2, -1)
 71 |     return K, V
 72 | 
 73 | def AudioEnc(S, training=True):
 74 |     '''
 75 |     Args:
 76 |       S: melspectrogram. (B, T/r, n_mels)
 77 | 
 78 |     Returns
 79 |       Q: Queries. (B, T/r, d)
 80 |     '''
 81 |     i = 1
 82 |     tensor = conv1d(S,
 83 |                     filters=hp.d,
 84 |                     size=1,
 85 |                     rate=1,
 86 |                     padding="CAUSAL",
 87 |                     dropout_rate=hp.dropout_rate,
 88 |                     activation_fn=tf.nn.relu,
 89 |                     training=training,
 90 |                     scope="C_{}".format(i)); i += 1
 91 |     tensor = conv1d(tensor,
 92 |                     size=1,
 93 |                     rate=1,
 94 |                     padding="CAUSAL",
 95 |                     dropout_rate=hp.dropout_rate,
 96 |                     activation_fn=tf.nn.relu,
 97 |                     training=training,
 98 |                     scope="C_{}".format(i)); i += 1
 99 |     tensor = conv1d(tensor,
100 |                     size=1,
101 |                     rate=1,
102 |                     padding="CAUSAL",
103 |                     dropout_rate=hp.dropout_rate,
104 |                     training=training,
105 |                     scope="C_{}".format(i)); i += 1
106 |     for _ in range(2):
107 |         for j in range(4):
108 |             tensor = hc(tensor,
109 |                             size=3,
110 |                             rate=3**j,
111 |                             padding="CAUSAL",
112 |                             dropout_rate=hp.dropout_rate,
113 |                             training=training,
114 |                             scope="HC_{}".format(i)); i += 1
115 |     for _ in range(2):
116 |         tensor = hc(tensor,
117 |                         size=3,
118 |                         rate=3,
119 |                         padding="CAUSAL",
120 |                         dropout_rate=hp.dropout_rate,
121 |                         training=training,
122 |                         scope="HC_{}".format(i)); i += 1
123 | 
124 |     return tensor
125 | 
126 | def Attention(Q, K, V, mononotic_attention=False, prev_max_attentions=None):
127 |     '''
128 |     Args:
129 |       Q: Queries. (B, T/r, d)
130 |       K: Keys. (B, N, d)
131 |       V: Values. (B, N, d)
132 |       mononotic_attention: A boolean. At training, it is False.
133 |       prev_max_attentions: (B,). At training, it is set to None.
134 | 
135 |     Returns:
136 |       R: [Context Vectors; Q]. (B, T/r, 2d)
137 |       alignments: (B, N, T/r)
138 |       max_attentions: (B, T/r)
139 |     '''
140 |     A = tf.matmul(Q, K, transpose_b=True) * tf.rsqrt(tf.to_float(hp.d))
141 |     if mononotic_attention:  # for inference
142 |         key_masks = tf.sequence_mask(prev_max_attentions, hp.max_N)
143 |         reverse_masks = tf.sequence_mask(hp.max_N - hp.attention_win_size - prev_max_attentions, hp.max_N)[:, ::-1]
144 |         masks = tf.logical_or(key_masks, reverse_masks)
145 |         masks = tf.tile(tf.expand_dims(masks, 1), [1, hp.max_T, 1])
146 |         paddings = tf.ones_like(A) * (-2 ** 32 + 1)  # (B, T/r, N)
147 |         A = tf.where(tf.equal(masks, False), A, paddings)
148 |     A = tf.nn.softmax(A) # (B, T/r, N)
149 |     max_attentions = tf.argmax(A, -1)  # (B, T/r)
150 |     R = tf.matmul(A, V)
151 |     R = tf.concat((R, Q), -1)
152 | 
153 |     alignments = tf.transpose(A, [0, 2, 1]) # (B, N, T/r)
154 | 
155 |     return R, alignments, max_attentions
156 | 
157 | def AudioDec(R, training=True):
158 |     '''
159 |     Args:
160 |       R: [Context Vectors; Q]. (B, T/r, 2d)
161 | 
162 |     Returns:
163 |       Y: Melspectrogram predictions. (B, T/r, n_mels)
164 |     '''
165 | 
166 |     i = 1
167 |     tensor = conv1d(R,
168 |                     filters=hp.d,
169 |                     size=1,
170 |                     rate=1,
171 |                     padding="CAUSAL",
172 |                     dropout_rate=hp.dropout_rate,
173 |                     training=training,
174 |                     scope="C_{}".format(i)); i += 1
175 |     for j in range(4):
176 |         tensor = hc(tensor,
177 |                         size=3,
178 |                         rate=3**j,
179 |                         padding="CAUSAL",
180 |                         dropout_rate=hp.dropout_rate,
181 |                         training=training,
182 |                         scope="HC_{}".format(i)); i += 1
183 | 
184 |     for _ in range(2):
185 |         tensor = hc(tensor,
186 |                         size=3,
187 |                         rate=1,
188 |                         padding="CAUSAL",
189 |                         dropout_rate=hp.dropout_rate,
190 |                         training=training,
191 |                         scope="HC_{}".format(i)); i += 1
192 |     for _ in range(3):
193 |         tensor = conv1d(tensor,
194 |                         size=1,
195 |                         rate=1,
196 |                         padding="CAUSAL",
197 |                         dropout_rate=hp.dropout_rate,
198 |                         activation_fn=tf.nn.relu,
199 |                         training=training,
200 |                         scope="C_{}".format(i)); i += 1
201 |     # mel_hats
202 |     logits = conv1d(tensor,
203 |                     filters=hp.n_mels,
204 |                     size=1,
205 |                     rate=1,
206 |                     padding="CAUSAL",
207 |                     dropout_rate=hp.dropout_rate,
208 |                     training=training,
209 |                     scope="C_{}".format(i)); i += 1
210 |     Y = tf.nn.sigmoid(logits) # mel_hats
211 | 
212 |     return logits, Y
213 | 
214 | def SSRN(Y, training=True):
215 |     '''
216 |     Args:
217 |       Y: Melspectrogram Predictions. (B, T/r, n_mels)
218 | 
219 |     Returns:
220 |       Z: Spectrogram Predictions. (B, T, 1+n_fft/2)
221 |     '''
222 | 
223 |     i = 1 # number of layers
224 | 
225 |     # -> (B, T/r, c)
226 |     tensor = conv1d(Y,
227 |                     filters=hp.c,
228 |                     size=1,
229 |                     rate=1,
230 |                     dropout_rate=hp.dropout_rate,
231 |                     training=training,
232 |                     scope="C_{}".format(i)); i += 1
233 |     for j in range(2):
234 |         tensor = hc(tensor,
235 |                       size=3,
236 |                       rate=3**j,
237 |                       dropout_rate=hp.dropout_rate,
238 |                       training=training,
239 |                       scope="HC_{}".format(i)); i += 1
240 |     for _ in range(2):
241 |         # -> (B, T/2, c) -> (B, T, c)
242 |         tensor = conv1d_transpose(tensor,
243 |                                   scope="D_{}".format(i),
244 |                                   dropout_rate=hp.dropout_rate,
245 |                                   training=training,); i += 1
246 |         for j in range(2):
247 |             tensor = hc(tensor,
248 |                             size=3,
249 |                             rate=3**j,
250 |                             dropout_rate=hp.dropout_rate,
251 |                             training=training,
252 |                             scope="HC_{}".format(i)); i += 1
253 |     # -> (B, T, 2*c)
254 |     tensor = conv1d(tensor,
255 |                     filters=2*hp.c,
256 |                     size=1,
257 |                     rate=1,
258 |                     dropout_rate=hp.dropout_rate,
259 |                     training=training,
260 |                     scope="C_{}".format(i)); i += 1
261 |     for _ in range(2):
262 |         tensor = hc(tensor,
263 |                         size=3,
264 |                         rate=1,
265 |                         dropout_rate=hp.dropout_rate,
266 |                         training=training,
267 |                         scope="HC_{}".format(i)); i += 1
268 |     # -> (B, T, 1+n_fft/2)
269 |     tensor = conv1d(tensor,
270 |                     filters=1+hp.n_fft//2,
271 |                     size=1,
272 |                     rate=1,
273 |                     dropout_rate=hp.dropout_rate,
274 |                     training=training,
275 |                     scope="C_{}".format(i)); i += 1
276 | 
277 |     for _ in range(2):
278 |         tensor = conv1d(tensor,
279 |                         size=1,
280 |                         rate=1,
281 |                         dropout_rate=hp.dropout_rate,
282 |                         activation_fn=tf.nn.relu,
283 |                         training=training,
284 |                         scope="C_{}".format(i)); i += 1
285 |     logits = conv1d(tensor,
286 |                size=1,
287 |                rate=1,
288 |                dropout_rate=hp.dropout_rate,
289 |                training=training,
290 |                scope="C_{}".format(i))
291 |     Z = tf.nn.sigmoid(logits)
292 |     return logits, Z
293 | 


--------------------------------------------------------------------------------