├── .gitignore
├── LICENSE
├── README.md
├── datasets
├── __init__.py
├── audio.py
├── preprocessor.py
└── wavenet_preprocessor.py
├── docker
└── Dockerfile
├── griffin_lim_synthesis_tool.ipynb
├── hparams.py
├── infolog.py
├── paper_hparams.py
├── papers
├── (content+location) attention.pdf
├── ClariNet.pdf
├── Tacotron 2 revised.pdf
├── bahdanau (content) attention.pdf
├── deepvoice 3.pdf
├── effective approaches attention.pdf
├── fast_wavenet.pdf
├── tacotron.pdf
├── tacotron2.pdf
└── wavenet.pdf
├── preprocess.py
├── requirements.txt
├── sentences.txt
├── synthesize.py
├── tacotron
├── __init__.py
├── feeder.py
├── models
│ ├── Architecture_wrappers.py
│ ├── __init__.py
│ ├── attention.py
│ ├── custom_decoder.py
│ ├── helpers.py
│ ├── modules.py
│ └── tacotron.py
├── synthesize.py
├── synthesizer.py
├── train.py
└── utils
│ ├── __init__.py
│ ├── cleaners.py
│ ├── cmudict.py
│ ├── numbers.py
│ ├── plot.py
│ ├── symbols.py
│ └── text.py
├── test_wavenet_feeder.py
├── train.py
├── wavenet_preprocess.py
└── wavenet_vocoder
├── __init__.py
├── feeder.py
├── models
├── __init__.py
├── gaussian.py
├── mixture.py
├── modules.py
└── wavenet.py
├── synthesize.py
├── synthesizer.py
├── train.py
└── util.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # Tacotron 2 oddities
107 | logs-*/
108 | training_data/
109 |
110 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Rayhane Mama
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tacotron-2:
2 | Tensorflow implementation of DeepMind's Tacotron-2. A deep neural network architecture described in this paper: [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
3 |
4 | This Repository contains additional improvements and attempts over the paper, we thus propose **paper_hparams.py** file which holds the exact hyperparameters to reproduce the paper results without any additional extras.
5 |
6 | Suggested **hparams.py** file which is default in use, contains the hyperparameters with extras that proved to provide better results in most cases. Feel free to toy with the parameters as needed.
7 |
8 | DIFFERENCES WILL BE HIGHLIGHTED IN DOCUMENTATION SHORTLY.
9 |
10 |
11 | # Repository Structure:
12 | Tacotron-2
13 | ├── datasets
14 | ├── en_UK (0)
15 | │ └── by_book
16 | │ └── female
17 | ├── en_US (0)
18 | │ └── by_book
19 | │ ├── female
20 | │ └── male
21 | ├── LJSpeech-1.1 (0)
22 | │ └── wavs
23 | ├── logs-Tacotron (2)
24 | │ ├── eval_-dir
25 | │ │ ├── plots
26 | │ │ └── wavs
27 | │ ├── mel-spectrograms
28 | │ ├── plots
29 | │ ├── taco_pretrained
30 | │ ├── metas
31 | │ └── wavs
32 | ├── logs-Wavenet (4)
33 | │ ├── eval-dir
34 | │ │ ├── plots
35 | │ │ └── wavs
36 | │ ├── plots
37 | │ ├── wave_pretrained
38 | │ ├── metas
39 | │ └── wavs
40 | ├── logs-Tacotron-2 ( * )
41 | │ ├── eval-dir
42 | │ │ ├── plots
43 | │ │ └── wavs
44 | │ ├── plots
45 | │ ├── taco_pretrained
46 | │ ├── wave_pretrained
47 | │ ├── metas
48 | │ └── wavs
49 | ├── papers
50 | ├── tacotron
51 | │ ├── models
52 | │ └── utils
53 | ├── tacotron_output (3)
54 | │ ├── eval
55 | │ ├── gta
56 | │ ├── logs-eval
57 | │ │ ├── plots
58 | │ │ └── wavs
59 | │ └── natural
60 | ├── wavenet_output (5)
61 | │ ├── plots
62 | │ └── wavs
63 | ├── training_data (1)
64 | │ ├── audio
65 | │ ├── linear
66 | │ └── mels
67 | └── wavenet_vocoder
68 | └── models
69 |
70 |
71 | The previous tree shows the current state of the repository (separate training, one step at a time).
72 |
73 | - Step **(0)**: Get your dataset, here I have set the examples of **Ljspeech**, **en_US** and **en_UK** (from **M-AILABS**).
74 | - Step **(1)**: Preprocess your data. This will give you the **training_data** folder.
75 | - Step **(2)**: Train your Tacotron model. Yields the **logs-Tacotron** folder.
76 | - Step **(3)**: Synthesize/Evaluate the Tacotron model. Gives the **tacotron_output** folder.
77 | - Step **(4)**: Train your Wavenet model. Yield the **logs-Wavenet** folder.
78 | - Step **(5)**: Synthesize audio using the Wavenet model. Gives the **wavenet_output** folder.
79 |
80 | - Note: Steps 2, 3, and 4 can be made with a simple run for both Tacotron and WaveNet (Tacotron-2, step ( * )).
81 |
82 |
83 | Note:
84 | - **Our preprocessing only supports Ljspeech and Ljspeech-like datasets (M-AILABS speech data)!** If running on datasets stored differently, you will probably need to make your own preprocessing script.
85 | - In the previous tree, files **were not represented** and **max depth was set to 3** for simplicity.
86 | - If you run training of both **models at the same time**, repository structure will be different.
87 |
88 | # Pretrained model and Samples:
89 | Pre-trained models and audio samples will be added at a later date. You can however check some primary insights of the model performance (at early stages of training) [here](https://github.com/Rayhane-mamah/Tacotron-2/issues/4#issuecomment-378741465). THIS IS VERY OUTDATED, I WILL UPDATE THIS SOON
90 |
91 | # Model Architecture:
92 |
93 |
94 |
95 |
96 | The model described by the authors can be divided in two parts:
97 | - Spectrogram prediction network
98 | - Wavenet vocoder
99 |
100 | To have an in-depth exploration of the model architecture, training procedure and preprocessing logic, refer to [our wiki](https://github.com/Rayhane-mamah/Tacotron-2/wiki)
101 |
102 | # Current state:
103 |
104 | To have an overview of our advance on this project, please refer to [this discussion](https://github.com/Rayhane-mamah/Tacotron-2/issues/4)
105 |
106 | since the two parts of the global model are trained separately, we can start by training the feature prediction model to use his predictions later during the wavenet training.
107 |
108 | # How to start
109 | - **Machine Setup:**
110 |
111 | First, you need to have python 3 installed along with [Tensorflow](https://www.tensorflow.org/install/).
112 |
113 | Next, you need to install some Linux dependencies to ensure audio libraries work properly:
114 |
115 | > apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools
116 |
117 | Finally, you can install the requirements. If you are an Anaconda user: (else replace **pip** with **pip3** and **python** with **python3**)
118 |
119 | > pip install -r requirements.txt
120 |
121 | - **Docker:**
122 |
123 | Alternatively, one can build the **docker image** to ensure everything is setup automatically and use the project inside the docker containers.
124 | **Dockerfile is insider "docker" folder**
125 |
126 | docker image can be built with:
127 |
128 | > docker build -t tacotron-2_image docker/
129 |
130 | Then containers are runnable with:
131 |
132 | > docker run -i --name new_container tacotron-2_image
133 |
134 | Please report any issues with the Docker usage with our models, I'll get to it. Thanks!
135 |
136 | # Dataset:
137 | We tested the code above on the [ljspeech dataset](https://keithito.com/LJ-Speech-Dataset/), which has almost 24 hours of labeled single actress voice recording. (further info on the dataset are available in the README file when you download it)
138 |
139 | We are also running current tests on the [new M-AILABS speech dataset](http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) which contains more than 700h of speech (more than 80 Gb of data) for more than 10 languages.
140 |
141 | After **downloading** the dataset, **extract** the compressed file, and **place the folder inside the cloned repository.**
142 |
143 | # Hparams setting:
144 | Before proceeding, you must pick the hyperparameters that suit best your needs. While it is possible to change the hyper parameters from command line during preprocessing/training, I still recommend making the changes once and for all on the **hparams.py** file directly.
145 |
146 | To pick optimal fft parameters, I have made a **griffin_lim_synthesis_tool** notebook that you can use to invert real extracted mel/linear spectrograms and choose how good your preprocessing is. All other options are well explained in the **hparams.py** and have meaningful names so that you can try multiple things with them.
147 |
148 | AWAIT DOCUMENTATION ON HPARAMS SHORTLY!!
149 |
150 | # Preprocessing
151 | Before running the following steps, please make sure you are inside **Tacotron-2 folder**
152 |
153 | > cd Tacotron-2
154 |
155 | Preprocessing can then be started using:
156 |
157 | > python preprocess.py
158 |
159 | dataset can be chosen using the **--dataset** argument. If using M-AILABS dataset, you need to provide the **language, voice, reader, merge_books and book arguments** for your custom need. Default is **Ljspeech**.
160 |
161 | Example M-AILABS:
162 |
163 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=False --book='northandsouth'
164 |
165 | or if you want to use all books for a single speaker:
166 |
167 | > python preprocess.py --dataset='M-AILABS' --language='en_US' --voice='female' --reader='mary_ann' --merge_books=True
168 |
169 | This should take no longer than a **few minutes.**
170 |
171 | # Training:
172 | To **train both models** sequentially (one after the other):
173 |
174 | > python train.py --model='Tacotron-2'
175 |
176 |
177 | Feature prediction model can **separately** be **trained** using:
178 |
179 | > python train.py --model='Tacotron'
180 |
181 | checkpoints will be made each **5000 steps** and stored under **logs-Tacotron folder.**
182 |
183 | Naturally, **training the wavenet separately** is done by:
184 |
185 | > python train.py --model='WaveNet'
186 |
187 | logs will be stored inside **logs-Wavenet**.
188 |
189 | **Note:**
190 | - If model argument is not provided, training will default to Tacotron-2 model training. (both models)
191 | - Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
192 | - It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**.
193 |
194 | # Synthesis
195 | To **synthesize audio** in an **End-to-End** (text to audio) manner (both models at work):
196 |
197 | > python synthesize.py --model='Tacotron-2'
198 |
199 | For the spectrogram prediction network (separately), there are **three types** of mel spectrograms synthesis:
200 |
201 | - **Evaluation** (synthesis on custom sentences). This is what we'll usually use after having a full end to end model.
202 |
203 | > python synthesize.py --model='Tacotron'
204 |
205 | - **Natural synthesis** (let the model make predictions alone by feeding last decoder output to the next time step).
206 |
207 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=False
208 |
209 |
210 | - **Ground Truth Aligned synthesis** (DEFAULT: the model is assisted by true labels in a teacher forcing manner). This synthesis method is used when predicting mel spectrograms used to train the wavenet vocoder. (yields better results as stated in the paper)
211 |
212 | > python synthesize.py --model='Tacotron' --mode='synthesis' --GTA=True
213 |
214 | Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectrograms (separately) can be done with:
215 |
216 | > python synthesize.py --model='WaveNet'
217 |
218 | **Note:**
219 | - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
220 | - Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
221 |
222 |
223 | # References and Resources:
224 | - [Natural TTS synthesis by conditioning Wavenet on MEL spectogram predictions](https://arxiv.org/pdf/1712.05884.pdf)
225 | - [Original tacotron paper](https://arxiv.org/pdf/1703.10135.pdf)
226 | - [Attention-Based Models for Speech Recognition](https://arxiv.org/pdf/1506.07503.pdf)
227 | - [Wavenet: A generative model for raw audio](https://arxiv.org/pdf/1609.03499.pdf)
228 | - [Fast Wavenet](https://arxiv.org/pdf/1611.09482.pdf)
229 | - [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder)
230 | - [keithito/tacotron](https://github.com/keithito/tacotron)
231 |
232 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/datasets/audio.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import librosa.filters
3 | import numpy as np
4 | import tensorflow as tf
5 | from scipy import signal
6 | from scipy.io import wavfile
7 |
8 |
9 | def load_wav(path, sr):
10 | return librosa.core.load(path, sr=sr)[0]
11 |
12 | def save_wav(wav, path, sr):
13 | wav *= 32767 / max(0.01, np.max(np.abs(wav)))
14 | #proposed by @dsmiller
15 | wavfile.write(path, sr, wav.astype(np.int16))
16 |
17 | def save_wavenet_wav(wav, path, sr, inv_preemphasize, k):
18 | # wav = inv_preemphasis(wav, k, inv_preemphasize)
19 | wav *= 32767 / max(0.01, np.max(np.abs(wav)))
20 | wavfile.write(path, sr, wav.astype(np.int16))
21 |
22 | def preemphasis(wav, k, preemphasize=True):
23 | if preemphasize:
24 | return signal.lfilter([1, -k], [1], wav)
25 | return wav
26 |
27 | def inv_preemphasis(wav, k, inv_preemphasize=True):
28 | if inv_preemphasize:
29 | return signal.lfilter([1], [1, -k], wav)
30 | return wav
31 |
32 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
33 | def start_and_end_indices(quantized, silence_threshold=2):
34 | for start in range(quantized.size):
35 | if abs(quantized[start] - 127) > silence_threshold:
36 | break
37 | for end in range(quantized.size - 1, 1, -1):
38 | if abs(quantized[end] - 127) > silence_threshold:
39 | break
40 |
41 | assert abs(quantized[start] - 127) > silence_threshold
42 | assert abs(quantized[end] - 127) > silence_threshold
43 |
44 | return start, end
45 |
46 | def trim_silence(wav, hparams):
47 | '''Trim leading and trailing silence
48 |
49 | Useful for M-AILABS dataset if we choose to trim the extra 0.5 silence at beginning and end.
50 | '''
51 | #Thanks @begeekmyfriend and @lautjy for pointing out the params contradiction. These params are separate and tunable per dataset.
52 | return librosa.effects.trim(wav, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
53 |
54 | def get_hop_size(hparams):
55 | hop_size = hparams.hop_size
56 | if hop_size is None:
57 | assert hparams.frame_shift_ms is not None
58 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
59 | return hop_size
60 |
61 | def linearspectrogram(wav, hparams):
62 | # D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
63 | D = _stft(wav, hparams)
64 | S = _amp_to_db(np.abs(D)**hparams.magnitude_power, hparams) - hparams.ref_level_db
65 |
66 | if hparams.signal_normalization:
67 | return _normalize(S, hparams)
68 | return S
69 |
70 | def melspectrogram(wav, hparams):
71 | # D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
72 | D = _stft(wav, hparams)
73 | S = _amp_to_db(_linear_to_mel(np.abs(D)**hparams.magnitude_power, hparams), hparams) - hparams.ref_level_db
74 |
75 | if hparams.signal_normalization:
76 | return _normalize(S, hparams)
77 | return S
78 |
79 | def inv_linear_spectrogram(linear_spectrogram, hparams):
80 | '''Converts linear spectrogram to waveform using librosa'''
81 | if hparams.signal_normalization:
82 | D = _denormalize(linear_spectrogram, hparams)
83 | else:
84 | D = linear_spectrogram
85 |
86 | S = _db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power) #Convert back to linear
87 |
88 | if hparams.use_lws:
89 | processor = _lws_processor(hparams)
90 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
91 | y = processor.istft(D).astype(np.float32)
92 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
93 | else:
94 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
95 |
96 |
97 | def inv_mel_spectrogram(mel_spectrogram, hparams):
98 | '''Converts mel spectrogram to waveform using librosa'''
99 | if hparams.signal_normalization:
100 | D = _denormalize(mel_spectrogram, hparams)
101 | else:
102 | D = mel_spectrogram
103 |
104 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db)**(1/hparams.magnitude_power), hparams) # Convert back to linear
105 |
106 | if hparams.use_lws:
107 | processor = _lws_processor(hparams)
108 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
109 | y = processor.istft(D).astype(np.float32)
110 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
111 | else:
112 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
113 |
114 | ###########################################################################################
115 | # tensorflow Griffin-Lim
116 | # Thanks to @begeekmyfriend: https://github.com/begeekmyfriend/Tacotron-2/blob/mandarin-new/datasets/audio.py
117 |
118 | def inv_linear_spectrogram_tensorflow(spectrogram, hparams):
119 | '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
120 | Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
121 | inv_preemphasis on the output after running the graph.
122 | '''
123 | if hparams.signal_normalization:
124 | D = _denormalize_tensorflow(spectrogram, hparams)
125 | else:
126 | D = linear_spectrogram
127 |
128 | S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power))
129 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams)
130 |
131 | def inv_mel_spectrogram_tensorflow(mel_spectrogram, hparams):
132 | '''Builds computational graph to convert mel spectrogram to waveform using TensorFlow.
133 | Unlike inv_mel_spectrogram, this does NOT invert the preemphasis. The caller should call
134 | inv_preemphasis on the output after running the graph.
135 | '''
136 | if hparams.signal_normalization:
137 | D = _denormalize_tensorflow(mel_spectrogram, hparams)
138 | else:
139 | D = mel_spectrogram
140 |
141 | S = tf.pow(_db_to_amp_tensorflow(D + hparams.ref_level_db), (1/hparams.magnitude_power))
142 | S = _mel_to_linear_tensorflow(S, hparams) # Convert back to linear
143 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power), hparams)
144 |
145 | ###########################################################################################
146 |
147 | def _lws_processor(hparams):
148 | import lws
149 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
150 |
151 | def _griffin_lim(S, hparams):
152 | '''librosa implementation of Griffin-Lim
153 | Based on https://github.com/librosa/librosa/issues/434
154 | '''
155 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
156 | S_complex = np.abs(S).astype(np.complex)
157 | y = _istft(S_complex * angles, hparams)
158 | for i in range(hparams.griffin_lim_iters):
159 | angles = np.exp(1j * np.angle(_stft(y, hparams)))
160 | y = _istft(S_complex * angles, hparams)
161 | return y
162 |
163 | def _griffin_lim_tensorflow(S, hparams):
164 | '''TensorFlow implementation of Griffin-Lim
165 | Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
166 | '''
167 | with tf.variable_scope('griffinlim'):
168 | # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
169 | S = tf.expand_dims(S, 0)
170 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
171 | y = tf.contrib.signal.inverse_stft(S_complex, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
172 | for i in range(hparams.griffin_lim_iters):
173 | est = tf.contrib.signal.stft(y, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
174 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
175 | y = tf.contrib.signal.inverse_stft(S_complex * angles, hparams.win_size, get_hop_size(hparams), hparams.n_fft)
176 | return tf.squeeze(y, 0)
177 |
178 | def _stft(y, hparams):
179 | if hparams.use_lws:
180 | return _lws_processor(hparams).stft(y).T
181 | else:
182 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size, pad_mode='constant')
183 |
184 | def _istft(y, hparams):
185 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
186 |
187 | ##########################################################
188 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
189 | def num_frames(length, fsize, fshift):
190 | """Compute number of time frames of spectrogram
191 | """
192 | pad = (fsize - fshift)
193 | if length % fshift == 0:
194 | M = (length + pad * 2 - fsize) // fshift + 1
195 | else:
196 | M = (length + pad * 2 - fsize) // fshift + 2
197 | return M
198 |
199 |
200 | def pad_lr(x, fsize, fshift):
201 | """Compute left and right padding
202 | """
203 | M = num_frames(len(x), fsize, fshift)
204 | pad = (fsize - fshift)
205 | T = len(x) + 2 * pad
206 | r = (M - 1) * fshift + fsize - T
207 | return pad, pad + r
208 | ##########################################################
209 | #Librosa correct padding
210 | def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
211 | '''compute right padding (final frame) or both sides padding (first and final frames)
212 | '''
213 | assert pad_sides in (1, 2)
214 | # return int(fsize // 2)
215 | pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
216 | if pad_sides == 1:
217 | return 0, pad
218 | else:
219 | return pad // 2, pad // 2 + pad % 2
220 |
221 | # Conversions
222 | _mel_basis = None
223 | _inv_mel_basis = None
224 |
225 | def _linear_to_mel(spectogram, hparams):
226 | global _mel_basis
227 | if _mel_basis is None:
228 | _mel_basis = _build_mel_basis(hparams)
229 | return np.dot(_mel_basis, spectogram)
230 |
231 | def _mel_to_linear(mel_spectrogram, hparams):
232 | global _inv_mel_basis
233 | if _inv_mel_basis is None:
234 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
235 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
236 |
237 | def _mel_to_linear_tensorflow(mel_spectrogram, hparams):
238 | global _inv_mel_basis
239 | if _inv_mel_basis is None:
240 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
241 | return tf.transpose(tf.maximum(1e-10, tf.matmul(tf.cast(_inv_mel_basis, tf.float32), tf.transpose(mel_spectrogram, [1, 0]))), [1, 0])
242 |
243 | def _build_mel_basis(hparams):
244 | assert hparams.fmax <= hparams.sample_rate // 2
245 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
246 | fmin=hparams.fmin, fmax=hparams.fmax)
247 |
248 | def _amp_to_db(x, hparams):
249 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
250 | return 20 * np.log10(np.maximum(min_level, x))
251 |
252 | def _db_to_amp(x):
253 | return np.power(10.0, (x) * 0.05)
254 |
255 | def _db_to_amp_tensorflow(x):
256 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
257 |
258 | def _normalize(S, hparams):
259 | if hparams.allow_clipping_in_normalization:
260 | if hparams.symmetric_mels:
261 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
262 | -hparams.max_abs_value, hparams.max_abs_value)
263 | else:
264 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
265 |
266 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
267 | if hparams.symmetric_mels:
268 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
269 | else:
270 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
271 |
272 | def _denormalize(D, hparams):
273 | if hparams.allow_clipping_in_normalization:
274 | if hparams.symmetric_mels:
275 | return (((np.clip(D, -hparams.max_abs_value,
276 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
277 | + hparams.min_level_db)
278 | else:
279 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
280 |
281 | if hparams.symmetric_mels:
282 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
283 | else:
284 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
285 |
286 | def _denormalize_tensorflow(D, hparams):
287 | if hparams.allow_clipping_in_normalization:
288 | if hparams.symmetric_mels:
289 | return (((tf.clip_by_value(D, -hparams.max_abs_value,
290 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
291 | + hparams.min_level_db)
292 | else:
293 | return ((tf.clip_by_value(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
294 |
295 | if hparams.symmetric_mels:
296 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
297 | else:
298 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
299 |
--------------------------------------------------------------------------------
/datasets/preprocessor.py:
--------------------------------------------------------------------------------
1 | import os
2 | from concurrent.futures import ProcessPoolExecutor
3 | from functools import partial
4 |
5 | import numpy as np
6 | from datasets import audio
7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
8 |
9 |
10 | def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
11 | """
12 | Preprocesses the speech dataset from a gven input path to given output directories
13 |
14 | Args:
15 | - hparams: hyper parameters
16 | - input_dir: input directory that contains the files to prerocess
17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
19 | - wav_dir: output directory of the preprocessed speech audio dataset
20 | - n_jobs: Optional, number of worker process to parallelize across
21 | - tqdm: Optional, provides a nice progress bar
22 |
23 | Returns:
24 | - A list of tuple describing the train examples. this should be written to train.txt
25 | """
26 |
27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for
28 | # optimization purposes and it can be omited
29 | executor = ProcessPoolExecutor(max_workers=n_jobs)
30 | futures = []
31 | index = 1
32 | for input_dir in input_dirs:
33 | with open(os.path.join(input_dir, 'metadata.csv'), encoding='utf-8') as f:
34 | for line in f:
35 | parts = line.strip().split('|')
36 | basename = parts[0]
37 | wav_path = os.path.join(input_dir, 'wavs', '{}.wav'.format(basename))
38 | text = parts[2]
39 | futures.append(executor.submit(partial(_process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams)))
40 | index += 1
41 |
42 | return [future.result() for future in tqdm(futures) if future.result() is not None]
43 |
44 |
45 | def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams):
46 | """
47 | Preprocesses a single utterance wav/text pair
48 |
49 | this writes the mel scale spectogram to disk and return a tuple to write
50 | to the train.txt file
51 |
52 | Args:
53 | - mel_dir: the directory to write the mel spectograms into
54 | - linear_dir: the directory to write the linear spectrograms into
55 | - wav_dir: the directory to write the preprocessed wav into
56 | - index: the numeric index to use in the spectogram filename
57 | - wav_path: path to the audio file containing the speech input
58 | - text: text spoken in the input audio file
59 | - hparams: hyper parameters
60 |
61 | Returns:
62 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
63 | """
64 | try:
65 | # Load the audio as numpy array
66 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
67 | except FileNotFoundError: #catch missing wav exception
68 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
69 | wav_path))
70 | return None
71 |
72 | #Trim lead/trail silences
73 | if hparams.trim_silence:
74 | wav = audio.trim_silence(wav, hparams)
75 |
76 | #Pre-emphasize
77 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
78 |
79 | #rescale wav
80 | if hparams.rescale:
81 | wav = wav / np.abs(wav).max() * hparams.rescaling_max
82 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max
83 |
84 | #Assert all audio is in [-1, 1]
85 | if (wav > 1.).any() or (wav < -1.).any():
86 | raise RuntimeError('wav has invalid value: {}'.format(wav_path))
87 | if (preem_wav > 1.).any() or (preem_wav < -1.).any():
88 | raise RuntimeError('wav has invalid value: {}'.format(wav_path))
89 |
90 | #Mu-law quantize
91 | if is_mulaw_quantize(hparams.input_type):
92 | #[0, quantize_channels)
93 | out = mulaw_quantize(wav, hparams.quantize_channels)
94 |
95 | #Trim silences
96 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
97 | wav = wav[start: end]
98 | preem_wav = preem_wav[start: end]
99 | out = out[start: end]
100 |
101 | constant_values = mulaw_quantize(0, hparams.quantize_channels)
102 | out_dtype = np.int16
103 |
104 | elif is_mulaw(hparams.input_type):
105 | #[-1, 1]
106 | out = mulaw(wav, hparams.quantize_channels)
107 | constant_values = mulaw(0., hparams.quantize_channels)
108 | out_dtype = np.float32
109 |
110 | else:
111 | #[-1, 1]
112 | out = wav
113 | constant_values = 0.
114 | out_dtype = np.float32
115 |
116 | # Compute the mel scale spectrogram from the wav
117 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
118 | mel_frames = mel_spectrogram.shape[1]
119 |
120 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
121 | return None
122 |
123 | #Compute the linear scale spectrogram from the wav
124 | linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
125 | linear_frames = linear_spectrogram.shape[1]
126 |
127 | #sanity check
128 | assert linear_frames == mel_frames
129 |
130 | if hparams.use_lws:
131 | #Ensure time resolution adjustement between audio and mel-spectrogram
132 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
133 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
134 |
135 | #Zero pad audio signal
136 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
137 | else:
138 | #Ensure time resolution adjustement between audio and mel-spectrogram
139 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides)
140 |
141 | #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
142 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)
143 |
144 | assert len(out) >= mel_frames * audio.get_hop_size(hparams)
145 |
146 | #time resolution adjustement
147 | #ensure length of raw audio is multiple of hop size so that we can use
148 | #transposed convolution to upsample
149 | out = out[:mel_frames * audio.get_hop_size(hparams)]
150 | assert len(out) % audio.get_hop_size(hparams) == 0
151 | time_steps = len(out)
152 |
153 | # Write the spectrogram and audio to disk
154 | audio_filename = 'audio-{}.npy'.format(index)
155 | mel_filename = 'mel-{}.npy'.format(index)
156 | linear_filename = 'linear-{}.npy'.format(index)
157 | np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False)
158 | np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
159 | np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)
160 |
161 | # Return a tuple describing this training example
162 | return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
163 |
--------------------------------------------------------------------------------
/datasets/wavenet_preprocessor.py:
--------------------------------------------------------------------------------
1 | import os
2 | from concurrent.futures import ProcessPoolExecutor
3 | from functools import partial
4 |
5 | import numpy as np
6 | from datasets import audio
7 | from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
8 |
9 |
10 | def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
11 | """
12 | Preprocesses the speech dataset from a gven input path to given output directories
13 |
14 | Args:
15 | - hparams: hyper parameters
16 | - input_dir: input directory that contains the files to prerocess
17 | - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
18 | - linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
19 | - wav_dir: output directory of the preprocessed speech audio dataset
20 | - n_jobs: Optional, number of worker process to parallelize across
21 | - tqdm: Optional, provides a nice progress bar
22 |
23 | Returns:
24 | - A list of tuple describing the train examples. this should be written to train.txt
25 | """
26 |
27 | # We use ProcessPoolExecutor to parallelize across processes, this is just for
28 | # optimization purposes and it can be omited
29 | executor = ProcessPoolExecutor(max_workers=n_jobs)
30 | futures = []
31 | for file in os.listdir(input_dir):
32 | wav_path = os.path.join(input_dir, file)
33 | basename = os.path.basename(wav_path).replace('.wav', '')
34 | futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams)))
35 |
36 | return [future.result() for future in tqdm(futures) if future.result() is not None]
37 |
38 |
39 | def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
40 | """
41 | Preprocesses a single utterance wav/text pair
42 |
43 | this writes the mel scale spectogram to disk and return a tuple to write
44 | to the train.txt file
45 |
46 | Args:
47 | - mel_dir: the directory to write the mel spectograms into
48 | - linear_dir: the directory to write the linear spectrograms into
49 | - wav_dir: the directory to write the preprocessed wav into
50 | - index: the numeric index to use in the spectrogram filename
51 | - wav_path: path to the audio file containing the speech input
52 | - text: text spoken in the input audio file
53 | - hparams: hyper parameters
54 |
55 | Returns:
56 | - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
57 | """
58 | try:
59 | # Load the audio as numpy array
60 | wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
61 | except FileNotFoundError: #catch missing wav exception
62 | print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
63 | wav_path))
64 | return None
65 |
66 | #M-AILABS extra silence specific
67 | if hparams.trim_silence:
68 | wav = audio.trim_silence(wav, hparams)
69 |
70 | #Pre-emphasize
71 | preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
72 |
73 | #rescale wav
74 | if hparams.rescale:
75 | wav = wav / np.abs(wav).max() * hparams.rescaling_max
76 | preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max
77 |
78 | #Assert all audio is in [-1, 1]
79 | if (wav > 1.).any() or (wav < -1.).any():
80 | raise RuntimeError('wav has invalid value: {}'.format(wav_path))
81 | if (preem_wav > 1.).any() or (preem_wav < -1.).any():
82 | raise RuntimeError('wav has invalid value: {}'.format(wav_path))
83 |
84 | #Mu-law quantize
85 | if is_mulaw_quantize(hparams.input_type):
86 | #[0, quantize_channels)
87 | out = mulaw_quantize(wav, hparams.quantize_channels)
88 |
89 | #Trim silences
90 | start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
91 | wav = wav[start: end]
92 | preem_wav = preem_wav[start: end]
93 | out = out[start: end]
94 |
95 | constant_values = mulaw_quantize(0, hparams.quantize_channels)
96 | out_dtype = np.int16
97 |
98 | elif is_mulaw(hparams.input_type):
99 | #[-1, 1]
100 | out = mulaw(wav, hparams.quantize_channels)
101 | constant_values = mulaw(0., hparams.quantize_channels)
102 | out_dtype = np.float32
103 |
104 | else:
105 | #[-1, 1]
106 | out = wav
107 | constant_values = 0.
108 | out_dtype = np.float32
109 |
110 | # Compute the mel scale spectrogram from the wav
111 | mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
112 | mel_frames = mel_spectrogram.shape[1]
113 |
114 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
115 | return None
116 |
117 | if hparams.use_lws:
118 | #Ensure time resolution adjustement between audio and mel-spectrogram
119 | fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
120 | l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
121 |
122 | #Zero pad audio signal
123 | out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
124 | else:
125 | #Ensure time resolution adjustement between audio and mel-spectrogram
126 | l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
127 |
128 | #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
129 | out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values)
130 |
131 | assert len(out) >= mel_frames * audio.get_hop_size(hparams)
132 |
133 | #time resolution adjustement
134 | #ensure length of raw audio is multiple of hop size so that we can use
135 | #transposed convolution to upsample
136 | out = out[:mel_frames * audio.get_hop_size(hparams)]
137 | assert len(out) % audio.get_hop_size(hparams) == 0
138 | time_steps = len(out)
139 |
140 | # Write the spectrogram and audio to disk
141 | audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
142 | mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
143 | np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
144 | np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
145 |
146 | #global condition features
147 | if hparams.gin_channels > 0:
148 | raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
149 | speaker_id = '' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
150 | else:
151 | speaker_id = ''
152 |
153 | # Return a tuple describing this training example
154 | return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps, mel_frames)
155 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/anaconda3:latest
2 | FROM tensorflow/tensorflow:latest-gpu-py3
3 |
4 | RUN apt-get update
5 | RUN apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools wget git vim
6 |
7 | RUN wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
8 | RUN tar -jxvf LJSpeech-1.1.tar.bz2
9 |
10 | RUN git clone https://github.com/Rayhane-mamah/Tacotron-2.git
11 |
12 | WORKDIR Tacotron-2
13 | RUN ln -s ../LJSpeech-1.1 .
14 | RUN pip install -r requirements.txt
--------------------------------------------------------------------------------
/griffin_lim_synthesis_tool.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "from datasets.audio import *\n",
13 | "import os\n",
14 | "from hparams import hparams\n",
15 | "\n",
16 | "n_sample = 0 #Change n_steps here\n",
17 | "mel_folder = 'logs-Tacotron/mel-spectrograms' #Or change file path\n",
18 | "mel_file = 'mel-prediction-step-{}.npy'.format(n_sample) #Or file name (for other generated mels)\n",
19 | "out_dir = 'wav_out'\n",
20 | "\n",
21 | "os.makedirs(out_dir, exist_ok=True)\n",
22 | "\n",
23 | "#mel_file = os.path.join(mel_folder, mel_file)\n",
24 | "mel_file = 'training_data/mels/mel-LJ001-0001.npy'\n",
25 | "mel_spectro = np.load(mel_file)\n",
26 | "mel_spectro.shape"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n",
36 | "#save the wav under test__\n",
37 | "save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
38 | " sr=hparams.sample_rate)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "from tacotron.utils.plot import *\n",
48 | "\n",
49 | "plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "lin_file = 'training_data/linear/linear-LJ001-0001.npy'\n",
59 | "lin_spectro = np.load(lin_file)\n",
60 | "lin_spectro.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n",
70 | "save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
71 | " sr=hparams.sample_rate)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
81 | " auto_aspect=True)"
82 | ]
83 | }
84 | ],
85 | "metadata": {
86 | "kernelspec": {
87 | "display_name": "Python 3",
88 | "language": "python",
89 | "name": "python3"
90 | },
91 | "language_info": {
92 | "codemirror_mode": {
93 | "name": "ipython",
94 | "version": 3
95 | },
96 | "file_extension": ".py",
97 | "mimetype": "text/x-python",
98 | "name": "python",
99 | "nbconvert_exporter": "python",
100 | "pygments_lexer": "ipython3",
101 | "version": "3.6.4"
102 | }
103 | },
104 | "nbformat": 4,
105 | "nbformat_minor": 2
106 | }
107 |
--------------------------------------------------------------------------------
/infolog.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | import json
3 | from datetime import datetime
4 | from threading import Thread
5 | from urllib.request import Request, urlopen
6 |
7 | _format = '%Y-%m-%d %H:%M:%S.%f'
8 | _file = None
9 | _run_name = None
10 | _slack_url = None
11 |
12 |
13 | def init(filename, run_name, slack_url=None):
14 | global _file, _run_name, _slack_url
15 | _close_logfile()
16 | _file = open(filename, 'a')
17 | _file = open(filename, 'a')
18 | _file.write('\n-----------------------------------------------------------------\n')
19 | _file.write('Starting new {} training run\n'.format(run_name))
20 | _file.write('-----------------------------------------------------------------\n')
21 | _run_name = run_name
22 | _slack_url = slack_url
23 |
24 |
25 | def log(msg, end='\n', slack=False):
26 | print(msg, end=end)
27 | if _file is not None:
28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg))
29 | if slack and _slack_url is not None:
30 | Thread(target=_send_slack, args=(msg,)).start()
31 |
32 |
33 | def _close_logfile():
34 | global _file
35 | if _file is not None:
36 | _file.close()
37 | _file = None
38 |
39 |
40 | def _send_slack(msg):
41 | req = Request(_slack_url)
42 | req.add_header('Content-Type', 'application/json')
43 | urlopen(req, json.dumps({
44 | 'username': 'tacotron',
45 | 'icon_emoji': ':taco:',
46 | 'text': '*%s*: %s' % (_run_name, msg)
47 | }).encode())
48 |
49 |
50 | atexit.register(_close_logfile)
51 |
--------------------------------------------------------------------------------
/papers/(content+location) attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/(content+location) attention.pdf
--------------------------------------------------------------------------------
/papers/ClariNet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/ClariNet.pdf
--------------------------------------------------------------------------------
/papers/Tacotron 2 revised.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/Tacotron 2 revised.pdf
--------------------------------------------------------------------------------
/papers/bahdanau (content) attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/bahdanau (content) attention.pdf
--------------------------------------------------------------------------------
/papers/deepvoice 3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/deepvoice 3.pdf
--------------------------------------------------------------------------------
/papers/effective approaches attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/effective approaches attention.pdf
--------------------------------------------------------------------------------
/papers/fast_wavenet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/fast_wavenet.pdf
--------------------------------------------------------------------------------
/papers/tacotron.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron.pdf
--------------------------------------------------------------------------------
/papers/tacotron2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/tacotron2.pdf
--------------------------------------------------------------------------------
/papers/wavenet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rayhane-mamah/Tacotron-2/ab5cb08a931fc842d3892ebeb27c8b8734ddd4b8/papers/wavenet.pdf
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from multiprocessing import cpu_count
4 |
5 | from datasets import preprocessor
6 | from hparams import hparams
7 | from tqdm import tqdm
8 |
9 |
10 | def preprocess(args, input_folders, out_dir, hparams):
11 | mel_dir = os.path.join(out_dir, 'mels')
12 | wav_dir = os.path.join(out_dir, 'audio')
13 | linear_dir = os.path.join(out_dir, 'linear')
14 | os.makedirs(mel_dir, exist_ok=True)
15 | os.makedirs(wav_dir, exist_ok=True)
16 | os.makedirs(linear_dir, exist_ok=True)
17 | metadata = preprocessor.build_from_path(hparams, input_folders, mel_dir, linear_dir, wav_dir, args.n_jobs, tqdm=tqdm)
18 | write_metadata(metadata, out_dir)
19 |
20 | def write_metadata(metadata, out_dir):
21 | with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
22 | for m in metadata:
23 | f.write('|'.join([str(x) for x in m]) + '\n')
24 | mel_frames = sum([int(m[4]) for m in metadata])
25 | timesteps = sum([int(m[3]) for m in metadata])
26 | sr = hparams.sample_rate
27 | hours = timesteps / sr / 3600
28 | print('Write {} utterances, {} mel frames, {} audio timesteps, ({:.2f} hours)'.format(
29 | len(metadata), mel_frames, timesteps, hours))
30 | print('Max input length (text chars): {}'.format(max(len(m[5]) for m in metadata)))
31 | print('Max mel frames length: {}'.format(max(int(m[4]) for m in metadata)))
32 | print('Max audio timesteps length: {}'.format(max(m[3] for m in metadata)))
33 |
34 | def norm_data(args):
35 |
36 | merge_books = (args.merge_books=='True')
37 |
38 | print('Selecting data folders..')
39 | supported_datasets = ['LJSpeech-1.0', 'LJSpeech-1.1', 'M-AILABS']
40 | if args.dataset not in supported_datasets:
41 | raise ValueError('dataset value entered {} does not belong to supported datasets: {}'.format(
42 | args.dataset, supported_datasets))
43 |
44 | if args.dataset.startswith('LJSpeech'):
45 | return [os.path.join(args.base_dir, args.dataset)]
46 |
47 |
48 | if args.dataset == 'M-AILABS':
49 | supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
50 | 'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
51 | if args.language not in supported_languages:
52 | raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
53 | supported_languages))
54 |
55 | supported_voices = ['female', 'male', 'mix']
56 | if args.voice not in supported_voices:
57 | raise ValueError('Please enter a supported voice option to use from M-AILABS dataset! \n{}'.format(
58 | supported_voices))
59 |
60 | path = os.path.join(args.base_dir, args.language, 'by_book', args.voice)
61 | supported_readers = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
62 | if args.reader not in supported_readers:
63 | raise ValueError('Please enter a valid reader for your language and voice settings! \n{}'.format(
64 | supported_readers))
65 |
66 | path = os.path.join(path, args.reader)
67 | supported_books = [e for e in os.listdir(path) if os.path.isdir(os.path.join(path,e))]
68 | if merge_books:
69 | return [os.path.join(path, book) for book in supported_books]
70 |
71 | else:
72 | if args.book not in supported_books:
73 | raise ValueError('Please enter a valid book for your reader settings! \n{}'.format(
74 | supported_books))
75 |
76 | return [os.path.join(path, args.book)]
77 |
78 |
79 | def run_preprocess(args, hparams):
80 | input_folders = norm_data(args)
81 | output_folder = os.path.join(args.base_dir, args.output)
82 |
83 | preprocess(args, input_folders, output_folder, hparams)
84 |
85 |
86 | def main():
87 | print('initializing preprocessing..')
88 | parser = argparse.ArgumentParser()
89 | parser.add_argument('--base_dir', default='')
90 | parser.add_argument('--hparams', default='',
91 | help='Hyperparameter overrides as a comma-separated list of name=value pairs')
92 | parser.add_argument('--dataset', default='LJSpeech-1.1')
93 | parser.add_argument('--language', default='en_US')
94 | parser.add_argument('--voice', default='female')
95 | parser.add_argument('--reader', default='mary_ann')
96 | parser.add_argument('--merge_books', default='False')
97 | parser.add_argument('--book', default='northandsouth')
98 | parser.add_argument('--output', default='training_data')
99 | parser.add_argument('--n_jobs', type=int, default=cpu_count())
100 | args = parser.parse_args()
101 |
102 | modified_hp = hparams.parse(args.hparams)
103 |
104 | assert args.merge_books in ('False', 'True')
105 |
106 | run_preprocess(args, modified_hp)
107 |
108 |
109 | if __name__ == '__main__':
110 | main()
111 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | falcon==1.2.0
2 | inflect==0.2.5
3 | audioread==2.1.5
4 | librosa==0.5.1
5 | matplotlib==2.0.2
6 | numpy==1.14.0
7 | scipy==1.0.0
8 | tqdm==4.11.2
9 | Unidecode==0.4.20
10 | pyaudio==0.2.11
11 | sounddevice==0.3.10
12 | lws
13 | keras
--------------------------------------------------------------------------------
/sentences.txt:
--------------------------------------------------------------------------------
1 | Scientists at the CERN laboratory say they have discovered a new particle.
2 | There's a way to measure the acute emotional intelligence that has never gone out of style.
3 | President Trump met with other leaders at the Group of 20 conference.
4 | The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
5 | Generative adversarial network or variational auto-encoder.
6 | Basilar membrane and otolaryngology are not auto-correlations.
7 | He has read the whole thing.
8 | He reads books.
9 | He thought it was time to present the present.
10 | Thisss isrealy awhsome.
11 | Punctuation sensitivity, is working.
12 | Punctuation sensitivity is working.
13 | Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
14 | She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
15 | Tajima Airport serves Toyooka.
16 | On offering to help the blind man, the man who then stole his car, had not, at that precise moment, had any evil intention, quite the contrary, what he did was nothing more than obey those feelings of generosity and altruism which, as everyone knows, are the two best traits of human nature and to be found in much more hardened criminals than this one, a simple car-thief without any hope of advancing in his profession, exploited by the real owners this enterprise, for it is they who take advantage of the needs of the poor.
17 | Thank you so much for your support!
--------------------------------------------------------------------------------
/synthesize.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from warnings import warn
4 | from time import sleep
5 |
6 | import tensorflow as tf
7 |
8 | from hparams import hparams
9 | from infolog import log
10 | from tacotron.synthesize import tacotron_synthesize
11 | from wavenet_vocoder.synthesize import wavenet_synthesize
12 |
13 |
14 | def prepare_run(args):
15 | modified_hp = hparams.parse(args.hparams)
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
17 |
18 | run_name = args.name or args.tacotron_name or args.model
19 | taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint)
20 |
21 | run_name = args.name or args.wavenet_name or args.model
22 | wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
23 | return taco_checkpoint, wave_checkpoint, modified_hp
24 |
25 | def get_sentences(args):
26 | if args.text_list != '':
27 | with open(args.text_list, 'rb') as f:
28 | sentences = list(map(lambda l: l.decode("utf-8")[:-1], f.readlines()))
29 | else:
30 | sentences = hparams.sentences
31 | return sentences
32 |
33 | def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
34 | log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
35 | log('Synthesizing mel-spectrograms from text..')
36 | wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
37 | #Delete Tacotron model from graph
38 | tf.reset_default_graph()
39 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is synthesizing
40 | sleep(0.5)
41 | log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
42 | wavenet_synthesize(args, hparams, wave_checkpoint)
43 | log('Tacotron-2 TTS synthesis complete!')
44 |
45 |
46 |
47 | def main():
48 | accepted_modes = ['eval', 'synthesis', 'live']
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint')
51 | parser.add_argument('--hparams', default='',
52 | help='Hyperparameter overrides as a comma-separated list of name=value pairs')
53 | parser.add_argument('--name', help='Name of logging directory if the two models were trained together.')
54 | parser.add_argument('--tacotron_name', help='Name of logging directory of Tacotron. If trained separately')
55 | parser.add_argument('--wavenet_name', help='Name of logging directory of WaveNet. If trained separately')
56 | parser.add_argument('--model', default='Tacotron-2')
57 | parser.add_argument('--input_dir', default='training_data/', help='folder to contain inputs sentences/targets')
58 | parser.add_argument('--mels_dir', default='tacotron_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet')
59 | parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
60 | parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
61 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
62 | parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
63 | parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
64 | args = parser.parse_args()
65 |
66 | accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
67 |
68 | if args.model not in accepted_models:
69 | raise ValueError('please enter a valid model to synthesize with: {}'.format(accepted_models))
70 |
71 | if args.mode not in accepted_modes:
72 | raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))
73 |
74 | if args.mode == 'live' and args.model == 'Wavenet':
75 | raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!')
76 |
77 | if args.GTA not in ('True', 'False'):
78 | raise ValueError('GTA option must be either True or False')
79 |
80 | if args.model == 'Tacotron-2':
81 | if args.mode == 'live':
82 | warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!')
83 | if args.mode == 'synthesis':
84 | raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)')
85 |
86 | taco_checkpoint, wave_checkpoint, hparams = prepare_run(args)
87 | sentences = get_sentences(args)
88 |
89 | if args.model == 'Tacotron':
90 | _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
91 | elif args.model == 'WaveNet':
92 | wavenet_synthesize(args, hparams, wave_checkpoint)
93 | elif args.model == 'Tacotron-2':
94 | synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences)
95 | else:
96 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
97 |
98 |
99 | if __name__ == '__main__':
100 | main()
101 |
--------------------------------------------------------------------------------
/tacotron/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/tacotron/feeder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import threading
3 | import time
4 | import traceback
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 | from infolog import log
9 | from sklearn.model_selection import train_test_split
10 | from tacotron.utils.text import text_to_sequence
11 |
12 | _batches_per_group = 64
13 |
14 | class Feeder:
15 | """
16 | Feeds batches of data into queue on a background thread.
17 | """
18 |
19 | def __init__(self, coordinator, metadata_filename, hparams):
20 | super(Feeder, self).__init__()
21 | self._coord = coordinator
22 | self._hparams = hparams
23 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
24 | self._train_offset = 0
25 | self._test_offset = 0
26 |
27 | # Load metadata
28 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), 'mels')
29 | self._linear_dir = os.path.join(os.path.dirname(metadata_filename), 'linear')
30 | with open(metadata_filename, encoding='utf-8') as f:
31 | self._metadata = [line.strip().split('|') for line in f]
32 | frame_shift_ms = hparams.hop_size / hparams.sample_rate
33 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
34 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(self._metadata), hours))
35 |
36 | #Train test split
37 | if hparams.tacotron_test_size is None:
38 | assert hparams.tacotron_test_batches is not None
39 |
40 | test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
41 | else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
42 | indices = np.arange(len(self._metadata))
43 | train_indices, test_indices = train_test_split(indices,
44 | test_size=test_size, random_state=hparams.tacotron_data_random_state)
45 |
46 | #Make sure test_indices is a multiple of batch_size else round down
47 | len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
48 | extra_test = test_indices[len_test_indices:]
49 | test_indices = test_indices[:len_test_indices]
50 | train_indices = np.concatenate([train_indices, extra_test])
51 |
52 | self._train_meta = list(np.array(self._metadata)[train_indices])
53 | self._test_meta = list(np.array(self._metadata)[test_indices])
54 |
55 | self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
56 |
57 | if hparams.tacotron_test_size is None:
58 | assert hparams.tacotron_test_batches == self.test_steps
59 |
60 | #pad input sequences with the 0 ( _ )
61 | self._pad = 0
62 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram
63 | #to avoid any possible conflicts, without affecting the output range of the model too much
64 | if hparams.symmetric_mels:
65 | self._target_pad = -hparams.max_abs_value
66 | else:
67 | self._target_pad = 0.
68 | #Mark finished sequences with 1s
69 | self._token_pad = 1.
70 |
71 | with tf.device('/cpu:0'):
72 | # Create placeholders for inputs and targets. Don't specify batch size because we want
73 | # to be able to feed different batch sizes at eval time.
74 | self._placeholders = [
75 | tf.placeholder(tf.int32, shape=(None, None), name='inputs'),
76 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
77 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='mel_targets'),
78 | tf.placeholder(tf.float32, shape=(None, None), name='token_targets'),
79 | tf.placeholder(tf.float32, shape=(None, None, hparams.num_freq), name='linear_targets'),
80 | tf.placeholder(tf.int32, shape=(None, ), name='targets_lengths'),
81 | tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos'),
82 | ]
83 |
84 | # Create queue for buffering data
85 | queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='input_queue')
86 | self._enqueue_op = queue.enqueue(self._placeholders)
87 | self.inputs, self.input_lengths, self.mel_targets, self.token_targets, self.linear_targets, self.targets_lengths, self.split_infos = queue.dequeue()
88 |
89 | self.inputs.set_shape(self._placeholders[0].shape)
90 | self.input_lengths.set_shape(self._placeholders[1].shape)
91 | self.mel_targets.set_shape(self._placeholders[2].shape)
92 | self.token_targets.set_shape(self._placeholders[3].shape)
93 | self.linear_targets.set_shape(self._placeholders[4].shape)
94 | self.targets_lengths.set_shape(self._placeholders[5].shape)
95 | self.split_infos.set_shape(self._placeholders[6].shape)
96 |
97 | # Create eval queue for buffering eval data
98 | eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32], name='eval_queue')
99 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
100 | self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, self.eval_token_targets, \
101 | self.eval_linear_targets, self.eval_targets_lengths, self.eval_split_infos = eval_queue.dequeue()
102 |
103 | self.eval_inputs.set_shape(self._placeholders[0].shape)
104 | self.eval_input_lengths.set_shape(self._placeholders[1].shape)
105 | self.eval_mel_targets.set_shape(self._placeholders[2].shape)
106 | self.eval_token_targets.set_shape(self._placeholders[3].shape)
107 | self.eval_linear_targets.set_shape(self._placeholders[4].shape)
108 | self.eval_targets_lengths.set_shape(self._placeholders[5].shape)
109 | self.eval_split_infos.set_shape(self._placeholders[6].shape)
110 |
111 | def start_threads(self, session):
112 | self._session = session
113 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
114 | thread.daemon = True #Thread will close when parent quits
115 | thread.start()
116 |
117 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
118 | thread.daemon = True #Thread will close when parent quits
119 | thread.start()
120 |
121 | def _get_test_groups(self):
122 | meta = self._test_meta[self._test_offset]
123 | self._test_offset += 1
124 |
125 | text = meta[5]
126 |
127 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
128 | mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
129 | #Create parallel sequences containing zeros to represent a non finished sequence
130 | token_target = np.asarray([0.] * (len(mel_target) - 1))
131 | linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
132 | return (input_data, mel_target, token_target, linear_target, len(mel_target))
133 |
134 | def make_test_batches(self):
135 | start = time.time()
136 |
137 | # Read a group of examples
138 | n = self._hparams.tacotron_batch_size
139 | r = self._hparams.outputs_per_step
140 |
141 | #Test on entire test set
142 | examples = [self._get_test_groups() for i in range(len(self._test_meta))]
143 |
144 | # Bucket examples based on similar output sequence length for efficiency
145 | examples.sort(key=lambda x: x[-1])
146 | batches = [examples[i: i+n] for i in range(0, len(examples), n)]
147 | np.random.shuffle(batches)
148 |
149 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
150 | return batches, r
151 |
152 | def _enqueue_next_train_group(self):
153 | while not self._coord.should_stop():
154 | start = time.time()
155 |
156 | # Read a group of examples
157 | n = self._hparams.tacotron_batch_size
158 | r = self._hparams.outputs_per_step
159 | examples = [self._get_next_example() for i in range(n * _batches_per_group)]
160 |
161 | # Bucket examples based on similar output sequence length for efficiency
162 | examples.sort(key=lambda x: x[-1])
163 | batches = [examples[i: i+n] for i in range(0, len(examples), n)]
164 | np.random.shuffle(batches)
165 |
166 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
167 | for batch in batches:
168 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
169 | self._session.run(self._enqueue_op, feed_dict=feed_dict)
170 |
171 | def _enqueue_next_test_group(self):
172 | #Create test batches once and evaluate on them for all test steps
173 | test_batches, r = self.make_test_batches()
174 | while not self._coord.should_stop():
175 | for batch in test_batches:
176 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
177 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
178 |
179 | def _get_next_example(self):
180 | """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
181 | """
182 | if self._train_offset >= len(self._train_meta):
183 | self._train_offset = 0
184 | np.random.shuffle(self._train_meta)
185 |
186 | meta = self._train_meta[self._train_offset]
187 | self._train_offset += 1
188 |
189 | text = meta[5]
190 |
191 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
192 | mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
193 | #Create parallel sequences containing zeros to represent a non finished sequence
194 | token_target = np.asarray([0.] * (len(mel_target) - 1))
195 | linear_target = np.load(os.path.join(self._linear_dir, meta[2]))
196 | return (input_data, mel_target, token_target, linear_target, len(mel_target))
197 |
198 | def _prepare_batch(self, batches, outputs_per_step):
199 | assert 0 == len(batches) % self._hparams.tacotron_num_gpus
200 | size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
201 | np.random.shuffle(batches)
202 |
203 | inputs = None
204 | mel_targets = None
205 | token_targets = None
206 | linear_targets = None
207 | targets_lengths = None
208 | split_infos = []
209 |
210 | targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
211 | input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
212 |
213 | #Produce inputs/targets of variables lengths for different GPUs
214 | for i in range(self._hparams.tacotron_num_gpus):
215 | batch = batches[size_per_device * i: size_per_device * (i + 1)]
216 | input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
217 | inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
218 | mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
219 | mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
220 |
221 | #Pad sequences with 1 to infer that the sequence is done
222 | token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
223 | token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
224 | linear_targets_cur_device, linear_target_max_len = self._prepare_targets([x[3] for x in batch], outputs_per_step)
225 | linear_targets = np.concatenate((linear_targets, linear_targets_cur_device), axis=1) if linear_targets is not None else linear_targets_cur_device
226 | split_infos.append([input_max_len, mel_target_max_len, token_target_max_len, linear_target_max_len])
227 |
228 | split_infos = np.asarray(split_infos, dtype=np.int32)
229 | return (inputs, input_lengths, mel_targets, token_targets, linear_targets, targets_lengths, split_infos)
230 |
231 | def _prepare_inputs(self, inputs):
232 | max_len = max([len(x) for x in inputs])
233 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
234 |
235 | def _prepare_targets(self, targets, alignment):
236 | max_len = max([len(t) for t in targets])
237 | data_len = self._round_up(max_len, alignment)
238 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
239 |
240 | def _prepare_token_targets(self, targets, alignment):
241 | max_len = max([len(t) for t in targets]) + 1
242 | data_len = self._round_up(max_len, alignment)
243 | return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
244 |
245 | def _pad_input(self, x, length):
246 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
247 |
248 | def _pad_target(self, t, length):
249 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
250 |
251 | def _pad_token_target(self, t, length):
252 | return np.pad(t, (0, length - t.shape[0]), mode='constant', constant_values=self._token_pad)
253 |
254 | def _round_up(self, x, multiple):
255 | remainder = x % multiple
256 | return x if remainder == 0 else x + multiple - remainder
257 |
258 | def _round_down(self, x, multiple):
259 | remainder = x % multiple
260 | return x if remainder == 0 else x - remainder
261 |
--------------------------------------------------------------------------------
/tacotron/models/Architecture_wrappers.py:
--------------------------------------------------------------------------------
1 | """A set of wrappers usefull for tacotron 2 architecture
2 | All notations and variable names were used in concordance with originial tensorflow implementation
3 | """
4 | import collections
5 |
6 | import numpy as np
7 | import tensorflow as tf
8 | from tacotron.models.attention import _compute_attention
9 | from tensorflow.contrib.rnn import RNNCell
10 | from tensorflow.python.framework import ops, tensor_shape
11 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
12 | from tensorflow.python.util import nest
13 |
14 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
15 |
16 |
17 |
18 | class TacotronEncoderCell(RNNCell):
19 | """Tacotron 2 Encoder Cell
20 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
21 | layer to predict the hidden representation vector (or memory)
22 | """
23 |
24 | def __init__(self, convolutional_layers, lstm_layer):
25 | """Initialize encoder parameters
26 |
27 | Args:
28 | convolutional_layers: Encoder convolutional block class
29 | lstm_layer: encoder bidirectional lstm layer class
30 | """
31 | super(TacotronEncoderCell, self).__init__()
32 | #Initialize encoder layers
33 | self._convolutions = convolutional_layers
34 | self._cell = lstm_layer
35 |
36 | def __call__(self, inputs, input_lengths=None):
37 | #Pass input sequence through a stack of convolutional layers
38 | conv_output = self._convolutions(inputs)
39 |
40 | #Extract hidden representation from encoder lstm cells
41 | hidden_representation = self._cell(conv_output, input_lengths)
42 |
43 | #For shape visualization
44 | self.conv_output_shape = conv_output.shape
45 | return hidden_representation
46 |
47 |
48 | class TacotronDecoderCellState(
49 | collections.namedtuple("TacotronDecoderCellState",
50 | ("cell_state", "attention", "time", "alignments",
51 | "alignment_history", "max_attentions"))):
52 | """`namedtuple` storing the state of a `TacotronDecoderCell`.
53 | Contains:
54 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time
55 | step.
56 | - `attention`: The attention emitted at the previous time step.
57 | - `time`: int32 scalar containing the current time step.
58 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
59 | emitted at the previous time step for each attention mechanism.
60 | - `alignment_history`: a single or tuple of `TensorArray`(s)
61 | containing alignment matrices from all time steps for each attention
62 | mechanism. Call `stack()` on each to convert to a `Tensor`.
63 | """
64 | def replace(self, **kwargs):
65 | """Clones the current state while overwriting components provided by kwargs.
66 | """
67 | return super(TacotronDecoderCellState, self)._replace(**kwargs)
68 |
69 | class TacotronDecoderCell(RNNCell):
70 | """Tactron 2 Decoder Cell
71 | Decodes encoder output and previous mel frames into next r frames
72 |
73 | Decoder Step i:
74 | 1) Prenet to compress last output information
75 | 2) Concat compressed inputs with previous context vector (input feeding) *
76 | 3) Decoder RNN (actual decoding) to predict current state s_{i} *
77 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
78 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
79 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated)
80 |
81 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
82 | and wrap that with the prenet before doing an input feeding, and with the prediction layer
83 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with
84 | tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
85 | """
86 |
87 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
88 | """Initialize decoder parameters
89 |
90 | Args:
91 | prenet: A tensorflow fully connected layer acting as the decoder pre-net
92 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to
93 | learn encoder-decoder alignments
94 | rnn_cell: Instance of RNNCell, main body of the decoder
95 | frame_projection: tensorflow fully connected layer with r * num_mels output units
96 | stop_projection: tensorflow fully connected layer, expected to project to a scalar
97 | and through a sigmoid activation
98 | mask_finished: Boolean, Whether to mask decoder frames after the
99 | """
100 | super(TacotronDecoderCell, self).__init__()
101 | #Initialize decoder layers
102 | self._prenet = prenet
103 | self._attention_mechanism = attention_mechanism
104 | self._cell = rnn_cell
105 | self._frame_projection = frame_projection
106 | self._stop_projection = stop_projection
107 |
108 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
109 |
110 | def _batch_size_checks(self, batch_size, error_message):
111 | return [check_ops.assert_equal(batch_size,
112 | self._attention_mechanism.batch_size,
113 | message=error_message)]
114 |
115 | @property
116 | def output_size(self):
117 | return self._frame_projection.shape
118 |
119 | @property
120 | def state_size(self):
121 | """The `state_size` property of `TacotronDecoderCell`.
122 |
123 | Returns:
124 | An `TacotronDecoderCell` tuple containing shapes used by this object.
125 | """
126 | return TacotronDecoderCellState(
127 | cell_state=self._cell._cell.state_size,
128 | time=tensor_shape.TensorShape([]),
129 | attention=self._attention_layer_size,
130 | alignments=self._attention_mechanism.alignments_size,
131 | alignment_history=(),
132 | max_attentions=())
133 |
134 | def zero_state(self, batch_size, dtype):
135 | """Return an initial (zero) state tuple for this `AttentionWrapper`.
136 |
137 | Args:
138 | batch_size: `0D` integer tensor: the batch size.
139 | dtype: The internal state data type.
140 | Returns:
141 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
142 | possibly, empty `TensorArray` objects.
143 | Raises:
144 | ValueError: (or, possibly at runtime, InvalidArgument), if
145 | `batch_size` does not match the output size of the encoder passed
146 | to the wrapper object at initialization time.
147 | """
148 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
149 | cell_state = self._cell._cell.zero_state(batch_size, dtype)
150 | error_message = (
151 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
152 | "Non-matching batch sizes between the memory "
153 | "(encoder output) and the requested batch size.")
154 | with ops.control_dependencies(
155 | self._batch_size_checks(batch_size, error_message)):
156 | cell_state = nest.map_structure(
157 | lambda s: array_ops.identity(s, name="checked_cell_state"),
158 | cell_state)
159 | return TacotronDecoderCellState(
160 | cell_state=cell_state,
161 | time=array_ops.zeros([], dtype=tf.int32),
162 | attention=_zero_state_tensors(self._attention_layer_size, batch_size,
163 | dtype),
164 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
165 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
166 | dynamic_size=True),
167 | max_attentions=tf.zeros((batch_size, ), dtype=tf.int32))
168 |
169 | def __call__(self, inputs, state):
170 | #Information bottleneck (essential for learning attention)
171 | prenet_output = self._prenet(inputs)
172 |
173 | #Concat context vector and prenet output to form LSTM cells input (input feeding)
174 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
175 |
176 | #Unidirectional LSTM layers
177 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
178 |
179 |
180 | #Compute the attention (context) vector and alignments using
181 | #the new decoder cell hidden state as query vector
182 | #and cumulative alignments to extract location features
183 | #The choice of the new cell hidden state (s_{i}) of the last
184 | #decoder RNN Cell is based on Luong et Al. (2015):
185 | #https://arxiv.org/pdf/1508.04025.pdf
186 | previous_alignments = state.alignments
187 | previous_alignment_history = state.alignment_history
188 | context_vector, alignments, cumulated_alignments, max_attentions = _compute_attention(self._attention_mechanism,
189 | LSTM_output,
190 | previous_alignments,
191 | attention_layer=None,
192 | prev_max_attentions=state.max_attentions)
193 |
194 | #Concat LSTM outputs and context vector to form projections inputs
195 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
196 |
197 | #Compute predicted frames and predicted
198 | cell_outputs = self._frame_projection(projections_input)
199 | stop_tokens = self._stop_projection(projections_input)
200 |
201 | #Save alignment history
202 | alignment_history = previous_alignment_history.write(state.time, alignments)
203 |
204 | #Prepare next decoder state
205 | next_state = TacotronDecoderCellState(
206 | time=state.time + 1,
207 | cell_state=next_cell_state,
208 | attention=context_vector,
209 | alignments=cumulated_alignments,
210 | alignment_history=alignment_history,
211 | max_attentions=max_attentions)
212 |
213 | return (cell_outputs, stop_tokens), next_state
214 |
--------------------------------------------------------------------------------
/tacotron/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 |
3 |
4 | def create_model(name, hparams):
5 | if name == 'Tacotron':
6 | return Tacotron(hparams)
7 | else:
8 | raise Exception('Unknown model: ' + name)
9 |
--------------------------------------------------------------------------------
/tacotron/models/attention.py:
--------------------------------------------------------------------------------
1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
2 |
3 | import tensorflow as tf
4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
5 | from tensorflow.python.layers import core as layers_core
6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
7 |
8 |
9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
10 | def _compute_attention(attention_mechanism, cell_output, attention_state,
11 | attention_layer, prev_max_attentions):
12 | """Computes the attention and alignments for a given attention_mechanism."""
13 | alignments, next_attention_state, max_attentions = attention_mechanism(
14 | cell_output, state=attention_state, prev_max_attentions=prev_max_attentions)
15 |
16 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
17 | expanded_alignments = array_ops.expand_dims(alignments, 1)
18 | # Context is the inner product of alignments and values along the
19 | # memory time dimension.
20 | # alignments shape is
21 | # [batch_size, 1, memory_time]
22 | # attention_mechanism.values shape is
23 | # [batch_size, memory_time, memory_size]
24 | # the batched matmul is over memory_time, so the output shape is
25 | # [batch_size, 1, memory_size].
26 | # we then squeeze out the singleton dim.
27 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
28 | context = array_ops.squeeze(context, [1])
29 |
30 | if attention_layer is not None:
31 | attention = attention_layer(array_ops.concat([cell_output, context], 1))
32 | else:
33 | attention = context
34 |
35 | return attention, alignments, next_attention_state, max_attentions
36 |
37 |
38 | def _location_sensitive_score(W_query, W_fil, W_keys):
39 | """Impelements Bahdanau-style (cumulative) scoring function.
40 | This attention is described in:
41 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
42 | gio, “Attention-based models for speech recognition,” in Ad-
43 | vances in Neural Information Processing Systems, 2015, pp.
44 | 577–585.
45 |
46 | #############################################################################
47 | hybrid attention (content-based + location-based)
48 | f = F * α_{i-1}
49 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
50 | #############################################################################
51 |
52 | Args:
53 | W_query: Tensor, shape '[batch_size, 1, attention_dim]' to compare to location features.
54 | W_location: processed previous alignments into location features, shape '[batch_size, max_time, attention_dim]'
55 | W_keys: Tensor, shape '[batch_size, max_time, attention_dim]', typically the encoder outputs.
56 | Returns:
57 | A '[batch_size, max_time]' attention score (energy)
58 | """
59 | # Get the number of hidden units from the trailing dimension of keys
60 | dtype = W_query.dtype
61 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
62 |
63 | v_a = tf.get_variable(
64 | 'attention_variable_projection', shape=[num_units], dtype=dtype,
65 | initializer=tf.contrib.layers.xavier_initializer())
66 | b_a = tf.get_variable(
67 | 'attention_bias', shape=[num_units], dtype=dtype,
68 | initializer=tf.zeros_initializer())
69 |
70 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
71 |
72 | def _smoothing_normalization(e):
73 | """Applies a smoothing normalization function instead of softmax
74 | Introduced in:
75 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
76 | gio, “Attention-based models for speech recognition,” in Ad-
77 | vances in Neural Information Processing Systems, 2015, pp.
78 | 577–585.
79 |
80 | ############################################################################
81 | Smoothing normalization function
82 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
83 | ############################################################################
84 |
85 | Args:
86 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
87 | values of an attention mechanism
88 | Returns:
89 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
90 | attendance to multiple memory time steps.
91 | """
92 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
93 |
94 |
95 | class LocationSensitiveAttention(BahdanauAttention):
96 | """Impelements Bahdanau-style (cumulative) scoring function.
97 | Usually referred to as "hybrid" attention (content-based + location-based)
98 | Extends the additive attention described in:
99 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
100 | tion by jointly learning to align and translate,” in Proceedings
101 | of ICLR, 2015."
102 | to use previous alignments as additional location features.
103 |
104 | This attention is described in:
105 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
106 | gio, “Attention-based models for speech recognition,” in Ad-
107 | vances in Neural Information Processing Systems, 2015, pp.
108 | 577–585.
109 | """
110 |
111 | def __init__(self,
112 | num_units,
113 | memory,
114 | hparams,
115 | is_training,
116 | mask_encoder=True,
117 | memory_sequence_length=None,
118 | smoothing=False,
119 | cumulate_weights=True,
120 | name='LocationSensitiveAttention'):
121 | """Construct the Attention mechanism.
122 | Args:
123 | num_units: The depth of the query mechanism.
124 | memory: The memory to query; usually the output of an RNN encoder. This
125 | tensor should be shaped `[batch_size, max_time, ...]`.
126 | mask_encoder (optional): Boolean, whether to mask encoder paddings.
127 | memory_sequence_length (optional): Sequence lengths for the batch entries
128 | in memory. If provided, the memory tensor rows are masked with zeros
129 | for values past the respective sequence lengths. Only relevant if mask_encoder = True.
130 | smoothing (optional): Boolean. Determines which normalization function to use.
131 | Default normalization function (probablity_fn) is softmax. If smoothing is
132 | enabled, we replace softmax with:
133 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
134 | Introduced in:
135 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
136 | gio, “Attention-based models for speech recognition,” in Ad-
137 | vances in Neural Information Processing Systems, 2015, pp.
138 | 577–585.
139 | This is mainly used if the model wants to attend to multiple input parts
140 | at the same decoding step. We probably won't be using it since multiple sound
141 | frames may depend on the same character/phone, probably not the way around.
142 | Note:
143 | We still keep it implemented in case we want to test it. They used it in the
144 | paper in the context of speech recognition, where one phoneme may depend on
145 | multiple subsequent sound frames.
146 | name: Name to use when creating ops.
147 | """
148 | #Create normalization function
149 | #Setting it to None defaults in using softmax
150 | normalization_function = _smoothing_normalization if (smoothing == True) else None
151 | memory_length = memory_sequence_length if (mask_encoder==True) else None
152 | super(LocationSensitiveAttention, self).__init__(
153 | num_units=num_units,
154 | memory=memory,
155 | memory_sequence_length=memory_length,
156 | probability_fn=normalization_function,
157 | name=name)
158 |
159 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
160 | kernel_size=hparams.attention_kernel, padding='same', use_bias=True,
161 | bias_initializer=tf.zeros_initializer(), name='location_features_convolution')
162 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
163 | dtype=tf.float32, name='location_features_layer')
164 | self._cumulate = cumulate_weights
165 | self.synthesis_constraint = hparams.synthesis_constraint and not is_training
166 | self.attention_win_size = tf.convert_to_tensor(hparams.attention_win_size, dtype=tf.int32)
167 | self.constraint_type = hparams.synthesis_constraint_type
168 |
169 | def __call__(self, query, state, prev_max_attentions):
170 | """Score the query based on the keys and values.
171 | Args:
172 | query: Tensor of dtype matching `self.values` and shape
173 | `[batch_size, query_depth]`.
174 | state (previous alignments): Tensor of dtype matching `self.values` and shape
175 | `[batch_size, alignments_size]`
176 | (`alignments_size` is memory's `max_time`).
177 | Returns:
178 | alignments: Tensor of dtype matching `self.values` and shape
179 | `[batch_size, alignments_size]` (`alignments_size` is memory's
180 | `max_time`).
181 | """
182 | previous_alignments = state
183 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
184 |
185 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
186 | processed_query = self.query_layer(query) if self.query_layer else query
187 | # -> [batch_size, 1, attention_dim]
188 | processed_query = tf.expand_dims(processed_query, 1)
189 |
190 | # processed_location_features shape [batch_size, max_time, attention dimension]
191 | # [batch_size, max_time] -> [batch_size, max_time, 1]
192 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
193 | # location features [batch_size, max_time, filters]
194 | f = self.location_convolution(expanded_alignments)
195 | # Projected location features [batch_size, max_time, attention_dim]
196 | processed_location_features = self.location_layer(f)
197 |
198 | # energy shape [batch_size, max_time]
199 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
200 |
201 | if self.synthesis_constraint:
202 | Tx = tf.shape(energy)[-1]
203 | # prev_max_attentions = tf.squeeze(prev_max_attentions, [-1])
204 | if self.constraint_type == 'monotonic':
205 | key_masks = tf.sequence_mask(prev_max_attentions, Tx)
206 | reverse_masks = tf.sequence_mask(Tx - self.attention_win_size - prev_max_attentions, Tx)[:, ::-1]
207 | else:
208 | assert self.constraint_type == 'window'
209 | key_masks = tf.sequence_mask(prev_max_attentions - (self.attention_win_size // 2 + (self.attention_win_size % 2 != 0)), Tx)
210 | reverse_masks = tf.sequence_mask(Tx - (self.attention_win_size // 2) - prev_max_attentions, Tx)[:, ::-1]
211 |
212 | masks = tf.logical_or(key_masks, reverse_masks)
213 | paddings = tf.ones_like(energy) * (-2 ** 32 + 1) # (N, Ty/r, Tx)
214 | energy = tf.where(tf.equal(masks, False), energy, paddings)
215 |
216 | # alignments shape = energy shape = [batch_size, max_time]
217 | alignments = self._probability_fn(energy, previous_alignments)
218 | max_attentions = tf.argmax(alignments, -1, output_type=tf.int32) # (N, Ty/r)
219 |
220 | # Cumulate alignments
221 | if self._cumulate:
222 | next_state = alignments + previous_alignments
223 | else:
224 | next_state = alignments
225 |
226 | return alignments, next_state, max_attentions
227 |
--------------------------------------------------------------------------------
/tacotron/models/custom_decoder.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import collections
4 |
5 | import tensorflow as tf
6 | from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper
7 | from tensorflow.contrib.seq2seq.python.ops import decoder
8 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
9 | from tensorflow.python.framework import ops, tensor_shape
10 | from tensorflow.python.layers import base as layers_base
11 | from tensorflow.python.ops import rnn_cell_impl
12 | from tensorflow.python.util import nest
13 |
14 |
15 | class CustomDecoderOutput(
16 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
17 | pass
18 |
19 |
20 | class CustomDecoder(decoder.Decoder):
21 | """Custom sampling decoder.
22 |
23 | Allows for stop token prediction at inference time
24 | and returns equivalent loss in training time.
25 |
26 | Note:
27 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
28 | """
29 |
30 | def __init__(self, cell, helper, initial_state, output_layer=None):
31 | """Initialize CustomDecoder.
32 | Args:
33 | cell: An `RNNCell` instance.
34 | helper: A `Helper` instance.
35 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
36 | The initial state of the RNNCell.
37 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
38 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior
39 | to storing the result or sampling.
40 | Raises:
41 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
42 | """
43 | rnn_cell_impl.assert_like_rnncell(type(cell), cell)
44 | if not isinstance(helper, helper_py.Helper):
45 | raise TypeError("helper must be a Helper, received: %s" % type(helper))
46 | if (output_layer is not None
47 | and not isinstance(output_layer, layers_base.Layer)):
48 | raise TypeError(
49 | "output_layer must be a Layer, received: %s" % type(output_layer))
50 | self._cell = cell
51 | self._helper = helper
52 | self._initial_state = initial_state
53 | self._output_layer = output_layer
54 |
55 | @property
56 | def batch_size(self):
57 | return self._helper.batch_size
58 |
59 | def _rnn_output_size(self):
60 | size = self._cell.output_size
61 | if self._output_layer is None:
62 | return size
63 | else:
64 | # To use layer's compute_output_shape, we need to convert the
65 | # RNNCell's output_size entries into shapes with an unknown
66 | # batch size. We then pass this through the layer's
67 | # compute_output_shape and read off all but the first (batch)
68 | # dimensions to get the output size of the rnn with the layer
69 | # applied to the top.
70 | output_shape_with_unknown_batch = nest.map_structure(
71 | lambda s: tensor_shape.TensorShape([None]).concatenate(s),
72 | size)
73 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access
74 | output_shape_with_unknown_batch)
75 | return nest.map_structure(lambda s: s[1:], layer_output_shape)
76 |
77 | @property
78 | def output_size(self):
79 | # Return the cell output and the id
80 | return CustomDecoderOutput(
81 | rnn_output=self._rnn_output_size(),
82 | token_output=self._helper.token_output_size,
83 | sample_id=self._helper.sample_ids_shape)
84 |
85 | @property
86 | def output_dtype(self):
87 | # Assume the dtype of the cell is the output_size structure
88 | # containing the input_state's first component's dtype.
89 | # Return that structure and the sample_ids_dtype from the helper.
90 | dtype = nest.flatten(self._initial_state)[0].dtype
91 | return CustomDecoderOutput(
92 | nest.map_structure(lambda _: dtype, self._rnn_output_size()),
93 | tf.float32,
94 | self._helper.sample_ids_dtype)
95 |
96 | def initialize(self, name=None):
97 | """Initialize the decoder.
98 | Args:
99 | name: Name scope for any created operations.
100 | Returns:
101 | `(finished, first_inputs, initial_state)`.
102 | """
103 | return self._helper.initialize() + (self._initial_state,)
104 |
105 | def step(self, time, inputs, state, name=None):
106 | """Perform a custom decoding step.
107 | Enables for dyanmic prediction
108 | Args:
109 | time: scalar `int32` tensor.
110 | inputs: A (structure of) input tensors.
111 | state: A (structure of) state tensors and TensorArrays.
112 | name: Name scope for any created operations.
113 | Returns:
114 | `(outputs, next_state, next_inputs, finished)`.
115 | """
116 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
117 | #Call outputprojection wrapper cell
118 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state)
119 |
120 | #apply output_layer (if existant)
121 | if self._output_layer is not None:
122 | cell_outputs = self._output_layer(cell_outputs)
123 | sample_ids = self._helper.sample(
124 | time=time, outputs=cell_outputs, state=cell_state)
125 |
126 | (finished, next_inputs, next_state) = self._helper.next_inputs(
127 | time=time,
128 | outputs=cell_outputs,
129 | state=cell_state,
130 | sample_ids=sample_ids,
131 | stop_token_prediction=stop_token)
132 |
133 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
134 | return (outputs, next_state, next_inputs, finished)
135 |
--------------------------------------------------------------------------------
/tacotron/models/helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow.contrib.seq2seq import Helper
4 |
5 |
6 | class TacoTestHelper(Helper):
7 | def __init__(self, batch_size, hparams):
8 | with tf.name_scope('TacoTestHelper'):
9 | self._batch_size = batch_size
10 | self._output_dim = hparams.num_mels
11 | self._reduction_factor = hparams.outputs_per_step
12 | self.stop_at_any = hparams.stop_at_any
13 |
14 | @property
15 | def batch_size(self):
16 | return self._batch_size
17 |
18 | @property
19 | def token_output_size(self):
20 | return self._reduction_factor
21 |
22 | @property
23 | def sample_ids_shape(self):
24 | return tf.TensorShape([])
25 |
26 | @property
27 | def sample_ids_dtype(self):
28 | return np.int32
29 |
30 | def initialize(self, name=None):
31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
32 |
33 | def sample(self, time, outputs, state, name=None):
34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
35 |
36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
37 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
38 | with tf.name_scope('TacoTestHelper'):
39 | #A sequence is finished when the output probability is > 0.5
40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
41 |
42 | #Since we are predicting r frames at each step, two modes are
43 | #then possible:
44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer)
46 | #Note:
47 | # With enough training steps, the model should be able to predict when to stop correctly
48 | # and the use of stop_at_any = True would be recommended. If however the model didn't
49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option
50 | # to get a correct synthesis
51 | if self.stop_at_any:
52 | finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
53 | else:
54 | finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
55 |
56 | # Feed last output frame as next input. outputs is [N, output_dim * r]
57 | next_inputs = outputs[:, -self._output_dim:]
58 | next_state = state
59 | return (finished, next_inputs, next_state)
60 |
61 |
62 | class TacoTrainingHelper(Helper):
63 | def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
64 | # inputs is [N, T_in], targets is [N, T_out, D]
65 | with tf.name_scope('TacoTrainingHelper'):
66 | self._batch_size = batch_size
67 | self._output_dim = hparams.num_mels
68 | self._reduction_factor = hparams.outputs_per_step
69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
70 | self.gta = gta
71 | self.eval = evaluating
72 | self._hparams = hparams
73 | self.global_step = global_step
74 |
75 | r = self._reduction_factor
76 | # Feed every r-th target frame as input
77 | self._targets = targets[:, r-1::r, :]
78 |
79 | #Maximal sequence length
80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
81 |
82 | @property
83 | def batch_size(self):
84 | return self._batch_size
85 |
86 | @property
87 | def token_output_size(self):
88 | return self._reduction_factor
89 |
90 | @property
91 | def sample_ids_shape(self):
92 | return tf.TensorShape([])
93 |
94 | @property
95 | def sample_ids_dtype(self):
96 | return np.int32
97 |
98 | def initialize(self, name=None):
99 | #Compute teacher forcing ratio for this global step.
100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing
101 | if self.gta:
102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
103 | elif self.eval and self._hparams.tacotron_natural_eval:
104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
105 | else:
106 | if self._hparams.tacotron_teacher_forcing_mode == 'scheduled':
107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
108 | self.global_step, self._hparams)
109 |
110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
111 |
112 | def sample(self, time, outputs, state, name=None):
113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them
114 |
115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
116 | with tf.name_scope(name or 'TacoTrainingHelper'):
117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions)
118 | finished = (time + 1 >= self._lengths)
119 |
120 | #Pick previous outputs randomly with respect to teacher forcing ratio
121 | next_inputs = tf.cond(
122 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
124 | lambda: outputs[:,-self._output_dim:])
125 |
126 | #Pass on state
127 | next_state = state
128 | return (finished, next_inputs, next_state)
129 |
130 |
131 | def _go_frames(batch_size, output_dim):
132 | '''Returns all-zero frames for a given batch size and output dimension'''
133 | return tf.tile([[0.0]], [batch_size, output_dim])
134 |
135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
136 | #################################################################
137 | # Narrow Cosine Decay:
138 |
139 | # Phase 1: tfr = init
140 | # We only start learning rate decay after 10k steps
141 |
142 | # Phase 2: tfr in ]init, final[
143 | # decay reach minimal value at step ~40k
144 |
145 | # Phase 3: tfr = final
146 | # clip by minimal teacher forcing ratio value (step >~ 40k)
147 | #################################################################
148 | #Pick final teacher forcing rate value
149 | if hparams.tacotron_teacher_forcing_final_ratio is not None:
150 | alpha = float(hparams.tacotron_teacher_forcing_final_ratio / hparams.tacotron_teacher_forcing_init_ratio)
151 |
152 | else:
153 | assert hparams.tacotron_teacher_forcing_decay_alpha is not None
154 | alpha = hparams.tacotron_teacher_forcing_decay_alpha
155 |
156 | #Compute natural cosine decay
157 | tfr = tf.train.cosine_decay(init_tfr,
158 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr ~= init at step 10k
159 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr ~= final at step ~40k
160 | alpha=alpha, #tfr = alpha% of init_tfr as final value
161 | name='tfr_cosine_decay')
162 |
163 | #force teacher forcing ratio to take initial value when global step < start decay step.
164 | narrow_tfr = tf.cond(
165 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
166 | lambda: tf.convert_to_tensor(init_tfr),
167 | lambda: tfr)
168 |
169 | return narrow_tfr
--------------------------------------------------------------------------------
/tacotron/synthesize.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import re
4 | import time
5 | from time import sleep
6 |
7 | import tensorflow as tf
8 | from hparams import hparams, hparams_debug_string
9 | from infolog import log
10 | from tacotron.synthesizer import Synthesizer
11 | from tqdm import tqdm
12 |
13 |
14 | def generate_fast(model, text):
15 | model.synthesize([text], None, None, None, None)
16 |
17 |
18 | def run_live(args, checkpoint_path, hparams):
19 | #Log to Terminal without keeping any records in files
20 | log(hparams_debug_string())
21 | synth = Synthesizer()
22 | synth.load(checkpoint_path, hparams)
23 |
24 | #Generate fast greeting message
25 | greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
26 | log(greetings)
27 | generate_fast(synth, greetings)
28 |
29 | #Interaction loop
30 | while True:
31 | try:
32 | text = input()
33 | generate_fast(synth, text)
34 |
35 | except KeyboardInterrupt:
36 | leave = 'Thank you for testing our features. see you soon.'
37 | log(leave)
38 | generate_fast(synth, leave)
39 | sleep(2)
40 | break
41 |
42 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
43 | eval_dir = os.path.join(output_dir, 'eval')
44 | log_dir = os.path.join(output_dir, 'logs-eval')
45 |
46 | if args.model == 'Tacotron-2':
47 | assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)
48 |
49 | #Create output path if it doesn't exist
50 | os.makedirs(eval_dir, exist_ok=True)
51 | os.makedirs(log_dir, exist_ok=True)
52 | os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
53 | os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
54 |
55 | log(hparams_debug_string())
56 | synth = Synthesizer()
57 | synth.load(checkpoint_path, hparams)
58 |
59 | #Set inputs batch wise
60 | sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
61 |
62 | log('Starting Synthesis')
63 | with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
64 | for i, texts in enumerate(tqdm(sentences)):
65 | start = time.time()
66 | basenames = ['batch_{}_sentence_{}'.format(i, j) for j in range(len(texts))]
67 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
68 |
69 | for elems in zip(texts, mel_filenames, speaker_ids):
70 | file.write('|'.join([str(x) for x in elems]) + '\n')
71 | log('synthesized mel spectrograms at {}'.format(eval_dir))
72 | return eval_dir
73 |
74 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
75 | GTA = (args.GTA == 'True')
76 | if GTA:
77 | synth_dir = os.path.join(output_dir, 'gta')
78 |
79 | #Create output path if it doesn't exist
80 | os.makedirs(synth_dir, exist_ok=True)
81 | else:
82 | synth_dir = os.path.join(output_dir, 'natural')
83 |
84 | #Create output path if it doesn't exist
85 | os.makedirs(synth_dir, exist_ok=True)
86 |
87 |
88 | metadata_filename = os.path.join(args.input_dir, 'train.txt')
89 | log(hparams_debug_string())
90 | synth = Synthesizer()
91 | synth.load(checkpoint_path, hparams, gta=GTA)
92 | with open(metadata_filename, encoding='utf-8') as f:
93 | metadata = [line.strip().split('|') for line in f]
94 | frame_shift_ms = hparams.hop_size / hparams.sample_rate
95 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
96 | log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
97 |
98 | #Set inputs batch wise
99 | metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
100 |
101 | log('Starting Synthesis')
102 | mel_dir = os.path.join(args.input_dir, 'mels')
103 | wav_dir = os.path.join(args.input_dir, 'audio')
104 | with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
105 | for i, meta in enumerate(tqdm(metadata)):
106 | texts = [m[5] for m in meta]
107 | mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
108 | wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
109 | basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
110 | mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)
111 |
112 | for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
113 | file.write('|'.join([str(x) for x in elems]) + '\n')
114 | log('synthesized mel spectrograms at {}'.format(synth_dir))
115 | return os.path.join(synth_dir, 'map.txt')
116 |
117 | def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
118 | output_dir = 'tacotron_' + args.output_dir
119 |
120 | try:
121 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
122 | log('loaded model at {}'.format(checkpoint_path))
123 | except:
124 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
125 |
126 | if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
127 | raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format(
128 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))
129 |
130 | if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
131 | raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(
132 | hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))
133 |
134 | if args.mode == 'eval':
135 | return run_eval(args, checkpoint_path, output_dir, hparams, sentences)
136 | elif args.mode == 'synthesis':
137 | return run_synthesis(args, checkpoint_path, output_dir, hparams)
138 | else:
139 | run_live(args, checkpoint_path, hparams)
140 |
--------------------------------------------------------------------------------
/tacotron/synthesizer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import wave
3 | from datetime import datetime
4 |
5 | import numpy as np
6 | import pyaudio
7 | import sounddevice as sd
8 | import tensorflow as tf
9 | from datasets import audio
10 | from infolog import log
11 | from librosa import effects
12 | from tacotron.models import create_model
13 | from tacotron.utils import plot
14 | from tacotron.utils.text import text_to_sequence
15 |
16 |
17 | class Synthesizer:
18 | def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
19 | log('Constructing model: %s' % model_name)
20 | #Force the batch size to be known in order to use attention masking in batch synthesis
21 | inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
22 | input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
23 | targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
24 | split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name='split_infos')
25 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
26 | self.model = create_model(model_name, hparams)
27 | if gta:
28 | self.model.initialize(inputs, input_lengths, targets, gta=gta, split_infos=split_infos)
29 | else:
30 | self.model.initialize(inputs, input_lengths, split_infos=split_infos)
31 |
32 | self.mel_outputs = self.model.tower_mel_outputs
33 | self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
34 | self.alignments = self.model.tower_alignments
35 | self.stop_token_prediction = self.model.tower_stop_token_prediction
36 | self.targets = targets
37 |
38 | if hparams.GL_on_GPU:
39 | self.GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
40 | self.GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')
41 |
42 | self.GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(self.GLGPU_mel_inputs, hparams)
43 | self.GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(self.GLGPU_lin_inputs, hparams)
44 |
45 | self.gta = gta
46 | self._hparams = hparams
47 | #pad input sequences with the 0 ( _ )
48 | self._pad = 0
49 | #explicitely setting the padding to a value that doesn't originally exist in the spectogram
50 | #to avoid any possible conflicts, without affecting the output range of the model too much
51 | if hparams.symmetric_mels:
52 | self._target_pad = -hparams.max_abs_value
53 | else:
54 | self._target_pad = 0.
55 |
56 | self.inputs = inputs
57 | self.input_lengths = input_lengths
58 | self.targets = targets
59 | self.split_infos = split_infos
60 |
61 | log('Loading checkpoint: %s' % checkpoint_path)
62 | #Memory allocation on the GPUs as needed
63 | config = tf.ConfigProto()
64 | config.gpu_options.allow_growth = True
65 | config.allow_soft_placement = True
66 |
67 | self.session = tf.Session(config=config)
68 | self.session.run(tf.global_variables_initializer())
69 |
70 | saver = tf.train.Saver()
71 | saver.restore(self.session, checkpoint_path)
72 |
73 |
74 | def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
75 | hparams = self._hparams
76 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
77 | #[-max, max] or [0,max]
78 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
79 |
80 | #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
81 | while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
82 | texts.append(texts[-1])
83 | basenames.append(basenames[-1])
84 | if mel_filenames is not None:
85 | mel_filenames.append(mel_filenames[-1])
86 |
87 | assert 0 == len(texts) % self._hparams.tacotron_num_gpus
88 | seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
89 | input_lengths = [len(seq) for seq in seqs]
90 |
91 | size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
92 |
93 | #Pad inputs according to each GPU max length
94 | input_seqs = None
95 | split_infos = []
96 | for i in range(self._hparams.tacotron_num_gpus):
97 | device_input = seqs[size_per_device*i: size_per_device*(i+1)]
98 | device_input, max_seq_len = self._prepare_inputs(device_input)
99 | input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
100 | split_infos.append([max_seq_len, 0, 0, 0])
101 |
102 | feed_dict = {
103 | self.inputs: input_seqs,
104 | self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
105 | }
106 |
107 | if self.gta:
108 | np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
109 | target_lengths = [len(np_target) for np_target in np_targets]
110 |
111 | #pad targets according to each GPU max length
112 | target_seqs = None
113 | for i in range(self._hparams.tacotron_num_gpus):
114 | device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
115 | device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
116 | target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
117 | split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
118 |
119 | feed_dict[self.targets] = target_seqs
120 | assert len(np_targets) == len(texts)
121 |
122 | feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
123 |
124 | if self.gta or not hparams.predict_linear:
125 | mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
126 |
127 | #Linearize outputs (n_gpus -> 1D)
128 | mels = [mel for gpu_mels in mels for mel in gpu_mels]
129 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
130 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
131 |
132 | if not self.gta:
133 | #Natural batch synthesis
134 | #Get Mel lengths for the entire batch from stop_tokens predictions
135 | target_lengths = self._get_output_lengths(stop_tokens)
136 |
137 | #Take off the batch wise padding
138 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
139 | assert len(mels) == len(texts)
140 |
141 | else:
142 | linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
143 |
144 | #Linearize outputs (1D arrays)
145 | linears = [linear for gpu_linear in linears for linear in gpu_linear]
146 | mels = [mel for gpu_mels in mels for mel in gpu_mels]
147 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
148 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
149 |
150 | #Natural batch synthesis
151 | #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
152 | target_lengths = self._get_output_lengths(stop_tokens)
153 |
154 | #Take off the batch wise padding
155 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
156 | linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
157 | linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
158 | assert len(mels) == len(linears) == len(texts)
159 |
160 | mels = np.clip(mels, T2_output_range[0], T2_output_range[1])
161 |
162 | if basenames is None:
163 | #Generate wav and read it
164 | if hparams.GL_on_GPU:
165 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mels[0]})
166 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
167 | else:
168 | wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
169 | audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way
170 |
171 | if platform.system() == 'Linux':
172 | #Linux wav reader
173 | os.system('aplay temp.wav')
174 |
175 | elif platform.system() == 'Windows':
176 | #windows wav reader
177 | os.system('start /min mplay32 /play /close temp.wav')
178 |
179 | else:
180 | raise RuntimeError('Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')
181 |
182 | return
183 |
184 |
185 | saved_mels_paths = []
186 | speaker_ids = []
187 | for i, mel in enumerate(mels):
188 | #Get speaker id for global conditioning (only used with GTA generally)
189 | if hparams.gin_channels > 0:
190 | raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
191 | speaker_id = '' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
192 | speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
193 | else:
194 | speaker_id = ''
195 | speaker_ids.append(speaker_id)
196 |
197 | # Write the spectrogram to disk
198 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders
199 | mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
200 | np.save(mel_filename, mel, allow_pickle=False)
201 | saved_mels_paths.append(mel_filename)
202 |
203 | if log_dir is not None:
204 | #save wav (mel -> wav)
205 | if hparams.GL_on_GPU:
206 | wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel})
207 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
208 | else:
209 | wav = audio.inv_mel_spectrogram(mel.T, hparams)
210 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate)
211 |
212 | #save alignments
213 | plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
214 | title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])
215 |
216 | #save mel spectrogram plot
217 | plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
218 | title='{}'.format(texts[i]), split_title=True)
219 |
220 | if hparams.predict_linear:
221 | #save wav (linear -> wav)
222 | if hparams.GL_on_GPU:
223 | wav = self.session.run(self.GLGPU_lin_outputs, feed_dict={self.GLGPU_lin_inputs: linears[i]})
224 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
225 | else:
226 | wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
227 | audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate)
228 |
229 | #save linear spectrogram plot
230 | plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
231 | title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
232 |
233 | return saved_mels_paths, speaker_ids
234 |
235 | def _round_up(self, x, multiple):
236 | remainder = x % multiple
237 | return x if remainder == 0 else x + multiple - remainder
238 |
239 | def _prepare_inputs(self, inputs):
240 | max_len = max([len(x) for x in inputs])
241 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
242 |
243 | def _pad_input(self, x, length):
244 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
245 |
246 | def _prepare_targets(self, targets, alignment):
247 | max_len = max([len(t) for t in targets])
248 | data_len = self._round_up(max_len, alignment)
249 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
250 |
251 | def _pad_target(self, t, length):
252 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
253 |
254 | def _get_output_lengths(self, stop_tokens):
255 | #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
256 | output_lengths = [row.index(1) if 1 in row else len(row) for row in np.round(stop_tokens).tolist()]
257 | return output_lengths
258 |
--------------------------------------------------------------------------------
/tacotron/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import time
5 | import traceback
6 | from datetime import datetime
7 |
8 | import infolog
9 | import numpy as np
10 | import tensorflow as tf
11 | from datasets import audio
12 | from hparams import hparams_debug_string
13 | from tacotron.feeder import Feeder
14 | from tacotron.models import create_model
15 | from tacotron.utils import ValueWindow, plot
16 | from tacotron.utils.text import sequence_to_text
17 | from tacotron.utils.symbols import symbols
18 | from tqdm import tqdm
19 |
20 | log = infolog.log
21 |
22 |
23 | def time_string():
24 | return datetime.now().strftime('%Y-%m-%d %H:%M')
25 |
26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
27 | #Create tensorboard projector
28 | config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
29 | config.model_checkpoint_path = checkpoint_path
30 |
31 | for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
32 | #Initialize config
33 | embedding = config.embeddings.add()
34 | #Specifiy the embedding variable and the metadata
35 | embedding.tensor_name = embedding_name
36 | embedding.metadata_path = path_to_meta
37 |
38 | #Project the embeddings to space dimensions for visualization
39 | tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
40 |
41 | def add_train_stats(model, hparams):
42 | with tf.variable_scope('stats') as scope:
43 | for i in range(hparams.tacotron_num_gpus):
44 | tf.summary.histogram('mel_outputs %d' % i, model.tower_mel_outputs[i])
45 | tf.summary.histogram('mel_targets %d' % i, model.tower_mel_targets[i])
46 | tf.summary.scalar('before_loss', model.before_loss)
47 | tf.summary.scalar('after_loss', model.after_loss)
48 |
49 | if hparams.predict_linear:
50 | tf.summary.scalar('linear_loss', model.linear_loss)
51 | for i in range(hparams.tacotron_num_gpus):
52 | tf.summary.histogram('linear_outputs %d' % i, model.tower_linear_outputs[i])
53 | tf.summary.histogram('linear_targets %d' % i, model.tower_linear_targets[i])
54 |
55 | tf.summary.scalar('regularization_loss', model.regularization_loss)
56 | tf.summary.scalar('stop_token_loss', model.stop_token_loss)
57 | tf.summary.scalar('loss', model.loss)
58 | tf.summary.scalar('learning_rate', model.learning_rate) #Control learning rate decay speed
59 | if hparams.tacotron_teacher_forcing_mode == 'scheduled':
60 | tf.summary.scalar('teacher_forcing_ratio', model.ratio) #Control teacher forcing ratio decay when mode = 'scheduled'
61 | gradient_norms = [tf.norm(grad) for grad in model.gradients]
62 | tf.summary.histogram('gradient_norm', gradient_norms)
63 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
64 | return tf.summary.merge_all()
65 |
66 | def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss):
67 | values = [
68 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_before_loss', simple_value=before_loss),
69 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_after_loss', simple_value=after_loss),
70 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/stop_token_loss', simple_value=stop_token_loss),
71 | tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_loss', simple_value=loss),
72 | ]
73 | if linear_loss is not None:
74 | values.append(tf.Summary.Value(tag='Tacotron_eval_model/eval_stats/eval_linear_loss', simple_value=linear_loss))
75 | test_summary = tf.Summary(value=values)
76 | summary_writer.add_summary(test_summary, step)
77 |
78 | def model_train_mode(args, feeder, hparams, global_step):
79 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
80 | model_name = None
81 | if args.model == 'Tacotron-2':
82 | model_name = 'Tacotron'
83 | model = create_model(model_name or args.model, hparams)
84 | if hparams.predict_linear:
85 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, linear_targets=feeder.linear_targets,
86 | targets_lengths=feeder.targets_lengths, global_step=global_step,
87 | is_training=True, split_infos=feeder.split_infos)
88 | else:
89 | model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets,
90 | targets_lengths=feeder.targets_lengths, global_step=global_step,
91 | is_training=True, split_infos=feeder.split_infos)
92 | model.add_loss()
93 | model.add_optimizer(global_step)
94 | stats = add_train_stats(model, hparams)
95 | return model, stats
96 |
97 | def model_test_mode(args, feeder, hparams, global_step):
98 | with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
99 | model_name = None
100 | if args.model == 'Tacotron-2':
101 | model_name = 'Tacotron'
102 | model = create_model(model_name or args.model, hparams)
103 | if hparams.predict_linear:
104 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets,
105 | linear_targets=feeder.eval_linear_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step,
106 | is_training=False, is_evaluating=True, split_infos=feeder.eval_split_infos)
107 | else:
108 | model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_mel_targets, feeder.eval_token_targets,
109 | targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True,
110 | split_infos=feeder.eval_split_infos)
111 | model.add_loss()
112 | return model
113 |
114 | def train(log_dir, args, hparams):
115 | save_dir = os.path.join(log_dir, 'taco_pretrained')
116 | plot_dir = os.path.join(log_dir, 'plots')
117 | wav_dir = os.path.join(log_dir, 'wavs')
118 | mel_dir = os.path.join(log_dir, 'mel-spectrograms')
119 | eval_dir = os.path.join(log_dir, 'eval-dir')
120 | eval_plot_dir = os.path.join(eval_dir, 'plots')
121 | eval_wav_dir = os.path.join(eval_dir, 'wavs')
122 | tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
123 | meta_folder = os.path.join(log_dir, 'metas')
124 | os.makedirs(save_dir, exist_ok=True)
125 | os.makedirs(plot_dir, exist_ok=True)
126 | os.makedirs(wav_dir, exist_ok=True)
127 | os.makedirs(mel_dir, exist_ok=True)
128 | os.makedirs(eval_dir, exist_ok=True)
129 | os.makedirs(eval_plot_dir, exist_ok=True)
130 | os.makedirs(eval_wav_dir, exist_ok=True)
131 | os.makedirs(tensorboard_dir, exist_ok=True)
132 | os.makedirs(meta_folder, exist_ok=True)
133 |
134 | checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
135 | input_path = os.path.join(args.base_dir, args.tacotron_input)
136 |
137 | if hparams.predict_linear:
138 | linear_dir = os.path.join(log_dir, 'linear-spectrograms')
139 | os.makedirs(linear_dir, exist_ok=True)
140 |
141 | log('Checkpoint path: {}'.format(checkpoint_path))
142 | log('Loading training data from: {}'.format(input_path))
143 | log('Using model: {}'.format(args.model))
144 | log(hparams_debug_string())
145 |
146 | #Start by setting a seed for repeatability
147 | tf.set_random_seed(hparams.tacotron_random_seed)
148 |
149 | #Set up data feeder
150 | coord = tf.train.Coordinator()
151 | with tf.variable_scope('datafeeder') as scope:
152 | feeder = Feeder(coord, input_path, hparams)
153 |
154 | #Set up model:
155 | global_step = tf.Variable(0, name='global_step', trainable=False)
156 | model, stats = model_train_mode(args, feeder, hparams, global_step)
157 | eval_model = model_test_mode(args, feeder, hparams, global_step)
158 |
159 | #Embeddings metadata
160 | char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
161 | if not os.path.isfile(char_embedding_meta):
162 | with open(char_embedding_meta, 'w', encoding='utf-8') as f:
163 | for symbol in symbols:
164 | if symbol == ' ':
165 | symbol = '\\s' #For visual purposes, swap space with \s
166 |
167 | f.write('{}\n'.format(symbol))
168 |
169 | char_embedding_meta = char_embedding_meta.replace(log_dir, '..')
170 |
171 | #Potential Griffin-Lim GPU setup
172 | if hparams.GL_on_GPU:
173 | GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
174 | GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq), name='GLGPU_lin_inputs')
175 |
176 | GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams)
177 | GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(GLGPU_lin_inputs, hparams)
178 |
179 | #Book keeping
180 | step = 0
181 | time_window = ValueWindow(100)
182 | loss_window = ValueWindow(100)
183 | saver = tf.train.Saver(max_to_keep=20)
184 |
185 | log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))
186 |
187 | #Memory allocation on the GPU as needed
188 | config = tf.ConfigProto()
189 | config.gpu_options.allow_growth = True
190 | config.allow_soft_placement = True
191 |
192 | #Train
193 | with tf.Session(config=config) as sess:
194 | try:
195 | summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
196 |
197 | sess.run(tf.global_variables_initializer())
198 |
199 | #saved model restoring
200 | if args.restore:
201 | # Restore saved model if the user requested it, default = True
202 | try:
203 | checkpoint_state = tf.train.get_checkpoint_state(save_dir)
204 |
205 | if (checkpoint_state and checkpoint_state.model_checkpoint_path):
206 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
207 | saver.restore(sess, checkpoint_state.model_checkpoint_path)
208 |
209 | else:
210 | log('No model to load at {}'.format(save_dir), slack=True)
211 | saver.save(sess, checkpoint_path, global_step=global_step)
212 |
213 | except tf.errors.OutOfRangeError as e:
214 | log('Cannot restore checkpoint: {}'.format(e), slack=True)
215 | else:
216 | log('Starting new training!', slack=True)
217 | saver.save(sess, checkpoint_path, global_step=global_step)
218 |
219 | #initializing feeder
220 | feeder.start_threads(sess)
221 |
222 | #Training loop
223 | while not coord.should_stop() and step < args.tacotron_train_steps:
224 | start_time = time.time()
225 | step, loss, opt = sess.run([global_step, model.loss, model.optimize])
226 | time_window.append(time.time() - start_time)
227 | loss_window.append(loss)
228 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
229 | step, time_window.average, loss, loss_window.average)
230 | log(message, end='\r', slack=(step % args.checkpoint_interval == 0))
231 |
232 | if np.isnan(loss) or loss > 100.:
233 | log('Loss exploded to {:.5f} at step {}'.format(loss, step))
234 | raise Exception('Loss exploded')
235 |
236 | if step % args.summary_interval == 0:
237 | log('\nWriting summary at step {}'.format(step))
238 | summary_writer.add_summary(sess.run(stats), step)
239 |
240 | if step % args.eval_interval == 0:
241 | #Run eval and save eval stats
242 | log('\nRunning evaluation at step {}'.format(step))
243 |
244 | eval_losses = []
245 | before_losses = []
246 | after_losses = []
247 | stop_token_losses = []
248 | linear_losses = []
249 | linear_loss = None
250 |
251 | if hparams.predict_linear:
252 | for i in tqdm(range(feeder.test_steps)):
253 | eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([
254 | eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
255 | eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0],
256 | eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0],
257 | eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0],
258 | eval_model.tower_linear_targets[0][0],
259 | ])
260 | eval_losses.append(eloss)
261 | before_losses.append(before_loss)
262 | after_losses.append(after_loss)
263 | stop_token_losses.append(stop_token_loss)
264 | linear_losses.append(linear_loss)
265 | linear_loss = sum(linear_losses) / len(linear_losses)
266 |
267 | if hparams.GL_on_GPU:
268 | wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p})
269 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
270 | else:
271 | wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
272 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
273 |
274 | else:
275 | for i in tqdm(range(feeder.test_steps)):
276 | eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([
277 | eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
278 | eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0],
279 | eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0]
280 | ])
281 | eval_losses.append(eloss)
282 | before_losses.append(before_loss)
283 | after_losses.append(after_loss)
284 | stop_token_losses.append(stop_token_loss)
285 |
286 | eval_loss = sum(eval_losses) / len(eval_losses)
287 | before_loss = sum(before_losses) / len(before_losses)
288 | after_loss = sum(after_losses) / len(after_losses)
289 | stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
290 |
291 | log('Saving eval log to {}..'.format(eval_dir))
292 | #Save some log to monitor model improvement on same unseen sequence
293 | if hparams.GL_on_GPU:
294 | wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p})
295 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
296 | else:
297 | wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
298 | audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
299 |
300 | plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
301 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss),
302 | max_len=t_len // hparams.outputs_per_step)
303 | plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
304 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t,
305 | max_len=t_len)
306 |
307 | if hparams.predict_linear:
308 | plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)),
309 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t,
310 | max_len=t_len, auto_aspect=True)
311 |
312 | log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
313 | log('Writing eval summary!')
314 | add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)
315 |
316 |
317 | if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
318 | #Save model and current global step
319 | saver.save(sess, checkpoint_path, global_step=global_step)
320 |
321 | log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
322 | if hparams.predict_linear:
323 | input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([
324 | model.tower_inputs[0][0],
325 | model.tower_mel_outputs[0][0],
326 | model.tower_linear_outputs[0][0],
327 | model.tower_alignments[0][0],
328 | model.tower_mel_targets[0][0],
329 | model.tower_targets_lengths[0][0],
330 | model.tower_linear_targets[0][0],
331 | ])
332 |
333 | #save predicted linear spectrogram to disk (debug)
334 | linear_filename = 'linear-prediction-step-{}.npy'.format(step)
335 | np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
336 |
337 | #save griffin lim inverted wav for debug (linear -> wav)
338 | if hparams.GL_on_GPU:
339 | wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction})
340 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
341 | else:
342 | wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
343 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
344 |
345 | #Save real and predicted linear-spectrogram plot to disk (control purposes)
346 | plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)),
347 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target,
348 | max_len=target_length, auto_aspect=True)
349 |
350 | else:
351 | input_seq, mel_prediction, alignment, target, target_length = sess.run([
352 | model.tower_inputs[0][0],
353 | model.tower_mel_outputs[0][0],
354 | model.tower_alignments[0][0],
355 | model.tower_mel_targets[0][0],
356 | model.tower_targets_lengths[0][0],
357 | ])
358 |
359 | #save predicted mel spectrogram to disk (debug)
360 | mel_filename = 'mel-prediction-step-{}.npy'.format(step)
361 | np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)
362 |
363 | #save griffin lim inverted wav for debug (mel -> wav)
364 | if hparams.GL_on_GPU:
365 | wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction})
366 | wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
367 | else:
368 | wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
369 | audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
370 |
371 | #save alignment plot to disk (control purposes)
372 | plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
373 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
374 | max_len=target_length // hparams.outputs_per_step)
375 | #save real and predicted mel-spectrogram plot to disk (control purposes)
376 | plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
377 | title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
378 | max_len=target_length)
379 | log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
380 |
381 | if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
382 | #Get current checkpoint state
383 | checkpoint_state = tf.train.get_checkpoint_state(save_dir)
384 |
385 | #Update Projector
386 | log('\nSaving Model Character Embeddings visualization..')
387 | add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
388 | log('Tacotron Character embeddings have been updated on tensorboard!')
389 |
390 | log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True)
391 | return save_dir
392 |
393 | except Exception as e:
394 | log('Exiting due to exception: {}'.format(e), slack=True)
395 | traceback.print_exc()
396 | coord.request_stop(e)
397 |
398 | def tacotron_train(args, log_dir, hparams):
399 | return train(log_dir, args, hparams)
400 |
--------------------------------------------------------------------------------
/tacotron/utils/__init__.py:
--------------------------------------------------------------------------------
1 | class ValueWindow():
2 | def __init__(self, window_size=100):
3 | self._window_size = window_size
4 | self._values = []
5 |
6 | def append(self, x):
7 | self._values = self._values[-(self._window_size - 1):] + [x]
8 |
9 | @property
10 | def sum(self):
11 | return sum(self._values)
12 |
13 | @property
14 | def count(self):
15 | return len(self._values)
16 |
17 | @property
18 | def average(self):
19 | return self.sum / max(1, self.count)
20 |
21 | def reset(self):
22 | self._values = []
23 |
--------------------------------------------------------------------------------
/tacotron/utils/cleaners.py:
--------------------------------------------------------------------------------
1 | '''
2 | Cleaners are transformations that run over the input text at both training and eval time.
3 |
4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
6 | 1. "english_cleaners" for English text
7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 | the symbols in symbols.py to match your data).
11 | '''
12 |
13 | import re
14 |
15 | from unidecode import unidecode
16 |
17 | from .numbers import normalize_numbers
18 |
19 | # Regular expression matching whitespace:
20 | _whitespace_re = re.compile(r'\s+')
21 |
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 | ('mrs', 'misess'),
25 | ('mr', 'mister'),
26 | ('dr', 'doctor'),
27 | ('st', 'saint'),
28 | ('co', 'company'),
29 | ('jr', 'junior'),
30 | ('maj', 'major'),
31 | ('gen', 'general'),
32 | ('drs', 'doctors'),
33 | ('rev', 'reverend'),
34 | ('lt', 'lieutenant'),
35 | ('hon', 'honorable'),
36 | ('sgt', 'sergeant'),
37 | ('capt', 'captain'),
38 | ('esq', 'esquire'),
39 | ('ltd', 'limited'),
40 | ('col', 'colonel'),
41 | ('ft', 'fort'),
42 | ]]
43 |
44 |
45 | def expand_abbreviations(text):
46 | for regex, replacement in _abbreviations:
47 | text = re.sub(regex, replacement, text)
48 | return text
49 |
50 |
51 | def expand_numbers(text):
52 | return normalize_numbers(text)
53 |
54 |
55 | def lowercase(text):
56 | '''lowercase input tokens.
57 | '''
58 | return text.lower()
59 |
60 |
61 | def collapse_whitespace(text):
62 | return re.sub(_whitespace_re, ' ', text)
63 |
64 |
65 | def convert_to_ascii(text):
66 | return unidecode(text)
67 |
68 |
69 | def basic_cleaners(text):
70 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
71 | text = lowercase(text)
72 | text = collapse_whitespace(text)
73 | return text
74 |
75 |
76 | def transliteration_cleaners(text):
77 | '''Pipeline for non-English text that transliterates to ASCII.'''
78 | text = convert_to_ascii(text)
79 | text = lowercase(text)
80 | text = collapse_whitespace(text)
81 | return text
82 |
83 |
84 | def english_cleaners(text):
85 | '''Pipeline for English text, including number and abbreviation expansion.'''
86 | text = convert_to_ascii(text)
87 | # text = lowercase(text)
88 | text = expand_numbers(text)
89 | text = expand_abbreviations(text)
90 | text = collapse_whitespace(text)
91 | return text
92 |
--------------------------------------------------------------------------------
/tacotron/utils/cmudict.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | valid_symbols = [
4 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
5 | 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
6 | 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
7 | 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
8 | 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
9 | 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
10 | 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
11 | ]
12 |
13 | _valid_symbol_set = set(valid_symbols)
14 |
15 |
16 | class CMUDict:
17 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
18 | def __init__(self, file_or_path, keep_ambiguous=True):
19 | if isinstance(file_or_path, str):
20 | with open(file_or_path, encoding='latin-1') as f:
21 | entries = _parse_cmudict(f)
22 | else:
23 | entries = _parse_cmudict(file_or_path)
24 | if not keep_ambiguous:
25 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
26 | self._entries = entries
27 |
28 |
29 | def __len__(self):
30 | return len(self._entries)
31 |
32 |
33 | def lookup(self, word):
34 | '''Returns list of ARPAbet pronunciations of the given word.'''
35 | return self._entries.get(word.upper())
36 |
37 |
38 |
39 | _alt_re = re.compile(r'\([0-9]+\)')
40 |
41 |
42 | def _parse_cmudict(file):
43 | cmudict = {}
44 | for line in file:
45 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
46 | parts = line.split(' ')
47 | word = re.sub(_alt_re, '', parts[0])
48 | pronunciation = _get_pronunciation(parts[1])
49 | if pronunciation:
50 | if word in cmudict:
51 | cmudict[word].append(pronunciation)
52 | else:
53 | cmudict[word] = [pronunciation]
54 | return cmudict
55 |
56 |
57 | def _get_pronunciation(s):
58 | parts = s.strip().split(' ')
59 | for part in parts:
60 | if part not in _valid_symbol_set:
61 | return None
62 | return ' '.join(parts)
63 |
--------------------------------------------------------------------------------
/tacotron/utils/numbers.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 |
13 |
14 | def _remove_commas(m):
15 | return m.group(1).replace(',', '')
16 |
17 |
18 | def _expand_decimal_point(m):
19 | return m.group(1).replace('.', ' point ')
20 |
21 |
22 | def _expand_dollars(m):
23 | match = m.group(1)
24 | parts = match.split('.')
25 | if len(parts) > 2:
26 | return match + ' dollars' # Unexpected format
27 | dollars = int(parts[0]) if parts[0] else 0
28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 | if dollars and cents:
30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 | cent_unit = 'cent' if cents == 1 else 'cents'
32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 | elif dollars:
34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 | return '%s %s' % (dollars, dollar_unit)
36 | elif cents:
37 | cent_unit = 'cent' if cents == 1 else 'cents'
38 | return '%s %s' % (cents, cent_unit)
39 | else:
40 | return 'zero dollars'
41 |
42 |
43 | def _expand_ordinal(m):
44 | return _inflect.number_to_words(m.group(0))
45 |
46 |
47 | def _expand_number(m):
48 | num = int(m.group(0))
49 | if num > 1000 and num < 3000:
50 | if num == 2000:
51 | return 'two thousand'
52 | elif num > 2000 and num < 2010:
53 | return 'two thousand ' + _inflect.number_to_words(num % 100)
54 | elif num % 100 == 0:
55 | return _inflect.number_to_words(num // 100) + ' hundred'
56 | else:
57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 | else:
59 | return _inflect.number_to_words(num, andword='')
60 |
61 |
62 | def normalize_numbers(text):
63 | text = re.sub(_comma_number_re, _remove_commas, text)
64 | text = re.sub(_pounds_re, r'\1 pounds', text)
65 | text = re.sub(_dollars_re, _expand_dollars, text)
66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 | text = re.sub(_ordinal_re, _expand_ordinal, text)
68 | text = re.sub(_number_re, _expand_number, text)
69 | return text
70 |
--------------------------------------------------------------------------------
/tacotron/utils/plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 |
5 | import numpy as np
6 |
7 |
8 | def split_title_line(title_text, max_words=5):
9 | """
10 | A function that splits any string based on specific character
11 | (returning it with the string), with maximum number of words on it
12 | """
13 | seq = title_text.split()
14 | return '\n'.join([' '.join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
15 |
16 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
17 | if max_len is not None:
18 | alignment = alignment[:, :max_len]
19 |
20 | fig = plt.figure(figsize=(8, 6))
21 | ax = fig.add_subplot(111)
22 |
23 | im = ax.imshow(
24 | alignment,
25 | aspect='auto',
26 | origin='lower',
27 | interpolation='none')
28 | fig.colorbar(im, ax=ax)
29 | xlabel = 'Decoder timestep'
30 |
31 | if split_title:
32 | title = split_title_line(title)
33 |
34 | plt.xlabel(xlabel)
35 | plt.title(title)
36 | plt.ylabel('Encoder timestep')
37 | plt.tight_layout()
38 | plt.savefig(path, format='png')
39 | plt.close()
40 |
41 |
42 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
43 | if max_len is not None:
44 | target_spectrogram = target_spectrogram[:max_len]
45 | pred_spectrogram = pred_spectrogram[:max_len]
46 |
47 | if split_title:
48 | title = split_title_line(title)
49 |
50 | fig = plt.figure(figsize=(10, 8))
51 | # Set common labels
52 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
53 |
54 | #target spectrogram subplot
55 | if target_spectrogram is not None:
56 | ax1 = fig.add_subplot(311)
57 | ax2 = fig.add_subplot(312)
58 |
59 | if auto_aspect:
60 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
61 | else:
62 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
63 | ax1.set_title('Target Mel-Spectrogram')
64 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
65 | ax2.set_title('Predicted Mel-Spectrogram')
66 | else:
67 | ax2 = fig.add_subplot(211)
68 |
69 | if auto_aspect:
70 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
71 | else:
72 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
73 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
74 |
75 | plt.tight_layout()
76 | plt.savefig(path, format='png')
77 | plt.close()
78 |
--------------------------------------------------------------------------------
/tacotron/utils/symbols.py:
--------------------------------------------------------------------------------
1 | '''
2 | Defines the set of symbols used in text input to the model.
3 |
4 | The default is a set of ASCII characters that works well for English or text that has been run
5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
6 | '''
7 | from . import cmudict
8 |
9 | _pad = '_'
10 | _eos = '~'
11 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? '
12 |
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | #_arpabet = ['@' + s for s in cmudict.valid_symbols]
15 |
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet
18 |
--------------------------------------------------------------------------------
/tacotron/utils/text.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from . import cleaners
4 | from .symbols import symbols
5 |
6 | # Mappings from symbol to numeric ID and vice versa:
7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9 |
10 | # Regular expression matching text enclosed in curly braces:
11 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
12 |
13 |
14 | def text_to_sequence(text, cleaner_names):
15 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
16 |
17 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded
18 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
19 |
20 | Args:
21 | text: string to convert to a sequence
22 | cleaner_names: names of the cleaner functions to run the text through
23 |
24 | Returns:
25 | List of integers corresponding to the symbols in the text
26 | '''
27 | sequence = []
28 |
29 | # Check for curly braces and treat their contents as ARPAbet:
30 | while len(text):
31 | m = _curly_re.match(text)
32 | if not m:
33 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
34 | break
35 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
36 | sequence += _arpabet_to_sequence(m.group(2))
37 | text = m.group(3)
38 |
39 | # Append EOS token
40 | sequence.append(_symbol_to_id['~'])
41 | return sequence
42 |
43 |
44 | def sequence_to_text(sequence):
45 | '''Converts a sequence of IDs back to a string'''
46 | result = ''
47 | for symbol_id in sequence:
48 | if symbol_id in _id_to_symbol:
49 | s = _id_to_symbol[symbol_id]
50 | # Enclose ARPAbet back in curly braces:
51 | if len(s) > 1 and s[0] == '@':
52 | s = '{%s}' % s[1:]
53 | result += s
54 | return result.replace('}{', ' ')
55 |
56 |
57 | def _clean_text(text, cleaner_names):
58 | for name in cleaner_names:
59 | cleaner = getattr(cleaners, name)
60 | if not cleaner:
61 | raise Exception('Unknown cleaner: %s' % name)
62 | text = cleaner(text)
63 | return text
64 |
65 |
66 | def _symbols_to_sequence(symbols):
67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 |
69 |
70 | def _arpabet_to_sequence(text):
71 | return _symbols_to_sequence(['@' + s for s in text.split()])
72 |
73 |
74 | def _should_keep_symbol(s):
75 | return s in _symbol_to_id and s is not '_' and s is not '~'
76 |
--------------------------------------------------------------------------------
/test_wavenet_feeder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import argparse
4 | from hparams import hparams
5 | from datasets import audio
6 | from tqdm import tqdm
7 |
8 |
9 |
10 | def _limit_time(hparams):
11 | '''Limit time resolution to save GPU memory.
12 | '''
13 | if hparams.max_time_sec is not None:
14 | return int(hparams.max_time_sec * hparams.sample_rate)
15 | elif hparams.max_time_steps is not None:
16 | return hparams.max_time_steps
17 | else:
18 | return None
19 |
20 |
21 | def get_groups(args, hparams, meta, local_condition):
22 | if hparams.train_with_GTA:
23 | mel_file = meta[2]
24 | else:
25 | mel_file = meta[1]
26 | audio_file = meta[0]
27 |
28 | input_data = np.load(os.path.join(args.base_dir, audio_file))
29 |
30 | if local_condition:
31 | local_condition_features = np.load(os.path.join(args.base_dir, mel_file))
32 | else:
33 | local_condition_features = None
34 |
35 | return (input_data, local_condition_features, None, len(input_data))
36 |
37 | def _adjust_time_resolution(hparams, batch, local_condition, max_time_steps):
38 | '''Adjust time resolution between audio and local condition
39 | '''
40 | if local_condition:
41 | new_batch = []
42 | for b in batch:
43 | x, c, g, l = b
44 | _assert_ready_for_upsample(hparams, x, c)
45 | if max_time_steps is not None:
46 | max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(hparams), True)
47 | if len(x) > max_time_steps:
48 | max_time_frames = max_steps // audio.get_hop_size(hparams)
49 | start = np.random.randint(0, len(c) - max_time_frames)
50 | time_start = start * audio.get_hop_size(hparams)
51 | x = x[time_start: time_start + max_time_frames * audio.get_hop_size(hparams)]
52 | c = c[start: start + max_time_frames, :]
53 | _assert_ready_for_upsample(hparams, x, c)
54 |
55 | new_batch.append((x, c, g, l))
56 | return new_batch
57 | else:
58 | new_batch = []
59 | for b in batch:
60 | x, c, g, l = b
61 | x = audio.trim_silence(x, hparams)
62 | if max_time_steps is not None and len(x) > max_time_steps:
63 | start = np.random.randint(0, len(c) - max_time_steps)
64 | x = x[start: start + max_time_steps]
65 | new_batch.append((x, c, g, l))
66 | return new_batch
67 |
68 | def _assert_ready_for_upsample(hparams, x, c):
69 | assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(hparams)
70 |
71 | def check_time_alignment(hparams, batch, local_condition):
72 | #No need to check beyond this step when preparing data
73 | #Limit time steps to save GPU Memory usage
74 | max_time_steps = _limit_time(hparams)
75 | #Adjust time resolution for upsampling
76 | batch = _adjust_time_resolution(hparams, batch, local_condition, max_time_steps)
77 |
78 | def _ensure_divisible(length, divisible_by=256, lower=True):
79 | if length % divisible_by == 0:
80 | return length
81 | if lower:
82 | return length - length % divisible_by
83 | else:
84 | return length + (divisible_by - length % divisible_by)
85 |
86 | def run(args, hparams):
87 | with open(args.metadata, 'r') as file:
88 | metadata = [line.strip().split('|') for line in file]
89 |
90 | local_condition = hparams.cin_channels > 0
91 |
92 | examples = [get_groups(args, hparams, meta, local_condition) for meta in metadata]
93 | batches = [examples[i: i+hparams.wavenet_batch_size] for i in range(0, len(examples), hparams.wavenet_batch_size)]
94 |
95 | for batch in tqdm(batches):
96 | check_time_alignment(hparams, batch, local_condition)
97 |
98 |
99 |
100 | def main():
101 | parser = argparse.ArgumentParser()
102 | parser.add_argument('--base_dir', default='')
103 | parser.add_argument('--hparams', default='',
104 | help='Hyperparameter overrides as a comma-separated list of name=value pairs')
105 | parser.add_argument('--metadata', default='tacotron_output/gta/map.txt')
106 | args = parser.parse_args()
107 |
108 | modified_hparams = hparams.parse(args.hparams)
109 | run(args, modified_hparams)
110 |
111 |
112 | if __name__ == '__main__':
113 | main()
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from time import sleep
4 |
5 | import infolog
6 | import tensorflow as tf
7 | from hparams import hparams
8 | from infolog import log
9 | from tacotron.synthesize import tacotron_synthesize
10 | from tacotron.train import tacotron_train
11 | from wavenet_vocoder.train import wavenet_train
12 |
13 | log = infolog.log
14 |
15 |
16 | def save_seq(file, sequence, input_path):
17 | '''Save Tacotron-2 training state to disk. (To skip for future runs)
18 | '''
19 | sequence = [str(int(s)) for s in sequence] + [input_path]
20 | with open(file, 'w') as f:
21 | f.write('|'.join(sequence))
22 |
23 | def read_seq(file):
24 | '''Load Tacotron-2 training state from disk. (To skip if not first run)
25 | '''
26 | if os.path.isfile(file):
27 | with open(file, 'r') as f:
28 | sequence = f.read().split('|')
29 |
30 | return [bool(int(s)) for s in sequence[:-1]], sequence[-1]
31 | else:
32 | return [0, 0, 0], ''
33 |
34 | def prepare_run(args):
35 | modified_hp = hparams.parse(args.hparams)
36 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
37 | run_name = args.name or args.model
38 | log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
39 | os.makedirs(log_dir, exist_ok=True)
40 | infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url)
41 | return log_dir, modified_hp
42 |
43 | def train(args, log_dir, hparams):
44 | state_file = os.path.join(log_dir, 'state_log')
45 | #Get training states
46 | (taco_state, GTA_state, wave_state), input_path = read_seq(state_file)
47 |
48 | if not taco_state:
49 | log('\n#############################################################\n')
50 | log('Tacotron Train\n')
51 | log('###########################################################\n')
52 | checkpoint = tacotron_train(args, log_dir, hparams)
53 | tf.reset_default_graph()
54 | #Sleep 1/2 second to let previous graph close and avoid error messages while synthesis
55 | sleep(0.5)
56 | if checkpoint is None:
57 | raise('Error occured while training Tacotron, Exiting!')
58 | taco_state = 1
59 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
60 | else:
61 | checkpoint = os.path.join(log_dir, 'taco_pretrained/')
62 |
63 | if not GTA_state:
64 | log('\n#############################################################\n')
65 | log('Tacotron GTA Synthesis\n')
66 | log('###########################################################\n')
67 | input_path = tacotron_synthesize(args, hparams, checkpoint)
68 | tf.reset_default_graph()
69 | #Sleep 1/2 second to let previous graph close and avoid error messages while Wavenet is training
70 | sleep(0.5)
71 | GTA_state = 1
72 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
73 | else:
74 | input_path = os.path.join('tacotron_' + args.output_dir, 'gta', 'map.txt')
75 |
76 | if input_path == '' or input_path is None:
77 | raise RuntimeError('input_path has an unpleasant value -> {}'.format(input_path))
78 |
79 | if not wave_state:
80 | log('\n#############################################################\n')
81 | log('Wavenet Train\n')
82 | log('###########################################################\n')
83 | checkpoint = wavenet_train(args, log_dir, hparams, input_path)
84 | if checkpoint is None:
85 | raise ('Error occured while training Wavenet, Exiting!')
86 | wave_state = 1
87 | save_seq(state_file, [taco_state, GTA_state, wave_state], input_path)
88 |
89 | if wave_state and GTA_state and taco_state:
90 | log('TRAINING IS ALREADY COMPLETE!!')
91 |
92 | def main():
93 | parser = argparse.ArgumentParser()
94 | parser.add_argument('--base_dir', default='')
95 | parser.add_argument('--hparams', default='',
96 | help='Hyperparameter overrides as a comma-separated list of name=value pairs')
97 | parser.add_argument('--tacotron_input', default='training_data/train.txt')
98 | parser.add_argument('--wavenet_input', default='tacotron_output/gta/map.txt')
99 | parser.add_argument('--name', help='Name of logging directory.')
100 | parser.add_argument('--model', default='Tacotron-2')
101 | parser.add_argument('--input_dir', default='training_data', help='folder to contain inputs sentences/targets')
102 | parser.add_argument('--output_dir', default='output', help='folder to contain synthesized mel spectrograms')
103 | parser.add_argument('--mode', default='synthesis', help='mode for synthesis of tacotron after training')
104 | parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode')
105 | parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training')
106 | parser.add_argument('--summary_interval', type=int, default=250,
107 | help='Steps between running summary ops')
108 | parser.add_argument('--embedding_interval', type=int, default=5000,
109 | help='Steps between updating embeddings projection visualization')
110 | parser.add_argument('--checkpoint_interval', type=int, default=2500,
111 | help='Steps between writing checkpoints')
112 | parser.add_argument('--eval_interval', type=int, default=5000,
113 | help='Steps between eval on test data')
114 | parser.add_argument('--tacotron_train_steps', type=int, default=100000, help='total number of tacotron training steps')
115 | parser.add_argument('--wavenet_train_steps', type=int, default=500000, help='total number of wavenet training steps')
116 | parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
117 | parser.add_argument('--slack_url', default=None, help='slack webhook notification destination link')
118 | args = parser.parse_args()
119 |
120 | accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
121 |
122 | if args.model not in accepted_models:
123 | raise ValueError('please enter a valid model to train: {}'.format(accepted_models))
124 |
125 | log_dir, hparams = prepare_run(args)
126 |
127 | if args.model == 'Tacotron':
128 | tacotron_train(args, log_dir, hparams)
129 | elif args.model == 'WaveNet':
130 | wavenet_train(args, log_dir, hparams, args.wavenet_input)
131 | elif args.model == 'Tacotron-2':
132 | train(args, log_dir, hparams)
133 | else:
134 | raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
135 |
136 |
137 | if __name__ == '__main__':
138 | main()
139 |
--------------------------------------------------------------------------------
/wavenet_preprocess.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from multiprocessing import cpu_count
4 |
5 | from datasets import wavenet_preprocessor
6 | from hparams import hparams
7 | from tqdm import tqdm
8 |
9 |
10 | def preprocess(args, input_dir, out_dir, hparams):
11 | mel_dir = os.path.join(out_dir, 'mels')
12 | wav_dir = os.path.join(out_dir, 'audio')
13 | os.makedirs(mel_dir, exist_ok=True)
14 | os.makedirs(wav_dir, exist_ok=True)
15 | metadata = wavenet_preprocessor.build_from_path(hparams, input_dir, mel_dir, wav_dir, args.n_jobs, tqdm=tqdm)
16 | write_metadata(metadata, out_dir)
17 |
18 | def write_metadata(metadata, out_dir):
19 | with open(os.path.join(out_dir, 'map.txt'), 'w', encoding='utf-8') as f:
20 | for m in metadata:
21 | f.write('|'.join([str(x) for x in m]) + '\n')
22 | mel_frames = sum([int(m[5]) for m in metadata])
23 | timesteps = sum([int(m[4]) for m in metadata])
24 | sr = hparams.sample_rate
25 | hours = timesteps / sr / 3600
26 | print('Write {} utterances, {} audio timesteps, ({:.2f} hours)'.format(
27 | len(metadata), timesteps, hours))
28 | print('Max mel frames length: {}'.format(max(int(m[5]) for m in metadata)))
29 | print('Max audio timesteps length: {}'.format(max(m[4] for m in metadata)))
30 |
31 | def run_preprocess(args, hparams):
32 | output_folder = os.path.join(args.base_dir, args.output)
33 |
34 | preprocess(args, args.input_dir, output_folder, hparams)
35 |
36 | def main():
37 | print('initializing preprocessing..')
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument('--base_dir', default='')
40 | parser.add_argument('--hparams', default='',
41 | help='Hyperparameter overrides as a comma-separated list of name=value pairs')
42 | parser.add_argument('--input_dir', default='LJSpeech-1.1/wavs')
43 | parser.add_argument('--output', default='tacotron_output/gta/')
44 | parser.add_argument('--n_jobs', type=int, default=cpu_count())
45 | args = parser.parse_args()
46 |
47 | modified_hp = hparams.parse(args.hparams)
48 |
49 | run_preprocess(args, modified_hp)
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/wavenet_vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/wavenet_vocoder/feeder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import threading
3 | import time
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 | from datasets import audio
8 | from infolog import log
9 | from keras.utils import np_utils
10 | from sklearn.model_selection import train_test_split
11 |
12 | from .util import is_mulaw_quantize, is_scalar_input
13 |
14 |
15 |
16 | _batches_per_group = 64
17 |
18 |
19 | class Feeder:
20 | """
21 | Feeds batches of data into queue in a background thread.
22 | """
23 | def __init__(self, coordinator, metadata_filename, base_dir, hparams):
24 | super(Feeder, self).__init__()
25 |
26 | self._coord = coordinator
27 | self._hparams = hparams
28 | self._train_offset = 0
29 | self._test_offset = 0
30 |
31 | if hparams.symmetric_mels:
32 | self._spec_pad = -hparams.max_abs_value
33 | else:
34 | self._spec_pad = 0.
35 |
36 | #Base directory of the project (to map files from different locations)
37 | self._base_dir = base_dir
38 |
39 | #Load metadata
40 | self._data_dir = os.path.dirname(metadata_filename)
41 | with open(metadata_filename, 'r') as f:
42 | self._metadata = [line.strip().split('|') for line in f]
43 |
44 | #Train test split
45 | if hparams.wavenet_test_size is None:
46 | assert hparams.wavenet_test_batches is not None
47 |
48 | test_size = (hparams.wavenet_test_size if hparams.wavenet_test_size is not None
49 | else hparams.wavenet_test_batches * hparams.wavenet_batch_size)
50 | indices = np.arange(len(self._metadata))
51 | train_indices, test_indices = train_test_split(indices,
52 | test_size=test_size, random_state=hparams.wavenet_data_random_state)
53 |
54 | #Make sure test size is a multiple of batch size else round up
55 | len_test_indices = _round_down(len(test_indices), hparams.wavenet_batch_size)
56 | extra_test = test_indices[len_test_indices:]
57 | test_indices = test_indices[:len_test_indices]
58 | train_indices = np.concatenate([train_indices, extra_test])
59 |
60 | self._train_meta = list(np.array(self._metadata)[train_indices])
61 | self._test_meta = list(np.array(self._metadata)[test_indices])
62 |
63 | self.test_steps = len(self._test_meta) // hparams.wavenet_batch_size
64 |
65 | if hparams.wavenet_test_size is None:
66 | assert hparams.wavenet_test_batches == self.test_steps
67 |
68 | #Get conditioning status
69 | self.local_condition, self.global_condition = self._check_conditions()
70 |
71 | with tf.device('/cpu:0'):
72 | # Create placeholders for inputs and targets. Don't specify batch size because we want
73 | # to be able to feed different batch sizes at eval time.
74 | if is_scalar_input(hparams.input_type):
75 | input_placeholder = tf.placeholder(tf.float32, shape=(None, 1, None), name='audio_inputs')
76 | target_placeholder = tf.placeholder(tf.float32, shape=(None, None, 1), name='audio_targets')
77 | target_type = tf.float32
78 | else:
79 | input_placeholder = tf.placeholder(tf.float32, shape=(None, hparams.quantize_channels, None), name='audio_inputs')
80 | target_placeholder = tf.placeholder(tf.int32, shape=(None, None, 1), name='audio_targets')
81 | target_type = tf.int32
82 |
83 | self._placeholders = [
84 | input_placeholder,
85 | target_placeholder,
86 | tf.placeholder(tf.int32, shape=(None, ), name='input_lengths'),
87 | ]
88 |
89 | queue_types = [tf.float32, target_type, tf.int32]
90 |
91 | if self.local_condition:
92 | self._placeholders.append(tf.placeholder(tf.float32, shape=(None, hparams.num_mels, None), name='local_condition_features'))
93 | queue_types.append(tf.float32)
94 | if self.global_condition:
95 | self._placeholders.append(tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features'))
96 | queue_types.append(tf.int32)
97 |
98 | # Create queue for buffering data
99 | queue = tf.FIFOQueue(8, queue_types, name='input_queue')
100 | self._enqueue_op = queue.enqueue(self._placeholders)
101 | variables = queue.dequeue()
102 |
103 | self.inputs = variables[0]
104 | self.inputs.set_shape(self._placeholders[0].shape)
105 | self.targets = variables[1]
106 | self.targets.set_shape(self._placeholders[1].shape)
107 | self.input_lengths = variables[2]
108 | self.input_lengths.set_shape(self._placeholders[2].shape)
109 |
110 | idx = 3
111 |
112 | #If local conditioning disabled override c inputs with None
113 | if hparams.cin_channels < 0:
114 | self.local_condition_features = None
115 | else:
116 | self.local_condition_features = variables[idx]
117 | self.local_condition_features.set_shape(self._placeholders[idx].shape)
118 | idx += 1
119 |
120 | #If global conditioning disabled override g inputs with None
121 | if hparams.gin_channels < 0:
122 | self.global_condition_features = None
123 | else:
124 | self.global_condition_features = variables[idx]
125 | self.global_condition_features.set_shape(self._placeholders[idx].shape)
126 |
127 | # Create queue for buffering eval data
128 | eval_queue = tf.FIFOQueue(1, queue_types, name='eval_queue')
129 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
130 | eval_variables = eval_queue.dequeue()
131 |
132 | self.eval_inputs = eval_variables[0]
133 | self.eval_inputs.set_shape(self._placeholders[0].shape)
134 | self.eval_targets = eval_variables[1]
135 | self.eval_targets.set_shape(self._placeholders[1].shape)
136 | self.eval_input_lengths = eval_variables[2]
137 | self.eval_input_lengths.set_shape(self._placeholders[2].shape)
138 |
139 | eval_idx = 3
140 |
141 | #If local conditioning disabled override c inputs with None
142 | if hparams.cin_channels < 0:
143 | self.eval_local_condition_features = None
144 | else:
145 | self.eval_local_condition_features = eval_variables[eval_idx]
146 | self.eval_local_condition_features.set_shape(self._placeholders[eval_idx].shape)
147 | eval_idx += 1
148 |
149 | #If global conditioning disabled override g inputs with None
150 | if hparams.gin_channels < 0:
151 | self.eval_global_condition_features = None
152 | else:
153 | self.eval_global_condition_features = eval_variables[eval_idx]
154 | self.eval_global_condition_features.set_shape(self._placeholders[eval_idx].shape)
155 |
156 |
157 | def start_threads(self, session):
158 | self._session = session
159 | thread = threading.Thread(name='background', target=self._enqueue_next_train_group)
160 | thread.daemon = True #Thread will close when parent quits
161 | thread.start()
162 |
163 | thread = threading.Thread(name='background', target=self._enqueue_next_test_group)
164 | thread.daemon = True #Thread will close when parent quits
165 | thread.start()
166 |
167 | def _get_test_groups(self):
168 | meta = self._test_meta[self._test_offset]
169 | self._test_offset += 1
170 |
171 | if self._hparams.train_with_GTA:
172 | mel_file = meta[2]
173 | else:
174 | mel_file = meta[1]
175 | audio_file = meta[0]
176 |
177 | input_data = np.load(os.path.join(self._base_dir, audio_file))
178 |
179 | if self.local_condition:
180 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
181 | else:
182 | local_condition_features = None
183 |
184 | if self.global_condition:
185 | global_condition_features = meta[3]
186 | if global_condition_features == '':
187 | raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!')
188 | else:
189 | global_condition_features = None
190 |
191 | return (input_data, local_condition_features, global_condition_features, len(input_data))
192 |
193 | def make_test_batches(self):
194 | start = time.time()
195 |
196 | #Read one example for evaluation
197 | n = 1
198 |
199 | #Test on entire test set (one sample at an evaluation step)
200 | examples = [self._get_test_groups() for i in range(len(self._test_meta))]
201 | batches = [examples[i: i+n] for i in range(0, len(examples), n)]
202 | np.random.shuffle(batches)
203 |
204 | log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
205 | return batches
206 |
207 | def _enqueue_next_train_group(self):
208 | while not self._coord.should_stop():
209 | start = time.time()
210 |
211 | # Read a group of examples
212 | n = self._hparams.wavenet_batch_size
213 | examples = [self._get_next_example() for i in range(n * _batches_per_group)]
214 |
215 | # Bucket examples base on similiar output length for efficiency
216 | examples.sort(key=lambda x: x[-1])
217 | batches = [examples[i: i+n] for i in range(0, len(examples), n)]
218 | np.random.shuffle(batches)
219 |
220 | log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
221 | for batch in batches:
222 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
223 | self._session.run(self._enqueue_op, feed_dict=feed_dict)
224 |
225 | def _enqueue_next_test_group(self):
226 | test_batches = self.make_test_batches()
227 | while not self._coord.should_stop():
228 | for batch in test_batches:
229 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch)))
230 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
231 |
232 | def _get_next_example(self):
233 | '''Get a single example (input, output, len_output) from disk
234 | '''
235 | if self._train_offset >= len(self._train_meta):
236 | self._train_offset = 0
237 | np.random.shuffle(self._train_meta)
238 | meta = self._train_meta[self._train_offset]
239 | self._train_offset += 1
240 |
241 | if self._hparams.train_with_GTA:
242 | mel_file = meta[2]
243 | if 'linear' in mel_file:
244 | raise RuntimeError('Linear spectrogram files selected instead of GTA mels, did you specify the wrong metadata?')
245 | else:
246 | mel_file = meta[1]
247 | audio_file = meta[0]
248 |
249 | input_data = np.load(os.path.join(self._base_dir, audio_file))
250 |
251 | if self.local_condition:
252 | local_condition_features = np.load(os.path.join(self._base_dir, mel_file))
253 | else:
254 | local_condition_features = None
255 |
256 | if self.global_condition:
257 | global_condition_features = meta[3]
258 | if global_condition_features == '':
259 | raise RuntimeError('Please redo the wavenet preprocessing (or GTA synthesis) to assign global condition features!')
260 | else:
261 | global_condition_features = None
262 |
263 | return (input_data, local_condition_features, global_condition_features, len(input_data))
264 |
265 |
266 | def _prepare_batch(self, batches):
267 | assert 0 == len(batches) % self._hparams.wavenet_num_gpus
268 | size_per_device = int(len(batches) / self._hparams.wavenet_num_gpus)
269 | np.random.shuffle(batches)
270 |
271 | #Limit time steps to save GPU Memory usage
272 | max_time_steps = self._limit_time()
273 | #Adjust time resolution for upsampling
274 | batches = self._adjust_time_resolution(batches, self.local_condition, max_time_steps)
275 |
276 | #time lengths
277 | input_lengths = np.asarray([len(x[0]) for x in batches], np.int32)
278 | max_input_length = max(input_lengths)
279 |
280 | #Since all inputs/targets will have the same lengths for all GPUs, we can simply treat all GPUs batches as one big batch and stack all data. (fixed length)
281 | inputs = self._prepare_inputs([x[0] for x in batches], max_input_length)
282 | targets = self._prepare_targets([x[0] for x in batches], max_input_length)
283 | local_condition_features = self._prepare_local_conditions(self.local_condition, [x[1] for x in batches])
284 | global_condition_features = self._prepare_global_conditions(self.global_condition, [x[2] for x in batches])
285 |
286 | #Create final batches
287 | new_batches = (inputs, targets, input_lengths)
288 | if local_condition_features is not None:
289 | new_batches += (local_condition_features, )
290 | if global_condition_features is not None:
291 | new_batches += (global_condition_features, )
292 |
293 | return new_batches
294 |
295 | def _prepare_inputs(self, inputs, maxlen):
296 | if is_mulaw_quantize(self._hparams.input_type):
297 | #[batch_size, time_steps, quantize_channels]
298 | x_batch = np.stack([_pad_inputs(np_utils.to_categorical(
299 | x, num_classes=self._hparams.quantize_channels), maxlen) for x in inputs]).astype(np.float32)
300 | else:
301 | #[batch_size, time_steps, 1]
302 | x_batch = np.stack([_pad_inputs(x.reshape(-1, 1), maxlen) for x in inputs]).astype(np.float32)
303 | assert len(x_batch.shape) == 3
304 | #Convert to channels first [batch_size, quantize_channels (or 1), time_steps]
305 | x_batch = np.transpose(x_batch, (0, 2, 1))
306 | return x_batch
307 |
308 | def _prepare_targets(self, targets, maxlen):
309 | #[batch_size, time_steps]
310 | if is_mulaw_quantize(self._hparams.input_type):
311 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.int32)
312 | else:
313 | y_batch = np.stack([_pad_targets(x, maxlen) for x in targets]).astype(np.float32)
314 | assert len(y_batch.shape) == 2
315 | #Add extra axis (make 3 dimension)
316 | y_batch = np.expand_dims(y_batch, axis=-1)
317 | return y_batch
318 |
319 | def _prepare_local_conditions(self, local_condition, c_features):
320 | if local_condition:
321 | maxlen = max([len(x) for x in c_features])
322 | #[-max, max] or [0,max]
323 | T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)
324 |
325 | if self._hparams.clip_for_wavenet:
326 | c_features = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in c_features]
327 |
328 | c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in c_features]).astype(np.float32)
329 | assert len(c_batch.shape) == 3
330 | #[batch_size, c_channels, time_steps]
331 | c_batch = np.transpose(c_batch, (0, 2, 1))
332 |
333 | if self._hparams.normalize_for_wavenet:
334 | #rerange to [0, 1]
335 | c_batch = _interp(c_batch, T2_output_range).astype(np.float32)
336 |
337 | else:
338 | c_batch = None
339 |
340 | return c_batch
341 |
342 | def _prepare_global_conditions(self, global_condition, g_features):
343 | if global_condition:
344 | g_batch = np.array(g_features).astype(np.int32).reshape(-1, 1)
345 |
346 | else:
347 | g_batch = None
348 |
349 | return g_batch
350 |
351 | def _check_conditions(self):
352 | local_condition = self._hparams.cin_channels > 0
353 | global_condition = self._hparams.gin_channels > 0
354 | return local_condition, global_condition
355 |
356 | def _limit_time(self):
357 | '''Limit time resolution to save GPU memory.
358 | '''
359 | if self._hparams.max_time_sec is not None:
360 | return int(self._hparams.max_time_sec * self._hparams.sample_rate)
361 |
362 | elif self._hparams.max_time_steps is not None:
363 | return self._hparams.max_time_steps
364 |
365 | else:
366 | return None
367 |
368 | def _adjust_time_resolution(self, batch, local_condition, max_time_steps):
369 | '''Adjust time resolution between audio and local condition
370 | '''
371 | if local_condition:
372 | new_batch = []
373 | for b in batch:
374 | x, c, g, l = b
375 | self._assert_ready_for_upsample(x, c)
376 | if max_time_steps is not None:
377 | max_steps = _ensure_divisible(max_time_steps, audio.get_hop_size(self._hparams), True)
378 | if len(x) > max_time_steps:
379 | max_time_frames = max_steps // audio.get_hop_size(self._hparams)
380 | start = np.random.randint(0, len(c) - max_time_frames)
381 | time_start = start * audio.get_hop_size(self._hparams)
382 | x = x[time_start: time_start + max_time_frames * audio.get_hop_size(self._hparams)]
383 | c = c[start: start + max_time_frames, :]
384 | self._assert_ready_for_upsample(x, c)
385 |
386 | new_batch.append((x, c, g, l))
387 | return new_batch
388 |
389 | else:
390 | new_batch = []
391 | for b in batch:
392 | x, c, g, l = b
393 | x = audio.trim_silence(x, hparams)
394 | if max_time_steps is not None and len(x) > max_time_steps:
395 | start = np.random.randint(0, len(c) - max_time_steps)
396 | x = x[start: start + max_time_steps]
397 | new_batch.append((x, c, g, l))
398 | return new_batch
399 |
400 | def _assert_ready_for_upsample(self, x, c):
401 | assert len(x) % len(c) == 0 and len(x) // len(c) == audio.get_hop_size(self._hparams)
402 |
403 |
404 | def _pad_inputs(x, maxlen, _pad=0):
405 | return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad)
406 |
407 | def _pad_targets(x, maxlen, _pad=0):
408 | return np.pad(x, (0, maxlen - len(x)), mode='constant', constant_values=_pad)
409 |
410 | def _round_up(x, multiple):
411 | remainder = x % multiple
412 | return x if remainder == 0 else x + multiple - remainder
413 |
414 | def _round_down(x, multiple):
415 | remainder = x % multiple
416 | return x if remainder == 0 else x - remainder
417 |
418 | def _ensure_divisible(length, divisible_by=256, lower=True):
419 | if length % divisible_by == 0:
420 | return length
421 | if lower:
422 | return length - length % divisible_by
423 | else:
424 | return length + (divisible_by - length % divisible_by)
425 |
426 | def _interp(feats, in_range):
427 | #rescales from [-max, max] (or [0, max]) to [0, 1]
428 | return (feats - in_range[0]) / (in_range[1] - in_range[0])
429 |
--------------------------------------------------------------------------------
/wavenet_vocoder/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .wavenet import WaveNet
2 | from warnings import warn
3 | from wavenet_vocoder.util import is_mulaw_quantize
4 |
5 | def create_model(name, hparams, init=False):
6 | if is_mulaw_quantize(hparams.input_type):
7 | if hparams.out_channels != hparams.quantize_channels:
8 | raise RuntimeError(
9 | "out_channels must equal to quantize_chennels if input_type is 'mulaw-quantize'")
10 |
11 | if name == 'WaveNet':
12 | return WaveNet(hparams, init)
13 | else:
14 | raise Exception('Unknow model: {}'.format(name))
15 |
--------------------------------------------------------------------------------
/wavenet_vocoder/models/gaussian.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 |
5 | def gaussian_maximum_likelihood_estimation_loss(y_hat, y, log_scale_min_gauss, num_classes, use_cdf=True, reduce=True):
6 | '''compute the gaussian MLE loss'''
7 | with tf.control_dependencies([tf.assert_equal(tf.shape(y_hat)[1], 2), tf.assert_equal(tf.rank(y_hat), 3)]):
8 | #[batch_size, time_steps, channels]
9 | y_hat = tf.transpose(y_hat, [0, 2, 1])
10 |
11 | #Unpack parameters: mean and log_scale outputs
12 | mean = y_hat[:, :, 0]
13 | log_scale = tf.maximum(y_hat[:, :, 1], log_scale_min_gauss)
14 | y = tf.squeeze(y, [-1])
15 |
16 | if use_cdf:
17 | #Compute log_probs using CDF trick (Normalized loss value and more stable training than with natural log prob)
18 | #Instantiate a Normal distribution with model outputs
19 | gaussian = tf.contrib.distributions.Normal(loc=mean, scale=tf.exp(log_scale))
20 |
21 | #Draw CDF+ and CDF- neighbors to the true sample y
22 | cdf_plus = gaussian.cdf(y + 1. / (num_classes - 1))
23 | cdf_min = gaussian.cdf(y - 1. / (num_classes - 1))
24 |
25 | #Maximize the difference between CDF+ and CDF- (or its log)
26 | log_prob = tf.log(tf.maximum(cdf_plus - cdf_min, 1e-12))
27 |
28 | else:
29 | #Get log probability of each sample under this distribution in a computationally stable fashion
30 | #This is the log(PDF)
31 | log_prob = -0.5 * (np.log(2. * np.pi) + 2. * log_scale + tf.square(y - mean) * tf.exp(-2. * log_scale))
32 |
33 | #Loss (Maximize log probability by minimizing its negative)
34 | if reduce:
35 | return -tf.reduce_sum(log_prob)
36 | else:
37 | return -tf.expand_dims(log_prob, [-1])
38 |
39 | def sample_from_gaussian(y, log_scale_min_gauss):
40 | '''sample from learned gaussian distribution'''
41 | with tf.control_dependencies([tf.assert_equal(tf.shape(y)[1], 2)]):
42 | #[batch_size, time_length, channels]
43 | y = tf.transpose(y, [0, 2, 1])
44 |
45 | mean = y[:, :, 0]
46 | log_scale = tf.maximum(y[:, :, 1], log_scale_min_gauss)
47 | scale = tf.exp(log_scale)
48 |
49 | gaussian_dist = tf.contrib.distributions.Normal(loc=mean, scale=scale, allow_nan_stats=False)
50 | x = gaussian_dist.sample()
51 |
52 | return tf.minimum(tf.maximum(x, -1.), 1.)
53 |
--------------------------------------------------------------------------------
/wavenet_vocoder/models/mixture.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 |
5 | def log_sum_exp(x):
6 | """ numerically stable log_sum_exp implementation that prevents overflow """
7 | axis = len(x.get_shape())-1
8 | m = tf.reduce_max(x, axis)
9 | m2 = tf.reduce_max(x, axis, keepdims=True)
10 | return m + tf.log(tf.reduce_sum(tf.exp(x-m2), axis))
11 |
12 | def log_prob_from_logits(x):
13 | """ numerically stable log_softmax implementation that prevents overflow """
14 | axis = len(x.get_shape())-1
15 | m = tf.reduce_max(x, axis, keepdims=True)
16 | return x - m - tf.log(tf.reduce_sum(tf.exp(x-m), axis, keepdims=True))
17 |
18 | def discretized_mix_logistic_loss(y_hat, y, num_classes=256,
19 | log_scale_min=-7.0, reduce=True):
20 | '''Discretized mix of logistic distributions loss.
21 |
22 | Note that it is assumed that input is scaled to [-1, 1]
23 |
24 | Args:
25 | y_hat: Tensor [batch_size, channels, time_length], predicted output.
26 | y: Tensor [batch_size, time_length, 1], Target.
27 | Returns:
28 | Tensor loss
29 | '''
30 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y_hat)[1], 3), 0), tf.assert_equal(tf.rank(y_hat), 3)]):
31 | nr_mix = tf.shape(y_hat)[1] // 3
32 |
33 | #[Batch_size, time_length, channels]
34 | y_hat = tf.transpose(y_hat, [0, 2, 1])
35 |
36 | #unpack parameters. [batch_size, time_length, num_mixtures] x 3
37 | logit_probs = y_hat[:, :, :nr_mix]
38 | means = y_hat[:, :, nr_mix:2 * nr_mix]
39 | log_scales = tf.maximum(y_hat[:, :, 2* nr_mix: 3 * nr_mix], log_scale_min)
40 |
41 | #[batch_size, time_length, 1] -> [batch_size, time_length, num_mixtures]
42 | y = y * tf.ones(shape=[1, 1, nr_mix], dtype=tf.float32)
43 |
44 | centered_y = y - means
45 | inv_stdv = tf.exp(-log_scales)
46 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
47 | cdf_plus = tf.nn.sigmoid(plus_in)
48 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
49 | cdf_min = tf.nn.sigmoid(min_in)
50 |
51 | log_cdf_plus = plus_in - tf.nn.softplus(plus_in) # log probability for edge case of 0 (before scaling)
52 | log_one_minus_cdf_min = -tf.nn.softplus(min_in) # log probability for edge case of 255 (before scaling)
53 |
54 | #probability for all other cases
55 | cdf_delta = cdf_plus - cdf_min
56 |
57 | mid_in = inv_stdv * centered_y
58 | #log probability in the center of the bin, to be used in extreme cases
59 | #(not actually used in this code)
60 | log_pdf_mid = mid_in - log_scales - 2. * tf.nn.softplus(mid_in)
61 |
62 | log_probs = tf.where(y < -0.999, log_cdf_plus,
63 | tf.where(y > 0.999, log_one_minus_cdf_min,
64 | tf.where(cdf_delta > 1e-5,
65 | tf.log(tf.maximum(cdf_delta, 1e-12)),
66 | log_pdf_mid - np.log((num_classes - 1) / 2))))
67 |
68 | #log_probs = log_probs + tf.nn.log_softmax(logit_probs, -1)
69 | log_probs = log_probs + log_prob_from_logits(logit_probs)
70 |
71 | if reduce:
72 | return -tf.reduce_sum(log_sum_exp(log_probs))
73 | else:
74 | return -tf.expand_dims(log_sum_exp(log_probs), [-1])
75 |
76 | def sample_from_discretized_mix_logistic(y, log_scale_min=-7.):
77 | '''
78 | Args:
79 | y: Tensor, [batch_size, channels, time_length]
80 | Returns:
81 | Tensor: sample in range of [-1, 1]
82 | '''
83 | with tf.control_dependencies([tf.assert_equal(tf.mod(tf.shape(y)[1], 3), 0)]):
84 | nr_mix = tf.shape(y)[1] // 3
85 |
86 | #[batch_size, time_length, channels]
87 | y = tf.transpose(y, [0, 2, 1])
88 | logit_probs = y[:, :, :nr_mix]
89 |
90 | #sample mixture indicator from softmax
91 | temp = tf.random_uniform(tf.shape(logit_probs), minval=1e-5, maxval=1. - 1e-5)
92 | temp = logit_probs - tf.log(-tf.log(temp))
93 | argmax = tf.argmax(temp, -1)
94 |
95 | #[batch_size, time_length] -> [batch_size, time_length, nr_mix]
96 | one_hot = tf.one_hot(argmax, depth=nr_mix, dtype=tf.float32)
97 | #select logistic parameters
98 | means = tf.reduce_sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
99 | log_scales = tf.maximum(tf.reduce_sum(
100 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1), log_scale_min)
101 |
102 | #sample from logistic & clip to interval
103 | #we don't actually round to the nearest 8-bit value when sampling
104 | u = tf.random_uniform(tf.shape(means), minval=1e-5, maxval=1. - 1e-5)
105 | x = means + tf.exp(log_scales) * (tf.log(u) - tf.log(1 -u))
106 |
107 | return tf.minimum(tf.maximum(x, -1.), 1.)
108 |
--------------------------------------------------------------------------------
/wavenet_vocoder/synthesize.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 | from hparams import hparams, hparams_debug_string
7 | from infolog import log
8 | from tqdm import tqdm
9 | from wavenet_vocoder.synthesizer import Synthesizer
10 |
11 |
12 | def run_synthesis(args, checkpoint_path, output_dir, hparams):
13 | log_dir = os.path.join(output_dir, 'plots')
14 | wav_dir = os.path.join(output_dir, 'wavs')
15 |
16 | #We suppose user will provide correct folder depending on training method
17 | log(hparams_debug_string())
18 | synth = Synthesizer()
19 | synth.load(checkpoint_path, hparams)
20 |
21 | if args.model == 'Tacotron-2':
22 | #If running all Tacotron-2, synthesize audio from evaluated mels
23 | metadata_filename = os.path.join(args.mels_dir, 'map.txt')
24 | with open(metadata_filename, encoding='utf-8') as f:
25 | metadata = np.array([line.strip().split('|') for line in f])
26 |
27 | speaker_ids = metadata[:, 2]
28 | mel_files = metadata[:, 1]
29 | texts = metadata[:, 0]
30 |
31 | speaker_ids = None if (speaker_ids == '').all() else speaker_ids
32 | else:
33 | #else Get all npy files in input_dir (supposing they are mels)
34 | mel_files = sorted([os.path.join(args.mels_dir, f) for f in os.listdir(args.mels_dir) if f.split('.')[-1] == 'npy'])
35 | speaker_ids = None if args.speaker_id is None else args.speaker_id.replace(' ', '').split(',')
36 | if speaker_ids is not None:
37 | assert len(speaker_ids) == len(mel_files)
38 |
39 | texts = None
40 |
41 | log('Starting synthesis! (this will take a while..)')
42 | os.makedirs(log_dir, exist_ok=True)
43 | os.makedirs(wav_dir, exist_ok=True)
44 |
45 | mel_files = [mel_files[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(mel_files), hparams.wavenet_synthesis_batch_size)]
46 | speaker_ids = None if speaker_ids is None else [speaker_ids[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(speaker_ids), hparams.wavenet_synthesis_batch_size)]
47 | texts = None if texts is None else [texts[i: i+hparams.wavenet_synthesis_batch_size] for i in range(0, len(texts), hparams.wavenet_synthesis_batch_size)]
48 |
49 | with open(os.path.join(wav_dir, 'map.txt'), 'w') as file:
50 | for i, mel_batch in enumerate(tqdm(mel_files)):
51 | mel_spectros = [np.load(mel_file) for mel_file in mel_batch]
52 |
53 | basenames = [os.path.basename(mel_file).replace('.npy', '') for mel_file in mel_batch]
54 | speaker_id_batch = None if speaker_ids is None else speaker_ids[i]
55 | audio_files = synth.synthesize(mel_spectros, speaker_id_batch, basenames, wav_dir, log_dir)
56 |
57 | speaker_logs = [''] * len(mel_batch) if speaker_id_batch is None else speaker_id_batch
58 |
59 | for j, mel_file in enumerate(mel_batch):
60 | if texts is None:
61 | file.write('{}|{}\n'.format(mel_file, audio_files[j], speaker_logs[j]))
62 | else:
63 | file.write('{}|{}|{}\n'.format(texts[i][j], mel_file, audio_files[j], speaker_logs[j]))
64 |
65 | log('synthesized audio waveforms at {}'.format(wav_dir))
66 |
67 |
68 |
69 | def wavenet_synthesize(args, hparams, checkpoint):
70 | output_dir = 'wavenet_' + args.output_dir
71 |
72 | try:
73 | checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
74 | log('loaded model at {}'.format(checkpoint_path))
75 | except:
76 | raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))
77 |
78 | run_synthesis(args, checkpoint_path, output_dir, hparams)
79 |
--------------------------------------------------------------------------------
/wavenet_vocoder/synthesizer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 | from datasets.audio import save_wavenet_wav, get_hop_size, melspectrogram
6 | from infolog import log
7 | from wavenet_vocoder.models import create_model
8 | from wavenet_vocoder.train import create_shadow_saver, load_averaged_model
9 | from wavenet_vocoder.feeder import _interp
10 |
11 | from . import util
12 |
13 |
14 | class Synthesizer:
15 | def load(self, checkpoint_path, hparams, model_name='WaveNet'):
16 | log('Constructing model: {}'.format(model_name))
17 | self._hparams = hparams
18 | local_cond, global_cond = self._check_conditions()
19 |
20 | self.local_conditions = tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name='local_condition_features') if local_cond else None
21 | self.global_conditions = tf.placeholder(tf.int32, shape=(None, 1), name='global_condition_features') if global_cond else None
22 | self.synthesis_length = tf.placeholder(tf.int32, shape=(), name='synthesis_length') if not local_cond else None
23 | self.targets = tf.placeholder(tf.float32, shape=(1, None, 1), name='audio_targets') if hparams.wavenet_synth_debug else None #Debug only with 1 wav
24 | self.input_lengths = tf.placeholder(tf.int32, shape=(1, ), name='input_lengths') if hparams.wavenet_synth_debug else None
25 | self.synth_debug = hparams.wavenet_synth_debug
26 |
27 | with tf.variable_scope('WaveNet_model') as scope:
28 | self.model = create_model(model_name, hparams)
29 | self.model.initialize(y=None, c=self.local_conditions, g=self.global_conditions,
30 | input_lengths=self.input_lengths, synthesis_length=self.synthesis_length, test_inputs=self.targets)
31 |
32 | self._hparams = hparams
33 | sh_saver = create_shadow_saver(self.model)
34 |
35 | log('Loading checkpoint: {}'.format(checkpoint_path))
36 | #Memory allocation on the GPU as needed
37 | config = tf.ConfigProto()
38 | config.gpu_options.allow_growth = True
39 | config.allow_soft_placement = True
40 |
41 | self.session = tf.Session(config=config)
42 | self.session.run(tf.global_variables_initializer())
43 |
44 | load_averaged_model(self.session, sh_saver, checkpoint_path)
45 |
46 | def synthesize(self, mel_spectrograms, speaker_ids, basenames, out_dir, log_dir):
47 | hparams = self._hparams
48 | local_cond, global_cond = self._check_conditions()
49 |
50 | #Switch mels in case of debug
51 | if self.synth_debug:
52 | assert len(hparams.wavenet_debug_mels) == len(hparams.wavenet_debug_wavs)
53 | mel_spectrograms = [np.load(mel_file) for mel_file in hparams.wavenet_debug_mels]
54 |
55 | #Get True length of audio to be synthesized: audio_len = mel_len * hop_size
56 | audio_lengths = [len(x) * get_hop_size(self._hparams) for x in mel_spectrograms]
57 |
58 | #Prepare local condition batch
59 | maxlen = max([len(x) for x in mel_spectrograms])
60 | #[-max, max] or [0,max]
61 | T2_output_range = (-self._hparams.max_abs_value, self._hparams.max_abs_value) if self._hparams.symmetric_mels else (0, self._hparams.max_abs_value)
62 |
63 | if self._hparams.clip_for_wavenet:
64 | mel_spectrograms = [np.clip(x, T2_output_range[0], T2_output_range[1]) for x in mel_spectrograms]
65 |
66 | c_batch = np.stack([_pad_inputs(x, maxlen, _pad=T2_output_range[0]) for x in mel_spectrograms]).astype(np.float32)
67 |
68 | if self._hparams.normalize_for_wavenet:
69 | #rerange to [0, 1]
70 | c_batch = _interp(c_batch, T2_output_range).astype(np.float32)
71 |
72 | g = None if speaker_ids is None else np.asarray(speaker_ids, dtype=np.int32).reshape(len(c_batch), 1)
73 | feed_dict = {}
74 |
75 | if local_cond:
76 | feed_dict[self.local_conditions] = c_batch
77 | else:
78 | feed_dict[self.synthesis_length] = 100
79 |
80 | if global_cond:
81 | feed_dict[self.global_conditions] = g
82 |
83 | if self.synth_debug:
84 | debug_wavs = hparams.wavenet_debug_wavs
85 | assert len(debug_wavs) % hparams.wavenet_num_gpus == 0
86 | test_wavs = [np.load(debug_wav).reshape(-1, 1) for debug_wav in debug_wavs]
87 |
88 | #pad wavs to same length
89 | max_test_len = max([len(x) for x in test_wavs])
90 | test_wavs = np.stack([_pad_inputs(x, max_test_len) for x in test_wavs]).astype(np.float32)
91 |
92 | assert len(test_wavs) == len(debug_wavs)
93 | feed_dict[self.targets] = test_wavs.reshape(len(test_wavs), max_test_len, 1)
94 | feed_dict[self.input_lengths] = np.asarray([test_wavs.shape[1]])
95 |
96 | #Generate wavs and clip extra padding to select Real speech parts
97 | generated_wavs, upsampled_features = self.session.run([self.model.tower_y_hat, self.model.tower_synth_upsampled_local_features], feed_dict=feed_dict)
98 |
99 | #Linearize outputs (n_gpus -> 1D)
100 | generated_wavs = [wav for gpu_wavs in generated_wavs for wav in gpu_wavs]
101 | upsampled_features = [feat for gpu_feats in upsampled_features for feat in gpu_feats]
102 |
103 | generated_wavs = [generated_wav[:length] for generated_wav, length in zip(generated_wavs, audio_lengths)]
104 | upsampled_features = [upsampled_feature[:, :length] for upsampled_feature, length in zip(upsampled_features, audio_lengths)]
105 |
106 | audio_filenames = []
107 | for i, (generated_wav, input_mel, upsampled_feature) in enumerate(zip(generated_wavs, mel_spectrograms, upsampled_features)):
108 | #Save wav to disk
109 | audio_filename = os.path.join(out_dir, 'wavenet-audio-{}.wav'.format(basenames[i]))
110 | save_wavenet_wav(generated_wav, audio_filename, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
111 | audio_filenames.append(audio_filename)
112 |
113 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
114 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
115 | generated_mel = melspectrogram(generated_wav, hparams).T
116 | util.plot_spectrogram(generated_mel, os.path.join(log_dir, 'wavenet-mel-spectrogram-{}.png'.format(basenames[i])),
117 | title='Local Condition vs Reconstructed Audio Mel-Spectrogram analysis', target_spectrogram=input_mel)
118 | #Save upsampled features to visualize checkerboard artifacts.
119 | util.plot_spectrogram(upsampled_feature.T, os.path.join(log_dir, 'wavenet-upsampled_features-{}.png'.format(basenames[i])),
120 | title='Upmsampled Local Condition features', auto_aspect=True)
121 |
122 | #Save waveplot to disk
123 | if log_dir is not None:
124 | plot_filename = os.path.join(log_dir, 'wavenet-waveplot-{}.png'.format(basenames[i]))
125 | util.waveplot(plot_filename, generated_wav, None, hparams, title='WaveNet generated Waveform.')
126 |
127 | return audio_filenames
128 |
129 | def _check_conditions(self):
130 | local_condition = self._hparams.cin_channels > 0
131 | global_condition = self._hparams.gin_channels > 0
132 | return local_condition, global_condition
133 |
134 |
135 | def _pad_inputs(x, maxlen, _pad=0):
136 | return np.pad(x, [(0, maxlen - len(x)), (0, 0)], mode='constant', constant_values=_pad)
137 |
--------------------------------------------------------------------------------
/wavenet_vocoder/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 | import time
5 | import traceback
6 | from datetime import datetime
7 |
8 | import infolog
9 | import librosa
10 | import numpy as np
11 | import tensorflow as tf
12 | from hparams import hparams_debug_string
13 | from datasets.audio import save_wavenet_wav, melspectrogram
14 | from tacotron.utils import ValueWindow
15 | from wavenet_vocoder.feeder import Feeder, _interp
16 | from wavenet_vocoder.models import create_model
17 |
18 | from . import util
19 |
20 | log = infolog.log
21 |
22 |
23 | def time_string():
24 | return datetime.now().strftime('%Y-%m-%d %H:%M')
25 |
26 | def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoint_path):
27 | #Create tensorboard projector
28 | config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
29 | config.model_checkpoint_path = checkpoint_path
30 |
31 | for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta):
32 | #Initialize config
33 | embedding = config.embeddings.add()
34 | #Specifiy the embedding variable and the metadata
35 | embedding.tensor_name = embedding_name
36 | embedding.metadata_path = path_to_meta
37 |
38 | #Project the embeddings to space dimensions for visualization
39 | tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config)
40 |
41 | def add_train_stats(model, hparams):
42 | with tf.variable_scope('stats') as scope:
43 | for i in range(hparams.wavenet_num_gpus):
44 | tf.summary.histogram('wav_outputs %d' % i, model.tower_y_hat_log[i])
45 | tf.summary.histogram('wav_targets %d' % i, model.tower_y_log[i])
46 | if model.tower_means[i] is not None:
47 | tf.summary.histogram('gaussian_means %d' % i, model.tower_means[i])
48 | tf.summary.histogram('gaussian_log_scales %d' % i, model.tower_log_scales[i])
49 |
50 | tf.summary.scalar('wavenet_learning_rate', model.learning_rate)
51 | tf.summary.scalar('wavenet_loss', model.loss)
52 |
53 | gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None]
54 | tf.summary.histogram('gradient_norm', gradient_norms)
55 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion)
56 | return tf.summary.merge_all()
57 |
58 | def add_test_stats(summary_writer, step, eval_loss, hparams):
59 | values = [
60 | tf.Summary.Value(tag='Wavenet_eval_model/eval_stats/wavenet_eval_loss', simple_value=eval_loss),
61 | ]
62 |
63 | test_summary = tf.Summary(value=values)
64 | summary_writer.add_summary(test_summary, step)
65 |
66 |
67 | def create_shadow_saver(model, global_step=None):
68 | '''Load shadow variables of saved model.
69 |
70 | Inspired by: https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
71 |
72 | Can also use: shadow_dict = model.ema.variables_to_restore()
73 | '''
74 | #Add global step to saved variables to save checkpoints correctly
75 | shadow_variables = [model.ema.average_name(v) for v in model.variables]
76 | variables = model.variables
77 |
78 | if global_step is not None:
79 | shadow_variables += ['global_step']
80 | variables += [global_step]
81 |
82 | shadow_dict = dict(zip(shadow_variables, variables)) #dict(zip(keys, values)) -> {key1: value1, key2: value2, ...}
83 | return tf.train.Saver(shadow_dict, max_to_keep=20)
84 |
85 | def load_averaged_model(sess, sh_saver, checkpoint_path):
86 | sh_saver.restore(sess, checkpoint_path)
87 |
88 |
89 | def eval_step(sess, global_step, model, plot_dir, wav_dir, summary_writer, hparams, model_name):
90 | '''Evaluate model during training.
91 | Supposes that model variables are averaged.
92 | '''
93 | start_time = time.time()
94 | y_hat, y_target, loss, input_mel, upsampled_features = sess.run([model.tower_y_hat[0], model.tower_y_target[0],
95 | model.eval_loss, model.tower_eval_c[0], model.tower_eval_upsampled_local_features[0]])
96 | duration = time.time() - start_time
97 | log('Time Evaluation: Generation of {} audio frames took {:.3f} sec ({:.3f} frames/sec)'.format(
98 | len(y_target), duration, len(y_target)/duration))
99 |
100 | #Make audio and plot paths
101 | pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
102 | target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
103 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
104 | mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
105 | upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))
106 |
107 | #Save figure
108 | util.waveplot(plot_path, y_hat, y_target, model._hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
109 | log('Eval loss for global step {}: {:.3f}'.format(global_step, loss))
110 |
111 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
112 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
113 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
114 | generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
115 | util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
116 | global_step, loss), target_spectrogram=input_mel.T)
117 | util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
118 | global_step, loss), auto_aspect=True)
119 |
120 | #Save Audio
121 | save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
122 | save_wavenet_wav(y_target, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
123 |
124 | #Write eval summary to tensorboard
125 | log('Writing eval summary!')
126 | add_test_stats(summary_writer, global_step, loss, hparams=hparams)
127 |
128 | def save_log(sess, global_step, model, plot_dir, wav_dir, hparams, model_name):
129 | log('\nSaving intermediate states at step {}'.format(global_step))
130 | idx = 0
131 | y_hat, y, loss, length, input_mel, upsampled_features = sess.run([model.tower_y_hat_log[0][idx],
132 | model.tower_y_log[0][idx],
133 | model.loss,
134 | model.tower_input_lengths[0][idx],
135 | model.tower_c[0][idx], model.tower_upsampled_local_features[0][idx]])
136 |
137 | #mask by length
138 | y_hat[length:] = 0
139 | y[length:] = 0
140 |
141 | #Make audio and plot paths
142 | pred_wav_path = os.path.join(wav_dir, 'step-{}-pred.wav'.format(global_step))
143 | target_wav_path = os.path.join(wav_dir, 'step-{}-real.wav'.format(global_step))
144 | plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(global_step))
145 | mel_path = os.path.join(plot_dir, 'step-{}-reconstruction-mel-spectrogram.png'.format(global_step))
146 | upsampled_path = os.path.join(plot_dir, 'step-{}-upsampled-features.png'.format(global_step))
147 |
148 | #Save figure
149 | util.waveplot(plot_path, y_hat, y, hparams, title='{}, {}, step={}, loss={:.5f}'.format(model_name, time_string(), global_step, loss))
150 |
151 | #Compare generated wav mel with original input mel to evaluate wavenet audio reconstruction performance
152 | #Both mels should match on low frequency information, wavenet mel should contain more high frequency detail when compared to Tacotron mels.
153 | T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)
154 | generated_mel = _interp(melspectrogram(y_hat, hparams).T, T2_output_range)
155 | util.plot_spectrogram(generated_mel, mel_path, title='Local Condition vs Reconst. Mel-Spectrogram, step={}, loss={:.5f}'.format(
156 | global_step, loss), target_spectrogram=input_mel.T)
157 | util.plot_spectrogram(upsampled_features.T, upsampled_path, title='Upsampled Local Condition features, step={}, loss={:.5f}'.format(
158 | global_step, loss), auto_aspect=True)
159 |
160 | #Save audio
161 | save_wavenet_wav(y_hat, pred_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
162 | save_wavenet_wav(y, target_wav_path, sr=hparams.sample_rate, inv_preemphasize=hparams.preemphasize, k=hparams.preemphasis)
163 |
164 | def save_checkpoint(sess, saver, checkpoint_path, global_step):
165 | saver.save(sess, checkpoint_path, global_step=global_step)
166 |
167 |
168 | def model_train_mode(args, feeder, hparams, global_step, init=False):
169 | with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope:
170 | model_name = None
171 | if args.model == 'Tacotron-2':
172 | model_name = 'WaveNet'
173 | model = create_model(model_name or args.model, hparams, init)
174 | #initialize model to train mode
175 | model.initialize(feeder.targets, feeder.local_condition_features, feeder.global_condition_features,
176 | feeder.input_lengths, x=feeder.inputs)
177 | model.add_loss()
178 | model.add_optimizer(global_step)
179 | stats = add_train_stats(model, hparams)
180 | return model, stats
181 |
182 | def model_test_mode(args, feeder, hparams, global_step):
183 | with tf.variable_scope('WaveNet_model', reuse=tf.AUTO_REUSE) as scope:
184 | model_name = None
185 | if args.model == 'Tacotron-2':
186 | model_name = 'WaveNet'
187 | model = create_model(model_name or args.model, hparams)
188 | #initialize model to test mode
189 | model.initialize(feeder.eval_targets, feeder.eval_local_condition_features, feeder.eval_global_condition_features,
190 | feeder.eval_input_lengths)
191 | model.add_loss()
192 | return model
193 |
194 | def train(log_dir, args, hparams, input_path):
195 | save_dir = os.path.join(log_dir, 'wave_pretrained')
196 | plot_dir = os.path.join(log_dir, 'plots')
197 | wav_dir = os.path.join(log_dir, 'wavs')
198 | eval_dir = os.path.join(log_dir, 'eval-dir')
199 | eval_plot_dir = os.path.join(eval_dir, 'plots')
200 | eval_wav_dir = os.path.join(eval_dir, 'wavs')
201 | tensorboard_dir = os.path.join(log_dir, 'wavenet_events')
202 | meta_folder = os.path.join(log_dir, 'metas')
203 | os.makedirs(save_dir, exist_ok=True)
204 | os.makedirs(plot_dir, exist_ok=True)
205 | os.makedirs(wav_dir, exist_ok=True)
206 | os.makedirs(eval_dir, exist_ok=True)
207 | os.makedirs(eval_plot_dir, exist_ok=True)
208 | os.makedirs(eval_wav_dir, exist_ok=True)
209 | os.makedirs(tensorboard_dir, exist_ok=True)
210 | os.makedirs(meta_folder, exist_ok=True)
211 |
212 | checkpoint_path = os.path.join(save_dir, 'wavenet_model.ckpt')
213 | input_path = os.path.join(args.base_dir, input_path)
214 |
215 | log('Checkpoint_path: {}'.format(checkpoint_path))
216 | log('Loading training data from: {}'.format(input_path))
217 | log('Using model: {}'.format(args.model))
218 | log(hparams_debug_string())
219 |
220 | #Start by setting a seed for repeatability
221 | tf.set_random_seed(hparams.wavenet_random_seed)
222 |
223 | #Set up data feeder
224 | coord = tf.train.Coordinator()
225 | with tf.variable_scope('datafeeder') as scope:
226 | feeder = Feeder(coord, input_path, args.base_dir, hparams)
227 |
228 | #Set up model
229 | global_step = tf.Variable(0, name='global_step', trainable=False)
230 | model, stats = model_train_mode(args, feeder, hparams, global_step)
231 | eval_model = model_test_mode(args, feeder, hparams, global_step)
232 |
233 | #Speaker Embeddings metadata
234 | if hparams.speakers_path is not None:
235 | speaker_embedding_meta = hparams.speakers_path
236 |
237 | else:
238 | speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv')
239 | if not os.path.isfile(speaker_embedding_meta):
240 | with open(speaker_embedding_meta, 'w', encoding='utf-8') as f:
241 | for speaker in hparams.speakers:
242 | f.write('{}\n'.format(speaker))
243 |
244 | speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..')
245 |
246 | #book keeping
247 | step = 0
248 | time_window = ValueWindow(100)
249 | loss_window = ValueWindow(100)
250 | sh_saver = create_shadow_saver(model, global_step)
251 |
252 | log('Wavenet training set to a maximum of {} steps'.format(args.wavenet_train_steps))
253 |
254 | #Memory allocation on the memory
255 | config = tf.ConfigProto()
256 | config.gpu_options.allow_growth = True
257 | config.allow_soft_placement = True
258 | run_init = False
259 |
260 | #Train
261 | with tf.Session(config=config) as sess:
262 | try:
263 | summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
264 | sess.run(tf.global_variables_initializer())
265 |
266 | #saved model restoring
267 | if args.restore:
268 | # Restore saved model if the user requested it, default = True
269 | try:
270 | checkpoint_state = tf.train.get_checkpoint_state(save_dir)
271 |
272 | if (checkpoint_state and checkpoint_state.model_checkpoint_path):
273 | log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
274 | load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
275 | else:
276 | log('No model to load at {}'.format(save_dir), slack=True)
277 | if hparams.wavenet_weight_normalization:
278 | run_init = True
279 |
280 | except tf.errors.OutOfRangeError as e:
281 | log('Cannot restore checkpoint: {}'.format(e), slack=True)
282 | else:
283 | log('Starting new training!', slack=True)
284 | if hparams.wavenet_weight_normalization:
285 | run_init = True
286 |
287 | if run_init:
288 | log('\nApplying Weight normalization in fresh training. Applying data dependent initialization forward pass..')
289 | #Create init_model
290 | init_model, _ = model_train_mode(args, feeder, hparams, global_step, init=True)
291 |
292 | #initializing feeder
293 | feeder.start_threads(sess)
294 |
295 | if run_init:
296 | #Run one forward pass for model parameters initialization (make prediction on init_batch)
297 | _ = sess.run(init_model.tower_y_hat)
298 | log('Data dependent initialization done. Starting training!')
299 |
300 | #Training loop
301 | while not coord.should_stop() and step < args.wavenet_train_steps:
302 | start_time = time.time()
303 | step, loss, opt = sess.run([global_step, model.loss, model.optimize])
304 | time_window.append(time.time() - start_time)
305 | loss_window.append(loss)
306 |
307 | message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
308 | step, time_window.average, loss, loss_window.average)
309 | log(message, end='\r', slack=(step % args.checkpoint_interval == 0))
310 |
311 | if np.isnan(loss) or loss > 100:
312 | log('Loss exploded to {:.5f} at step {}'.format(loss, step))
313 | raise Exception('Loss exploded')
314 |
315 | if step % args.summary_interval == 0:
316 | log('\nWriting summary at step {}'.format(step))
317 | summary_writer.add_summary(sess.run(stats), step)
318 |
319 | if step % args.checkpoint_interval == 0 or step == args.wavenet_train_steps:
320 | save_log(sess, step, model, plot_dir, wav_dir, hparams=hparams, model_name=args.model)
321 | save_checkpoint(sess, sh_saver, checkpoint_path, global_step)
322 |
323 | if step % args.eval_interval == 0:
324 | log('\nEvaluating at step {}'.format(step))
325 | eval_step(sess, step, eval_model, eval_plot_dir, eval_wav_dir, summary_writer=summary_writer , hparams=model._hparams, model_name=args.model)
326 |
327 | if hparams.gin_channels > 0 and (step % args.embedding_interval == 0 or step == args.wavenet_train_steps or step == 1):
328 | #Get current checkpoint state
329 | checkpoint_state = tf.train.get_checkpoint_state(save_dir)
330 |
331 | #Update Projector
332 | log('\nSaving Model Speaker Embeddings visualization..')
333 | add_embedding_stats(summary_writer, [model.embedding_table.name], [speaker_embedding_meta], checkpoint_state.model_checkpoint_path)
334 | log('WaveNet Speaker embeddings have been updated on tensorboard!')
335 |
336 | log('Wavenet training complete after {} global steps'.format(args.wavenet_train_steps), slack=True)
337 | return save_dir
338 |
339 | except Exception as e:
340 | log('Exiting due to exception: {}'.format(e), slack=True)
341 | traceback.print_exc()
342 | coord.request_stop(e)
343 |
344 |
345 | def wavenet_train(args, log_dir, hparams, input_path):
346 | return train(log_dir, args, hparams, input_path)
347 |
--------------------------------------------------------------------------------
/wavenet_vocoder/util.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 |
5 | import librosa.display as dsp
6 | import numpy as np
7 | import tensorflow as tf
8 |
9 |
10 | def _assert_valid_input_type(s):
11 | assert s == 'mulaw-quantize' or s == 'mulaw' or s == 'raw'
12 |
13 | def is_mulaw_quantize(s):
14 | _assert_valid_input_type(s)
15 | return s == 'mulaw-quantize'
16 |
17 | def is_mulaw(s):
18 | _assert_valid_input_type(s)
19 | return s == 'mulaw'
20 |
21 | def is_raw(s):
22 | _assert_valid_input_type(s)
23 | return s == 'raw'
24 |
25 | def is_scalar_input(s):
26 | return is_raw(s) or is_mulaw(s)
27 |
28 |
29 | #From https://github.com/r9y9/nnmnkwii/blob/master/nnmnkwii/preprocessing/generic.py
30 | def mulaw(x, mu=256):
31 | """Mu-Law companding
32 | Method described in paper [1]_.
33 | .. math::
34 | f(x) = sign(x) ln (1 + mu |x|) / ln (1 + mu)
35 | Args:
36 | x (array-like): Input signal. Each value of input signal must be in
37 | range of [-1, 1].
38 | mu (number): Compression parameter ``μ``.
39 | Returns:
40 | array-like: Compressed signal ([-1, 1])
41 | See also:
42 | :func:`nnmnkwii.preprocessing.inv_mulaw`
43 | :func:`nnmnkwii.preprocessing.mulaw_quantize`
44 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
45 | .. [1] Brokish, Charles W., and Michele Lewis. "A-law and mu-law companding
46 | implementations using the tms320c54x." SPRA163 (1997).
47 | """
48 | mu = 255
49 | return _sign(x) * _log1p(mu * _abs(x)) / _log1p(mu)
50 |
51 |
52 | def inv_mulaw(y, mu=256):
53 | """Inverse of mu-law companding (mu-law expansion)
54 | .. math::
55 | f^{-1}(x) = sign(y) (1 / mu) (1 + mu)^{|y|} - 1)
56 | Args:
57 | y (array-like): Compressed signal. Each value of input signal must be in
58 | range of [-1, 1].
59 | mu (number): Compression parameter ``μ``.
60 | Returns:
61 | array-like: Uncomprresed signal (-1 <= x <= 1)
62 | See also:
63 | :func:`nnmnkwii.preprocessing.inv_mulaw`
64 | :func:`nnmnkwii.preprocessing.mulaw_quantize`
65 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
66 | """
67 | mu = 255
68 | return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
69 |
70 |
71 | def mulaw_quantize(x, mu=256):
72 | """Mu-Law companding + quantize
73 | Args:
74 | x (array-like): Input signal. Each value of input signal must be in
75 | range of [-1, 1].
76 | mu (number): Compression parameter ``μ``.
77 | Returns:
78 | array-like: Quantized signal (dtype=int)
79 | - y ∈ [0, mu] if x ∈ [-1, 1]
80 | - y ∈ [0, mu) if x ∈ [-1, 1)
81 | .. note::
82 | If you want to get quantized values of range [0, mu) (not [0, mu]),
83 | then you need to provide input signal of range [-1, 1).
84 | Examples:
85 | >>> from scipy.io import wavfile
86 | >>> import pysptk
87 | >>> import numpy as np
88 | >>> from nnmnkwii import preprocessing as P
89 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file())
90 | >>> x = (x / 32768.0).astype(np.float32)
91 | >>> y = P.mulaw_quantize(x)
92 | >>> print(y.min(), y.max(), y.dtype)
93 | 15 246 int64
94 | See also:
95 | :func:`nnmnkwii.preprocessing.mulaw`
96 | :func:`nnmnkwii.preprocessing.inv_mulaw`
97 | :func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
98 | """
99 | mu = 255
100 | y = mulaw(x, mu)
101 | # scale [-1, 1] to [0, mu]
102 | return _asint((y + 1) / 2 * mu)
103 |
104 |
105 | def inv_mulaw_quantize(y, mu=256):
106 | """Inverse of mu-law companding + quantize
107 | Args:
108 | y (array-like): Quantized signal (∈ [0, mu]).
109 | mu (number): Compression parameter ``μ``.
110 | Returns:
111 | array-like: Uncompressed signal ([-1, 1])
112 | Examples:
113 | >>> from scipy.io import wavfile
114 | >>> import pysptk
115 | >>> import numpy as np
116 | >>> from nnmnkwii import preprocessing as P
117 | >>> fs, x = wavfile.read(pysptk.util.example_audio_file())
118 | >>> x = (x / 32768.0).astype(np.float32)
119 | >>> x_hat = P.inv_mulaw_quantize(P.mulaw_quantize(x))
120 | >>> x_hat = (x_hat * 32768).astype(np.int16)
121 | See also:
122 | :func:`nnmnkwii.preprocessing.mulaw`
123 | :func:`nnmnkwii.preprocessing.inv_mulaw`
124 | :func:`nnmnkwii.preprocessing.mulaw_quantize`
125 | """
126 | # [0, m) to [-1, 1]
127 | mu = 255
128 | y = 2 * _asfloat(y) / mu - 1
129 | return inv_mulaw(y, mu)
130 |
131 | def _sign(x):
132 | #wrapper to support tensorflow tensors/numpy arrays
133 | isnumpy = isinstance(x, np.ndarray)
134 | isscalar = np.isscalar(x)
135 | return np.sign(x) if (isnumpy or isscalar) else tf.sign(x)
136 |
137 |
138 | def _log1p(x):
139 | #wrapper to support tensorflow tensors/numpy arrays
140 | isnumpy = isinstance(x, np.ndarray)
141 | isscalar = np.isscalar(x)
142 | return np.log1p(x) if (isnumpy or isscalar) else tf.log1p(x)
143 |
144 |
145 | def _abs(x):
146 | #wrapper to support tensorflow tensors/numpy arrays
147 | isnumpy = isinstance(x, np.ndarray)
148 | isscalar = np.isscalar(x)
149 | return np.abs(x) if (isnumpy or isscalar) else tf.abs(x)
150 |
151 |
152 | def _asint(x):
153 | #wrapper to support tensorflow tensors/numpy arrays
154 | isnumpy = isinstance(x, np.ndarray)
155 | isscalar = np.isscalar(x)
156 | return x.astype(np.int) if isnumpy else int(x) if isscalar else tf.cast(x, tf.int32)
157 |
158 |
159 | def _asfloat(x):
160 | #wrapper to support tensorflow tensors/numpy arrays
161 | isnumpy = isinstance(x, np.ndarray)
162 | isscalar = np.isscalar(x)
163 | return x.astype(np.float32) if isnumpy else float(x) if isscalar else tf.cast(x, tf.float32)
164 |
165 | def sequence_mask(input_lengths, max_len=None, expand=True):
166 | if max_len is None:
167 | max_len = tf.reduce_max(input_lengths)
168 |
169 | if expand:
170 | return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1)
171 | return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
172 |
173 |
174 | def waveplot(path, y_hat, y_target, hparams, title=None):
175 | sr = hparams.sample_rate
176 |
177 | fig = plt.figure(figsize=(12, 4))
178 | if y_target is not None:
179 | ax = plt.subplot(3, 1, 1)
180 | dsp.waveplot(y_target, sr=sr)
181 | ax.set_title('Target waveform')
182 | ax = plt.subplot(3, 1, 2)
183 | dsp.waveplot(y_hat, sr=sr)
184 | ax.set_title('Predicted waveform')
185 | else:
186 | ax = plt.subplot(2, 1, 1)
187 | dsp.waveplot(y_hat, sr=sr)
188 | ax.set_title('Generated waveform')
189 |
190 | if title is not None:
191 | # Set common labels
192 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
193 |
194 | plt.tight_layout()
195 | plt.savefig(path, format="png")
196 | plt.close()
197 |
198 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
199 | if max_len is not None:
200 | target_spectrogram = target_spectrogram[:max_len]
201 | pred_spectrogram = pred_spectrogram[:max_len]
202 |
203 | if split_title:
204 | title = split_title_line(title)
205 |
206 | fig = plt.figure(figsize=(10, 8))
207 | # Set common labels
208 | fig.text(0.5, 0.18, title, horizontalalignment='center', fontsize=16)
209 |
210 | #target spectrogram subplot
211 | if target_spectrogram is not None:
212 | ax1 = fig.add_subplot(311)
213 | ax2 = fig.add_subplot(312)
214 |
215 | if auto_aspect:
216 | im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
217 | else:
218 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
219 | ax1.set_title('Target Mel-Spectrogram')
220 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
221 | ax2.set_title('Predicted Mel-Spectrogram')
222 | else:
223 | ax2 = fig.add_subplot(211)
224 |
225 | if auto_aspect:
226 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
227 | else:
228 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
229 | fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
230 |
231 | plt.tight_layout()
232 | plt.savefig(path, format='png')
233 | plt.close()
234 |
--------------------------------------------------------------------------------