├── DeepVoice3_multi_speaker_TTS_en_demo.ipynb
├── DeepVoice3_single_speaker_TTS_en_demo.ipynb
├── Neural_network_based_singing_voice_synthesis_demo_using_kiritan_singing_database_(Japanese).ipynb
├── README.md
└── Tacotron2_and_WaveNet_text_to_speech_demo.ipynb
/DeepVoice3_multi_speaker_TTS_en_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "DeepVoice3 multi-speaker TTS en demo.ipynb",
7 | "version": "0.3.2",
8 | "provenance": [],
9 | "private_outputs": true,
10 | "collapsed_sections": [],
11 | "toc_visible": true
12 | },
13 | "kernelspec": {
14 | "name": "python3",
15 | "display_name": "Python 3"
16 | },
17 | "accelerator": "GPU"
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "[View in Colaboratory](https://colab.research.google.com/github/r9y9/Colaboratory/blob/master/DeepVoice3_multi_speaker_TTS_en_demo.ipynb)"
28 | ]
29 | },
30 | {
31 | "metadata": {
32 | "id": "Za124iWvdMsZ",
33 | "colab_type": "text"
34 | },
35 | "cell_type": "markdown",
36 | "source": [
37 | "# DeepVoice3: Multi-speaker text-to-speech demo\n",
38 | "\n",
39 | "In this notebook, you can try DeepVoice3-based multi-speaker text-to-speech (en) using a model trained on [VCTK dataset](http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html). The notebook is supposed to be executed on [Google colab](https://colab.research.google.com) so you don't have to setup your machines locally.\n",
40 | "\n",
41 | "**Estimated time to complete**: 5 miniutes.\n",
42 | "\n",
43 | "- Code: https://github.com/r9y9/deepvoice3_pytorch\n",
44 | "- Audio samples: https://r9y9.github.io/deepvoice3_pytorch/"
45 | ]
46 | },
47 | {
48 | "metadata": {
49 | "id": "ml6wOhwqhGiI",
50 | "colab_type": "text"
51 | },
52 | "cell_type": "markdown",
53 | "source": [
54 | "## Setup"
55 | ]
56 | },
57 | {
58 | "metadata": {
59 | "id": "QjindPTItq75",
60 | "colab_type": "text"
61 | },
62 | "cell_type": "markdown",
63 | "source": [
64 | "### Install dependencies"
65 | ]
66 | },
67 | {
68 | "metadata": {
69 | "id": "kemMMs6pg9Rv",
70 | "colab_type": "code",
71 | "colab": {}
72 | },
73 | "cell_type": "code",
74 | "source": [
75 | "import os\n",
76 | "from os.path import exists, join, expanduser\n",
77 | "\n",
78 | "# Clone\n",
79 | "name = \"deepvoice3_pytorch\"\n",
80 | "if not exists(name):\n",
81 | " ! git clone https://github.com/r9y9/$name"
82 | ],
83 | "execution_count": 0,
84 | "outputs": []
85 | },
86 | {
87 | "metadata": {
88 | "id": "ntBxf7b6DCqT",
89 | "colab_type": "code",
90 | "colab": {}
91 | },
92 | "cell_type": "code",
93 | "source": [
94 | "# Change working directory to the project dir \n",
95 | "os.chdir(join(expanduser(\"~\"), name))\n",
96 | "\n",
97 | "# Use pytorch v0.3.1\n",
98 | "!pip install -q torch==0.3.1"
99 | ],
100 | "execution_count": 0,
101 | "outputs": []
102 | },
103 | {
104 | "metadata": {
105 | "id": "X6VFmDe-ideo",
106 | "colab_type": "code",
107 | "colab": {}
108 | },
109 | "cell_type": "code",
110 | "source": [
111 | "%pylab inline\n",
112 | "! pip install -q librosa nltk\n",
113 | "\n",
114 | "import torch\n",
115 | "import numpy as np\n",
116 | "import librosa\n",
117 | "import librosa.display\n",
118 | "import IPython\n",
119 | "from IPython.display import Audio\n",
120 | "# need this for English text processing frontend\n",
121 | "import nltk\n",
122 | "! python -m nltk.downloader cmudict"
123 | ],
124 | "execution_count": 0,
125 | "outputs": []
126 | },
127 | {
128 | "metadata": {
129 | "id": "_l1Gd2SStt0E",
130 | "colab_type": "text"
131 | },
132 | "cell_type": "markdown",
133 | "source": [
134 | "### Download a pre-trained model"
135 | ]
136 | },
137 | {
138 | "metadata": {
139 | "id": "42Zwjr4UjNn_",
140 | "colab_type": "code",
141 | "colab": {}
142 | },
143 | "cell_type": "code",
144 | "source": [
145 | "checkpoint_path = \"20171222_deepvoice3_vctk108_checkpoint_step000300000.pth\""
146 | ],
147 | "execution_count": 0,
148 | "outputs": []
149 | },
150 | {
151 | "metadata": {
152 | "id": "45Wrp8INj6Xu",
153 | "colab_type": "code",
154 | "colab": {}
155 | },
156 | "cell_type": "code",
157 | "source": [
158 | "if not exists(checkpoint_path):\n",
159 | " !curl -O -L \"https://www.dropbox.com/s/uzmtzgcedyu531k/20171222_deepvoice3_vctk108_checkpoint_step000300000.pth\""
160 | ],
161 | "execution_count": 0,
162 | "outputs": []
163 | },
164 | {
165 | "metadata": {
166 | "id": "pbN0Kuo43G4U",
167 | "colab_type": "text"
168 | },
169 | "cell_type": "markdown",
170 | "source": [
171 | "### git checkout to the working commit"
172 | ]
173 | },
174 | {
175 | "metadata": {
176 | "id": "Pqu6dICruu56",
177 | "colab_type": "code",
178 | "colab": {}
179 | },
180 | "cell_type": "code",
181 | "source": [
182 | "# Copy preset file (json) from master\n",
183 | "# The preset file describes hyper parameters\n",
184 | "! git checkout master --quiet\n",
185 | "preset = \"./presets/deepvoice3_vctk.json\"\n",
186 | "! cp -v $preset .\n",
187 | "preset = \"./deepvoice3_vctk.json\"\n",
188 | "\n",
189 | "# And then git checkout to the working commit\n",
190 | "# This is due to the model was trained a few months ago and it's not compatible\n",
191 | "# with the current master. \n",
192 | "! git checkout 0421749 --quiet\n",
193 | "! pip install -q -e '.[train]'"
194 | ],
195 | "execution_count": 0,
196 | "outputs": []
197 | },
198 | {
199 | "metadata": {
200 | "id": "_yJ90ESZiT_S",
201 | "colab_type": "text"
202 | },
203 | "cell_type": "markdown",
204 | "source": [
205 | "## Synthesis"
206 | ]
207 | },
208 | {
209 | "metadata": {
210 | "id": "FUyhiJg03dj6",
211 | "colab_type": "text"
212 | },
213 | "cell_type": "markdown",
214 | "source": [
215 | "### Setup hyper parameters"
216 | ]
217 | },
218 | {
219 | "metadata": {
220 | "id": "E9sLuYgcnbZb",
221 | "colab_type": "code",
222 | "colab": {}
223 | },
224 | "cell_type": "code",
225 | "source": [
226 | "import hparams\n",
227 | "import json\n",
228 | "\n",
229 | "# Newly added params. Need to inject dummy values\n",
230 | "for dummy, v in [(\"fmin\", 0), (\"fmax\", 0), (\"rescaling\", False),\n",
231 | " (\"rescaling_max\", 0.999), \n",
232 | " (\"allow_clipping_in_normalization\", False)]:\n",
233 | " if hparams.hparams.get(dummy) is None:\n",
234 | " hparams.hparams.add_hparam(dummy, v)\n",
235 | " \n",
236 | "# Load parameters from preset\n",
237 | "with open(preset) as f:\n",
238 | " hparams.hparams.parse_json(f.read())\n",
239 | "\n",
240 | "# Tell we are using multi-speaker DeepVoice3\n",
241 | "hparams.hparams.builder = \"deepvoice3_multispeaker\"\n",
242 | " \n",
243 | "# Inject frontend text processor\n",
244 | "import synthesis\n",
245 | "import train\n",
246 | "from deepvoice3_pytorch import frontend\n",
247 | "synthesis._frontend = getattr(frontend, \"en\")\n",
248 | "train._frontend = getattr(frontend, \"en\")\n",
249 | "\n",
250 | "# alises\n",
251 | "fs = hparams.hparams.sample_rate\n",
252 | "hop_length = hparams.hparams.hop_size"
253 | ],
254 | "execution_count": 0,
255 | "outputs": []
256 | },
257 | {
258 | "metadata": {
259 | "id": "C4NOldY83wG1",
260 | "colab_type": "text"
261 | },
262 | "cell_type": "markdown",
263 | "source": [
264 | "### Define utility functions"
265 | ]
266 | },
267 | {
268 | "metadata": {
269 | "id": "xRbelGLjiSfA",
270 | "colab_type": "code",
271 | "colab": {}
272 | },
273 | "cell_type": "code",
274 | "source": [
275 | "def tts(model, text, p=0, speaker_id=0, fast=True, figures=True):\n",
276 | " from synthesis import tts as _tts\n",
277 | " waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)\n",
278 | " if figures:\n",
279 | " visualize(alignment, spectrogram)\n",
280 | " IPython.display.display(Audio(waveform, rate=fs))\n",
281 | " \n",
282 | "def visualize(alignment, spectrogram):\n",
283 | " label_fontsize = 16\n",
284 | " figure(figsize=(16,16))\n",
285 | "\n",
286 | " subplot(2,1,1)\n",
287 | " imshow(alignment.T, aspect=\"auto\", origin=\"lower\", interpolation=None)\n",
288 | " xlabel(\"Decoder timestamp\", fontsize=label_fontsize)\n",
289 | " ylabel(\"Encoder timestamp\", fontsize=label_fontsize)\n",
290 | " colorbar()\n",
291 | "\n",
292 | " subplot(2,1,2)\n",
293 | " librosa.display.specshow(spectrogram.T, sr=fs, \n",
294 | " hop_length=hop_length, x_axis=\"time\", y_axis=\"linear\")\n",
295 | " xlabel(\"Time\", fontsize=label_fontsize)\n",
296 | " ylabel(\"Hz\", fontsize=label_fontsize)\n",
297 | " tight_layout()\n",
298 | " colorbar()"
299 | ],
300 | "execution_count": 0,
301 | "outputs": []
302 | },
303 | {
304 | "metadata": {
305 | "id": "m2jmbSD430Ws",
306 | "colab_type": "text"
307 | },
308 | "cell_type": "markdown",
309 | "source": [
310 | "### Load the model checkpoint"
311 | ]
312 | },
313 | {
314 | "metadata": {
315 | "id": "lr8pgqtYhvav",
316 | "colab_type": "code",
317 | "colab": {}
318 | },
319 | "cell_type": "code",
320 | "source": [
321 | "from train import build_model\n",
322 | "from train import restore_parts, load_checkpoint\n",
323 | "\n",
324 | "model = build_model()\n",
325 | "model = load_checkpoint(checkpoint_path, model, None, True)"
326 | ],
327 | "execution_count": 0,
328 | "outputs": []
329 | },
330 | {
331 | "metadata": {
332 | "id": "DOJ3miW63ywA",
333 | "colab_type": "text"
334 | },
335 | "cell_type": "markdown",
336 | "source": [
337 | "### Generate speech"
338 | ]
339 | },
340 | {
341 | "metadata": {
342 | "id": "GR1XRy-ykbz_",
343 | "colab_type": "code",
344 | "colab": {}
345 | },
346 | "cell_type": "code",
347 | "source": [
348 | "# Try your favorite senteneces:)\n",
349 | "text = \"Some have accepted this as a miracle without any physical explanation\"\n",
350 | "N = 15\n",
351 | "print(\"Synthesizing \\\"{}\\\" with {} different speakers\".format(text, N))\n",
352 | "for speaker_id in range(N):\n",
353 | " print(speaker_id)\n",
354 | " tts(model, text, speaker_id=speaker_id, figures=False)"
355 | ],
356 | "execution_count": 0,
357 | "outputs": []
358 | },
359 | {
360 | "metadata": {
361 | "id": "nirMEf2J5Roy",
362 | "colab_type": "code",
363 | "colab": {}
364 | },
365 | "cell_type": "code",
366 | "source": [
367 | "# With attention plot\n",
368 | "tts(model, text, speaker_id=0, figures=True)"
369 | ],
370 | "execution_count": 0,
371 | "outputs": []
372 | },
373 | {
374 | "metadata": {
375 | "id": "ArQspYbs5Aoo",
376 | "colab_type": "text"
377 | },
378 | "cell_type": "markdown",
379 | "source": [
380 | "For details, please visit https://github.com/r9y9/deepvoice3_pytorch"
381 | ]
382 | }
383 | ]
384 | }
--------------------------------------------------------------------------------
/DeepVoice3_single_speaker_TTS_en_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "DeepVoice3 single-speaker TTS en demo.ipynb",
7 | "version": "0.3.2",
8 | "provenance": [],
9 | "private_outputs": true,
10 | "collapsed_sections": [],
11 | "toc_visible": true
12 | },
13 | "kernelspec": {
14 | "name": "python3",
15 | "display_name": "Python 3"
16 | },
17 | "accelerator": "GPU"
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "[View in Colaboratory](https://colab.research.google.com/github/r9y9/Colaboratory/blob/master/DeepVoice3_single_speaker_TTS_en_demo.ipynb)"
28 | ]
29 | },
30 | {
31 | "metadata": {
32 | "id": "Za124iWvdMsZ",
33 | "colab_type": "text"
34 | },
35 | "cell_type": "markdown",
36 | "source": [
37 | "# DeepVoice3: Single-speaker text-to-speech demo\n",
38 | "\n",
39 | "In this notebook, you can try DeepVoice3-based single-speaker text-to-speech (en) using a model trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/). The notebook is supposed to be executed on [Google colab](https://colab.research.google.com) so you don't have to setup your machines locally.\n",
40 | "\n",
41 | "**Estimated time to complete**: 5 miniutes.\n",
42 | "\n",
43 | "- Code: https://github.com/r9y9/deepvoice3_pytorch\n",
44 | "- Audio samples: https://r9y9.github.io/deepvoice3_pytorch/"
45 | ]
46 | },
47 | {
48 | "metadata": {
49 | "id": "ml6wOhwqhGiI",
50 | "colab_type": "text"
51 | },
52 | "cell_type": "markdown",
53 | "source": [
54 | "## Setup"
55 | ]
56 | },
57 | {
58 | "metadata": {
59 | "id": "QjindPTItq75",
60 | "colab_type": "text"
61 | },
62 | "cell_type": "markdown",
63 | "source": [
64 | "### Install dependencies"
65 | ]
66 | },
67 | {
68 | "metadata": {
69 | "id": "kemMMs6pg9Rv",
70 | "colab_type": "code",
71 | "colab": {}
72 | },
73 | "cell_type": "code",
74 | "source": [
75 | "import os\n",
76 | "from os.path import exists, join, expanduser\n",
77 | "\n",
78 | "# Clone\n",
79 | "name = \"deepvoice3_pytorch\"\n",
80 | "if not exists(name):\n",
81 | " ! git clone https://github.com/r9y9/$name"
82 | ],
83 | "execution_count": 0,
84 | "outputs": []
85 | },
86 | {
87 | "metadata": {
88 | "id": "ntBxf7b6DCqT",
89 | "colab_type": "code",
90 | "colab": {}
91 | },
92 | "cell_type": "code",
93 | "source": [
94 | "# Change working directory to the project dir \n",
95 | "os.chdir(join(expanduser(\"~\"), name))\n",
96 | "\n",
97 | "!git checkout 7a10ac6763eda92595e257543494b6a95f64229b --quiet\n",
98 | "\n",
99 | "# Install dependencices\n",
100 | "!pip install -q -e '.[bin]'"
101 | ],
102 | "execution_count": 0,
103 | "outputs": []
104 | },
105 | {
106 | "metadata": {
107 | "id": "X6VFmDe-ideo",
108 | "colab_type": "code",
109 | "colab": {}
110 | },
111 | "cell_type": "code",
112 | "source": [
113 | "%pylab inline\n",
114 | "! pip install -q librosa nltk\n",
115 | "\n",
116 | "import torch\n",
117 | "import numpy as np\n",
118 | "import librosa\n",
119 | "import librosa.display\n",
120 | "import IPython\n",
121 | "from IPython.display import Audio\n",
122 | "# need this for English text processing frontend\n",
123 | "import nltk\n",
124 | "! python -m nltk.downloader cmudict"
125 | ],
126 | "execution_count": 0,
127 | "outputs": []
128 | },
129 | {
130 | "metadata": {
131 | "id": "_l1Gd2SStt0E",
132 | "colab_type": "text"
133 | },
134 | "cell_type": "markdown",
135 | "source": [
136 | "### Download a pre-trained model"
137 | ]
138 | },
139 | {
140 | "metadata": {
141 | "id": "42Zwjr4UjNn_",
142 | "colab_type": "code",
143 | "colab": {}
144 | },
145 | "cell_type": "code",
146 | "source": [
147 | "preset = \"20180505_deepvoice3_ljspeech.json\"\n",
148 | "checkpoint_path = \"20180505_deepvoice3_checkpoint_step000640000.pth\""
149 | ],
150 | "execution_count": 0,
151 | "outputs": []
152 | },
153 | {
154 | "metadata": {
155 | "id": "45Wrp8INj6Xu",
156 | "colab_type": "code",
157 | "colab": {}
158 | },
159 | "cell_type": "code",
160 | "source": [
161 | "if not exists(preset):\n",
162 | " !curl -O -L \"https://www.dropbox.com/s/0ck82unm0bo0rxd/20180505_deepvoice3_ljspeech.json\"\n",
163 | "if not exists(checkpoint_path):\n",
164 | " !curl -O -L \"https://www.dropbox.com/s/5ucl9remrwy5oeg/20180505_deepvoice3_checkpoint_step000640000.pth\""
165 | ],
166 | "execution_count": 0,
167 | "outputs": []
168 | },
169 | {
170 | "metadata": {
171 | "id": "_yJ90ESZiT_S",
172 | "colab_type": "text"
173 | },
174 | "cell_type": "markdown",
175 | "source": [
176 | "## Synthesis"
177 | ]
178 | },
179 | {
180 | "metadata": {
181 | "id": "FUyhiJg03dj6",
182 | "colab_type": "text"
183 | },
184 | "cell_type": "markdown",
185 | "source": [
186 | "### Setup hyper parameters"
187 | ]
188 | },
189 | {
190 | "metadata": {
191 | "id": "E9sLuYgcnbZb",
192 | "colab_type": "code",
193 | "colab": {}
194 | },
195 | "cell_type": "code",
196 | "source": [
197 | "import hparams\n",
198 | "import json\n",
199 | "\n",
200 | " \n",
201 | "# Load parameters from preset\n",
202 | "with open(preset) as f:\n",
203 | " hparams.hparams.parse_json(f.read())\n",
204 | " \n",
205 | "# Inject frontend text processor\n",
206 | "import synthesis\n",
207 | "import train\n",
208 | "from deepvoice3_pytorch import frontend\n",
209 | "synthesis._frontend = getattr(frontend, \"en\")\n",
210 | "train._frontend = getattr(frontend, \"en\")\n",
211 | "\n",
212 | "# alises\n",
213 | "fs = hparams.hparams.sample_rate\n",
214 | "hop_length = hparams.hparams.hop_size"
215 | ],
216 | "execution_count": 0,
217 | "outputs": []
218 | },
219 | {
220 | "metadata": {
221 | "id": "C4NOldY83wG1",
222 | "colab_type": "text"
223 | },
224 | "cell_type": "markdown",
225 | "source": [
226 | "### Define utility functions"
227 | ]
228 | },
229 | {
230 | "metadata": {
231 | "id": "xRbelGLjiSfA",
232 | "colab_type": "code",
233 | "colab": {}
234 | },
235 | "cell_type": "code",
236 | "source": [
237 | "def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):\n",
238 | " from synthesis import tts as _tts\n",
239 | " waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)\n",
240 | " if figures:\n",
241 | " visualize(alignment, spectrogram)\n",
242 | " IPython.display.display(Audio(waveform, rate=fs))\n",
243 | " \n",
244 | "def visualize(alignment, spectrogram):\n",
245 | " label_fontsize = 16\n",
246 | " figure(figsize=(16,16))\n",
247 | "\n",
248 | " subplot(2,1,1)\n",
249 | " imshow(alignment.T, aspect=\"auto\", origin=\"lower\", interpolation=None)\n",
250 | " xlabel(\"Decoder timestamp\", fontsize=label_fontsize)\n",
251 | " ylabel(\"Encoder timestamp\", fontsize=label_fontsize)\n",
252 | " colorbar()\n",
253 | "\n",
254 | " subplot(2,1,2)\n",
255 | " librosa.display.specshow(spectrogram.T, sr=fs, \n",
256 | " hop_length=hop_length, x_axis=\"time\", y_axis=\"linear\")\n",
257 | " xlabel(\"Time\", fontsize=label_fontsize)\n",
258 | " ylabel(\"Hz\", fontsize=label_fontsize)\n",
259 | " tight_layout()\n",
260 | " colorbar()"
261 | ],
262 | "execution_count": 0,
263 | "outputs": []
264 | },
265 | {
266 | "metadata": {
267 | "id": "m2jmbSD430Ws",
268 | "colab_type": "text"
269 | },
270 | "cell_type": "markdown",
271 | "source": [
272 | "### Load the model checkpoint"
273 | ]
274 | },
275 | {
276 | "metadata": {
277 | "id": "lr8pgqtYhvav",
278 | "colab_type": "code",
279 | "colab": {}
280 | },
281 | "cell_type": "code",
282 | "source": [
283 | "from train import build_model\n",
284 | "from train import restore_parts, load_checkpoint\n",
285 | "\n",
286 | "model = build_model()\n",
287 | "model = load_checkpoint(checkpoint_path, model, None, True)"
288 | ],
289 | "execution_count": 0,
290 | "outputs": []
291 | },
292 | {
293 | "metadata": {
294 | "id": "DOJ3miW63ywA",
295 | "colab_type": "text"
296 | },
297 | "cell_type": "markdown",
298 | "source": [
299 | "### Generate speech"
300 | ]
301 | },
302 | {
303 | "metadata": {
304 | "id": "GR1XRy-ykbz_",
305 | "colab_type": "code",
306 | "colab": {}
307 | },
308 | "cell_type": "code",
309 | "source": [
310 | "# Try your favorite senteneces:)\n",
311 | "texts = [\n",
312 | " \"Scientists at the CERN laboratory say they have discovered a new particle.\",\n",
313 | " \"There's a way to measure the acute emotional intelligence that has never gone out of style.\",\n",
314 | " \"President Trump met with other leaders at the Group of 20 conference.\",\n",
315 | " \"The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.\",\n",
316 | " \"Generative adversarial network or variational auto-encoder.\",\n",
317 | " \"The buses aren't the problem, they actually provide a solution.\",\n",
318 | " \"peter piper picked a peck of pickled peppers how many peppers did peter piper pick.\",\n",
319 | " \"Some have accepted this as a miracle without any physical explanation.\",\n",
320 | "]\n",
321 | "\n",
322 | "for idx, text in enumerate(texts):\n",
323 | " print(idx, text)\n",
324 | " tts(model, text, figures=False)"
325 | ],
326 | "execution_count": 0,
327 | "outputs": []
328 | },
329 | {
330 | "metadata": {
331 | "id": "nirMEf2J5Roy",
332 | "colab_type": "code",
333 | "colab": {}
334 | },
335 | "cell_type": "code",
336 | "source": [
337 | "# With attention plot\n",
338 | "text = \"Generative adversarial network or variational auto-encoder.\"\n",
339 | "tts(model, text, figures=True)"
340 | ],
341 | "execution_count": 0,
342 | "outputs": []
343 | },
344 | {
345 | "metadata": {
346 | "id": "ArQspYbs5Aoo",
347 | "colab_type": "text"
348 | },
349 | "cell_type": "markdown",
350 | "source": [
351 | "For details, please visit https://github.com/r9y9/deepvoice3_pytorch"
352 | ]
353 | }
354 | ]
355 | }
--------------------------------------------------------------------------------
/Neural_network_based_singing_voice_synthesis_demo_using_kiritan_singing_database_(Japanese).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Neural network-based singing voice synthesis demo using kiritan_singing database (Japanese).ipynb",
7 | "provenance": [],
8 | "private_outputs": true,
9 | "collapsed_sections": [
10 | "VWhj3SHGRShX"
11 | ],
12 | "toc_visible": true,
13 | "authorship_tag": "ABX9TyOi40QwOl8FHzLkwHYwmsUP",
14 | "include_colab_link": true
15 | },
16 | "kernelspec": {
17 | "name": "python3",
18 | "display_name": "Python 3"
19 | },
20 | "accelerator": "GPU"
21 | },
22 | "cells": [
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {
26 | "id": "view-in-github",
27 | "colab_type": "text"
28 | },
29 | "source": [
30 | "
"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {
36 | "id": "MQjZmX2nQLfR",
37 | "colab_type": "text"
38 | },
39 | "source": [
40 | "# Neural network-based singing voice synthesis demo using kiritan_singing database (Japanese)\n",
41 | "\n",
42 | "This is a demo of a singing voice synthesis system trained on the [kiritan_singing database (*Japanese*)](https://zunko.jp/kiridev/login.php). Given a musicxml file, the system generates waveform.\n",
43 | "\n",
44 | "All the models were trained using https://github.com/r9y9/nnsvs/. Recipes to reproduce experiments are included in the repository: https://github.com/r9y9/nnsvs/tree/master/egs/kiritan_singing.\n",
45 | "\n",
46 | "Estimate time to run: 5 mins.\n",
47 | "\n",
48 | "\n",
49 | "## Notice\n",
50 | "\n",
51 | "This is an alpha version of demo and singing voice quality is not very high (this is expected). Major updates and improvements are comming soon. More details on this project can be found at https://github.com/r9y9/nnsvs/issues/1."
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "id": "Oi7WL3T1-H9w",
58 | "colab_type": "text"
59 | },
60 | "source": [
61 | "## Download music xml files"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "metadata": {
67 | "id": "t4UVgGL4-aOv",
68 | "colab_type": "code",
69 | "colab": {}
70 | },
71 | "source": [
72 | "! git clone -q https://github.com/r9y9/kiritan_singing"
73 | ],
74 | "execution_count": 0,
75 | "outputs": []
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {
80 | "id": "VWhj3SHGRShX",
81 | "colab_type": "text"
82 | },
83 | "source": [
84 | "## Install requirements\n",
85 | "\n",
86 | "At the moment, nnsvs depends on sinsy (C++ library) for the muxicxml to context feature conversion. Installing binary dependencies is a bit complicated, but here goes the complete setup."
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "metadata": {
92 | "id": "BzyX0v8HRrCS",
93 | "colab_type": "code",
94 | "colab": {}
95 | },
96 | "source": [
97 | "! pip install -q -U numpy cython\n",
98 | "! rm -rf hts_engine_API sinsy pysinsy nnmnkwii nnsvs"
99 | ],
100 | "execution_count": 0,
101 | "outputs": []
102 | },
103 | {
104 | "cell_type": "code",
105 | "metadata": {
106 | "id": "bh5_HHAdPyUA",
107 | "colab_type": "code",
108 | "colab": {}
109 | },
110 | "source": [
111 | "# Binary dependencies\n",
112 | "! git clone -q https://github.com/r9y9/hts_engine_API\n",
113 | "! cd hts_engine_API/src && ./waf configure --prefix=/usr/ && sudo ./waf build > /dev/null 2>&1 && ./waf install\n",
114 | "! git clone -q https://github.com/r9y9/sinsy\n",
115 | "! cd sinsy/src/ && mkdir -p build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=/usr/ .. && make -j > /dev/null 2>&1 && sudo make install\n"
116 | ],
117 | "execution_count": 0,
118 | "outputs": []
119 | },
120 | {
121 | "cell_type": "code",
122 | "metadata": {
123 | "id": "VACHstV2RjAm",
124 | "colab_type": "code",
125 | "colab": {}
126 | },
127 | "source": [
128 | "# Python dependencies\n",
129 | "! git clone -q https://github.com/r9y9/pysinsy\n",
130 | "! cd pysinsy && export SINSY_INSTALL_PREFIX=/usr/ && pip install -q .\n",
131 | "! git clone -q https://github.com/r9y9/nnmnkwii\n",
132 | "! cd nnmnkwii && pip install -q .\n",
133 | "! git clone -q https://github.com/r9y9/nnsvs \n",
134 | "! cd nnsvs && pip install -q ."
135 | ],
136 | "execution_count": 0,
137 | "outputs": []
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {
142 | "id": "3p5KDOHy8OnC",
143 | "colab_type": "text"
144 | },
145 | "source": [
146 | "## Python imports"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "metadata": {
152 | "id": "FgpDInEI0tVS",
153 | "colab_type": "code",
154 | "colab": {}
155 | },
156 | "source": [
157 | "%pylab inline\n",
158 | "rcParams[\"figure.figsize\"] = (16,5)"
159 | ],
160 | "execution_count": 0,
161 | "outputs": []
162 | },
163 | {
164 | "cell_type": "code",
165 | "metadata": {
166 | "id": "qX8FC_XlSPlh",
167 | "colab_type": "code",
168 | "colab": {}
169 | },
170 | "source": [
171 | "import hydra\n",
172 | "from omegaconf import DictConfig, OmegaConf\n",
173 | "import numpy as np\n",
174 | "import joblib\n",
175 | "import torch\n",
176 | "from os.path import join, basename, exists\n",
177 | "import os\n",
178 | "import pysptk\n",
179 | "import pyworld\n",
180 | "import librosa\n",
181 | "import librosa.display\n",
182 | "import IPython\n",
183 | "from IPython.display import Audio\n",
184 | "from nnmnkwii.io import hts\n",
185 | "from nnmnkwii import paramgen\n",
186 | "from nnmnkwii.preprocessing.f0 import interp1d\n",
187 | "from nnmnkwii.frontend import merlin as fe\n",
188 | "\n",
189 | "from nnsvs.multistream import multi_stream_mlpg, split_streams\n",
190 | "from nnsvs.gen import (\n",
191 | " predict_timelag, predict_duration, predict_acoustic, postprocess_duration,\n",
192 | " gen_waveform, get_windows)\n",
193 | "from nnsvs.frontend.ja import xml2lab, _lazy_init\n",
194 | "from nnsvs.gen import _midi_to_hz\n",
195 | "\n",
196 | "_lazy_init(dic_dir=\"/usr/lib/sinsy/dic\")"
197 | ],
198 | "execution_count": 0,
199 | "outputs": []
200 | },
201 | {
202 | "cell_type": "code",
203 | "metadata": {
204 | "id": "03R6EYp4ULok",
205 | "colab_type": "code",
206 | "colab": {}
207 | },
208 | "source": [
209 | "sample_rate = 48000\n",
210 | "frame_period = 5\n",
211 | "fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)\n",
212 | "alpha = pysptk.util.mcepalpha(sample_rate)\n",
213 | "hop_length = int(0.001 * frame_period * sample_rate)"
214 | ],
215 | "execution_count": 0,
216 | "outputs": []
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {
221 | "id": "LdCvZzUt0xPy",
222 | "colab_type": "text"
223 | },
224 | "source": [
225 | "## Setup models"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "metadata": {
231 | "id": "4lfMonh6Vi2z",
232 | "colab_type": "code",
233 | "colab": {}
234 | },
235 | "source": [
236 | "! curl -q -LO https://www.dropbox.com/s/pctlausq00eecqp/20200502_kiritan_singing-00-svs-world.zip\n",
237 | "! unzip -qq -o 20200502_kiritan_singing-00-svs-world.zip\n",
238 | "\n",
239 | "model_dir = \"./20200502_kiritan_singing-00-svs-world\""
240 | ],
241 | "execution_count": 0,
242 | "outputs": []
243 | },
244 | {
245 | "cell_type": "code",
246 | "metadata": {
247 | "id": "jTnJXxz4VjZu",
248 | "colab_type": "code",
249 | "colab": {}
250 | },
251 | "source": [
252 | "use_cuda = True\n",
253 | "if use_cuda and torch.cuda.is_available():\n",
254 | " device = torch.device(\"cuda\")\n",
255 | "else:\n",
256 | " device = torch.device(\"cpu\")"
257 | ],
258 | "execution_count": 0,
259 | "outputs": []
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {
264 | "id": "ZSluQysG06pZ",
265 | "colab_type": "text"
266 | },
267 | "source": [
268 | "### Time-lag model"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "metadata": {
274 | "id": "RvS-2DXxZ5SI",
275 | "colab_type": "code",
276 | "colab": {}
277 | },
278 | "source": [
279 | "timelag_config = OmegaConf.load(join(model_dir, \"timelag\", \"model.yaml\"))\n",
280 | "timelag_model = hydra.utils.instantiate(timelag_config.netG).to(device)\n",
281 | "checkpoint = torch.load(join(model_dir, \"timelag\", \"latest.pth\"), map_location=lambda storage, loc: storage)\n",
282 | "timelag_model.load_state_dict(checkpoint[\"state_dict\"])\n",
283 | "timelag_in_scaler = joblib.load(join(model_dir, \"in_timelag_scaler.joblib\"))\n",
284 | "timelag_out_scaler = joblib.load(join(model_dir, \"out_timelag_scaler.joblib\"))\n",
285 | "timelag_model.eval();"
286 | ],
287 | "execution_count": 0,
288 | "outputs": []
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {
293 | "id": "3VoPGoAR3kpV",
294 | "colab_type": "text"
295 | },
296 | "source": [
297 | "### Duration model"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "metadata": {
303 | "id": "Pl8Q5DgP1eIg",
304 | "colab_type": "code",
305 | "colab": {}
306 | },
307 | "source": [
308 | "duration_config = OmegaConf.load(join(model_dir, \"duration\", \"model.yaml\"))\n",
309 | "duration_model = hydra.utils.instantiate(duration_config.netG).to(device)\n",
310 | "checkpoint = torch.load(join(model_dir, \"duration\", \"latest.pth\"), map_location=lambda storage, loc: storage)\n",
311 | "duration_model.load_state_dict(checkpoint[\"state_dict\"])\n",
312 | "duration_in_scaler = joblib.load(join(model_dir, \"in_duration_scaler.joblib\"))\n",
313 | "duration_out_scaler = joblib.load(join(model_dir, \"out_duration_scaler.joblib\"))\n",
314 | "duration_model.eval();"
315 | ],
316 | "execution_count": 0,
317 | "outputs": []
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "metadata": {
322 | "id": "O2flghYb3q-o",
323 | "colab_type": "text"
324 | },
325 | "source": [
326 | "### Acoustic model"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "metadata": {
332 | "id": "bwmBKAE83pYG",
333 | "colab_type": "code",
334 | "colab": {}
335 | },
336 | "source": [
337 | "acoustic_config = OmegaConf.load(join(model_dir, \"acoustic\", \"model.yaml\"))\n",
338 | "acoustic_model = hydra.utils.instantiate(acoustic_config.netG).to(device)\n",
339 | "checkpoint = torch.load(join(model_dir, \"acoustic\", \"latest.pth\"), map_location=lambda storage, loc: storage)\n",
340 | "acoustic_model.load_state_dict(checkpoint[\"state_dict\"])\n",
341 | "acoustic_in_scaler = joblib.load(join(model_dir, \"in_acoustic_scaler.joblib\"))\n",
342 | "acoustic_out_scaler = joblib.load(join(model_dir, \"out_acoustic_scaler.joblib\"))\n",
343 | "acoustic_model.eval();"
344 | ],
345 | "execution_count": 0,
346 | "outputs": []
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "metadata": {
351 | "id": "omkQ2G4i4DZN",
352 | "colab_type": "text"
353 | },
354 | "source": [
355 | "## Synthesis"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {
361 | "id": "D7b9Yc05CQSu",
362 | "colab_type": "text"
363 | },
364 | "source": [
365 | "### Choose your favorite musicxml file here!"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "metadata": {
371 | "id": "_V1yQdEe4MTq",
372 | "colab_type": "code",
373 | "colab": {}
374 | },
375 | "source": [
376 | "# NOTE: 01.xml and 02.xml were not included in the training data\n",
377 | "# 03.xml - 37.xml were used for training.\n",
378 | "labels = xml2lab(\"kiritan_singing/musicxml/01.xml\").round_()"
379 | ],
380 | "execution_count": 0,
381 | "outputs": []
382 | },
383 | {
384 | "cell_type": "code",
385 | "metadata": {
386 | "id": "Q-HvucB14TFS",
387 | "colab_type": "code",
388 | "colab": {}
389 | },
390 | "source": [
391 | "question_path = join(model_dir, \"jp_qst001_nnsvs.hed\")\n",
392 | "binary_dict, continuous_dict = hts.load_question_set(question_path, append_hat_for_LL=False)"
393 | ],
394 | "execution_count": 0,
395 | "outputs": []
396 | },
397 | {
398 | "cell_type": "code",
399 | "metadata": {
400 | "id": "q8yAgiw36fCZ",
401 | "colab_type": "code",
402 | "colab": {}
403 | },
404 | "source": [
405 | "# pitch indices in the input features\n",
406 | "pitch_idx = len(binary_dict) + 1\n",
407 | "pitch_indices = np.arange(len(binary_dict), len(binary_dict)+3)\n",
408 | "log_f0_conditioning = True"
409 | ],
410 | "execution_count": 0,
411 | "outputs": []
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {
416 | "id": "4q5INSdh6lVj",
417 | "colab_type": "text"
418 | },
419 | "source": [
420 | "### Predict time-lag"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "metadata": {
426 | "id": "SXzv451B6jS6",
427 | "colab_type": "code",
428 | "colab": {}
429 | },
430 | "source": [
431 | "lag = predict_timelag(device, labels, timelag_model, timelag_in_scaler,\n",
432 | " timelag_out_scaler, binary_dict, continuous_dict, pitch_indices,\n",
433 | " log_f0_conditioning)\n",
434 | "lag.shape"
435 | ],
436 | "execution_count": 0,
437 | "outputs": []
438 | },
439 | {
440 | "cell_type": "code",
441 | "metadata": {
442 | "id": "EQr9u_uv6lzg",
443 | "colab_type": "code",
444 | "colab": {}
445 | },
446 | "source": [
447 | "plot(lag / 50000, label=\"Timelag (in frames) for note onsets\", linewidth=2)\n",
448 | "xlabel(\"Time index in musical note\")\n",
449 | "ylabel(\"Timelag\")\n",
450 | "legend();"
451 | ],
452 | "execution_count": 0,
453 | "outputs": []
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {
458 | "id": "9OALA70D6sHy",
459 | "colab_type": "text"
460 | },
461 | "source": [
462 | "### Predict phoneme durations"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "metadata": {
468 | "id": "gffqkEYU6o5F",
469 | "colab_type": "code",
470 | "colab": {}
471 | },
472 | "source": [
473 | "durations = predict_duration(device, labels, duration_model,\n",
474 | " duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict,\n",
475 | " pitch_indices, log_f0_conditioning)\n",
476 | "durations.shape"
477 | ],
478 | "execution_count": 0,
479 | "outputs": []
480 | },
481 | {
482 | "cell_type": "code",
483 | "metadata": {
484 | "id": "QnoRw3Ah6tX1",
485 | "colab_type": "code",
486 | "colab": {}
487 | },
488 | "source": [
489 | "plot(durations, label=\"Phoneme durations in frames\")\n",
490 | "xlabel(\"Time index in phone\")\n",
491 | "ylabel(\"Duration\")\n",
492 | "legend();"
493 | ],
494 | "execution_count": 0,
495 | "outputs": []
496 | },
497 | {
498 | "cell_type": "code",
499 | "metadata": {
500 | "id": "oIBV5ZJj6xPv",
501 | "colab_type": "code",
502 | "colab": {}
503 | },
504 | "source": [
505 | "# Normalize phoneme durations to satisfy constraints by the musical score\n",
506 | "duration_modified_labels = postprocess_duration(labels, durations, lag)"
507 | ],
508 | "execution_count": 0,
509 | "outputs": []
510 | },
511 | {
512 | "cell_type": "markdown",
513 | "metadata": {
514 | "id": "C0_rgqQg6v77",
515 | "colab_type": "text"
516 | },
517 | "source": [
518 | "### Predict acoustic features"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "metadata": {
524 | "id": "WAuA45p26uP_",
525 | "colab_type": "code",
526 | "colab": {}
527 | },
528 | "source": [
529 | "acoustic_features = predict_acoustic(device, duration_modified_labels, acoustic_model,\n",
530 | " acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict,\n",
531 | " \"coarse_coding\", pitch_indices, log_f0_conditioning)\n",
532 | "acoustic_features.shape"
533 | ],
534 | "execution_count": 0,
535 | "outputs": []
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {
540 | "id": "R0dkvEJK6318",
541 | "colab_type": "text"
542 | },
543 | "source": [
544 | "### Visualize acoustic features\n",
545 | "\n",
546 | "Before generating a wavefrom, let's visualize acoustic features to understand how the acoustic model works. Since acoustic features contain multiple differnt features (*multi-stream*, e.g., mgc, lf0, vuv and bap), let us first split acoustic features."
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "metadata": {
552 | "id": "coYE3tqF61tQ",
553 | "colab_type": "code",
554 | "colab": {}
555 | },
556 | "source": [
557 | "stream_sizes = acoustic_config.stream_sizes\n",
558 | "has_dynamic_features = acoustic_config.has_dynamic_features\n",
559 | "# (mgc, lf0, vuv, bap) with delta and delta-delta except for vuv\n",
560 | "stream_sizes, has_dynamic_features"
561 | ],
562 | "execution_count": 0,
563 | "outputs": []
564 | },
565 | {
566 | "cell_type": "code",
567 | "metadata": {
568 | "id": "LuCrC7GY65Ls",
569 | "colab_type": "code",
570 | "colab": {}
571 | },
572 | "source": [
573 | "feats = multi_stream_mlpg(\n",
574 | " acoustic_features, acoustic_out_scaler.var_, get_windows(3), stream_sizes,\n",
575 | " has_dynamic_features)\n",
576 | "# get static features\n",
577 | "mgc, diff_lf0, vuv, bap = split_streams(feats, [60, 1, 1, 5])"
578 | ],
579 | "execution_count": 0,
580 | "outputs": []
581 | },
582 | {
583 | "cell_type": "markdown",
584 | "metadata": {
585 | "id": "ma-6HPn068Kb",
586 | "colab_type": "text"
587 | },
588 | "source": [
589 | "#### Visualize F0"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "metadata": {
595 | "id": "Z0OPYJoT6511",
596 | "colab_type": "code",
597 | "colab": {}
598 | },
599 | "source": [
600 | "# relative f0 -> absolute f0\n",
601 | "# need to extract pitch sequence from the musical score\n",
602 | "linguistic_features = fe.linguistic_features(duration_modified_labels,\n",
603 | " binary_dict, continuous_dict,\n",
604 | " add_frame_features=True,\n",
605 | " subphone_features=\"coarse_coding\")\n",
606 | "f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]\n",
607 | "lf0_score = f0_score.copy()\n",
608 | "nonzero_indices = np.nonzero(lf0_score)\n",
609 | "lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])\n",
610 | "lf0_score = interp1d(lf0_score, kind=\"slinear\")\n",
611 | "\n",
612 | "f0 = diff_lf0 + lf0_score\n",
613 | "f0[vuv < 0.5] = 0\n",
614 | "f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])"
615 | ],
616 | "execution_count": 0,
617 | "outputs": []
618 | },
619 | {
620 | "cell_type": "code",
621 | "metadata": {
622 | "id": "dwqgIQfp69QJ",
623 | "colab_type": "code",
624 | "colab": {}
625 | },
626 | "source": [
627 | "plot(f0[-2500:, :], linewidth=2, label=\"F0 contour (in Hz)\")\n",
628 | "plot((vuv[-2500:, :] > 0.5)*100, linewidth=2, label=\"Voiced/unvoiced flag\")\n",
629 | "legend()\n",
630 | "xlabel(\"Frame\")\n",
631 | "ylabel(\"F0 (in Hz)\");"
632 | ],
633 | "execution_count": 0,
634 | "outputs": []
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {
639 | "id": "smlsejRv7ByF",
640 | "colab_type": "text"
641 | },
642 | "source": [
643 | "#### Visualize spectrogram"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "metadata": {
649 | "id": "mdsyUi3u6-Rt",
650 | "colab_type": "code",
651 | "colab": {}
652 | },
653 | "source": [
654 | "# Trim and visualize (to save memory and time)\n",
655 | "logsp = np.log(pysptk.mc2sp(mgc[-2500:, :], alpha=alpha, fftlen=fftlen))\n",
656 | "librosa.display.specshow(logsp.T, sr=sample_rate, hop_length=hop_length, x_axis=\"time\", y_axis=\"linear\", cmap=\"jet\");"
657 | ],
658 | "execution_count": 0,
659 | "outputs": []
660 | },
661 | {
662 | "cell_type": "markdown",
663 | "metadata": {
664 | "id": "GpnhK_Um7FPX",
665 | "colab_type": "text"
666 | },
667 | "source": [
668 | "#### Visualize aperiodicity"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "metadata": {
674 | "id": "OUCWh5Er7DAU",
675 | "colab_type": "code",
676 | "colab": {}
677 | },
678 | "source": [
679 | "aperiodicity = pyworld.decode_aperiodicity(bap[-2500:, :].astype(np.float64), sample_rate, fftlen)\n",
680 | "librosa.display.specshow(aperiodicity.T, sr=sample_rate, hop_length=hop_length, x_axis=\"time\", y_axis=\"linear\", cmap=\"jet\");"
681 | ],
682 | "execution_count": 0,
683 | "outputs": []
684 | },
685 | {
686 | "cell_type": "markdown",
687 | "metadata": {
688 | "id": "_i8cFqle7Ibm",
689 | "colab_type": "text"
690 | },
691 | "source": [
692 | "### Generate waveform\n",
693 | "\n",
694 | "Finally, let's generate waveform and listen to the sample."
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "metadata": {
700 | "id": "BLMUQa2c7Gei",
701 | "colab_type": "code",
702 | "colab": {}
703 | },
704 | "source": [
705 | "generated_waveform = gen_waveform(\n",
706 | " duration_modified_labels, acoustic_features, acoustic_out_scaler,\n",
707 | " binary_dict, continuous_dict, acoustic_config.stream_sizes,\n",
708 | " acoustic_config.has_dynamic_features,\n",
709 | " \"coarse_coding\", log_f0_conditioning,\n",
710 | " pitch_idx, num_windows=3,\n",
711 | " post_filter=True, sample_rate=sample_rate, frame_period=frame_period,\n",
712 | " relative_f0=True)\n",
713 | "\n",
714 | "# trim trailing/leading silences for covenience\n",
715 | "generated_waveform = librosa.effects.trim(generated_waveform)[0]"
716 | ],
717 | "execution_count": 0,
718 | "outputs": []
719 | },
720 | {
721 | "cell_type": "markdown",
722 | "metadata": {
723 | "id": "XYSyTQylDvIR",
724 | "colab_type": "text"
725 | },
726 | "source": [
727 | "## Listen to the generated sample"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "metadata": {
733 | "id": "l-qNFhnu7J1T",
734 | "colab_type": "code",
735 | "colab": {}
736 | },
737 | "source": [
738 | "librosa.display.waveplot(generated_waveform, sample_rate, x_axis=\"time\")\n",
739 | "IPython.display.display(Audio(generated_waveform, rate=sample_rate))"
740 | ],
741 | "execution_count": 0,
742 | "outputs": []
743 | },
744 | {
745 | "cell_type": "markdown",
746 | "metadata": {
747 | "id": "5a2Q8N487MPZ",
748 | "colab_type": "text"
749 | },
750 | "source": [
751 | "## Summary\n",
752 | "\n",
753 | "A demo of a singing voice synthesis system based on neural networks. Full code is available https://github.com/r9y9/nnsvs."
754 | ]
755 | },
756 | {
757 | "cell_type": "markdown",
758 | "metadata": {
759 | "id": "5XQ_diETC6FB",
760 | "colab_type": "text"
761 | },
762 | "source": [
763 | "\n",
764 | "## References\n",
765 | "\n",
766 | "- Kiritan database: https://zunko.jp/kiridev/login.php\n",
767 | "- Code to reproduce: https://github.com/r9y9/nnsvs"
768 | ]
769 | }
770 | ]
771 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Colaboratory
2 |
3 | Notebooks supposed to be executed on [Google Colaboratory](https://colab.research.google.com)
4 |
--------------------------------------------------------------------------------
/Tacotron2_and_WaveNet_text_to_speech_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Tacotron2 and WaveNet text-to-speech demo.ipynb",
7 | "provenance": [],
8 | "private_outputs": true,
9 | "collapsed_sections": [],
10 | "toc_visible": true,
11 | "include_colab_link": true
12 | },
13 | "kernelspec": {
14 | "name": "python3",
15 | "display_name": "Python 3"
16 | },
17 | "accelerator": "GPU"
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "
"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {
33 | "id": "syjMmwOEa-uk",
34 | "colab_type": "text"
35 | },
36 | "source": [
37 | "# Tacotron2: WaveNet-basd text-to-speech demo\n",
38 | "\n",
39 | "- Tacotron2 (mel-spectrogram prediction part): https://github.com/Rayhane-mamah/Tacotron-2\n",
40 | "- WaveNet: https://github.com/r9y9/wavenet_vocoder\n",
41 | "\n",
42 | "This is a proof of concept for Tacotron2 text-to-speech synthesis. Models used here were trained on [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).\n",
43 | "\n",
44 | "**Notice**: The waveform generation is super slow since it implements naive autoregressive generation. It doesn't use parallel generation method described in [Parallel WaveNet](https://arxiv.org/abs/1711.10433). \n",
45 | "\n",
46 | "**Estimated time to complete**: 2 ~ 3 hours."
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {
52 | "id": "m7R_1MpFc3Za",
53 | "colab_type": "text"
54 | },
55 | "source": [
56 | "## Setup\n",
57 | "\n",
58 | "### Install dependencies"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "metadata": {
64 | "id": "NlLC7Q7Us8go",
65 | "colab_type": "code",
66 | "colab": {}
67 | },
68 | "source": [
69 | "import os\n",
70 | "from os.path import exists, join, expanduser\n",
71 | "\n",
72 | "os.chdir(expanduser(\"~\"))\n",
73 | "\n",
74 | "wavenet_dir = \"wavenet_vocoder\"\n",
75 | "if not exists(wavenet_dir):\n",
76 | " ! git clone https://github.com/r9y9/$wavenet_dir\n",
77 | " ! cd wavenet_vocoder && git checkout v0.1.1 && cd -\n",
78 | " \n",
79 | "taco2_dir = \"Tacotron-2\"\n",
80 | "if not exists(taco2_dir):\n",
81 | " ! git clone https://github.com/r9y9/$taco2_dir\n",
82 | " ! cd $taco2_dir && git checkout -B wavenet3 origin/wavenet3"
83 | ],
84 | "execution_count": 0,
85 | "outputs": []
86 | },
87 | {
88 | "cell_type": "code",
89 | "metadata": {
90 | "id": "KBFfji_Avluz",
91 | "colab_type": "code",
92 | "colab": {}
93 | },
94 | "source": [
95 | "# Install dependencies\n",
96 | "! pip install -q -U \"tensorflow<=1.9.0\"\n",
97 | "! pip install -q -U \"keras==2.2.4\"\n",
98 | "! pip install -q -U \"numpy<1.16\"\n",
99 | "! pip install -q -U \"pysptk<=0.1.14\"\n",
100 | "\n",
101 | "os.chdir(join(expanduser(\"~\"), taco2_dir))\n",
102 | "! pip install -q -r requirements.txt\n",
103 | "\n",
104 | "os.chdir(join(expanduser(\"~\"), wavenet_dir))\n",
105 | "! pip install -q -e '.[train]'"
106 | ],
107 | "execution_count": 0,
108 | "outputs": []
109 | },
110 | {
111 | "cell_type": "code",
112 | "metadata": {
113 | "id": "15p8phXx6nxe",
114 | "colab_type": "code",
115 | "colab": {}
116 | },
117 | "source": [
118 | "import torch\n",
119 | "import tensorflow\n",
120 | "import pysptk\n",
121 | "import numpy as np\n",
122 | "tensorflow.__version__, pysptk.__version__, np.__version__"
123 | ],
124 | "execution_count": 0,
125 | "outputs": []
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {
130 | "id": "_fZo1X7ac_Tp",
131 | "colab_type": "text"
132 | },
133 | "source": [
134 | "### Download pretrained models\n",
135 | "\n",
136 | "#### Tacotron2 (mel-spectrogram prediction part)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "metadata": {
142 | "id": "Sau06KhizkoD",
143 | "colab_type": "code",
144 | "colab": {}
145 | },
146 | "source": [
147 | "os.chdir(join(expanduser(\"~\"), taco2_dir))\n",
148 | "! mkdir -p logs-Tacotron\n",
149 | "if not exists(\"logs-Tacotron/pretrained\"):\n",
150 | " ! curl -O -L \"https://www.dropbox.com/s/vx7y4qqs732sqgg/pretrained.tar.gz\"\n",
151 | " ! tar xzvf pretrained.tar.gz\n",
152 | " ! mv pretrained logs-Tacotron"
153 | ],
154 | "execution_count": 0,
155 | "outputs": []
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {
160 | "id": "D4tWl_hfdXdh",
161 | "colab_type": "text"
162 | },
163 | "source": [
164 | "#### WaveNet"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "metadata": {
170 | "id": "q2kwJ-t_ykXZ",
171 | "colab_type": "code",
172 | "colab": {}
173 | },
174 | "source": [
175 | "os.chdir(join(expanduser(\"~\"), wavenet_dir))\n",
176 | "wn_preset = \"20180510_mixture_lj_checkpoint_step000320000_ema.json\"\n",
177 | "wn_checkpoint_path = \"20180510_mixture_lj_checkpoint_step000320000_ema.pth\"\n",
178 | "\n",
179 | "if not exists(wn_preset):\n",
180 | " !curl -O -L \"https://www.dropbox.com/s/0vsd7973w20eskz/20180510_mixture_lj_checkpoint_step000320000_ema.json\"\n",
181 | "if not exists(wn_checkpoint_path):\n",
182 | " !curl -O -L \"https://www.dropbox.com/s/zdbfprugbagfp2w/20180510_mixture_lj_checkpoint_step000320000_ema.pth\""
183 | ],
184 | "execution_count": 0,
185 | "outputs": []
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {
190 | "id": "km1SAASEcIL6",
191 | "colab_type": "text"
192 | },
193 | "source": [
194 | "## Input texts to be synthesized\n",
195 | "\n",
196 | "Choose your favorite sentences :)"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "metadata": {
202 | "id": "4LeTMHHFdcmS",
203 | "colab_type": "code",
204 | "colab": {}
205 | },
206 | "source": [
207 | "os.chdir(join(expanduser(\"~\"), taco2_dir))"
208 | ],
209 | "execution_count": 0,
210 | "outputs": []
211 | },
212 | {
213 | "cell_type": "code",
214 | "metadata": {
215 | "id": "tU1lz6PcbXut",
216 | "colab_type": "code",
217 | "colab": {}
218 | },
219 | "source": [
220 | "%%bash\n",
221 | "cat << EOS > text_list.txt\n",
222 | "This is really awesome!\n",
223 | "This is text-to-speech online demonstration by Tacotron 2 and WaveNet.\n",
224 | "Thanks for your patience.\n",
225 | "EOS\n",
226 | "\n",
227 | "cat text_list.txt"
228 | ],
229 | "execution_count": 0,
230 | "outputs": []
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {
235 | "id": "K9akhzMhbWe0",
236 | "colab_type": "text"
237 | },
238 | "source": [
239 | "## Mel-spectrogram prediction by Tacoron2"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "metadata": {
245 | "id": "0n4h5aa51dHS",
246 | "colab_type": "code",
247 | "colab": {}
248 | },
249 | "source": [
250 | "# Remove old files if exist\n",
251 | "! rm -rf tacotron_output\n",
252 | "! python synthesize.py --model='Tacotron' --mode='eval' \\\n",
253 | " --hparams='symmetric_mels=False,max_abs_value=4.0,power=1.1,outputs_per_step=1' \\\n",
254 | " --text_list=./text_list.txt"
255 | ],
256 | "execution_count": 0,
257 | "outputs": []
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {
262 | "id": "FF1mh1Jvdp0a",
263 | "colab_type": "text"
264 | },
265 | "source": [
266 | "## Waveform synthesis by WaveNet"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "metadata": {
272 | "id": "rY_MfE0m8Ese",
273 | "colab_type": "code",
274 | "colab": {}
275 | },
276 | "source": [
277 | "import librosa.display\n",
278 | "import IPython\n",
279 | "from IPython.display import Audio\n",
280 | "import numpy as np\n",
281 | "import torch"
282 | ],
283 | "execution_count": 0,
284 | "outputs": []
285 | },
286 | {
287 | "cell_type": "code",
288 | "metadata": {
289 | "id": "vTmp0T0G3lU0",
290 | "colab_type": "code",
291 | "colab": {}
292 | },
293 | "source": [
294 | "os.chdir(join(expanduser(\"~\"), wavenet_dir))\n",
295 | "\n",
296 | "# Setup WaveNet vocoder hparams\n",
297 | "from hparams import hparams\n",
298 | "with open(wn_preset) as f:\n",
299 | " hparams.parse_json(f.read())\n",
300 | "\n",
301 | "# Setup WaveNet vocoder\n",
302 | "from train import build_model\n",
303 | "from synthesis import wavegen\n",
304 | "import torch\n",
305 | "\n",
306 | "use_cuda = torch.cuda.is_available()\n",
307 | "device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n",
308 | "\n",
309 | "model = build_model().to(device)\n",
310 | "\n",
311 | "print(\"Load checkpoint from {}\".format(wn_checkpoint_path))\n",
312 | "checkpoint = torch.load(wn_checkpoint_path)\n",
313 | "model.load_state_dict(checkpoint[\"state_dict\"])"
314 | ],
315 | "execution_count": 0,
316 | "outputs": []
317 | },
318 | {
319 | "cell_type": "code",
320 | "metadata": {
321 | "id": "334X6oFK6Vf9",
322 | "colab_type": "code",
323 | "colab": {}
324 | },
325 | "source": [
326 | "from glob import glob\n",
327 | "from tqdm import tqdm\n",
328 | "\n",
329 | "with open(\"../Tacotron-2/tacotron_output/eval/map.txt\") as f:\n",
330 | " maps = f.readlines()\n",
331 | "maps = list(map(lambda x:x[:-1].split(\"|\"), maps))\n",
332 | "# filter out invalid ones\n",
333 | "maps = list(filter(lambda x:len(x) == 2, maps))\n",
334 | "\n",
335 | "print(\"List of texts to be synthesized\")\n",
336 | "for idx, (text,_) in enumerate(maps):\n",
337 | " print(idx, text)"
338 | ],
339 | "execution_count": 0,
340 | "outputs": []
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {
345 | "id": "yaleFjoyiND_",
346 | "colab_type": "text"
347 | },
348 | "source": [
349 | "### Waveform generation\n",
350 | "\n",
351 | "**Note**: This will takes hours to finish depending on the number and lenght of texts. Try short sentences first if you would like to see samples quickly."
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "metadata": {
357 | "id": "j9BO7IES7Htp",
358 | "colab_type": "code",
359 | "colab": {}
360 | },
361 | "source": [
362 | "waveforms = []\n",
363 | "\n",
364 | "for idx, (text, mel) in enumerate(maps):\n",
365 | " print(\"\\n\", idx, text)\n",
366 | " mel_path = join(\"../Tacotron-2\", mel)\n",
367 | " c = np.load(mel_path)\n",
368 | " if c.shape[1] != hparams.num_mels:\n",
369 | " np.swapaxes(c, 0, 1)\n",
370 | " # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]\n",
371 | " c = np.interp(c, (0, 4), (0, 1))\n",
372 | " \n",
373 | " # Generate\n",
374 | " waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)\n",
375 | " \n",
376 | " waveforms.append(waveform)\n",
377 | "\n",
378 | " # Audio\n",
379 | " IPython.display.display(Audio(waveform, rate=hparams.sample_rate))"
380 | ],
381 | "execution_count": 0,
382 | "outputs": []
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {
387 | "id": "hNG8oI4OiJkJ",
388 | "colab_type": "text"
389 | },
390 | "source": [
391 | "## Summary: audio samples"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "metadata": {
397 | "id": "OIyfhn0v9Ntg",
398 | "colab_type": "code",
399 | "colab": {}
400 | },
401 | "source": [
402 | "for idx, (text, mel) in enumerate(maps):\n",
403 | " print(idx, text)\n",
404 | " IPython.display.display(Audio(waveforms[idx], rate=hparams.sample_rate))"
405 | ],
406 | "execution_count": 0,
407 | "outputs": []
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {
412 | "id": "O0hc4ah-gMUa",
413 | "colab_type": "text"
414 | },
415 | "source": [
416 | "For more information, please visit https://github.com/r9y9/wavenet_vocoder. More samples can be found at https://r9y9.github.io/wavenet_vocoder/. "
417 | ]
418 | }
419 | ]
420 | }
--------------------------------------------------------------------------------