├── .gitignore
├── LICENSE
├── README.md
├── demo_streamlit.ipynb
├── demo_voice.py
├── encoder
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   ├── audio.cpython-37.pyc
    │   ├── inference.cpython-37.pyc
    │   ├── model.cpython-37.pyc
    │   ├── params_data.cpython-37.pyc
    │   └── params_model.cpython-37.pyc
    ├── audio.py
    ├── config.py
    ├── data_objects
    │   ├── __init__.py
    │   ├── random_cycler.py
    │   ├── speaker.py
    │   ├── speaker_batch.py
    │   ├── speaker_verification_dataset.py
    │   └── utterance.py
    ├── inference.py
    ├── model.py
    ├── params_data.py
    ├── params_model.py
    ├── preprocess.py
    ├── saved_models
    │   └── .gitkeep
    ├── train.py
    └── visualizations.py
├── helper.py
├── requirements_demo.txt
├── samples
    ├── .DS_Store
    ├── 1320_00000.mp3
    ├── 3575_00000.mp3
    ├── 8230_00000.mp3
    ├── README.md
    ├── VCTK.txt
    ├── myvoice.mp3
    ├── p240_00000.mp3
    └── p260_00000.mp3
├── slides.pdf
├── synthesizer
    ├── LICENSE.txt
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   ├── audio.cpython-37.pyc
    │   ├── hparams.cpython-37.pyc
    │   ├── inference.cpython-37.pyc
    │   ├── infolog.cpython-37.pyc
    │   └── tacotron2.cpython-37.pyc
    ├── audio.py
    ├── feeder.py
    ├── hparams.py
    ├── inference.py
    ├── infolog.py
    ├── models
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── architecture_wrappers.cpython-37.pyc
    │   │   ├── attention.cpython-37.pyc
    │   │   ├── custom_decoder.cpython-37.pyc
    │   │   ├── helpers.cpython-37.pyc
    │   │   ├── modules.cpython-37.pyc
    │   │   └── tacotron.cpython-37.pyc
    │   ├── architecture_wrappers.py
    │   ├── attention.py
    │   ├── custom_decoder.py
    │   ├── helpers.py
    │   ├── modules.py
    │   └── tacotron.py
    ├── preprocess.py
    ├── synthesize.py
    ├── tacotron2.py
    ├── train.py
    └── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-37.pyc
    │       ├── cleaners.cpython-37.pyc
    │       ├── numbers.cpython-37.pyc
    │       ├── plot.cpython-37.pyc
    │       ├── symbols.cpython-37.pyc
    │       └── text.cpython-37.pyc
    │   ├── _cmudict.py
    │   ├── cleaners.py
    │   ├── numbers.py
    │   ├── plot.py
    │   ├── symbols.py
    │   └── text.py
└── vocoder
    ├── LICENSE.txt
    ├── __pycache__
        ├── audio.cpython-37.pyc
        ├── display.cpython-37.pyc
        ├── distribution.cpython-37.pyc
        ├── hparams.cpython-37.pyc
        └── inference.cpython-37.pyc
    ├── audio.py
    ├── display.py
    ├── distribution.py
    ├── gen_wavernn.py
    ├── hparams.py
    ├── inference.py
    ├── models
        ├── __pycache__
        │   └── fatchord_version.cpython-37.pyc
        ├── deepmind_version.py
        └── fatchord_version.py
    ├── saved_models
        └── .gitkeep
    ├── train.py
    └── vocoder_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # don't upload wave files and pretrained models
 2 | *.wav
 3 | *.pt
 4 | *.pyc
 5 | *data-00000-of-00001
 6 | synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001
 7 | synthesizer/saved_models/*
 8 | __pycache__/
 9 | 
10 | # pipfiles
11 | Pipfile*
12 | 
13 | # exception to the rule
14 | !saved_models/.gitkeep


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 dataroots
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Streamlit Demo: Real-Time Voice Cloning
 2 | 
 3 | This repository demonstrates how a simple voice transfer app can be created using [Streamlit](https://www.streamlit.io/). The code for this demo is based on the repository for [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning).
 4 | 
 5 | This app allows you to:
 6 | * Record your voice
 7 | * Visualize the embedding of the speaker
 8 | * Synthesize speech based on the recorded voice
 9 | 
10 | 
11 | ## Setup
12 | 
13 | ### 1. Install Requirements
14 | **Python 3.6 or 3.7** is needed
15 | 
16 | * Create your virtual environment (e.g. [pipenv](https://pipenv.pypa.io/en/latest/), [poetry](https://python-poetry.org/) or [venv](https://docs.python.org/3/library/venv.html)).
17 | * Install [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1).
18 | * Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
19 | * Run `pip install -r requirements_demo.txt` to install the remaining necessary packages.
20 | 
21 | ### 2. Download Pretrained Models
22 | Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
23 | 
24 | ### 3. Launch streamlit demo
25 | 
26 | * `streamlit run demo_voice.py`


--------------------------------------------------------------------------------
/demo_streamlit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from synthesizer.inference import Synthesizer\n",
 10 |     "from encoder import inference as encoder\n",
 11 |     "from vocoder import inference as vocoder\n",
 12 |     "from pathlib import Path\n",
 13 |     "import numpy as np\n",
 14 |     "import soundfile as sf\n",
 15 |     "import os\n",
 16 |     "import librosa\n",
 17 |     "import sounddevice as sd\n",
 18 |     "import wavio\n",
 19 |     "import glob\n",
 20 |     "from helper import draw_embed, create_spectrogram, read_audio\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# 1. Record your own voice"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "print(\"Recording...\")\n",
 38 |     "duration = 5  # seconds\n",
 39 |     "fs = 48000\n",
 40 |     "sd.default.samplerate = fs\n",
 41 |     "sd.default.channels = 1\n",
 42 |     "myrecording = sd.rec(int(duration * fs))\n",
 43 |     "sd.wait(duration)\n",
 44 |     "print(\"Saving sample as myvoice.mp3\")\n",
 45 |     "path_myrecording = \"./samples/myvoice.mp3\"\n",
 46 |     "wavio.write(path_myrecording, myrecording, fs, sampwidth=2)\n",
 47 |     "sd.play(myrecording, fs) #st\n",
 48 |     "print(\"Done! Saved sample as myvoice.mp3\")"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "fig = create_spectrogram(path_myrecording)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "# 2. Load your pretrained models"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "print(\"Loading pretrained models...\")\n",
 74 |     "seed = 42\n",
 75 |     "low_mem = False\n",
 76 |     "num_generated = 0\n",
 77 |     "enc_model_fpath = Path(\"encoder/saved_models/pretrained.pt\")\n",
 78 |     "syn_model_dir = Path(\"synthesizer/saved_models/logs-pretrained/\")\n",
 79 |     "voc_model_fpath = Path(\"vocoder/saved_models/pretrained/pretrained.pt\")\n",
 80 |     "encoder.load_model(enc_model_fpath)\n",
 81 |     "synthesizer = Synthesizer(\n",
 82 |     "    syn_model_dir.joinpath(\"taco_pretrained\"), low_mem=low_mem, seed=seed\n",
 83 |     ")\n",
 84 |     "vocoder.load_model(voc_model_fpath)\n",
 85 |     "print(\"Loaded pretrained models!\")"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "# 3. Choose a recording"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "audio_folder = \"samples\"\n",
102 |     "filenames = glob.glob(os.path.join(audio_folder, \"*.mp3\"))\n",
103 |     "print(filenames)\n",
104 |     "\n",
105 |     "selected_filename = 'samples/myvoice.mp3'\n",
106 |     "in_fpath = Path(selected_filename.replace('\"', \"\").replace(\"'\", \"\"))"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "# 4. Start preprocessing"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "original_wav, sampling_rate = librosa.load(str(in_fpath))\n",
123 |     "preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)\n",
124 |     "print(\"Loaded file succesfully!\")\n",
125 |     "embed = encoder.embed_utterance(preprocessed_wav)\n",
126 |     "sd.play(original_wav, sampling_rate) #st\n",
127 |     "print(\"Created the embedding\")"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "fig = draw_embed(embed, \"myembedding\", None)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "# 5. Synthesize the text you like to hear"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "## Generating the spectrogram\n",
153 |     "text = input(\"Write a sentence (+-20 words) to be synthesized:\\n\")"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "if text != \"\":\n",
163 |     "    texts = [text]\n",
164 |     "    embeds = [embed]\n",
165 |     "    # If you know what the attention layer alignments are,\n",
166 |     "    # you can retrieve them here by passing return_alignments=True\n",
167 |     "    specs = synthesizer.synthesize_spectrograms(texts, embeds)\n",
168 |     "    spec = specs[0]\n",
169 |     "    print(\"Created the mel spectrogram\")\n",
170 |     "\n",
171 |     "    # Generating the waveform\n",
172 |     "    print(\"Synthesizing the waveform:\")\n",
173 |     "\n",
174 |     "    generated_wav = vocoder.infer_waveform(spec)\n",
175 |     "\n",
176 |     "    # Post-generation\n",
177 |     "    # There's a bug with sounddevice that makes the audio cut one\n",
178 |     "    # second earlier, so we pad it.\n",
179 |     "    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode=\"constant\")\n",
180 |     "\n",
181 |     "    # Trim excess silences to compensate for gaps in spectrograms (issue #53)\n",
182 |     "    generated_wav = encoder.preprocess_wav(generated_wav)\n",
183 |     "\n",
184 |     "    # Play the audio (non-blocking)\n",
185 |     "    try:\n",
186 |     "        sd.stop()\n",
187 |     "        sd.play(generated_wav, synthesizer.sample_rate)\n",
188 |     "    except sd.PortAudioError as e:\n",
189 |     "        print(\"\\nCaught exception: %s\" % repr(e))\n",
190 |     "        print(\n",
191 |     "            'Continuing without audio playback. Suppress this message with \\\n",
192 |     "            the \"--no_sound\" flag.\\n'\n",
193 |     "        )\n",
194 |     "\n",
195 |     "    # Save it on the disk\n",
196 |     "    filename = \"demo_output_%02d.wav\" % num_generated\n",
197 |     "    sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)\n",
198 |     "    num_generated += 1\n",
199 |     "    print(\"\\nSaved output as %s\\n\\n\" % filename)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.6"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 4
231 | }
232 | 


--------------------------------------------------------------------------------
/demo_voice.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from synthesizer.inference import Synthesizer
  3 | from encoder import inference as encoder
  4 | from vocoder import inference as vocoder
  5 | from pathlib import Path
  6 | import numpy as np
  7 | import soundfile as sf
  8 | import os
  9 | import librosa
 10 | import glob
 11 | from helper import draw_embed, create_spectrogram, read_audio, record, save_record
 12 | 
 13 | "# Streamlit showcase"
 14 | 
 15 | model_load_state = st.text("Loading pretrained models...")
 16 | 
 17 | seed = 42
 18 | low_mem = False
 19 | num_generated = 0
 20 | enc_model_fpath = Path("encoder/saved_models/pretrained.pt")
 21 | syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
 22 | voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
 23 | encoder.load_model(enc_model_fpath)
 24 | synthesizer = Synthesizer(
 25 |     syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem, seed=seed
 26 | )
 27 | vocoder.load_model(voc_model_fpath)
 28 | 
 29 | model_load_state.text("Loaded pretrained models!")
 30 | 
 31 | st.header("1. Record your own voice")
 32 | 
 33 | filename = st.text_input("Choose a filename: ")
 34 | 
 35 | if st.button(f"Click to Record"):
 36 |     if filename == "":
 37 |         st.warning("Choose a filename.")
 38 |     else:
 39 |         record_state = st.text("Recording...")
 40 |         duration = 5  # seconds
 41 |         fs = 48000
 42 |         myrecording = record(duration, fs)
 43 |         record_state.text(f"Saving sample as {filename}.mp3")
 44 | 
 45 |         path_myrecording = f"./samples/{filename}.mp3"
 46 | 
 47 |         save_record(path_myrecording, myrecording, fs)
 48 |         record_state.text(f"Done! Saved sample as {filename}.mp3")
 49 | 
 50 |         st.audio(read_audio(path_myrecording))
 51 | 
 52 |         fig = create_spectrogram(path_myrecording)
 53 |         st.pyplot(fig)
 54 | 
 55 | "## 2. Choose an audio record"
 56 | 
 57 | audio_folder = "samples"
 58 | filenames = glob.glob(os.path.join(audio_folder, "*.mp3"))
 59 | selected_filename = st.selectbox("Select a file", filenames)
 60 | 
 61 | if selected_filename is not None:
 62 |     # Create embedding
 63 |     in_fpath = Path(selected_filename.replace('"', "").replace("'", ""))
 64 |     original_wav, sampling_rate = librosa.load(str(in_fpath))
 65 |     preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
 66 |     embed = encoder.embed_utterance(preprocessed_wav)
 67 |     st.success("Created the embedding")
 68 | 
 69 |     st.audio(read_audio(in_fpath))
 70 | 
 71 |     if st.sidebar.checkbox("Do you want to change your embedding?"):
 72 |         height = int(np.sqrt(len(embed)))
 73 |         shape = (height, -1)
 74 |         matrix_embed = np.round(embed, 2).reshape(shape)
 75 |         matrix_embed = [list(row) for row in matrix_embed]
 76 |         a = st.text_area("Change your embedding:", value=str(matrix_embed).replace("],", "],\n"))
 77 | 
 78 |         matrix = [[float(x) for x in row.strip("[] \n").split(",")] for row in a.split("],")]
 79 |         embed = np.array(matrix).flatten()
 80 | 
 81 |     fig = draw_embed(embed, "myembedding", None)
 82 |     st.pyplot(fig)
 83 | 
 84 | 
 85 | "## 3. Synthesize text."
 86 | text = st.text_input("Write a sentence (+-20 words) to be synthesized:")
 87 | 
 88 | 
 89 | def pgbar(i, seq_len, b_size, gen_rate):
 90 |     mybar.progress(i / seq_len)
 91 | 
 92 | 
 93 | if st.button("Click to synthesize"):
 94 |     texts = [text]
 95 |     embeds = [embed]
 96 | 
 97 |     # generate waveform
 98 |     with st.spinner("Generating your speech..."):
 99 |         specs = synthesizer.synthesize_spectrograms(texts, embeds)
100 |         spec = specs[0]
101 |         synthesize_state = st.text("Created the mel spectrogram")
102 |         synthesize_state.text("Generating the waveform...")
103 |         mybar = st.progress(0)
104 |         generated_wav = vocoder.infer_waveform(spec, progress_callback=pgbar)
105 |         generated_wav = np.pad(
106 |             generated_wav, (0, synthesizer.sample_rate), mode="constant"
107 |         )
108 |         generated_wav = encoder.preprocess_wav(generated_wav)
109 |         synthesize_state.text("Synthesized the waveform")
110 |         st.success("Done!")
111 | 
112 |     # Save it on the disk
113 |     filename = "demo_output_%02d.wav" % num_generated
114 |     sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
115 |     num_generated += 1
116 |     synthesize_state.text("\nSaved output as %s\n\n" % filename)
117 |     st.audio(read_audio(filename))
118 | 


--------------------------------------------------------------------------------
/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__init__.py


--------------------------------------------------------------------------------
/encoder/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/__pycache__/audio.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/audio.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/__pycache__/inference.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/inference.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/__pycache__/params_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/params_data.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/__pycache__/params_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/params_model.cpython-37.pyc


--------------------------------------------------------------------------------
/encoder/audio.py:
--------------------------------------------------------------------------------
  1 | from scipy.ndimage.morphology import binary_dilation
  2 | from encoder.params_data import *
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | from warnings import warn
  6 | import numpy as np
  7 | import librosa
  8 | import struct
  9 | 
 10 | try:
 11 |     import webrtcvad
 12 | except:
 13 |     warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
 14 |     webrtcvad=None
 15 | 
 16 | int16_max = (2 ** 15) - 1
 17 | 
 18 | 
 19 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
 20 |                    source_sr: Optional[int] = None):
 21 |     """
 22 |     Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
 23 |     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
 24 | 
 25 |     :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
 26 |     just .wav), either the waveform as a numpy array of floats.
 27 |     :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
 28 |     preprocessing. After preprocessing, the waveform's sampling rate will match the data 
 29 |     hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
 30 |     this argument will be ignored.
 31 |     """
 32 |     # Load the wav from disk if needed
 33 |     if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
 34 |         wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
 35 |     else:
 36 |         wav = fpath_or_wav
 37 |     
 38 |     # Resample the wav if needed
 39 |     if source_sr is not None and source_sr != sampling_rate:
 40 |         wav = librosa.resample(wav, source_sr, sampling_rate)
 41 |         
 42 |     # Apply the preprocessing: normalize volume and shorten long silences 
 43 |     wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
 44 |     if webrtcvad:
 45 |         wav = trim_long_silences(wav)
 46 |     
 47 |     return wav
 48 | 
 49 | 
 50 | def wav_to_mel_spectrogram(wav):
 51 |     """
 52 |     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
 53 |     Note: this not a log-mel spectrogram.
 54 |     """
 55 |     frames = librosa.feature.melspectrogram(
 56 |         wav,
 57 |         sampling_rate,
 58 |         n_fft=int(sampling_rate * mel_window_length / 1000),
 59 |         hop_length=int(sampling_rate * mel_window_step / 1000),
 60 |         n_mels=mel_n_channels
 61 |     )
 62 |     return frames.astype(np.float32).T
 63 | 
 64 | 
 65 | def trim_long_silences(wav):
 66 |     """
 67 |     Ensures that segments without voice in the waveform remain no longer than a 
 68 |     threshold determined by the VAD parameters in params.py.
 69 | 
 70 |     :param wav: the raw waveform as a numpy array of floats 
 71 |     :return: the same waveform with silences trimmed away (length <= original wav length)
 72 |     """
 73 |     # Compute the voice detection window size
 74 |     samples_per_window = (vad_window_length * sampling_rate) // 1000
 75 |     
 76 |     # Trim the end of the audio to have a multiple of the window size
 77 |     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
 78 |     
 79 |     # Convert the float waveform to 16-bit mono PCM
 80 |     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
 81 |     
 82 |     # Perform voice activation detection
 83 |     voice_flags = []
 84 |     vad = webrtcvad.Vad(mode=3)
 85 |     for window_start in range(0, len(wav), samples_per_window):
 86 |         window_end = window_start + samples_per_window
 87 |         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
 88 |                                          sample_rate=sampling_rate))
 89 |     voice_flags = np.array(voice_flags)
 90 |     
 91 |     # Smooth the voice detection with a moving average
 92 |     def moving_average(array, width):
 93 |         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
 94 |         ret = np.cumsum(array_padded, dtype=float)
 95 |         ret[width:] = ret[width:] - ret[:-width]
 96 |         return ret[width - 1:] / width
 97 |     
 98 |     audio_mask = moving_average(voice_flags, vad_moving_average_width)
 99 |     audio_mask = np.round(audio_mask).astype(np.bool)
100 |     
101 |     # Dilate the voiced regions
102 |     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
103 |     audio_mask = np.repeat(audio_mask, samples_per_window)
104 |     
105 |     return wav[audio_mask == True]
106 | 
107 | 
108 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
109 |     if increase_only and decrease_only:
110 |         raise ValueError("Both increase only and decrease only are set")
111 |     dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
112 |     if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
113 |         return wav
114 |     return wav * (10 ** (dBFS_change / 20))
115 | 


--------------------------------------------------------------------------------
/encoder/config.py:
--------------------------------------------------------------------------------
 1 | librispeech_datasets = {
 2 |     "train": {
 3 |         "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
 4 |         "other": ["LibriSpeech/train-other-500"]
 5 |     },
 6 |     "test": {
 7 |         "clean": ["LibriSpeech/test-clean"],
 8 |         "other": ["LibriSpeech/test-other"]
 9 |     },
10 |     "dev": {
11 |         "clean": ["LibriSpeech/dev-clean"],
12 |         "other": ["LibriSpeech/dev-other"]
13 |     },
14 | }
15 | libritts_datasets = {
16 |     "train": {
17 |         "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
18 |         "other": ["LibriTTS/train-other-500"]
19 |     },
20 |     "test": {
21 |         "clean": ["LibriTTS/test-clean"],
22 |         "other": ["LibriTTS/test-other"]
23 |     },
24 |     "dev": {
25 |         "clean": ["LibriTTS/dev-clean"],
26 |         "other": ["LibriTTS/dev-other"]
27 |     },
28 | }
29 | voxceleb_datasets = {
30 |     "voxceleb1" : {
31 |         "train": ["VoxCeleb1/wav"],
32 |         "test": ["VoxCeleb1/test_wav"]
33 |     },
34 |     "voxceleb2" : {
35 |         "train": ["VoxCeleb2/dev/aac"],
36 |         "test": ["VoxCeleb2/test_wav"]
37 |     }
38 | }
39 | 
40 | other_datasets = [
41 |     "LJSpeech-1.1",
42 |     "VCTK-Corpus/wav48",
43 | ]
44 | 
45 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
46 | 


--------------------------------------------------------------------------------
/encoder/data_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
3 | 


--------------------------------------------------------------------------------
/encoder/data_objects/random_cycler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | class RandomCycler:
 4 |     """
 5 |     Creates an internal copy of a sequence and allows access to its items in a constrained random 
 6 |     order. For a source sequence of n items and one or several consecutive queries of a total 
 7 |     of m items, the following guarantees hold (one implies the other):
 8 |         - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
 9 |         - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
10 |     """
11 |     
12 |     def __init__(self, source):
13 |         if len(source) == 0:
14 |             raise Exception("Can't create RandomCycler from an empty collection")
15 |         self.all_items = list(source)
16 |         self.next_items = []
17 |     
18 |     def sample(self, count: int):
19 |         shuffle = lambda l: random.sample(l, len(l))
20 |         
21 |         out = []
22 |         while count > 0:
23 |             if count >= len(self.all_items):
24 |                 out.extend(shuffle(list(self.all_items)))
25 |                 count -= len(self.all_items)
26 |                 continue
27 |             n = min(count, len(self.next_items))
28 |             out.extend(self.next_items[:n])
29 |             count -= n
30 |             self.next_items = self.next_items[n:]
31 |             if len(self.next_items) == 0:
32 |                 self.next_items = shuffle(list(self.all_items))
33 |         return out
34 |     
35 |     def __next__(self):
36 |         return self.sample(1)[0]
37 | 
38 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker.py:
--------------------------------------------------------------------------------
 1 | from encoder.data_objects.random_cycler import RandomCycler
 2 | from encoder.data_objects.utterance import Utterance
 3 | from pathlib import Path
 4 | 
 5 | # Contains the set of utterances of a single speaker
 6 | class Speaker:
 7 |     def __init__(self, root: Path):
 8 |         self.root = root
 9 |         self.name = root.name
10 |         self.utterances = None
11 |         self.utterance_cycler = None
12 |         
13 |     def _load_utterances(self):
14 |         with self.root.joinpath("_sources.txt").open("r") as sources_file:
15 |             sources = [l.split(",") for l in sources_file]
16 |         sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
17 |         self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
18 |         self.utterance_cycler = RandomCycler(self.utterances)
19 |                
20 |     def random_partial(self, count, n_frames):
21 |         """
22 |         Samples a batch of <count> unique partial utterances from the disk in a way that all 
23 |         utterances come up at least once every two cycles and in a random order every time.
24 |         
25 |         :param count: The number of partial utterances to sample from the set of utterances from 
26 |         that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
27 |         the number of utterances available.
28 |         :param n_frames: The number of frames in the partial utterance.
29 |         :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
30 |         frames are the frames of the partial utterances and range is the range of the partial 
31 |         utterance with regard to the complete utterance.
32 |         """
33 |         if self.utterances is None:
34 |             self._load_utterances()
35 | 
36 |         utterances = self.utterance_cycler.sample(count)
37 | 
38 |         a = [(u,) + u.random_partial(n_frames) for u in utterances]
39 | 
40 |         return a
41 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List
 3 | from encoder.data_objects.speaker import Speaker
 4 | 
 5 | class SpeakerBatch:
 6 |     def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
 7 |         self.speakers = speakers
 8 |         self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
 9 |         
10 |         # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
11 |         # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
12 |         self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
13 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker_verification_dataset.py:
--------------------------------------------------------------------------------
 1 | from encoder.data_objects.random_cycler import RandomCycler
 2 | from encoder.data_objects.speaker_batch import SpeakerBatch
 3 | from encoder.data_objects.speaker import Speaker
 4 | from encoder.params_data import partials_n_frames
 5 | from torch.utils.data import Dataset, DataLoader
 6 | from pathlib import Path
 7 | 
 8 | # TODO: improve with a pool of speakers for data efficiency
 9 | 
10 | class SpeakerVerificationDataset(Dataset):
11 |     def __init__(self, datasets_root: Path):
12 |         self.root = datasets_root
13 |         speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
14 |         if len(speaker_dirs) == 0:
15 |             raise Exception("No speakers found. Make sure you are pointing to the directory "
16 |                             "containing all preprocessed speaker directories.")
17 |         self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
18 |         self.speaker_cycler = RandomCycler(self.speakers)
19 | 
20 |     def __len__(self):
21 |         return int(1e10)
22 |         
23 |     def __getitem__(self, index):
24 |         return next(self.speaker_cycler)
25 |     
26 |     def get_logs(self):
27 |         log_string = ""
28 |         for log_fpath in self.root.glob("*.txt"):
29 |             with log_fpath.open("r") as log_file:
30 |                 log_string += "".join(log_file.readlines())
31 |         return log_string
32 |     
33 |     
34 | class SpeakerVerificationDataLoader(DataLoader):
35 |     def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
36 |                  batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
37 |                  worker_init_fn=None):
38 |         self.utterances_per_speaker = utterances_per_speaker
39 | 
40 |         super().__init__(
41 |             dataset=dataset, 
42 |             batch_size=speakers_per_batch, 
43 |             shuffle=False, 
44 |             sampler=sampler, 
45 |             batch_sampler=batch_sampler, 
46 |             num_workers=num_workers,
47 |             collate_fn=self.collate, 
48 |             pin_memory=pin_memory, 
49 |             drop_last=False, 
50 |             timeout=timeout, 
51 |             worker_init_fn=worker_init_fn
52 |         )
53 | 
54 |     def collate(self, speakers):
55 |         return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
56 |     


--------------------------------------------------------------------------------
/encoder/data_objects/utterance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Utterance:
 5 |     def __init__(self, frames_fpath, wave_fpath):
 6 |         self.frames_fpath = frames_fpath
 7 |         self.wave_fpath = wave_fpath
 8 |         
 9 |     def get_frames(self):
10 |         return np.load(self.frames_fpath)
11 | 
12 |     def random_partial(self, n_frames):
13 |         """
14 |         Crops the frames into a partial utterance of n_frames
15 |         
16 |         :param n_frames: The number of frames of the partial utterance
17 |         :return: the partial utterance frames and a tuple indicating the start and end of the 
18 |         partial utterance in the complete utterance.
19 |         """
20 |         frames = self.get_frames()
21 |         if frames.shape[0] == n_frames:
22 |             start = 0
23 |         else:
24 |             start = np.random.randint(0, frames.shape[0] - n_frames)
25 |         end = start + n_frames
26 |         return frames[start:end], (start, end)


--------------------------------------------------------------------------------
/encoder/inference.py:
--------------------------------------------------------------------------------
  1 | from encoder.params_data import *
  2 | from encoder.model import SpeakerEncoder
  3 | from encoder.audio import preprocess_wav   # We want to expose this function from here
  4 | from matplotlib import cm
  5 | from encoder import audio
  6 | from pathlib import Path
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | _model = None # type: SpeakerEncoder
 12 | _device = None # type: torch.device
 13 | 
 14 | 
 15 | def load_model(weights_fpath: Path, device=None):
 16 |     """
 17 |     Loads the model in memory. If this function is not explicitely called, it will be run on the 
 18 |     first call to embed_frames() with the default weights file.
 19 |     
 20 |     :param weights_fpath: the path to saved model weights.
 21 |     :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
 22 |     model will be loaded and will run on this device. Outputs will however always be on the cpu. 
 23 |     If None, will default to your GPU if it"s available, otherwise your CPU.
 24 |     """
 25 |     # TODO: I think the slow loading of the encoder might have something to do with the device it
 26 |     #   was saved on. Worth investigating.
 27 |     global _model, _device
 28 |     if device is None:
 29 |         _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 30 |     elif isinstance(device, str):
 31 |         _device = torch.device(device)
 32 |     _model = SpeakerEncoder(_device, torch.device("cpu"))
 33 |     checkpoint = torch.load(weights_fpath, _device)
 34 |     _model.load_state_dict(checkpoint["model_state"])
 35 |     _model.eval()
 36 |     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
 37 |     
 38 |     
 39 | def is_loaded():
 40 |     return _model is not None
 41 | 
 42 | 
 43 | def embed_frames_batch(frames_batch):
 44 |     """
 45 |     Computes embeddings for a batch of mel spectrogram.
 46 |     
 47 |     :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
 48 |     (batch_size, n_frames, n_channels)
 49 |     :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
 50 |     """
 51 |     if _model is None:
 52 |         raise Exception("Model was not loaded. Call load_model() before inference.")
 53 |     
 54 |     frames = torch.from_numpy(frames_batch).to(_device)
 55 |     embed = _model.forward(frames).detach().cpu().numpy()
 56 |     return embed
 57 | 
 58 | 
 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
 60 |                            min_pad_coverage=0.75, overlap=0.5):
 61 |     """
 62 |     Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
 63 |     partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
 64 |     spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
 65 |     its spectrogram. This function assumes that the mel spectrogram parameters used are those 
 66 |     defined in params_data.py.
 67 |     
 68 |     The returned ranges may be indexing further than the length of the waveform. It is 
 69 |     recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
 70 |     
 71 |     :param n_samples: the number of samples in the waveform
 72 |     :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
 73 |     utterance
 74 |     :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
 75 |     enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
 76 |     then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
 77 |     it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
 78 |     utterance, this parameter is ignored so that the function always returns at least 1 slice.
 79 |     :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
 80 |     utterances are entirely disjoint. 
 81 |     :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
 82 |     respectively the waveform and the mel spectrogram with these slices to obtain the partial 
 83 |     utterances.
 84 |     """
 85 |     assert 0 <= overlap < 1
 86 |     assert 0 < min_pad_coverage <= 1
 87 |     
 88 |     samples_per_frame = int((sampling_rate * mel_window_step / 1000))
 89 |     n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
 90 |     frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
 91 | 
 92 |     # Compute the slices
 93 |     wav_slices, mel_slices = [], []
 94 |     steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
 95 |     for i in range(0, steps, frame_step):
 96 |         mel_range = np.array([i, i + partial_utterance_n_frames])
 97 |         wav_range = mel_range * samples_per_frame
 98 |         mel_slices.append(slice(*mel_range))
 99 |         wav_slices.append(slice(*wav_range))
100 |         
101 |     # Evaluate whether extra padding is warranted or not
102 |     last_wav_range = wav_slices[-1]
103 |     coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
104 |     if coverage < min_pad_coverage and len(mel_slices) > 1:
105 |         mel_slices = mel_slices[:-1]
106 |         wav_slices = wav_slices[:-1]
107 |     
108 |     return wav_slices, mel_slices
109 | 
110 | 
111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
112 |     """
113 |     Computes an embedding for a single utterance.
114 |     
115 |     # TODO: handle multiple wavs to benefit from batching on GPU
116 |     :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
117 |     :param using_partials: if True, then the utterance is split in partial utterances of 
118 |     <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
119 |     normalized average. If False, the utterance is instead computed from feeding the entire 
120 |     spectogram to the network.
121 |     :param return_partials: if True, the partial embeddings will also be returned along with the 
122 |     wav slices that correspond to the partial embeddings.
123 |     :param kwargs: additional arguments to compute_partial_splits()
124 |     :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
125 |     <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
126 |     (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
127 |     returned. If <using_partials> is simultaneously set to False, both these values will be None 
128 |     instead.
129 |     """
130 |     # Process the entire utterance if not using partials
131 |     if not using_partials:
132 |         frames = audio.wav_to_mel_spectrogram(wav)
133 |         embed = embed_frames_batch(frames[None, ...])[0]
134 |         if return_partials:
135 |             return embed, None, None
136 |         return embed
137 |     
138 |     # Compute where to split the utterance into partials and pad if necessary
139 |     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
140 |     max_wave_length = wave_slices[-1].stop
141 |     if max_wave_length >= len(wav):
142 |         wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
143 |     
144 |     # Split the utterance into partials
145 |     frames = audio.wav_to_mel_spectrogram(wav)
146 |     frames_batch = np.array([frames[s] for s in mel_slices])
147 |     partial_embeds = embed_frames_batch(frames_batch)
148 |     
149 |     # Compute the utterance embedding from the partial embeddings
150 |     raw_embed = np.mean(partial_embeds, axis=0)
151 |     embed = raw_embed / np.linalg.norm(raw_embed, 2)
152 |     
153 |     if return_partials:
154 |         return embed, partial_embeds, wave_slices
155 |     return embed
156 | 
157 | 
158 | def embed_speaker(wavs, **kwargs):
159 |     raise NotImplemented()
160 | 
161 | 
162 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
163 |     if ax is None:
164 |         ax = plt.gca()
165 |     
166 |     if shape is None:
167 |         height = int(np.sqrt(len(embed)))
168 |         shape = (height, -1)
169 |     embed = embed.reshape(shape)
170 |     
171 |     cmap = cm.get_cmap()
172 |     mappable = ax.imshow(embed, cmap=cmap)
173 |     cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174 |     mappable.set_clim(*color_range)
175 | 
176 |     
177 |     ax.set_xticks([]), ax.set_yticks([])
178 |     ax.set_title(title)
179 | 


--------------------------------------------------------------------------------
/encoder/model.py:
--------------------------------------------------------------------------------
  1 | from encoder.params_model import *
  2 | from encoder.params_data import *
  3 | from scipy.interpolate import interp1d
  4 | from sklearn.metrics import roc_curve
  5 | from torch.nn.utils import clip_grad_norm_
  6 | from scipy.optimize import brentq
  7 | from torch import nn
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | 
 12 | class SpeakerEncoder(nn.Module):
 13 |     def __init__(self, device, loss_device):
 14 |         super().__init__()
 15 |         self.loss_device = loss_device
 16 |         
 17 |         # Network defition
 18 |         self.lstm = nn.LSTM(input_size=mel_n_channels,
 19 |                             hidden_size=model_hidden_size, 
 20 |                             num_layers=model_num_layers, 
 21 |                             batch_first=True).to(device)
 22 |         self.linear = nn.Linear(in_features=model_hidden_size, 
 23 |                                 out_features=model_embedding_size).to(device)
 24 |         self.relu = torch.nn.ReLU().to(device)
 25 |         
 26 |         # Cosine similarity scaling (with fixed initial parameter values)
 27 |         self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
 28 |         self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
 29 | 
 30 |         # Loss
 31 |         self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
 32 |         
 33 |     def do_gradient_ops(self):
 34 |         # Gradient scale
 35 |         self.similarity_weight.grad *= 0.01
 36 |         self.similarity_bias.grad *= 0.01
 37 |             
 38 |         # Gradient clipping
 39 |         clip_grad_norm_(self.parameters(), 3, norm_type=2)
 40 |     
 41 |     def forward(self, utterances, hidden_init=None):
 42 |         """
 43 |         Computes the embeddings of a batch of utterance spectrograms.
 44 |         
 45 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
 46 |         (batch_size, n_frames, n_channels) 
 47 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
 48 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
 49 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
 50 |         """
 51 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
 52 |         # and the final cell state.
 53 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
 54 |         
 55 |         # We take only the hidden state of the last layer
 56 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 57 |         
 58 |         # L2-normalize it
 59 |         embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 60 |         
 61 |         return embeds
 62 |     
 63 |     def similarity_matrix(self, embeds):
 64 |         """
 65 |         Computes the similarity matrix according the section 2.1 of GE2E.
 66 | 
 67 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
 68 |         utterances_per_speaker, embedding_size)
 69 |         :return: the similarity matrix as a tensor of shape (speakers_per_batch,
 70 |         utterances_per_speaker, speakers_per_batch)
 71 |         """
 72 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
 73 |         
 74 |         # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
 75 |         centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
 76 |         centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
 77 | 
 78 |         # Exclusive centroids (1 per utterance)
 79 |         centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
 80 |         centroids_excl /= (utterances_per_speaker - 1)
 81 |         centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
 82 | 
 83 |         # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
 84 |         # product of these vectors (which is just an element-wise multiplication reduced by a sum).
 85 |         # We vectorize the computation for efficiency.
 86 |         sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
 87 |                                  speakers_per_batch).to(self.loss_device)
 88 |         mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
 89 |         for j in range(speakers_per_batch):
 90 |             mask = np.where(mask_matrix[j])[0]
 91 |             sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
 92 |             sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
 93 |         
 94 |         ## Even more vectorized version (slower maybe because of transpose)
 95 |         # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
 96 |         #                           ).to(self.loss_device)
 97 |         # eye = np.eye(speakers_per_batch, dtype=np.int)
 98 |         # mask = np.where(1 - eye)
 99 |         # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
100 |         # mask = np.where(eye)
101 |         # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
102 |         # sim_matrix2 = sim_matrix2.transpose(1, 2)
103 |         
104 |         sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
105 |         return sim_matrix
106 |     
107 |     def loss(self, embeds):
108 |         """
109 |         Computes the softmax loss according the section 2.1 of GE2E.
110 |         
111 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
112 |         utterances_per_speaker, embedding_size)
113 |         :return: the loss and the EER for this batch of embeddings.
114 |         """
115 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
116 |         
117 |         # Loss
118 |         sim_matrix = self.similarity_matrix(embeds)
119 |         sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
120 |                                          speakers_per_batch))
121 |         ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
122 |         target = torch.from_numpy(ground_truth).long().to(self.loss_device)
123 |         loss = self.loss_fn(sim_matrix, target)
124 |         
125 |         # EER (not backpropagated)
126 |         with torch.no_grad():
127 |             inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
128 |             labels = np.array([inv_argmax(i) for i in ground_truth])
129 |             preds = sim_matrix.detach().cpu().numpy()
130 | 
131 |             # Snippet from https://yangcha.github.io/EER-ROC/
132 |             fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
133 |             eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
134 |             
135 |         return loss, eer


--------------------------------------------------------------------------------
/encoder/params_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Mel-filterbank
 3 | mel_window_length = 25  # In milliseconds
 4 | mel_window_step = 10    # In milliseconds
 5 | mel_n_channels = 40
 6 | 
 7 | 
 8 | ## Audio
 9 | sampling_rate = 16000
10 | # Number of spectrogram frames in a partial utterance
11 | partials_n_frames = 160     # 1600 ms
12 | # Number of spectrogram frames at inference
13 | inference_n_frames = 80     #  800 ms
14 | 
15 | 
16 | ## Voice Activation Detection
17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18 | # This sets the granularity of the VAD. Should not need to be changed.
19 | vad_window_length = 30  # In milliseconds
20 | # Number of frames to average together when performing the moving average smoothing.
21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
22 | vad_moving_average_width = 8
23 | # Maximum number of consecutive silent frames a segment can have.
24 | vad_max_silence_length = 6
25 | 
26 | 
27 | ## Audio volume normalization
28 | audio_norm_target_dBFS = -30
29 | 
30 | 


--------------------------------------------------------------------------------
/encoder/params_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Model parameters
 3 | model_hidden_size = 256
 4 | model_embedding_size = 256
 5 | model_num_layers = 3
 6 | 
 7 | 
 8 | ## Training parameters
 9 | learning_rate_init = 1e-4
10 | speakers_per_batch = 64
11 | utterances_per_speaker = 10
12 | 


--------------------------------------------------------------------------------
/encoder/preprocess.py:
--------------------------------------------------------------------------------
  1 | from multiprocess.pool import ThreadPool
  2 | from encoder.params_data import *
  3 | from encoder.config import librispeech_datasets, anglophone_nationalites
  4 | from datetime import datetime
  5 | from encoder import audio
  6 | from pathlib import Path
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | 
 10 | 
 11 | class DatasetLog:
 12 |     """
 13 |     Registers metadata about the dataset in a text file.
 14 |     """
 15 |     def __init__(self, root, name):
 16 |         self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
 17 |         self.sample_data = dict()
 18 |         
 19 |         start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 20 |         self.write_line("Creating dataset %s on %s" % (name, start_time))
 21 |         self.write_line("-----")
 22 |         self._log_params()
 23 |         
 24 |     def _log_params(self):
 25 |         from encoder import params_data
 26 |         self.write_line("Parameter values:")
 27 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 28 |             value = getattr(params_data, param_name)
 29 |             self.write_line("\t%s: %s" % (param_name, value))
 30 |         self.write_line("-----")
 31 |     
 32 |     def write_line(self, line):
 33 |         self.text_file.write("%s\n" % line)
 34 |         
 35 |     def add_sample(self, **kwargs):
 36 |         for param_name, value in kwargs.items():
 37 |             if not param_name in self.sample_data:
 38 |                 self.sample_data[param_name] = []
 39 |             self.sample_data[param_name].append(value)
 40 |             
 41 |     def finalize(self):
 42 |         self.write_line("Statistics:")
 43 |         for param_name, values in self.sample_data.items():
 44 |             self.write_line("\t%s:" % param_name)
 45 |             self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
 46 |             self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
 47 |         self.write_line("-----")
 48 |         end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
 49 |         self.write_line("Finished on %s" % end_time)
 50 |         self.text_file.close()
 51 |        
 52 |         
 53 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
 54 |     dataset_root = datasets_root.joinpath(dataset_name)
 55 |     if not dataset_root.exists():
 56 |         print("Couldn\'t find %s, skipping this dataset." % dataset_root)
 57 |         return None, None
 58 |     return dataset_root, DatasetLog(out_dir, dataset_name)
 59 | 
 60 | 
 61 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
 62 |                              skip_existing, logger):
 63 |     print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
 64 |     
 65 |     # Function to preprocess utterances for one speaker
 66 |     def preprocess_speaker(speaker_dir: Path):
 67 |         # Give a name to the speaker that includes its dataset
 68 |         speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
 69 |         
 70 |         # Create an output directory with that name, as well as a txt file containing a 
 71 |         # reference to each source file.
 72 |         speaker_out_dir = out_dir.joinpath(speaker_name)
 73 |         speaker_out_dir.mkdir(exist_ok=True)
 74 |         sources_fpath = speaker_out_dir.joinpath("_sources.txt")
 75 |         
 76 |         # There's a possibility that the preprocessing was interrupted earlier, check if 
 77 |         # there already is a sources file.
 78 |         if sources_fpath.exists():
 79 |             try:
 80 |                 with sources_fpath.open("r") as sources_file:
 81 |                     existing_fnames = {line.split(",")[0] for line in sources_file}
 82 |             except:
 83 |                 existing_fnames = {}
 84 |         else:
 85 |             existing_fnames = {}
 86 |         
 87 |         # Gather all audio files for that speaker recursively
 88 |         sources_file = sources_fpath.open("a" if skip_existing else "w")
 89 |         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
 90 |             # Check if the target output file already exists
 91 |             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
 92 |             out_fname = out_fname.replace(".%s" % extension, ".npy")
 93 |             if skip_existing and out_fname in existing_fnames:
 94 |                 continue
 95 |                 
 96 |             # Load and preprocess the waveform
 97 |             wav = audio.preprocess_wav(in_fpath)
 98 |             if len(wav) == 0:
 99 |                 continue
100 |             
101 |             # Create the mel spectrogram, discard those that are too short
102 |             frames = audio.wav_to_mel_spectrogram(wav)
103 |             if len(frames) < partials_n_frames:
104 |                 continue
105 |             
106 |             out_fpath = speaker_out_dir.joinpath(out_fname)
107 |             np.save(out_fpath, frames)
108 |             logger.add_sample(duration=len(wav) / sampling_rate)
109 |             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
110 |         
111 |         sources_file.close()
112 |     
113 |     # Process the utterances for each speaker
114 |     with ThreadPool(8) as pool:
115 |         list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
116 |                   unit="speakers"))
117 |     logger.finalize()
118 |     print("Done preprocessing %s.\n" % dataset_name)
119 | 
120 | 
121 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
122 |     for dataset_name in librispeech_datasets["train"]["other"]:
123 |         # Initialize the preprocessing
124 |         dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
125 |         if not dataset_root:
126 |             return 
127 |         
128 |         # Preprocess all speakers
129 |         speaker_dirs = list(dataset_root.glob("*"))
130 |         _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
131 |                                  skip_existing, logger)
132 | 
133 | 
134 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
135 |     # Initialize the preprocessing
136 |     dataset_name = "VoxCeleb1"
137 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
138 |     if not dataset_root:
139 |         return
140 | 
141 |     # Get the contents of the meta file
142 |     with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
143 |         metadata = [line.split("\t") for line in metafile][1:]
144 |     
145 |     # Select the ID and the nationality, filter out non-anglophone speakers
146 |     nationalities = {line[0]: line[3] for line in metadata}
147 |     keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
148 |                         nationality.lower() in anglophone_nationalites]
149 |     print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
150 |           (len(keep_speaker_ids), len(nationalities)))
151 |     
152 |     # Get the speaker directories for anglophone speakers only
153 |     speaker_dirs = dataset_root.joinpath("wav").glob("*")
154 |     speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
155 |                     speaker_dir.name in keep_speaker_ids]
156 |     print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
157 |           (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
158 | 
159 |     # Preprocess all speakers
160 |     _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
161 |                              skip_existing, logger)
162 | 
163 | 
164 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
165 |     # Initialize the preprocessing
166 |     dataset_name = "VoxCeleb2"
167 |     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
168 |     if not dataset_root:
169 |         return
170 |     
171 |     # Get the speaker directories
172 |     # Preprocess all speakers
173 |     speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
174 |     _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
175 |                              skip_existing, logger)
176 | 


--------------------------------------------------------------------------------
/encoder/saved_models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/saved_models/.gitkeep


--------------------------------------------------------------------------------
/encoder/train.py:
--------------------------------------------------------------------------------
  1 | from encoder.visualizations import Visualizations
  2 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
  3 | from encoder.params_model import *
  4 | from encoder.model import SpeakerEncoder
  5 | from utils.profiler import Profiler
  6 | from pathlib import Path
  7 | import torch
  8 | 
  9 | def sync(device: torch.device):
 10 |     # For correct profiling (cuda operations are async)
 11 |     if device.type == "cuda":
 12 |         torch.cuda.synchronize(device)
 13 |     
 14 | 
 15 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
 16 |           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
 17 |           no_visdom: bool):
 18 |     # Create a dataset and a dataloader
 19 |     dataset = SpeakerVerificationDataset(clean_data_root)
 20 |     loader = SpeakerVerificationDataLoader(
 21 |         dataset,
 22 |         speakers_per_batch,
 23 |         utterances_per_speaker,
 24 |         num_workers=8,
 25 |     )
 26 |     
 27 |     # Setup the device on which to run the forward pass and the loss. These can be different, 
 28 |     # because the forward pass is faster on the GPU whereas the loss is often (depending on your
 29 |     # hyperparameters) faster on the CPU.
 30 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 31 |     # FIXME: currently, the gradient is None if loss_device is cuda
 32 |     loss_device = torch.device("cpu")
 33 |     
 34 |     # Create the model and the optimizer
 35 |     model = SpeakerEncoder(device, loss_device)
 36 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
 37 |     init_step = 1
 38 |     
 39 |     # Configure file path for the model
 40 |     state_fpath = models_dir.joinpath(run_id + ".pt")
 41 |     backup_dir = models_dir.joinpath(run_id + "_backups")
 42 | 
 43 |     # Load any existing model
 44 |     if not force_restart:
 45 |         if state_fpath.exists():
 46 |             print("Found existing model \"%s\", loading it and resuming training." % run_id)
 47 |             checkpoint = torch.load(state_fpath)
 48 |             init_step = checkpoint["step"]
 49 |             model.load_state_dict(checkpoint["model_state"])
 50 |             optimizer.load_state_dict(checkpoint["optimizer_state"])
 51 |             optimizer.param_groups[0]["lr"] = learning_rate_init
 52 |         else:
 53 |             print("No model \"%s\" found, starting training from scratch." % run_id)
 54 |     else:
 55 |         print("Starting the training from scratch.")
 56 |     model.train()
 57 |     
 58 |     # Initialize the visualization environment
 59 |     vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
 60 |     vis.log_dataset(dataset)
 61 |     vis.log_params()
 62 |     device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
 63 |     vis.log_implementation({"Device": device_name})
 64 |     
 65 |     # Training loop
 66 |     profiler = Profiler(summarize_every=10, disabled=False)
 67 |     for step, speaker_batch in enumerate(loader, init_step):
 68 |         profiler.tick("Blocking, waiting for batch (threaded)")
 69 |         
 70 |         # Forward pass
 71 |         inputs = torch.from_numpy(speaker_batch.data).to(device)
 72 |         sync(device)
 73 |         profiler.tick("Data to %s" % device)
 74 |         embeds = model(inputs)
 75 |         sync(device)
 76 |         profiler.tick("Forward pass")
 77 |         embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
 78 |         loss, eer = model.loss(embeds_loss)
 79 |         sync(loss_device)
 80 |         profiler.tick("Loss")
 81 | 
 82 |         # Backward pass
 83 |         model.zero_grad()
 84 |         loss.backward()
 85 |         profiler.tick("Backward pass")
 86 |         model.do_gradient_ops()
 87 |         optimizer.step()
 88 |         profiler.tick("Parameter update")
 89 |         
 90 |         # Update visualizations
 91 |         # learning_rate = optimizer.param_groups[0]["lr"]
 92 |         vis.update(loss.item(), eer, step)
 93 |         
 94 |         # Draw projections and save them to the backup folder
 95 |         if umap_every != 0 and step % umap_every == 0:
 96 |             print("Drawing and saving projections (step %d)" % step)
 97 |             backup_dir.mkdir(exist_ok=True)
 98 |             projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
 99 |             embeds = embeds.detach().cpu().numpy()
100 |             vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
101 |             vis.save()
102 | 
103 |         # Overwrite the latest version of the model
104 |         if save_every != 0 and step % save_every == 0:
105 |             print("Saving the model (step %d)" % step)
106 |             torch.save({
107 |                 "step": step + 1,
108 |                 "model_state": model.state_dict(),
109 |                 "optimizer_state": optimizer.state_dict(),
110 |             }, state_fpath)
111 |             
112 |         # Make a backup
113 |         if backup_every != 0 and step % backup_every == 0:
114 |             print("Making a backup (step %d)" % step)
115 |             backup_dir.mkdir(exist_ok=True)
116 |             backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
117 |             torch.save({
118 |                 "step": step + 1,
119 |                 "model_state": model.state_dict(),
120 |                 "optimizer_state": optimizer.state_dict(),
121 |             }, backup_fpath)
122 |             
123 |         profiler.tick("Extras (visualizations, saving)")
124 | 


--------------------------------------------------------------------------------
/encoder/visualizations.py:
--------------------------------------------------------------------------------
  1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
  2 | from datetime import datetime
  3 | from time import perf_counter as timer
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | # import webbrowser
  7 | import visdom
  8 | import umap
  9 | 
 10 | colormap = np.array([
 11 |     [76, 255, 0],
 12 |     [0, 127, 70],
 13 |     [255, 0, 0],
 14 |     [255, 217, 38],
 15 |     [0, 135, 255],
 16 |     [165, 0, 165],
 17 |     [255, 167, 255],
 18 |     [0, 255, 255],
 19 |     [255, 96, 38],
 20 |     [142, 76, 0],
 21 |     [33, 0, 127],
 22 |     [0, 0, 0],
 23 |     [183, 183, 183],
 24 | ], dtype=np.float) / 255 
 25 | 
 26 | 
 27 | class Visualizations:
 28 |     def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
 29 |         # Tracking data
 30 |         self.last_update_timestamp = timer()
 31 |         self.update_every = update_every
 32 |         self.step_times = []
 33 |         self.losses = []
 34 |         self.eers = []
 35 |         print("Updating the visualizations every %d steps." % update_every)
 36 |         
 37 |         # If visdom is disabled TODO: use a better paradigm for that
 38 |         self.disabled = disabled    
 39 |         if self.disabled:
 40 |             return 
 41 |         
 42 |         # Set the environment name
 43 |         now = str(datetime.now().strftime("%d-%m %Hh%M"))
 44 |         if env_name is None:
 45 |             self.env_name = now
 46 |         else:
 47 |             self.env_name = "%s (%s)" % (env_name, now)
 48 |         
 49 |         # Connect to visdom and open the corresponding window in the browser
 50 |         try:
 51 |             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
 52 |         except ConnectionError:
 53 |             raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
 54 |                             "start it.")
 55 |         # webbrowser.open("http://localhost:8097/env/" + self.env_name)
 56 |         
 57 |         # Create the windows
 58 |         self.loss_win = None
 59 |         self.eer_win = None
 60 |         # self.lr_win = None
 61 |         self.implementation_win = None
 62 |         self.projection_win = None
 63 |         self.implementation_string = ""
 64 |         
 65 |     def log_params(self):
 66 |         if self.disabled:
 67 |             return 
 68 |         from encoder import params_data
 69 |         from encoder import params_model
 70 |         param_string = "<b>Model parameters</b>:<br>"
 71 |         for param_name in (p for p in dir(params_model) if not p.startswith("__")):
 72 |             value = getattr(params_model, param_name)
 73 |             param_string += "\t%s: %s<br>" % (param_name, value)
 74 |         param_string += "<b>Data parameters</b>:<br>"
 75 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 76 |             value = getattr(params_data, param_name)
 77 |             param_string += "\t%s: %s<br>" % (param_name, value)
 78 |         self.vis.text(param_string, opts={"title": "Parameters"})
 79 |         
 80 |     def log_dataset(self, dataset: SpeakerVerificationDataset):
 81 |         if self.disabled:
 82 |             return 
 83 |         dataset_string = ""
 84 |         dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
 85 |         dataset_string += "\n" + dataset.get_logs()
 86 |         dataset_string = dataset_string.replace("\n", "<br>")
 87 |         self.vis.text(dataset_string, opts={"title": "Dataset"})
 88 |         
 89 |     def log_implementation(self, params):
 90 |         if self.disabled:
 91 |             return 
 92 |         implementation_string = ""
 93 |         for param, value in params.items():
 94 |             implementation_string += "<b>%s</b>: %s\n" % (param, value)
 95 |             implementation_string = implementation_string.replace("\n", "<br>")
 96 |         self.implementation_string = implementation_string
 97 |         self.implementation_win = self.vis.text(
 98 |             implementation_string, 
 99 |             opts={"title": "Training implementation"}
100 |         )
101 | 
102 |     def update(self, loss, eer, step):
103 |         # Update the tracking data
104 |         now = timer()
105 |         self.step_times.append(1000 * (now - self.last_update_timestamp))
106 |         self.last_update_timestamp = now
107 |         self.losses.append(loss)
108 |         self.eers.append(eer)
109 |         print(".", end="")
110 |         
111 |         # Update the plots every <update_every> steps
112 |         if step % self.update_every != 0:
113 |             return
114 |         time_string = "Step time:  mean: %5dms  std: %5dms" % \
115 |                       (int(np.mean(self.step_times)), int(np.std(self.step_times)))
116 |         print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
117 |               (step, np.mean(self.losses), np.mean(self.eers), time_string))
118 |         if not self.disabled:
119 |             self.loss_win = self.vis.line(
120 |                 [np.mean(self.losses)],
121 |                 [step],
122 |                 win=self.loss_win,
123 |                 update="append" if self.loss_win else None,
124 |                 opts=dict(
125 |                     legend=["Avg. loss"],
126 |                     xlabel="Step",
127 |                     ylabel="Loss",
128 |                     title="Loss",
129 |                 )
130 |             )
131 |             self.eer_win = self.vis.line(
132 |                 [np.mean(self.eers)],
133 |                 [step],
134 |                 win=self.eer_win,
135 |                 update="append" if self.eer_win else None,
136 |                 opts=dict(
137 |                     legend=["Avg. EER"],
138 |                     xlabel="Step",
139 |                     ylabel="EER",
140 |                     title="Equal error rate"
141 |                 )
142 |             )
143 |             if self.implementation_win is not None:
144 |                 self.vis.text(
145 |                     self.implementation_string + ("<b>%s</b>" % time_string), 
146 |                     win=self.implementation_win,
147 |                     opts={"title": "Training implementation"},
148 |                 )
149 | 
150 |         # Reset the tracking
151 |         self.losses.clear()
152 |         self.eers.clear()
153 |         self.step_times.clear()
154 |         
155 |     def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
156 |                          max_speakers=10):
157 |         max_speakers = min(max_speakers, len(colormap))
158 |         embeds = embeds[:max_speakers * utterances_per_speaker]
159 |         
160 |         n_speakers = len(embeds) // utterances_per_speaker
161 |         ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
162 |         colors = [colormap[i] for i in ground_truth]
163 |         
164 |         reducer = umap.UMAP()
165 |         projected = reducer.fit_transform(embeds)
166 |         plt.scatter(projected[:, 0], projected[:, 1], c=colors)
167 |         plt.gca().set_aspect("equal", "datalim")
168 |         plt.title("UMAP projection (step %d)" % step)
169 |         if not self.disabled:
170 |             self.projection_win = self.vis.matplot(plt, win=self.projection_win)
171 |         if out_fpath is not None:
172 |             plt.savefig(out_fpath)
173 |         plt.clf()
174 |         
175 |     def save(self):
176 |         if not self.disabled:
177 |             self.vis.save([self.env_name])
178 |         


--------------------------------------------------------------------------------
/helper.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import librosa
 3 | from pathlib import Path
 4 | from encoder.inference import plot_embedding_as_heatmap
 5 | import sounddevice as sd
 6 | import wavio
 7 | 
 8 | def draw_embed(embed, name, which):
 9 |     """
10 |     Draws an embedding.
11 | 
12 |     Parameters:
13 |         embed (np.array): array of embedding
14 | 
15 |         name (str): title of plot
16 | 
17 | 
18 |     Return:
19 |         fig: matplotlib figure
20 |     """
21 |     fig, embed_ax = plt.subplots()
22 |     plot_embedding_as_heatmap(embed)
23 |     embed_ax.set_title(name)
24 |     embed_ax.set_aspect("equal", "datalim")
25 |     embed_ax.set_xticks([])
26 |     embed_ax.set_yticks([])
27 |     embed_ax.figure.canvas.draw()
28 |     return fig
29 | 
30 | 
31 | def create_spectrogram(voice_sample):
32 |     """
33 |     Creates and saves a spectrogram plot for a sound sample.
34 | 
35 |     Parameters:
36 |         voice_sample (str): path to sample of sound
37 | 
38 |     Return:
39 |         fig
40 |     """
41 | 
42 |     in_fpath = Path(voice_sample.replace('"', "").replace("'", ""))
43 |     original_wav, sampling_rate = librosa.load(str(in_fpath))
44 | 
45 |     # Plot the signal read from wav file
46 |     fig = plt.figure()
47 |     plt.subplot(211)
48 |     plt.title(f"Spectrogram of file {voice_sample}")
49 | 
50 |     plt.plot(original_wav)
51 |     plt.xlabel("Sample")
52 |     plt.ylabel("Amplitude")
53 | 
54 |     plt.subplot(212)
55 |     plt.specgram(original_wav, Fs=sampling_rate)
56 |     plt.xlabel("Time")
57 |     plt.ylabel("Frequency")
58 |     # plt.savefig(voice_sample.split(".")[0] + "_spectogram.png")
59 |     return fig
60 | 
61 | def read_audio(file):
62 |     with open(file, "rb") as audio_file:
63 |         audio_bytes = audio_file.read()
64 |     return audio_bytes
65 | 
66 | def record(duration=5, fs=48000):
67 |     sd.default.samplerate = fs
68 |     sd.default.channels = 1
69 |     myrecording = sd.rec(int(duration * fs))
70 |     sd.wait(duration)
71 |     return myrecording
72 | 
73 | def save_record(path_myrecording, myrecording, fs):
74 |     wavio.write(path_myrecording, myrecording, fs, sampwidth=2)
75 |     return None
76 | 


--------------------------------------------------------------------------------
/requirements_demo.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.10.0
  2 | altair==4.1.0
  3 | appdirs==1.4.4
  4 | appnope==0.1.0
  5 | argon2-cffi==20.1.0
  6 | astor==0.8.1
  7 | async-generator==1.10
  8 | attrs==20.2.0
  9 | audioread==2.1.8
 10 | backcall==0.2.0
 11 | base58==2.0.1
 12 | bleach==3.2.1
 13 | blinker==1.4
 14 | boto3==1.15.7
 15 | botocore==1.18.7
 16 | cachetools==4.1.1
 17 | certifi==2020.6.20
 18 | cffi==1.14.3
 19 | chardet==3.0.4
 20 | click==7.1.2
 21 | cycler==0.10.0
 22 | decorator==4.4.2
 23 | defusedxml==0.6.0
 24 | dill==0.3.2
 25 | entrypoints==0.3
 26 | enum-compat==0.0.3
 27 | ffmpeg==1.4
 28 | future==0.18.2
 29 | gast==0.2.2
 30 | google-pasta==0.2.0
 31 | grpcio==1.32.0
 32 | h5py==2.10.0
 33 | idna==2.10
 34 | importlib-metadata==2.0.0
 35 | inflect==4.1.0
 36 | ipykernel==5.3.4
 37 | ipython==7.18.1
 38 | ipython-genutils==0.2.0
 39 | ipywidgets==7.5.1
 40 | jedi==0.17.2
 41 | Jinja2==2.11.2
 42 | jmespath==0.10.0
 43 | joblib==0.16.0
 44 | jsonpatch==1.26
 45 | jsonpointer==2.0
 46 | jsonschema==3.2.0
 47 | jupyter-client==6.1.7
 48 | jupyter-core==4.6.3
 49 | jupyterlab-pygments==0.1.2
 50 | Keras-Applications==1.0.8
 51 | Keras-Preprocessing==1.1.2
 52 | kiwisolver==1.2.0
 53 | librosa==0.8.0
 54 | llvmlite==0.31.0
 55 | Markdown==3.2.2
 56 | MarkupSafe==1.1.1
 57 | matplotlib==3.2.2
 58 | mistune==0.8.4
 59 | multiprocess==0.70.10
 60 | nbclient==0.5.0
 61 | nbconvert==6.0.6
 62 | nbformat==5.0.7
 63 | nest-asyncio==1.4.1
 64 | notebook==6.1.4
 65 | numba==0.48.0
 66 | numpy==1.19.2
 67 | opt-einsum==3.3.0
 68 | packaging==20.4
 69 | pandas==1.1.2
 70 | pandocfilters==1.4.2
 71 | parso==0.7.1
 72 | pathtools==0.1.2
 73 | pexpect==4.8.0
 74 | pickleshare==0.7.5
 75 | Pillow==7.2.0
 76 | pooch==1.2.0
 77 | prometheus-client==0.8.0
 78 | prompt-toolkit==3.0.7
 79 | protobuf==3.13.0
 80 | ptyprocess==0.6.0
 81 | pyarrow==1.0.1
 82 | pycparser==2.20
 83 | pydeck==0.5.0b1
 84 | Pygments==2.7.1
 85 | pyparsing==2.4.7
 86 | PyQt5==5.15.1
 87 | PyQt5-sip==12.8.1
 88 | pyrsistent==0.17.3
 89 | python-dateutil==2.8.1
 90 | pytz==2020.1
 91 | pyzmq==19.0.2
 92 | requests==2.24.0
 93 | resampy==0.2.2
 94 | s3transfer==0.3.3
 95 | scikit-learn==0.23.2
 96 | scipy==1.5.2
 97 | Send2Trash==1.5.0
 98 | six==1.15.0
 99 | sounddevice==0.4.1
100 | SoundFile==0.10.3.post1
101 | streamlit==0.67.1
102 | tensorboard==1.15.0
103 | tensorflow==1.15.0
104 | tensorflow-estimator==1.15.1
105 | termcolor==1.1.0
106 | terminado==0.9.1
107 | testpath==0.4.4
108 | threadpoolctl==2.1.0
109 | toml==0.10.1
110 | toolz==0.11.1
111 | torch==1.6.0
112 | torchfile==0.1.0
113 | torchvision==0.7.0
114 | tornado==6.0.4
115 | tqdm==4.50.0
116 | traitlets==5.0.4
117 | tzlocal==2.1
118 | umap-learn==0.4.6
119 | Unidecode==1.1.1
120 | urllib3==1.25.10
121 | validators==0.18.1
122 | visdom==0.1.8.9
123 | watchdog==0.10.3
124 | wavio==0.0.4
125 | wcwidth==0.2.5
126 | webencodings==0.5.1
127 | webrtcvad==2.0.10
128 | websocket-client==0.57.0
129 | Werkzeug==1.0.1
130 | widgetsnbextension==3.5.1
131 | wrapt==1.12.1
132 | zipp==3.2.0
133 | 


--------------------------------------------------------------------------------
/samples/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/.DS_Store


--------------------------------------------------------------------------------
/samples/1320_00000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/1320_00000.mp3


--------------------------------------------------------------------------------
/samples/3575_00000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/3575_00000.mp3


--------------------------------------------------------------------------------
/samples/8230_00000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/8230_00000.mp3


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
 1 | The audio files in this folder are provided for toolbox testing and
 2 | benchmarking purposes. These are the same reference utterances
 3 | used by the SV2TTS authors to generate the audio samples located at:
 4 | https://google.github.io/tacotron/publications/speaker_adaptation/index.html
 5 | 
 6 | The `p240_00000.mp3` and `p260_00000.mp3` files are compressed
 7 | versions of audios from the VCTK corpus available at:
 8 | https://datashare.is.ed.ac.uk/handle/10283/3443
 9 | VCTK.txt contains the copyright notices and licensing information.
10 | 
11 | The `1320_00000.mp3`, `3575_00000.mp3`, `6829_00000.mp3`
12 | and `8230_00000.mp3` files are compressed versions of audios
13 | from the LibriSpeech dataset available at: https://openslr.org/12
14 | For these files, the following notice applies:
15 | ```
16 | LibriSpeech (c) 2014 by Vassil Panayotov
17 | 
18 | LibriSpeech ASR corpus is licensed under a
19 | Creative Commons Attribution 4.0 International License.
20 | 
21 | See <http://creativecommons.org/licenses/by/4.0/>.
22 | ```
23 | 


--------------------------------------------------------------------------------
/samples/VCTK.txt:
--------------------------------------------------------------------------------
 1 | ---------------------------------------------------------------------
 2 |                           CSTR VCTK Corpus 
 3 |       English Multi-speaker Corpus for CSTR Voice Cloning Toolkit 
 4 | 
 5 |                            (Version 0.92) 
 6 |                         RELEASE September 2019
 7 |              The Centre for Speech Technology Research
 8 |                       University of Edinburgh 
 9 |                         Copyright (c) 2019 
10 | 
11 |                          Junichi Yamagishi
12 |                        jyamagis@inf.ed.ac.uk
13 | ---------------------------------------------------------------------
14 | 
15 | Overview 
16 | 
17 | This CSTR VCTK Corpus includes speech data uttered by 110 English
18 | speakers with various accents. Each speaker reads out about 400
19 | sentences, which were selected from a newspaper, the rainbow passage
20 | and an elicitation paragraph used for the speech accent archive.
21 | 
22 | The newspaper texts were taken from Herald Glasgow, with permission
23 | from Herald & Times Group. Each speaker has a different set of the
24 | newspaper texts selected based a greedy algorithm that increases the
25 | contextual and phonetic coverage. The details of the text selection 
26 | algorithms are described in the following paper: 
27 | 
28 | C. Veaux, J. Yamagishi and S. King, 
29 | "The voice bank corpus: Design, collection and data analysis of 
30 | a large regional accent speech database," 
31 | https://doi.org/10.1109/ICSDA.2013.6709856
32 | 
33 | The rainbow passage and elicitation paragraph are the same for all
34 | speakers. The rainbow passage can be found at International Dialects
35 | of English Archive:
36 | (http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation
37 | paragraph is identical to the one used for the speech accent archive
38 | (http://accent.gmu.edu). The details of the the speech accent archive
39 | can be found at
40 | http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf
41 | 
42 | All speech data was recorded using an identical recording setup: an
43 | omni-directional microphone (DPA 4035) and a small diaphragm condenser 
44 | microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz 
45 | sampling frequency at 24 bits and in a hemi-anechoic chamber of 
46 | the University of Edinburgh. (However, two speakers, p280 and p315 
47 | had technical issues of the audio recordings using MKH 800). 
48 | All recordings were converted into 16 bits, were downsampled to 
49 | 48 kHz, and were manually end-pointed.
50 | 
51 | This corpus was originally aimed for HMM-based text-to-speech synthesis 
52 | systems, especially for speaker-adaptive HMM-based speech synthesis 
53 | that uses average voice models trained on multiple speakers and speaker
54 | adaptation technologies. This corpus is also suitable for DNN-based 
55 | multi-speaker text-to-speech synthesis systems and waveform modeling.
56 | 
57 | COPYING 
58 | 
59 | This corpus is licensed under the Creative Commons License: Attribution 4.0 International 
60 | http://creativecommons.org/licenses/by/4.0/legalcode 
61 | 
62 | VCTK VARIANTS 
63 | There are several variants of the VCTK corpus: 
64 | Speech enhancement 
65 | - Noisy speech database for training speech enhancement algorithms and TTS models where we added various types of noises to VCTK artificially: http://dx.doi.org/10.7488/ds/2117
66 | - Reverberant speech database for training speech dereverberation algorithms and TTS models where we added various types of reverberantion to VCTK artificially http://dx.doi.org/10.7488/ds/1425
67 | - Noisy reverberant speech database for training speech enhancement algorithms and TTS models http://dx.doi.org/10.7488/ds/2139
68 | - Device Recorded VCTK where speech signals of the VCTK corpus were played back and re-recorded in office environments using relatively inexpensive consumer devices http://dx.doi.org/10.7488/ds/2316
69 | - The Microsoft Scalable Noisy Speech Dataset (MS-SNSD) https://github.com/microsoft/MS-SNSD
70 | 
71 | ASV and anti-spoofing 
72 | - Spoofing and Anti-Spoofing (SAS) corpus, which is a collection of synthetic speech signals produced by nine techniques, two of which are speech synthesis, and seven are voice conversion. All of them were built using the VCTK corpus. http://dx.doi.org/10.7488/ds/252
73 | - Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) Database. This database consists of synthetic speech signals produced by ten techniques and this has been used in the first Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) http://dx.doi.org/10.7488/ds/298
74 | - ASVspoof 2019: The 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge database. This database has been used in the 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2019) https://doi.org/10.7488/ds/2555
75 | 
76 | 
77 | ACKNOWLEDGEMENTS
78 | 
79 | The CSTR VCTK Corpus was constructed by:
80 |          
81 |         Christophe Veaux   (University of Edinburgh)
82 |         Junichi Yamagishi  (University of Edinburgh)
83 |         Kirsten MacDonald 
84 | 
85 | The research leading to these results was partly funded from EPSRC
86 | grants EP/I031022/1 (NST) and EP/J002526/1 (CAF), from the RSE-NSFC
87 | grant (61111130120), and from the JST CREST (uDialogue).
88 | 
89 | Please cite this corpus as follows:
90 | Christophe Veaux,  Junichi Yamagishi, Kirsten MacDonald, 
91 | "CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit",  
92 | The Centre for Speech Technology Research (CSTR), 
93 | University of Edinburgh 
94 | 
95 | 


--------------------------------------------------------------------------------
/samples/myvoice.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/myvoice.mp3


--------------------------------------------------------------------------------
/samples/p240_00000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/p240_00000.mp3


--------------------------------------------------------------------------------
/samples/p260_00000.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/p260_00000.mp3


--------------------------------------------------------------------------------
/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/slides.pdf


--------------------------------------------------------------------------------
/synthesizer/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/synthesizer/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/synthesizer/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/__pycache__/audio.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/audio.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/__pycache__/hparams.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/hparams.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/__pycache__/inference.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/inference.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/__pycache__/infolog.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/infolog.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/__pycache__/tacotron2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/tacotron2.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | 
  8 | 
  9 | def load_wav(path, sr):
 10 |     return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | def save_wav(wav, path, sr):
 13 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 14 |     #proposed by @dsmiller
 15 |     wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | def save_wavenet_wav(wav, path, sr):
 18 |     librosa.output.write_wav(path, wav, sr=sr)
 19 | 
 20 | def preemphasis(wav, k, preemphasize=True):
 21 |     if preemphasize:
 22 |         return signal.lfilter([1, -k], [1], wav)
 23 |     return wav
 24 | 
 25 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 26 |     if inv_preemphasize:
 27 |         return signal.lfilter([1], [1, -k], wav)
 28 |     return wav
 29 | 
 30 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 31 | def start_and_end_indices(quantized, silence_threshold=2):
 32 |     for start in range(quantized.size):
 33 |         if abs(quantized[start] - 127) > silence_threshold:
 34 |             break
 35 |     for end in range(quantized.size - 1, 1, -1):
 36 |         if abs(quantized[end] - 127) > silence_threshold:
 37 |             break
 38 |     
 39 |     assert abs(quantized[start] - 127) > silence_threshold
 40 |     assert abs(quantized[end] - 127) > silence_threshold
 41 |     
 42 |     return start, end
 43 | 
 44 | def get_hop_size(hparams):
 45 |     hop_size = hparams.hop_size
 46 |     if hop_size is None:
 47 |         assert hparams.frame_shift_ms is not None
 48 |         hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 49 |     return hop_size
 50 | 
 51 | def linearspectrogram(wav, hparams):
 52 |     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 53 |     S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
 54 |     
 55 |     if hparams.signal_normalization:
 56 |         return _normalize(S, hparams)
 57 |     return S
 58 | 
 59 | def melspectrogram(wav, hparams):
 60 |     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 61 |     S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
 62 |     
 63 |     if hparams.signal_normalization:
 64 |         return _normalize(S, hparams)
 65 |     return S
 66 | 
 67 | def inv_linear_spectrogram(linear_spectrogram, hparams):
 68 |     """Converts linear spectrogram to waveform using librosa"""
 69 |     if hparams.signal_normalization:
 70 |         D = _denormalize(linear_spectrogram, hparams)
 71 |     else:
 72 |         D = linear_spectrogram
 73 |     
 74 |     S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
 75 |     
 76 |     if hparams.use_lws:
 77 |         processor = _lws_processor(hparams)
 78 |         D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 79 |         y = processor.istft(D).astype(np.float32)
 80 |         return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 81 |     else:
 82 |         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
 83 | 
 84 | def inv_mel_spectrogram(mel_spectrogram, hparams):
 85 |     """Converts mel spectrogram to waveform using librosa"""
 86 |     if hparams.signal_normalization:
 87 |         D = _denormalize(mel_spectrogram, hparams)
 88 |     else:
 89 |         D = mel_spectrogram
 90 |     
 91 |     S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
 92 |     
 93 |     if hparams.use_lws:
 94 |         processor = _lws_processor(hparams)
 95 |         D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 96 |         y = processor.istft(D).astype(np.float32)
 97 |         return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 98 |     else:
 99 |         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
100 | 
101 | def _lws_processor(hparams):
102 |     import lws
103 |     return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
104 | 
105 | def _griffin_lim(S, hparams):
106 |     """librosa implementation of Griffin-Lim
107 |     Based on https://github.com/librosa/librosa/issues/434
108 |     """
109 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
110 |     S_complex = np.abs(S).astype(np.complex)
111 |     y = _istft(S_complex * angles, hparams)
112 |     for i in range(hparams.griffin_lim_iters):
113 |         angles = np.exp(1j * np.angle(_stft(y, hparams)))
114 |         y = _istft(S_complex * angles, hparams)
115 |     return y
116 | 
117 | def _stft(y, hparams):
118 |     if hparams.use_lws:
119 |         return _lws_processor(hparams).stft(y).T
120 |     else:
121 |         return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
122 | 
123 | def _istft(y, hparams):
124 |     return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
125 | 
126 | ##########################################################
127 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
128 | def num_frames(length, fsize, fshift):
129 |     """Compute number of time frames of spectrogram
130 |     """
131 |     pad = (fsize - fshift)
132 |     if length % fshift == 0:
133 |         M = (length + pad * 2 - fsize) // fshift + 1
134 |     else:
135 |         M = (length + pad * 2 - fsize) // fshift + 2
136 |     return M
137 | 
138 | 
139 | def pad_lr(x, fsize, fshift):
140 |     """Compute left and right padding
141 |     """
142 |     M = num_frames(len(x), fsize, fshift)
143 |     pad = (fsize - fshift)
144 |     T = len(x) + 2 * pad
145 |     r = (M - 1) * fshift + fsize - T
146 |     return pad, pad + r
147 | ##########################################################
148 | #Librosa correct padding
149 | def librosa_pad_lr(x, fsize, fshift):
150 |     return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
151 | 
152 | # Conversions
153 | _mel_basis = None
154 | _inv_mel_basis = None
155 | 
156 | def _linear_to_mel(spectogram, hparams):
157 |     global _mel_basis
158 |     if _mel_basis is None:
159 |         _mel_basis = _build_mel_basis(hparams)
160 |     return np.dot(_mel_basis, spectogram)
161 | 
162 | def _mel_to_linear(mel_spectrogram, hparams):
163 |     global _inv_mel_basis
164 |     if _inv_mel_basis is None:
165 |         _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
166 |     return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
167 | 
168 | def _build_mel_basis(hparams):
169 |     assert hparams.fmax <= hparams.sample_rate // 2
170 |     return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
171 |                                fmin=hparams.fmin, fmax=hparams.fmax)
172 | 
173 | def _amp_to_db(x, hparams):
174 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
175 |     return 20 * np.log10(np.maximum(min_level, x))
176 | 
177 | def _db_to_amp(x):
178 |     return np.power(10.0, (x) * 0.05)
179 | 
180 | def _normalize(S, hparams):
181 |     if hparams.allow_clipping_in_normalization:
182 |         if hparams.symmetric_mels:
183 |             return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
184 |                            -hparams.max_abs_value, hparams.max_abs_value)
185 |         else:
186 |             return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
187 |     
188 |     assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
189 |     if hparams.symmetric_mels:
190 |         return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
191 |     else:
192 |         return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
193 | 
194 | def _denormalize(D, hparams):
195 |     if hparams.allow_clipping_in_normalization:
196 |         if hparams.symmetric_mels:
197 |             return (((np.clip(D, -hparams.max_abs_value,
198 |                               hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
199 |                     + hparams.min_level_db)
200 |         else:
201 |             return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
202 |     
203 |     if hparams.symmetric_mels:
204 |         return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
205 |     else:
206 |         return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
207 | 


--------------------------------------------------------------------------------
/synthesizer/feeder.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import train_test_split
  2 | from synthesizer.utils.text import text_to_sequence
  3 | from synthesizer.infolog import log
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import threading
  7 | import time
  8 | import os
  9 | 
 10 | _batches_per_group = 64
 11 | 
 12 | class Feeder:
 13 | 	"""
 14 | 		Feeds batches of data into queue on a background thread.
 15 | 	"""
 16 | 
 17 | 	def __init__(self, coordinator, metadata_filename, hparams):
 18 | 		super(Feeder, self).__init__()
 19 | 		self._coord = coordinator
 20 | 		self._hparams = hparams
 21 | 		self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
 22 | 		self._train_offset = 0
 23 | 		self._test_offset = 0
 24 | 
 25 | 		# Load metadata
 26 | 		self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels")
 27 | 		self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds")
 28 | 		with open(metadata_filename, encoding="utf-8") as f:
 29 | 			self._metadata = [line.strip().split("|") for line in f]
 30 | 			frame_shift_ms = hparams.hop_size / hparams.sample_rate
 31 | 			hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600)
 32 | 			log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours))
 33 | 
 34 | 		#Train test split
 35 | 		if hparams.tacotron_test_size is None:
 36 | 			assert hparams.tacotron_test_batches is not None
 37 | 
 38 | 		test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
 39 | 			else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
 40 | 		indices = np.arange(len(self._metadata))
 41 | 		train_indices, test_indices = train_test_split(indices,
 42 | 			test_size=test_size, random_state=hparams.tacotron_data_random_state)
 43 | 
 44 | 		#Make sure test_indices is a multiple of batch_size else round up
 45 | 		len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size)
 46 | 		extra_test = test_indices[len_test_indices:]
 47 | 		test_indices = test_indices[:len_test_indices]
 48 | 		train_indices = np.concatenate([train_indices, extra_test])
 49 | 
 50 | 		self._train_meta = list(np.array(self._metadata)[train_indices])
 51 | 		self._test_meta = list(np.array(self._metadata)[test_indices])
 52 | 
 53 | 		self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size
 54 | 
 55 | 		if hparams.tacotron_test_size is None:
 56 | 			assert hparams.tacotron_test_batches == self.test_steps
 57 | 
 58 | 		#pad input sequences with the <pad_token> 0 ( _ )
 59 | 		self._pad = 0
 60 | 		#explicitely setting the padding to a value that doesn"t originally exist in the spectogram
 61 | 		#to avoid any possible conflicts, without affecting the output range of the model too much
 62 | 		if hparams.symmetric_mels:
 63 | 			self._target_pad = -hparams.max_abs_value
 64 | 		else:
 65 | 			self._target_pad = 0.
 66 | 		#Mark finished sequences with 1s
 67 | 		self._token_pad = 1.
 68 | 
 69 | 		with tf.device("/cpu:0"):
 70 | 			# Create placeholders for inputs and targets. Don"t specify batch size because we want
 71 | 			# to be able to feed different batch sizes at eval time.
 72 | 			self._placeholders = [
 73 | 				tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"),
 74 | 				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
 75 | 				tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
 76 | 							   name="mel_targets"),
 77 | 				tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"),
 78 | 				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
 79 | 				tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
 80 | 							   name="split_infos"),
 81 | 				
 82 | 				# SV2TTS
 83 | 				tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
 84 | 							   name="speaker_embeddings")
 85 | 			]
 86 | 
 87 | 			# Create queue for buffering data
 88 | 			queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
 89 | 									 tf.int32, tf.int32, tf.float32], name="input_queue")
 90 | 			self._enqueue_op = queue.enqueue(self._placeholders)
 91 | 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
 92 | 				self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue()
 93 | 
 94 | 			self.inputs.set_shape(self._placeholders[0].shape)
 95 | 			self.input_lengths.set_shape(self._placeholders[1].shape)
 96 | 			self.mel_targets.set_shape(self._placeholders[2].shape)
 97 | 			self.token_targets.set_shape(self._placeholders[3].shape)
 98 | 			self.targets_lengths.set_shape(self._placeholders[4].shape)
 99 | 			self.split_infos.set_shape(self._placeholders[5].shape)
100 | 			self.speaker_embeddings.set_shape(self._placeholders[6].shape)
101 | 
102 | 			# Create eval queue for buffering eval data
103 | 			eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
104 | 										  tf.int32, tf.int32, tf.float32], name="eval_queue")
105 | 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
106 | 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \
107 | 				self.eval_token_targets, self.eval_targets_lengths, \
108 | 				self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue()
109 | 
110 | 			self.eval_inputs.set_shape(self._placeholders[0].shape)
111 | 			self.eval_input_lengths.set_shape(self._placeholders[1].shape)
112 | 			self.eval_mel_targets.set_shape(self._placeholders[2].shape)
113 | 			self.eval_token_targets.set_shape(self._placeholders[3].shape)
114 | 			self.eval_targets_lengths.set_shape(self._placeholders[4].shape)
115 | 			self.eval_split_infos.set_shape(self._placeholders[5].shape)
116 | 			self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape)
117 | 
118 | 
119 | 	def start_threads(self, session):
120 | 		self._session = session
121 | 		thread = threading.Thread(name="background", target=self._enqueue_next_train_group)
122 | 		thread.daemon = True #Thread will close when parent quits
123 | 		thread.start()
124 | 
125 | 		thread = threading.Thread(name="background", target=self._enqueue_next_test_group)
126 | 		thread.daemon = True #Thread will close when parent quits
127 | 		thread.start()
128 | 
129 | 	def _get_test_groups(self):
130 | 		meta = self._test_meta[self._test_offset]
131 | 		self._test_offset += 1
132 | 
133 | 		text = meta[5]
134 | 
135 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
136 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
137 | 		#Create parallel sequences containing zeros to represent a non finished sequence
138 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
139 | 		embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
140 | 		return input_data, mel_target, token_target, embed_target, len(mel_target)
141 | 	
142 | 	def make_test_batches(self):
143 | 		start = time.time()
144 | 
145 | 		# Read a group of examples
146 | 		n = self._hparams.tacotron_batch_size
147 | 		r = self._hparams.outputs_per_step
148 | 
149 | 		#Test on entire test set
150 | 		examples = [self._get_test_groups() for i in range(len(self._test_meta))]
151 | 
152 | 		# Bucket examples based on similar output sequence length for efficiency
153 | 		examples.sort(key=lambda x: x[-1])
154 | 		batches = [examples[i: i+n] for i in range(0, len(examples), n)]
155 | 		np.random.shuffle(batches)
156 | 
157 | 		log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start))
158 | 		return batches, r
159 | 
160 | 	def _enqueue_next_train_group(self):
161 | 		while not self._coord.should_stop():
162 | 			start = time.time()
163 | 
164 | 			# Read a group of examples
165 | 			n = self._hparams.tacotron_batch_size
166 | 			r = self._hparams.outputs_per_step
167 | 			examples = [self._get_next_example() for i in range(n * _batches_per_group)]
168 | 
169 | 			# Bucket examples based on similar output sequence length for efficiency
170 | 			examples.sort(key=lambda x: x[-1])
171 | 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
172 | 			np.random.shuffle(batches)
173 | 
174 | 			log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start))
175 | 			for batch in batches:
176 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
177 | 				self._session.run(self._enqueue_op, feed_dict=feed_dict)
178 | 
179 | 	def _enqueue_next_test_group(self):
180 | 		#Create test batches once and evaluate on them for all test steps
181 | 		test_batches, r = self.make_test_batches()
182 | 		while not self._coord.should_stop():
183 | 			for batch in test_batches:
184 | 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r)))
185 | 				self._session.run(self._eval_enqueue_op, feed_dict=feed_dict)
186 | 
187 | 	def _get_next_example(self):
188 | 		"""Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
189 | 		"""
190 | 		if self._train_offset >= len(self._train_meta):
191 | 			self._train_offset = 0
192 | 			np.random.shuffle(self._train_meta)
193 | 
194 | 		meta = self._train_meta[self._train_offset]
195 | 		self._train_offset += 1
196 | 
197 | 		text = meta[5]
198 | 
199 | 		input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
200 | 		mel_target = np.load(os.path.join(self._mel_dir, meta[1]))
201 | 		#Create parallel sequences containing zeros to represent a non finished sequence
202 | 		token_target = np.asarray([0.] * (len(mel_target) - 1))
203 | 		embed_target = np.load(os.path.join(self._embed_dir, meta[2]))
204 | 		return input_data, mel_target, token_target, embed_target, len(mel_target)
205 | 
206 | 	def _prepare_batch(self, batches, outputs_per_step):
207 | 		assert 0 == len(batches) % self._hparams.tacotron_num_gpus
208 | 		size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus)
209 | 		np.random.shuffle(batches)
210 | 
211 | 		inputs = None
212 | 		mel_targets = None
213 | 		token_targets = None
214 | 		targets_lengths = None
215 | 		split_infos = []
216 | 
217 | 		targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss
218 | 		input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32)
219 | 		
220 | 		for i in range(self._hparams.tacotron_num_gpus):
221 | 			batch = batches[size_per_device*i:size_per_device*(i+1)]
222 | 			input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch])
223 | 			inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device
224 | 			mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step)
225 | 			mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device
226 | 
227 | 			#Pad sequences with 1 to infer that the sequence is done
228 | 			token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step)
229 | 			token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device
230 | 			split_infos.append([input_max_len, mel_target_max_len, token_target_max_len])
231 | 
232 | 		split_infos = np.asarray(split_infos, dtype=np.int32)
233 | 		
234 | 		### SV2TTS ###
235 | 		
236 | 		embed_targets = np.asarray([x[3] for x in batches])
237 | 		
238 | 		##############
239 | 		
240 | 		return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \
241 | 			   split_infos, embed_targets
242 | 
243 | 	def _prepare_inputs(self, inputs):
244 | 		max_len = max([len(x) for x in inputs])
245 | 		return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
246 | 
247 | 	def _prepare_targets(self, targets, alignment):
248 | 		max_len = max([len(t) for t in targets])
249 | 		data_len = self._round_up(max_len, alignment)
250 | 		return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
251 | 
252 | 	def _prepare_token_targets(self, targets, alignment):
253 | 		max_len = max([len(t) for t in targets]) + 1
254 | 		data_len = self._round_up(max_len, alignment)
255 | 		return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len
256 | 
257 | 	def _pad_input(self, x, length):
258 | 		return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
259 | 
260 | 	def _pad_target(self, t, length):
261 | 		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
262 | 
263 | 	def _pad_token_target(self, t, length):
264 | 		return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad)
265 | 
266 | 	def _round_up(self, x, multiple):
267 | 		remainder = x % multiple
268 | 		return x if remainder == 0 else x + multiple - remainder
269 | 
270 | 	def _round_down(self, x, multiple):
271 | 		remainder = x % multiple
272 | 		return x if remainder == 0 else x - remainder
273 | 


--------------------------------------------------------------------------------
/synthesizer/inference.py:
--------------------------------------------------------------------------------
  1 | from synthesizer.tacotron2 import Tacotron2
  2 | from synthesizer.hparams import hparams
  3 | from multiprocess.pool import Pool  # You're free to use either one
  4 | #from multiprocessing import Pool   # 
  5 | from synthesizer import audio
  6 | from pathlib import Path
  7 | from typing import Union, List
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import numba.cuda
 11 | import librosa
 12 | 
 13 | 
 14 | class Synthesizer:
 15 |     sample_rate = hparams.sample_rate
 16 |     hparams = hparams
 17 |     
 18 |     def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, seed=None):
 19 |         """
 20 |         Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
 21 |         needed or until load() is called.
 22 |         
 23 |         :param checkpoints_dir: path to the directory containing the checkpoint file as well as the
 24 |         weight files (.data, .index and .meta files)
 25 |         :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
 26 |         :param low_mem: if True, the model will be loaded in a separate process and its resources 
 27 |         will be released after each usage. Adds a large overhead, only recommended if your GPU 
 28 |         memory is low (<= 2gb)
 29 |         :param seed: optional integer for seeding random number generators when initializing model
 30 |         This makes the synthesizer output consistent for a given embedding and input text.
 31 |         However, it requires the model to be reloaded whenever a text is synthesized.
 32 |         """
 33 |         self.verbose = verbose
 34 |         self._low_mem = low_mem
 35 |         self._seed = seed
 36 | 
 37 |         # Prepare the model
 38 |         self._model = None  # type: Tacotron2
 39 |         checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
 40 |         if checkpoint_state is None:
 41 |             raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
 42 |         self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
 43 |         if verbose:
 44 |             model_name = checkpoints_dir.parent.name.replace("logs-", "")
 45 |             step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
 46 |             print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
 47 | 
 48 |     def set_seed(self, new_seed):
 49 |         """
 50 |         Updates the seed that initializes random number generators associated with Tacotron2.
 51 |         Returns the new seed state as confirmation.
 52 |         """
 53 |         try:
 54 |             self._seed = int(new_seed)
 55 |         except:
 56 |             self._seed = None
 57 | 
 58 |         return self._seed
 59 | 
 60 |     def is_loaded(self):
 61 |         """
 62 |         Whether the model is loaded in GPU memory.
 63 |         """
 64 |         return self._model is not None
 65 |     
 66 |     def load(self):
 67 |         """
 68 |         Effectively loads the model to GPU memory given the weights file that was passed in the
 69 |         constructor.
 70 |         """
 71 |         if self._low_mem:
 72 |             raise Exception("Cannot load the synthesizer permanently in low mem mode")
 73 |         tf.compat.v1.reset_default_graph()
 74 |         self._model = Tacotron2(self.checkpoint_fpath, hparams, seed=self._seed)
 75 |             
 76 |     def synthesize_spectrograms(self, texts: List[str],
 77 |                                 embeddings: Union[np.ndarray, List[np.ndarray]],
 78 |                                 return_alignments=False):
 79 |         """
 80 |         Synthesizes mel spectrograms from texts and speaker embeddings.
 81 | 
 82 |         :param texts: a list of N text prompts to be synthesized
 83 |         :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
 84 |         :param return_alignments: if True, a matrix representing the alignments between the 
 85 |         characters
 86 |         and each decoder output step will be returned for each spectrogram
 87 |         :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
 88 |         sequence length of spectrogram i, and possibly the alignments.
 89 |         """
 90 |         if not self._low_mem:
 91 |             # Usual inference mode: load the model on the first request and keep it loaded.
 92 |             # Reload it every time for deterministic operation if seed specified.
 93 |             if not self.is_loaded() or self._seed is not None:
 94 |                 self.load()
 95 |             specs, alignments = self._model.my_synthesize(embeddings, texts)
 96 |         else:
 97 |             # Low memory inference mode: load the model upon every request. The model has to be 
 98 |             # loaded in a separate process to be able to release GPU memory (a simple workaround 
 99 |             # to tensorflow's intricacies)
100 |             specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 
101 |                                                 [(self.checkpoint_fpath, embeddings, texts)])[0]
102 |     
103 |         return (specs, alignments) if return_alignments else specs
104 | 
105 |     @staticmethod
106 |     def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
107 |         # Load the model and forward the inputs
108 |         tf.compat.v1.reset_default_graph()
109 |         model = Tacotron2(checkpoint_fpath, hparams, seed=self._seed)
110 |         specs, alignments = model.my_synthesize(embeddings, texts)
111 |         
112 |         # Detach the outputs (not doing so will cause the process to hang)
113 |         specs, alignments = [spec.copy() for spec in specs], alignments.copy()
114 |         
115 |         # Close cuda for this process
116 |         model.session.close()
117 |         numba.cuda.select_device(0)
118 |         numba.cuda.close()
119 |         
120 |         return specs, alignments
121 | 
122 |     @staticmethod
123 |     def load_preprocess_wav(fpath):
124 |         """
125 |         Loads and preprocesses an audio file under the same conditions the audio files were used to
126 |         train the synthesizer. 
127 |         """
128 |         wav = librosa.load(str(fpath), hparams.sample_rate)[0]
129 |         if hparams.rescale:
130 |             wav = wav / np.abs(wav).max() * hparams.rescaling_max
131 |         return wav
132 | 
133 |     @staticmethod
134 |     def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
135 |         """
136 |         Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
137 |         were fed to the synthesizer when training.
138 |         """
139 |         if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
140 |             wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
141 |         else:
142 |             wav = fpath_or_wav
143 |         
144 |         mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
145 |         return mel_spectrogram
146 |     
147 |     @staticmethod
148 |     def griffin_lim(mel):
149 |         """
150 |         Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
151 |         with the same parameters present in hparams.py.
152 |         """
153 |         return audio.inv_mel_spectrogram(mel, hparams)
154 | 


--------------------------------------------------------------------------------
/synthesizer/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import json
 3 | from datetime import datetime
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | _format = "%Y-%m-%d %H:%M:%S.%f"
 8 | _file = None
 9 | _run_name = None
10 | _slack_url = None
11 | 
12 | 
13 | def init(filename, run_name, slack_url=None):
14 | 	global _file, _run_name, _slack_url
15 | 	_close_logfile()
16 | 	_file = open(filename, "a")
17 | 	_file = open(filename, "a")
18 | 	_file.write("\n-----------------------------------------------------------------\n")
19 | 	_file.write("Starting new {} training run\n".format(run_name))
20 | 	_file.write("-----------------------------------------------------------------\n")
21 | 	_run_name = run_name
22 | 	_slack_url = slack_url
23 | 
24 | 
25 | def log(msg, end="\n", slack=False):
26 | 	print(msg, end=end)
27 | 	if _file is not None:
28 | 		_file.write("[%s]  %s\n" % (datetime.now().strftime(_format)[:-3], msg))
29 | 	if slack and _slack_url is not None:
30 | 		Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 | 	global _file
35 | 	if _file is not None:
36 | 		_file.close()
37 | 		_file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 | 	req = Request(_slack_url)
42 | 	req.add_header("Content-Type", "application/json")
43 | 	urlopen(req, json.dumps({
44 | 		"username": "tacotron",
45 | 		"icon_emoji": ":taco:",
46 | 		"text": "*%s*: %s" % (_run_name, msg)
47 | 	}).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/synthesizer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == "Tacotron":
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception("Unknown model: " + name)
9 | 


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/architecture_wrappers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/architecture_wrappers.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/attention.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/attention.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/custom_decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/custom_decoder.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/helpers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/helpers.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/modules.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/modules.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/__pycache__/tacotron.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/tacotron.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/models/architecture_wrappers.py:
--------------------------------------------------------------------------------
  1 | """A set of wrappers useful for tacotron 2 architecture
  2 | All notations and variable names were used in concordance with originial tensorflow implementation
  3 | """
  4 | import collections
  5 | import tensorflow as tf
  6 | from synthesizer.models.attention import _compute_attention
  7 | from tensorflow.contrib.rnn import RNNCell
  8 | from tensorflow.python.framework import ops, tensor_shape
  9 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
 10 | from tensorflow.python.util import nest
 11 | 
 12 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 13 | 
 14 | 
 15 | 
 16 | class TacotronEncoderCell(RNNCell):
 17 | 	"""Tacotron 2 Encoder Cell
 18 | 	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
 19 | 	layer to predict the hidden representation vector (or memory)
 20 | 	"""
 21 | 
 22 | 	def __init__(self, convolutional_layers, lstm_layer):
 23 | 		"""Initialize encoder parameters
 24 | 
 25 | 		Args:
 26 | 			convolutional_layers: Encoder convolutional block class
 27 | 			lstm_layer: encoder bidirectional lstm layer class
 28 | 		"""
 29 | 		super(TacotronEncoderCell, self).__init__()
 30 | 		#Initialize encoder layers
 31 | 		self._convolutions = convolutional_layers
 32 | 		self._cell = lstm_layer
 33 | 
 34 | 	def __call__(self, inputs, input_lengths=None):
 35 | 		#Pass input sequence through a stack of convolutional layers
 36 | 		conv_output = self._convolutions(inputs)
 37 | 
 38 | 		#Extract hidden representation from encoder lstm cells
 39 | 		hidden_representation = self._cell(conv_output, input_lengths)
 40 | 
 41 | 		#For shape visualization
 42 | 		self.conv_output_shape = conv_output.shape
 43 | 		return hidden_representation
 44 | 
 45 | 
 46 | class TacotronDecoderCellState(
 47 | 	collections.namedtuple("TacotronDecoderCellState",
 48 | 	 ("cell_state", "attention", "time", "alignments",
 49 | 	  "alignment_history"))):
 50 | 	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
 51 | 	Contains:
 52 | 	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 53 | 		step.
 54 | 	  - `attention`: The attention emitted at the previous time step.
 55 | 	  - `time`: int32 scalar containing the current time step.
 56 | 	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 57 | 		 emitted at the previous time step for each attention mechanism.
 58 | 	  - `alignment_history`: a single or tuple of `TensorArray`(s)
 59 | 		 containing alignment matrices from all time steps for each attention
 60 | 		 mechanism. Call `stack()` on each to convert to a `Tensor`.
 61 | 	"""
 62 | 	def replace(self, **kwargs):
 63 | 		"""Clones the current state while overwriting components provided by kwargs.
 64 | 		"""
 65 | 		return super(TacotronDecoderCellState, self)._replace(**kwargs)
 66 | 
 67 | class TacotronDecoderCell(RNNCell):
 68 | 	"""Tactron 2 Decoder Cell
 69 | 	Decodes encoder output and previous mel frames into next r frames
 70 | 
 71 | 	Decoder Step i:
 72 | 		1) Prenet to compress last output information
 73 | 		2) Concat compressed inputs with previous context vector (input feeding) *
 74 | 		3) Decoder RNN (actual decoding) to predict current state s_{i} *
 75 | 		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 76 | 		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 77 | 		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 78 | 
 79 | 	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper,
 80 | 	and wrap that with the prenet before doing an input feeding, and with the prediction layer
 81 | 	that uses RNN states to project on output space. Actions marked with (*) can be replaced with
 82 | 	tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only.
 83 | 	"""
 84 | 
 85 | 	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
 86 | 		"""Initialize decoder parameters
 87 | 
 88 | 		Args:
 89 | 		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
 90 | 		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to
 91 | 			    learn encoder-decoder alignments
 92 | 		    rnn_cell: Instance of RNNCell, main body of the decoder
 93 | 		    frame_projection: tensorflow fully connected layer with r * num_mels output units
 94 | 		    stop_projection: tensorflow fully connected layer, expected to project to a scalar
 95 | 			    and through a sigmoid activation
 96 | 			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
 97 | 		"""
 98 | 		super(TacotronDecoderCell, self).__init__()
 99 | 		#Initialize decoder layers
100 | 		self._prenet = prenet
101 | 		self._attention_mechanism = attention_mechanism
102 | 		self._cell = rnn_cell
103 | 		self._frame_projection = frame_projection
104 | 		self._stop_projection = stop_projection
105 | 
106 | 		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
107 | 
108 | 	def _batch_size_checks(self, batch_size, error_message):
109 | 		return [check_ops.assert_equal(batch_size,
110 | 		  self._attention_mechanism.batch_size,
111 | 		  message=error_message)]
112 | 
113 | 	@property
114 | 	def output_size(self):
115 | 		return self._frame_projection.shape
116 | 
117 | 	@property
118 | 	def state_size(self):
119 | 		"""The `state_size` property of `TacotronDecoderCell`.
120 | 
121 | 		Returns:
122 | 		  An `TacotronDecoderCell` tuple containing shapes used by this object.
123 | 		"""
124 | 		return TacotronDecoderCellState(
125 | 			cell_state=self._cell._cell.state_size,
126 | 			time=tensor_shape.TensorShape([]),
127 | 			attention=self._attention_layer_size,
128 | 			alignments=self._attention_mechanism.alignments_size,
129 | 			alignment_history=())
130 | 
131 | 	def zero_state(self, batch_size, dtype):
132 | 		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
133 | 
134 | 		Args:
135 | 		  batch_size: `0D` integer tensor: the batch size.
136 | 		  dtype: The internal state data type.
137 | 		Returns:
138 | 		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
139 | 		  possibly, empty `TensorArray` objects.
140 | 		Raises:
141 | 		  ValueError: (or, possibly at runtime, InvalidArgument), if
142 | 			`batch_size` does not match the output size of the encoder passed
143 | 			to the wrapper object at initialization time.
144 | 		"""
145 | 		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
146 | 			cell_state = self._cell._cell.zero_state(batch_size, dtype)
147 | 			error_message = (
148 | 				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
149 | 				"Non-matching batch sizes between the memory "
150 | 				"(encoder output) and the requested batch size.")
151 | 			with ops.control_dependencies(
152 | 				self._batch_size_checks(batch_size, error_message)):
153 | 				cell_state = nest.map_structure(
154 | 					lambda s: array_ops.identity(s, name="checked_cell_state"),
155 | 					cell_state)
156 | 			return TacotronDecoderCellState(
157 | 				cell_state=cell_state,
158 | 				time=array_ops.zeros([], dtype=tf.int32),
159 | 				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
160 | 				  dtype),
161 | 				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
162 | 				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
163 | 				dynamic_size=True))
164 | 
165 | 	def __call__(self, inputs, state):
166 | 		#Information bottleneck (essential for learning attention)
167 | 		prenet_output = self._prenet(inputs)
168 | 
169 | 		#Concat context vector and prenet output to form LSTM cells input (input feeding)
170 | 		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
171 | 
172 | 		#Unidirectional LSTM layers
173 | 		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
174 | 
175 | 
176 | 		#Compute the attention (context) vector and alignments using
177 | 		#the new decoder cell hidden state as query vector
178 | 		#and cumulative alignments to extract location features
179 | 		#The choice of the new cell hidden state (s_{i}) of the last
180 | 		#decoder RNN Cell is based on Luong et Al. (2015):
181 | 		#https://arxiv.org/pdf/1508.04025.pdf
182 | 		previous_alignments = state.alignments
183 | 		previous_alignment_history = state.alignment_history
184 | 		context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
185 | 			LSTM_output,
186 | 			previous_alignments,
187 | 			attention_layer=None)
188 | 
189 | 		#Concat LSTM outputs and context vector to form projections inputs
190 | 		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
191 | 
192 | 		#Compute predicted frames and predicted <stop_token>
193 | 		cell_outputs = self._frame_projection(projections_input)
194 | 		stop_tokens = self._stop_projection(projections_input)
195 | 
196 | 		#Save alignment history
197 | 		alignment_history = previous_alignment_history.write(state.time, alignments)
198 | 
199 | 		#Prepare next decoder state
200 | 		next_state = TacotronDecoderCellState(
201 | 			time=state.time + 1,
202 | 			cell_state=next_cell_state,
203 | 			attention=context_vector,
204 | 			alignments=cumulated_alignments,
205 | 			alignment_history=alignment_history)
206 | 
207 | 		return (cell_outputs, stop_tokens), next_state
208 | 


--------------------------------------------------------------------------------
/synthesizer/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.layers import core as layers_core
  6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
  7 | 
  8 | 
  9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 10 | def _compute_attention(attention_mechanism, cell_output, attention_state,
 11 | 					   attention_layer):
 12 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 13 | 	alignments, next_attention_state = attention_mechanism(
 14 | 		cell_output, state=attention_state)
 15 | 
 16 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 17 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 18 | 	# Context is the inner product of alignments and values along the
 19 | 	# memory time dimension.
 20 | 	# alignments shape is
 21 | 	#   [batch_size, 1, memory_time]
 22 | 	# attention_mechanism.values shape is
 23 | 	#   [batch_size, memory_time, memory_size]
 24 | 	# the batched matmul is over memory_time, so the output shape is
 25 | 	#   [batch_size, 1, memory_size].
 26 | 	# we then squeeze out the singleton dim.
 27 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 28 | 	context = array_ops.squeeze(context, [1])
 29 | 
 30 | 	if attention_layer is not None:
 31 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 32 | 	else:
 33 | 		attention = context
 34 | 
 35 | 	return attention, alignments, next_attention_state
 36 | 
 37 | 
 38 | def _location_sensitive_score(W_query, W_fil, W_keys):
 39 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 40 | 	This attention is described in:
 41 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 42 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 43 | 	  vances in Neural Information Processing Systems, 2015, pp.
 44 | 	  577–585.
 45 | 
 46 | 	#############################################################################
 47 | 			  hybrid attention (content-based + location-based)
 48 | 							   f = F * α_{i-1}
 49 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 50 | 	#############################################################################
 51 | 
 52 | 	Args:
 53 | 		W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features.
 54 | 		W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]"
 55 | 		W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs.
 56 | 	Returns:
 57 | 		A "[batch_size, max_time]" attention score (energy)
 58 | 	"""
 59 | 	# Get the number of hidden units from the trailing dimension of keys
 60 | 	dtype = W_query.dtype
 61 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 62 | 
 63 | 	v_a = tf.compat.v1.get_variable(
 64 | 		"attention_variable_projection", shape=[num_units], dtype=dtype,
 65 | 		initializer=tf.contrib.layers.xavier_initializer())
 66 | 	b_a = tf.compat.v1.get_variable(
 67 | 		"attention_bias", shape=[num_units], dtype=dtype,
 68 | 		initializer=tf.zeros_initializer())
 69 | 
 70 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 71 | 
 72 | def _smoothing_normalization(e):
 73 | 	"""Applies a smoothing normalization function instead of softmax
 74 | 	Introduced in:
 75 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 76 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 77 | 	  vances in Neural Information Processing Systems, 2015, pp.
 78 | 	  577–585.
 79 | 
 80 | 	############################################################################
 81 | 						Smoothing normalization function
 82 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 83 | 	############################################################################
 84 | 
 85 | 	Args:
 86 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 87 | 			values of an attention mechanism
 88 | 	Returns:
 89 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 90 | 			attendance to multiple memory time steps.
 91 | 	"""
 92 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 93 | 
 94 | 
 95 | class LocationSensitiveAttention(BahdanauAttention):
 96 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 97 | 	Usually referred to as "hybrid" attention (content-based + location-based)
 98 | 	Extends the additive attention described in:
 99 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
100 |   tion by jointly learning to align and translate,” in Proceedings
101 |   of ICLR, 2015."
102 | 	to use previous alignments as additional location features.
103 | 
104 | 	This attention is described in:
105 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
106 |   gio, “Attention-based models for speech recognition,” in Ad-
107 |   vances in Neural Information Processing Systems, 2015, pp.
108 |   577–585.
109 | 	"""
110 | 
111 | 	def __init__(self,
112 | 				 num_units,
113 | 				 memory,
114 | 				 hparams,
115 | 				 mask_encoder=True,
116 | 				 memory_sequence_length=None,
117 | 				 smoothing=False,
118 | 				 cumulate_weights=True,
119 | 				 name="LocationSensitiveAttention"):
120 | 		"""Construct the Attention mechanism.
121 | 		Args:
122 | 			num_units: The depth of the query mechanism.
123 | 			memory: The memory to query; usually the output of an RNN encoder.  This
124 | 				tensor should be shaped `[batch_size, max_time, ...]`.
125 | 			mask_encoder (optional): Boolean, whether to mask encoder paddings.
126 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
127 | 				in memory.  If provided, the memory tensor rows are masked with zeros
128 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
129 | 			smoothing (optional): Boolean. Determines which normalization function to use.
130 | 				Default normalization function (probablity_fn) is softmax. If smoothing is
131 | 				enabled, we replace softmax with:
132 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
133 | 				Introduced in:
134 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
135 | 				  gio, “Attention-based models for speech recognition,” in Ad-
136 | 				  vances in Neural Information Processing Systems, 2015, pp.
137 | 				  577–585.
138 | 				This is mainly used if the model wants to attend to multiple input parts
139 | 				at the same decoding step. We probably won"t be using it since multiple sound
140 | 				frames may depend on the same character/phone, probably not the way around.
141 | 				Note:
142 | 					We still keep it implemented in case we want to test it. They used it in the
143 | 					paper in the context of speech recognition, where one phoneme may depend on
144 | 					multiple subsequent sound frames.
145 | 			name: Name to use when creating ops.
146 | 		"""
147 | 		#Create normalization function
148 | 		#Setting it to None defaults in using softmax
149 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
150 | 		memory_length = memory_sequence_length if (mask_encoder==True) else None
151 | 		super(LocationSensitiveAttention, self).__init__(
152 | 				num_units=num_units,
153 | 				memory=memory,
154 | 				memory_sequence_length=memory_length,
155 | 				probability_fn=normalization_function,
156 | 				name=name)
157 | 
158 | 		self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters,
159 | 			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
160 | 			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
161 | 		self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False,
162 | 			dtype=tf.float32, name="location_features_layer")
163 | 		self._cumulate = cumulate_weights
164 | 
165 | 	def __call__(self, query, state):
166 | 		"""Score the query based on the keys and values.
167 | 		Args:
168 | 			query: Tensor of dtype matching `self.values` and shape
169 | 				`[batch_size, query_depth]`.
170 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
171 | 				`[batch_size, alignments_size]`
172 | 				(`alignments_size` is memory"s `max_time`).
173 | 		Returns:
174 | 			alignments: Tensor of dtype matching `self.values` and shape
175 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
176 | 				`max_time`).
177 | 		"""
178 | 		previous_alignments = state
179 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
180 | 
181 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
182 | 			processed_query = self.query_layer(query) if self.query_layer else query
183 | 			# -> [batch_size, 1, attention_dim]
184 | 			processed_query = tf.expand_dims(processed_query, 1)
185 | 
186 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
187 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
188 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
189 | 			# location features [batch_size, max_time, filters]
190 | 			f = self.location_convolution(expanded_alignments)
191 | 			# Projected location features [batch_size, max_time, attention_dim]
192 | 			processed_location_features = self.location_layer(f)
193 | 
194 | 			# energy shape [batch_size, max_time]
195 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
196 | 
197 | 
198 | 		# alignments shape = energy shape = [batch_size, max_time]
199 | 		alignments = self._probability_fn(energy, previous_alignments)
200 | 
201 | 		# Cumulate alignments
202 | 		if self._cumulate:
203 | 			next_state = alignments + previous_alignments
204 | 		else:
205 | 			next_state = alignments
206 | 
207 | 		return alignments, next_state
208 | 


--------------------------------------------------------------------------------
/synthesizer/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | import collections
  3 | import tensorflow as tf
  4 | from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
  5 | from tensorflow.contrib.seq2seq.python.ops import decoder
  6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
  7 | from tensorflow.python.framework import ops, tensor_shape
  8 | from tensorflow.python.layers import base as layers_base
  9 | from tensorflow.python.ops import rnn_cell_impl
 10 | from tensorflow.python.util import nest
 11 | 
 12 | 
 13 | class CustomDecoderOutput(
 14 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 15 | 	pass
 16 | 
 17 | 
 18 | class CustomDecoder(decoder.Decoder):
 19 | 	"""Custom sampling decoder.
 20 | 
 21 | 	Allows for stop token prediction at inference time
 22 | 	and returns equivalent loss in training time.
 23 | 
 24 | 	Note:
 25 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 26 | 	"""
 27 | 
 28 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 29 | 		"""Initialize CustomDecoder.
 30 | 		Args:
 31 | 			cell: An `RNNCell` instance.
 32 | 			helper: A `Helper` instance.
 33 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 34 | 				The initial state of the RNNCell.
 35 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 36 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 37 | 				to storing the result or sampling.
 38 | 		Raises:
 39 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 40 | 		"""
 41 | 		rnn_cell_impl.assert_like_rnncell(type(cell), cell)
 42 | 		if not isinstance(helper, helper_py.Helper):
 43 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 44 | 		if (output_layer is not None
 45 | 				and not isinstance(output_layer, layers_base.Layer)):
 46 | 			raise TypeError(
 47 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 48 | 		self._cell = cell
 49 | 		self._helper = helper
 50 | 		self._initial_state = initial_state
 51 | 		self._output_layer = output_layer
 52 | 
 53 | 	@property
 54 | 	def batch_size(self):
 55 | 		return self._helper.batch_size
 56 | 
 57 | 	def _rnn_output_size(self):
 58 | 		size = self._cell.output_size
 59 | 		if self._output_layer is None:
 60 | 			return size
 61 | 		else:
 62 | 			# To use layer"s compute_output_shape, we need to convert the
 63 | 			# RNNCell"s output_size entries into shapes with an unknown
 64 | 			# batch size.  We then pass this through the layer"s
 65 | 			# compute_output_shape and read off all but the first (batch)
 66 | 			# dimensions to get the output size of the rnn with the layer
 67 | 			# applied to the top.
 68 | 			output_shape_with_unknown_batch = nest.map_structure(
 69 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 70 | 					size)
 71 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 72 | 					output_shape_with_unknown_batch)
 73 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 74 | 
 75 | 	@property
 76 | 	def output_size(self):
 77 | 		# Return the cell output and the id
 78 | 		return CustomDecoderOutput(
 79 | 				rnn_output=self._rnn_output_size(),
 80 | 				token_output=self._helper.token_output_size,
 81 | 				sample_id=self._helper.sample_ids_shape)
 82 | 
 83 | 	@property
 84 | 	def output_dtype(self):
 85 | 		# Assume the dtype of the cell is the output_size structure
 86 | 		# containing the input_state"s first component's dtype.
 87 | 		# Return that structure and the sample_ids_dtype from the helper.
 88 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 89 | 		return CustomDecoderOutput(
 90 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 91 | 				tf.float32,
 92 | 				self._helper.sample_ids_dtype)
 93 | 
 94 | 	def initialize(self, name=None):
 95 | 		"""Initialize the decoder.
 96 | 		Args:
 97 | 			name: Name scope for any created operations.
 98 | 		Returns:
 99 | 			`(finished, first_inputs, initial_state)`.
100 | 		"""
101 | 		return self._helper.initialize() + (self._initial_state,)
102 | 
103 | 	def step(self, time, inputs, state, name=None):
104 | 		"""Perform a custom decoding step.
105 | 		Enables for dyanmic <stop_token> prediction
106 | 		Args:
107 | 			time: scalar `int32` tensor.
108 | 			inputs: A (structure of) input tensors.
109 | 			state: A (structure of) state tensors and TensorArrays.
110 | 			name: Name scope for any created operations.
111 | 		Returns:
112 | 			`(outputs, next_state, next_inputs, finished)`.
113 | 		"""
114 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
115 | 			#Call outputprojection wrapper cell
116 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
117 | 
118 | 			#apply output_layer (if existant)
119 | 			if self._output_layer is not None:
120 | 				cell_outputs = self._output_layer(cell_outputs)
121 | 			sample_ids = self._helper.sample(
122 | 					time=time, outputs=cell_outputs, state=cell_state)
123 | 
124 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
125 | 					time=time,
126 | 					outputs=cell_outputs,
127 | 					state=cell_state,
128 | 					sample_ids=sample_ids,
129 | 					stop_token_prediction=stop_token)
130 | 
131 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
132 | 		return (outputs, next_state, next_inputs, finished)
133 | 


--------------------------------------------------------------------------------
/synthesizer/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | 
  5 | 
  6 | class TacoTestHelper(Helper):
  7 | 	def __init__(self, batch_size, hparams):
  8 | 		with tf.name_scope("TacoTestHelper"):
  9 | 			self._batch_size = batch_size
 10 | 			self._output_dim = hparams.num_mels
 11 | 			self._reduction_factor = hparams.outputs_per_step
 12 | 			self.stop_at_any = hparams.stop_at_any
 13 | 
 14 | 	@property
 15 | 	def batch_size(self):
 16 | 		return self._batch_size
 17 | 
 18 | 	@property
 19 | 	def token_output_size(self):
 20 | 		return self._reduction_factor
 21 | 
 22 | 	@property
 23 | 	def sample_ids_shape(self):
 24 | 		return tf.TensorShape([])
 25 | 
 26 | 	@property
 27 | 	def sample_ids_dtype(self):
 28 | 		return np.int32
 29 | 
 30 | 	def initialize(self, name=None):
 31 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 | 	def sample(self, time, outputs, state, name=None):
 34 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
 37 | 		"""Stop on EOS. Otherwise, pass the last output as the next input and pass through state."""
 38 | 		with tf.name_scope("TacoTestHelper"):
 39 | 			#A sequence is finished when the output probability is > 0.5
 40 | 			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
 41 | 
 42 | 			#Since we are predicting r frames at each step, two modes are
 43 | 			#then possible:
 44 | 			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
 45 | 			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
 46 | 			#Note:
 47 | 			#	With enough training steps, the model should be able to predict when to stop correctly
 48 | 			#	and the use of stop_at_any = True would be recommended. If however the model didn"t
 49 | 			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option
 50 | 			#	to get a correct synthesis
 51 | 			if self.stop_at_any:
 52 | 				finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
 53 | 			else:
 54 | 				finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
 55 | 
 56 | 			# Feed last output frame as next input. outputs is [N, output_dim * r]
 57 | 			next_inputs = outputs[:, -self._output_dim:]
 58 | 			next_state = state
 59 | 			return (finished, next_inputs, next_state)
 60 | 
 61 | 
 62 | class TacoTrainingHelper(Helper):
 63 | 	def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
 64 | 		# inputs is [N, T_in], targets is [N, T_out, D]
 65 | 		with tf.name_scope("TacoTrainingHelper"):
 66 | 			self._batch_size = batch_size
 67 | 			self._output_dim = hparams.num_mels
 68 | 			self._reduction_factor = hparams.outputs_per_step
 69 | 			self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
 70 | 			self.gta = gta
 71 | 			self.eval = evaluating
 72 | 			self._hparams = hparams
 73 | 			self.global_step = global_step
 74 | 
 75 | 			r = self._reduction_factor
 76 | 			# Feed every r-th target frame as input
 77 | 			self._targets = targets[:, r-1::r, :]
 78 | 
 79 | 			#Maximal sequence length
 80 | 			self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
 81 | 
 82 | 	@property
 83 | 	def batch_size(self):
 84 | 		return self._batch_size
 85 | 
 86 | 	@property
 87 | 	def token_output_size(self):
 88 | 		return self._reduction_factor
 89 | 
 90 | 	@property
 91 | 	def sample_ids_shape(self):
 92 | 		return tf.TensorShape([])
 93 | 
 94 | 	@property
 95 | 	def sample_ids_dtype(self):
 96 | 		return np.int32
 97 | 
 98 | 	def initialize(self, name=None):
 99 | 		#Compute teacher forcing ratio for this global step.
100 | 		#In GTA mode, override teacher forcing scheme to work with full teacher forcing
101 | 		if self.gta:
102 | 			self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
103 | 		elif self.eval and self._hparams.natural_eval:
104 | 			self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
105 | 		else:
106 | 			if self._hparams.tacotron_teacher_forcing_mode == "scheduled":
107 | 				self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
108 | 					self.global_step, self._hparams)
109 | 
110 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
111 | 
112 | 	def sample(self, time, outputs, state, name=None):
113 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
114 | 
115 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
116 | 		with tf.name_scope(name or "TacoTrainingHelper"):
117 | 			#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
118 | 			finished = (time + 1 >= self._lengths)
119 | 
120 | 			#Pick previous outputs randomly with respect to teacher forcing ratio
121 | 			next_inputs = tf.cond(
122 | 				tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
123 | 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
124 | 				lambda: outputs[:,-self._output_dim:])
125 | 
126 | 			#Pass on state
127 | 			next_state = state
128 | 			return (finished, next_inputs, next_state)
129 | 
130 | 
131 | def _go_frames(batch_size, output_dim):
132 | 	"""Returns all-zero <GO> frames for a given batch size and output dimension"""
133 | 	return tf.tile([[0.0]], [batch_size, output_dim])
134 | 
135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
136 | 		#################################################################
137 | 		# Narrow Cosine Decay:
138 | 
139 | 		# Phase 1: tfr = 1
140 | 		# We only start learning rate decay after 10k steps
141 | 
142 | 		# Phase 2: tfr in ]0, 1[
143 | 		# decay reach minimal value at step ~280k
144 | 
145 | 		# Phase 3: tfr = 0
146 | 		# clip by minimal teacher forcing ratio value (step >~ 280k)
147 | 		#################################################################
148 | 		#Compute natural cosine decay
149 | 		tfr = tf.train.cosine_decay(init_tfr,
150 | 			global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
151 | 			decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
152 | 			alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
153 | 			name="tfr_cosine_decay")
154 | 
155 | 		#force teacher forcing ratio to take initial value when global step < start decay step.
156 | 		narrow_tfr = tf.cond(
157 | 			tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
158 | 			lambda: tf.convert_to_tensor(init_tfr),
159 | 			lambda: tfr)
160 | 
161 | 		return narrow_tfr


--------------------------------------------------------------------------------
/synthesizer/synthesize.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.tacotron2 import Tacotron2
 2 | from synthesizer.hparams import hparams_debug_string
 3 | from synthesizer.infolog import log
 4 | import tensorflow as tf
 5 | from tqdm import tqdm
 6 | import time
 7 | import os
 8 | 
 9 | 
10 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
11 |     eval_dir = os.path.join(output_dir, "eval")
12 |     log_dir = os.path.join(output_dir, "logs-eval")
13 |     
14 |     #Create output path if it doesn"t exist
15 |     os.makedirs(eval_dir, exist_ok=True)
16 |     os.makedirs(log_dir, exist_ok=True)
17 |     os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
18 |     os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
19 |     
20 |     log(hparams_debug_string())
21 |     synth = Tacotron2(checkpoint_path, hparams)
22 |     
23 |     #Set inputs batch wise
24 |     sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 
25 |                  in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
26 |     
27 |     log("Starting Synthesis")
28 |     with open(os.path.join(eval_dir, "map.txt"), "w") as file:
29 |         for i, texts in enumerate(tqdm(sentences)):
30 |             start = time.time()
31 |             basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
32 |             mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
33 |             
34 |             for elems in zip(texts, mel_filenames, speaker_ids):
35 |                 file.write("|".join([str(x) for x in elems]) + "\n")
36 |     log("synthesized mel spectrograms at {}".format(eval_dir))
37 |     return eval_dir
38 | 
39 | def run_synthesis(in_dir, out_dir, model_dir, hparams):
40 |     synth_dir = os.path.join(out_dir, "mels_gta")
41 |     os.makedirs(synth_dir, exist_ok=True)
42 |     metadata_filename = os.path.join(in_dir, "train.txt")
43 |     print(hparams_debug_string())
44 |     
45 |     # Load the model in memory
46 |     weights_dir = os.path.join(model_dir, "taco_pretrained")
47 |     checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
48 |     synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
49 |     
50 |     # Load the metadata
51 |     with open(metadata_filename, encoding="utf-8") as f:
52 |         metadata = [line.strip().split("|") for line in f]
53 |         frame_shift_ms = hparams.hop_size / hparams.sample_rate
54 |         hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
55 |         print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))
56 |         
57 |     #Set inputs batch wise
58 |     metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
59 |                 range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
60 |     # TODO: come on big boy, fix this
61 |     # Quick and dirty fix to make sure that all batches have the same size 
62 |     metadata = metadata[:-1]
63 |     
64 |     print("Starting Synthesis")
65 |     mel_dir = os.path.join(in_dir, "mels")
66 |     embed_dir = os.path.join(in_dir, "embeds")
67 |     meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
68 |     with open(meta_out_fpath, "w") as file:
69 |         for i, meta in enumerate(tqdm(metadata)):
70 |             texts = [m[5] for m in meta]
71 |             mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
72 |             embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
73 |             basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "") 
74 |                          for m in mel_filenames]
75 |             synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)
76 |             
77 |             for elems in meta:
78 |                 file.write("|".join([str(x) for x in elems]) + "\n")
79 |                 
80 |     print("Synthesized mel spectrograms at {}".format(synth_dir))
81 |     return meta_out_fpath
82 | 
83 | 


--------------------------------------------------------------------------------
/synthesizer/tacotron2.py:
--------------------------------------------------------------------------------
  1 | from synthesizer.utils.text import text_to_sequence
  2 | from synthesizer.infolog import log
  3 | from synthesizer.models import create_model
  4 | from synthesizer.utils import plot
  5 | from synthesizer import audio
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import os
  9 | 
 10 | 
 11 | class Tacotron2:
 12 |     def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron", seed=None):
 13 |         log("Constructing model: %s" % model_name)
 14 | 
 15 |         # Initialize tensorflow random number seed for deterministic operation if provided
 16 |         if seed is not None:
 17 |             tf.compat.v1.set_random_seed(seed)
 18 | 
 19 |         #Force the batch size to be known in order to use attention masking in batch synthesis
 20 |         inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs")
 21 |         input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths")
 22 |         speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
 23 |                                             name="speaker_embeddings")
 24 |         targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets")
 25 |         split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos")
 26 |         with tf.compat.v1.variable_scope("Tacotron_model") as scope:
 27 |             self.model = create_model(model_name, hparams)
 28 |             if gta:
 29 |                 self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta,
 30 |                                       split_infos=split_infos)
 31 |             else:
 32 |                 self.model.initialize(inputs, input_lengths, speaker_embeddings,
 33 |                                       split_infos=split_infos)
 34 |             
 35 |             self.mel_outputs = self.model.tower_mel_outputs
 36 |             self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None
 37 |             self.alignments = self.model.tower_alignments
 38 |             self.stop_token_prediction = self.model.tower_stop_token_prediction
 39 |             self.targets = targets
 40 |         
 41 |         self.gta = gta
 42 |         self._hparams = hparams
 43 |         #pad input sequences with the <pad_token> 0 ( _ )
 44 |         self._pad = 0
 45 |         #explicitely setting the padding to a value that doesn"t originally exist in the spectogram
 46 |         #to avoid any possible conflicts, without affecting the output range of the model too much
 47 |         if hparams.symmetric_mels:
 48 |             self._target_pad = -hparams.max_abs_value
 49 |         else:
 50 |             self._target_pad = 0.
 51 |         
 52 |         self.inputs = inputs
 53 |         self.input_lengths = input_lengths
 54 |         self.speaker_embeddings = speaker_embeddings
 55 |         self.targets = targets
 56 |         self.split_infos = split_infos
 57 |         
 58 |         log("Loading checkpoint: %s" % checkpoint_path)
 59 |         #Memory allocation on the GPUs as needed
 60 |         config = tf.compat.v1.ConfigProto()
 61 |         config.gpu_options.allow_growth = True
 62 |         config.allow_soft_placement = True
 63 |         
 64 |         self.session = tf.compat.v1.Session(config=config)
 65 |         self.session.run(tf.compat.v1.global_variables_initializer())
 66 |         
 67 |         saver = tf.compat.v1.train.Saver()
 68 |         saver.restore(self.session, checkpoint_path)
 69 |     
 70 |     def my_synthesize(self, speaker_embeds, texts):
 71 |         """
 72 |         Lighter synthesis function that directly returns the mel spectrograms.
 73 |         """
 74 |         
 75 |         # Prepare the input
 76 |         cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")]
 77 |         seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
 78 |         input_lengths = [len(seq) for seq in seqs]
 79 |         input_seqs, max_seq_len = self._prepare_inputs(seqs)
 80 |         split_infos = [[max_seq_len, 0, 0, 0]]
 81 |         feed_dict = {
 82 |             self.inputs: input_seqs,
 83 |             self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
 84 |             self.split_infos: np.asarray(split_infos, dtype=np.int32),
 85 |             self.speaker_embeddings: speaker_embeds
 86 |         }
 87 |         
 88 |         # Forward it
 89 |         mels, alignments, stop_tokens = self.session.run(
 90 |             [self.mel_outputs, self.alignments, self.stop_token_prediction],
 91 |             feed_dict=feed_dict)
 92 |         mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0]
 93 |         
 94 |         # Trim the output
 95 |         for i in range(len(mels)):
 96 |             try:
 97 |                 target_length = list(np.round(stop_tokens[i])).index(1)
 98 |                 mels[i] = mels[i][:target_length, :]
 99 |             except ValueError:
100 |                 # If no token is generated, we simply do not trim the output
101 |                 continue
102 |         
103 |         return [mel.T for mel in mels], alignments
104 |     
105 |     def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames):
106 |         hparams = self._hparams
107 |         cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]
108 |               
109 |         assert 0 == len(texts) % self._hparams.tacotron_num_gpus
110 |         seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
111 |         input_lengths = [len(seq) for seq in seqs]
112 |         
113 |         size_per_device = len(seqs) // self._hparams.tacotron_num_gpus
114 |         
115 |         #Pad inputs according to each GPU max length
116 |         input_seqs = None
117 |         split_infos = []
118 |         for i in range(self._hparams.tacotron_num_gpus):
119 |             device_input = seqs[size_per_device*i: size_per_device*(i+1)]
120 |             device_input, max_seq_len = self._prepare_inputs(device_input)
121 |             input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input
122 |             split_infos.append([max_seq_len, 0, 0, 0])
123 |         
124 |         feed_dict = {
125 |             self.inputs: input_seqs,
126 |             self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
127 |         }
128 |         
129 |         if self.gta:
130 |             np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
131 |             target_lengths = [len(np_target) for np_target in np_targets]
132 |             
133 |             #pad targets according to each GPU max length
134 |             target_seqs = None
135 |             for i in range(self._hparams.tacotron_num_gpus):
136 |                 device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
137 |                 device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step)
138 |                 target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
139 |                 split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
140 |             
141 |             feed_dict[self.targets] = target_seqs
142 |             assert len(np_targets) == len(texts)
143 |         
144 |         feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
145 |         feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames]
146 |         
147 |         if self.gta or not hparams.predict_linear:
148 |             mels, alignments, stop_tokens = self.session.run(
149 |                 [self.mel_outputs, self.alignments, self.stop_token_prediction],
150 |                 feed_dict=feed_dict)
151 |             #Linearize outputs (1D arrays)
152 |             mels = [mel for gpu_mels in mels for mel in gpu_mels]
153 |             alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
154 |             stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
155 |             
156 |             if not self.gta:
157 |                 #Natural batch synthesis
158 |                 #Get Mel lengths for the entire batch from stop_tokens predictions
159 |                 target_lengths = self._get_output_lengths(stop_tokens)
160 |             
161 |             #Take off the batch wise padding
162 |             mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
163 |             assert len(mels) == len(texts)
164 |         
165 |         else:
166 |             linears, mels, alignments, stop_tokens = self.session.run(
167 |                 [self.linear_outputs, self.mel_outputs, self.alignments,
168 |                  self.stop_token_prediction],
169 |                 feed_dict=feed_dict)
170 |             #Linearize outputs (1D arrays)
171 |             linears = [linear for gpu_linear in linears for linear in gpu_linear]
172 |             mels = [mel for gpu_mels in mels for mel in gpu_mels]
173 |             alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
174 |             stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token]
175 |             
176 |             #Natural batch synthesis
177 |             #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
178 |             # target_lengths = self._get_output_lengths(stop_tokens)
179 |             target_lengths = [9999]
180 |             
181 |             #Take off the batch wise padding
182 |             mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
183 |             linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
184 |             assert len(mels) == len(linears) == len(texts)
185 |         
186 |         if basenames is None:
187 |             raise NotImplemented()
188 |         
189 |         saved_mels_paths = []
190 |         for i, mel in enumerate(mels):
191 |             # Write the spectrogram to disk
192 |             # Note: outputs mel-spectrogram files and target ones have same names, just different folders
193 |             mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i]))
194 |             np.save(mel_filename, mel, allow_pickle=False)
195 |             saved_mels_paths.append(mel_filename)
196 |             
197 |             if log_dir is not None:
198 |                 #save wav (mel -> wav)
199 |                 wav = audio.inv_mel_spectrogram(mel.T, hparams)
200 |                 audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate)
201 |                 
202 |                 #save alignments
203 |                 plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])),
204 |                                     title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i])
205 |                 
206 |                 #save mel spectrogram plot
207 |                 plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])),
208 |                                       title="{}".format(texts[i]), split_title=True)
209 |                 
210 |                 if hparams.predict_linear:
211 |                     #save wav (linear -> wav)
212 |                     wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
213 |                     audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate)
214 |                     
215 |                     #save linear spectrogram plot
216 |                     plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])),
217 |                                           title="{}".format(texts[i]), split_title=True, auto_aspect=True)
218 |         
219 |         return saved_mels_paths
220 |     
221 |     def _round_up(self, x, multiple):
222 |         remainder = x % multiple
223 |         return x if remainder == 0 else x + multiple - remainder
224 |     
225 |     def _prepare_inputs(self, inputs):
226 |         max_len = max([len(x) for x in inputs])
227 |         return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len
228 |     
229 |     def _pad_input(self, x, length):
230 |         return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad)
231 |     
232 |     def _prepare_targets(self, targets, alignment):
233 |         max_len = max([len(t) for t in targets])
234 |         data_len = self._round_up(max_len, alignment)
235 |         return np.stack([self._pad_target(t, data_len) for t in targets]), data_len
236 |     
237 |     def _pad_target(self, t, length):
238 |         return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad)
239 |     
240 |     def _get_output_lengths(self, stop_tokens):
241 |         #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise)
242 |         output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()]
243 |         return output_lengths
244 | 


--------------------------------------------------------------------------------
/synthesizer/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/cleaners.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/cleaners.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/numbers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/numbers.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/plot.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/plot.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/symbols.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/symbols.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/__pycache__/text.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/text.cpython-37.pyc


--------------------------------------------------------------------------------
/synthesizer/utils/_cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | valid_symbols = [
 4 |   "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
 5 |   "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
 6 |   "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
 7 |   "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
 8 |   "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
 9 |   "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
10 |   "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
11 | ]
12 | 
13 | _valid_symbol_set = set(valid_symbols)
14 | 
15 | 
16 | class CMUDict:
17 |   """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
18 |   def __init__(self, file_or_path, keep_ambiguous=True):
19 |     if isinstance(file_or_path, str):
20 |       with open(file_or_path, encoding="latin-1") as f:
21 |         entries = _parse_cmudict(f)
22 |     else:
23 |       entries = _parse_cmudict(file_or_path)
24 |     if not keep_ambiguous:
25 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
26 |     self._entries = entries
27 | 
28 | 
29 |   def __len__(self):
30 |     return len(self._entries)
31 | 
32 | 
33 |   def lookup(self, word):
34 |     """Returns list of ARPAbet pronunciations of the given word."""
35 |     return self._entries.get(word.upper())
36 | 
37 | 
38 | 
39 | _alt_re = re.compile(r"\([0-9]+\)")
40 | 
41 | 
42 | def _parse_cmudict(file):
43 |   cmudict = {}
44 |   for line in file:
45 |     if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
46 |       parts = line.split("  ")
47 |       word = re.sub(_alt_re, "", parts[0])
48 |       pronunciation = _get_pronunciation(parts[1])
49 |       if pronunciation:
50 |         if word in cmudict:
51 |           cmudict[word].append(pronunciation)
52 |         else:
53 |           cmudict[word] = [pronunciation]
54 |   return cmudict
55 | 
56 | 
57 | def _get_pronunciation(s):
58 |   parts = s.strip().split(" ")
59 |   for part in parts:
60 |     if part not in _valid_symbol_set:
61 |       return None
62 |   return " ".join(parts)
63 | 


--------------------------------------------------------------------------------
/synthesizer/utils/cleaners.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | """
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | # Regular expression matching whitespace:
18 | _whitespace_re = re.compile(r"\s+")
19 | 
20 | # List of (regular expression, replacement) pairs for abbreviations:
21 | _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
22 |   ("mrs", "misess"),
23 |   ("mr", "mister"),
24 |   ("dr", "doctor"),
25 |   ("st", "saint"),
26 |   ("co", "company"),
27 |   ("jr", "junior"),
28 |   ("maj", "major"),
29 |   ("gen", "general"),
30 |   ("drs", "doctors"),
31 |   ("rev", "reverend"),
32 |   ("lt", "lieutenant"),
33 |   ("hon", "honorable"),
34 |   ("sgt", "sergeant"),
35 |   ("capt", "captain"),
36 |   ("esq", "esquire"),
37 |   ("ltd", "limited"),
38 |   ("col", "colonel"),
39 |   ("ft", "fort"),
40 | ]]
41 | 
42 | 
43 | def expand_abbreviations(text):
44 |   for regex, replacement in _abbreviations:
45 |     text = re.sub(regex, replacement, text)
46 |   return text
47 | 
48 | 
49 | def expand_numbers(text):
50 |   return normalize_numbers(text)
51 | 
52 | 
53 | def lowercase(text):
54 |   """lowercase input tokens."""
55 |   return text.lower()
56 | 
57 | 
58 | def collapse_whitespace(text):
59 |   return re.sub(_whitespace_re, " ", text)
60 | 
61 | 
62 | def convert_to_ascii(text):
63 |   return unidecode(text)
64 | 
65 | 
66 | def basic_cleaners(text):
67 |   """Basic pipeline that lowercases and collapses whitespace without transliteration."""
68 |   text = lowercase(text)
69 |   text = collapse_whitespace(text)
70 |   return text
71 | 
72 | 
73 | def transliteration_cleaners(text):
74 |   """Pipeline for non-English text that transliterates to ASCII."""
75 |   text = convert_to_ascii(text)
76 |   text = lowercase(text)
77 |   text = collapse_whitespace(text)
78 |   return text
79 | 
80 | 
81 | def english_cleaners(text):
82 |   """Pipeline for English text, including number and abbreviation expansion."""
83 |   text = convert_to_ascii(text)
84 |   text = lowercase(text)
85 |   text = expand_numbers(text)
86 |   text = expand_abbreviations(text)
87 |   text = collapse_whitespace(text)
88 |   return text
89 | 


--------------------------------------------------------------------------------
/synthesizer/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import inflect
 3 | 
 4 | _inflect = inflect.engine()
 5 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 6 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 7 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
 8 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
 9 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
10 | _number_re = re.compile(r"[0-9]+")
11 | 
12 | 
13 | def _remove_commas(m):
14 |   return m.group(1).replace(",", "")
15 | 
16 | 
17 | def _expand_decimal_point(m):
18 |   return m.group(1).replace(".", " point ")
19 | 
20 | 
21 | def _expand_dollars(m):
22 |   match = m.group(1)
23 |   parts = match.split(".")
24 |   if len(parts) > 2:
25 |     return match + " dollars"  # Unexpected format
26 |   dollars = int(parts[0]) if parts[0] else 0
27 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
28 |   if dollars and cents:
29 |     dollar_unit = "dollar" if dollars == 1 else "dollars"
30 |     cent_unit = "cent" if cents == 1 else "cents"
31 |     return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
32 |   elif dollars:
33 |     dollar_unit = "dollar" if dollars == 1 else "dollars"
34 |     return "%s %s" % (dollars, dollar_unit)
35 |   elif cents:
36 |     cent_unit = "cent" if cents == 1 else "cents"
37 |     return "%s %s" % (cents, cent_unit)
38 |   else:
39 |     return "zero dollars"
40 | 
41 | 
42 | def _expand_ordinal(m):
43 |   return _inflect.number_to_words(m.group(0))
44 | 
45 | 
46 | def _expand_number(m):
47 |   num = int(m.group(0))
48 |   if num > 1000 and num < 3000:
49 |     if num == 2000:
50 |       return "two thousand"
51 |     elif num > 2000 and num < 2010:
52 |       return "two thousand " + _inflect.number_to_words(num % 100)
53 |     elif num % 100 == 0:
54 |       return _inflect.number_to_words(num // 100) + " hundred"
55 |     else:
56 |       return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
57 |   else:
58 |     return _inflect.number_to_words(num, andword="")
59 | 
60 | 
61 | def normalize_numbers(text):
62 |   text = re.sub(_comma_number_re, _remove_commas, text)
63 |   text = re.sub(_pounds_re, r"\1 pounds", text)
64 |   text = re.sub(_dollars_re, _expand_dollars, text)
65 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
66 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
67 |   text = re.sub(_number_re, _expand_number, text)
68 |   return text
69 | 


--------------------------------------------------------------------------------
/synthesizer/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use("Agg")
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def split_title_line(title_text, max_words=5):
 8 | 	"""
 9 | 	A function that splits any string based on specific character
10 | 	(returning it with the string), with maximum number of words on it
11 | 	"""
12 | 	seq = title_text.split()
13 | 	return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
14 | 
15 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
16 | 	if max_len is not None:
17 | 		alignment = alignment[:, :max_len]
18 | 
19 | 	fig = plt.figure(figsize=(8, 6))
20 | 	ax = fig.add_subplot(111)
21 | 
22 | 	im = ax.imshow(
23 | 		alignment,
24 | 		aspect="auto",
25 | 		origin="lower",
26 | 		interpolation="none")
27 | 	fig.colorbar(im, ax=ax)
28 | 	xlabel = "Decoder timestep"
29 | 
30 | 	if split_title:
31 | 		title = split_title_line(title)
32 | 
33 | 	plt.xlabel(xlabel)
34 | 	plt.title(title)
35 | 	plt.ylabel("Encoder timestep")
36 | 	plt.tight_layout()
37 | 	plt.savefig(path, format="png")
38 | 	plt.close()
39 | 
40 | 
41 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
42 | 	if max_len is not None:
43 | 		target_spectrogram = target_spectrogram[:max_len]
44 | 		pred_spectrogram = pred_spectrogram[:max_len]
45 | 
46 | 	if split_title:
47 | 		title = split_title_line(title)
48 | 
49 | 	fig = plt.figure(figsize=(10, 8))
50 | 	# Set common labels
51 | 	fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
52 | 
53 | 	#target spectrogram subplot
54 | 	if target_spectrogram is not None:
55 | 		ax1 = fig.add_subplot(311)
56 | 		ax2 = fig.add_subplot(312)
57 | 
58 | 		if auto_aspect:
59 | 			im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
60 | 		else:
61 | 			im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
62 | 		ax1.set_title("Target Mel-Spectrogram")
63 | 		fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
64 | 		ax2.set_title("Predicted Mel-Spectrogram")
65 | 	else:
66 | 		ax2 = fig.add_subplot(211)
67 | 
68 | 	if auto_aspect:
69 | 		im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
70 | 	else:
71 | 		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
72 | 	fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
73 | 
74 | 	plt.tight_layout()
75 | 	plt.savefig(path, format="png")
76 | 	plt.close()
77 | 


--------------------------------------------------------------------------------
/synthesizer/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | """
 7 | # from . import cmudict
 8 | 
 9 | _pad        = "_"
10 | _eos        = "~"
11 | _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | #_arpabet = ["@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet
18 | 


--------------------------------------------------------------------------------
/synthesizer/utils/text.py:
--------------------------------------------------------------------------------
 1 | from .symbols import symbols
 2 | from . import cleaners
 3 | import re
 4 | 
 5 | # Mappings from symbol to numeric ID and vice versa:
 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 8 | 
 9 | # Regular expression matching text enclosed in curly braces:
10 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
11 | 
12 | 
13 | def text_to_sequence(text, cleaner_names):
14 |   """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
15 | 
16 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
17 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
18 | 
19 |     Args:
20 |       text: string to convert to a sequence
21 |       cleaner_names: names of the cleaner functions to run the text through
22 | 
23 |     Returns:
24 |       List of integers corresponding to the symbols in the text
25 |   """
26 |   sequence = []
27 | 
28 |   # Check for curly braces and treat their contents as ARPAbet:
29 |   while len(text):
30 |     m = _curly_re.match(text)
31 |     if not m:
32 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
33 |       break
34 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
35 |     sequence += _arpabet_to_sequence(m.group(2))
36 |     text = m.group(3)
37 | 
38 |   # Append EOS token
39 |   sequence.append(_symbol_to_id["~"])
40 |   return sequence
41 | 
42 | 
43 | def sequence_to_text(sequence):
44 |   """Converts a sequence of IDs back to a string"""
45 |   result = ""
46 |   for symbol_id in sequence:
47 |     if symbol_id in _id_to_symbol:
48 |       s = _id_to_symbol[symbol_id]
49 |       # Enclose ARPAbet back in curly braces:
50 |       if len(s) > 1 and s[0] == "@":
51 |         s = "{%s}" % s[1:]
52 |       result += s
53 |   return result.replace("}{", " ")
54 | 
55 | 
56 | def _clean_text(text, cleaner_names):
57 |   for name in cleaner_names:
58 |     cleaner = getattr(cleaners, name)
59 |     if not cleaner:
60 |       raise Exception("Unknown cleaner: %s" % name)
61 |     text = cleaner(text)
62 |   return text
63 | 
64 | 
65 | def _symbols_to_sequence(symbols):
66 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
67 | 
68 | 
69 | def _arpabet_to_sequence(text):
70 |   return _symbols_to_sequence(["@" + s for s in text.split()])
71 | 
72 | 
73 | def _should_keep_symbol(s):
74 |   return s in _symbol_to_id and s not in ("_", "~")
75 | 


--------------------------------------------------------------------------------
/vocoder/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/vocoder/__pycache__/audio.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/audio.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/__pycache__/display.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/display.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/__pycache__/distribution.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/distribution.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/__pycache__/hparams.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/hparams.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/__pycache__/inference.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/inference.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/audio.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import librosa
  4 | import vocoder.hparams as hp
  5 | from scipy.signal import lfilter
  6 | 
  7 | 
  8 | def label_2_float(x, bits) :
  9 |     return 2 * x / (2**bits - 1.) - 1.
 10 | 
 11 | 
 12 | def float_2_label(x, bits) :
 13 |     assert abs(x).max() <= 1.0
 14 |     x = (x + 1.) * (2**bits - 1) / 2
 15 |     return x.clip(0, 2**bits - 1)
 16 | 
 17 | 
 18 | def load_wav(path) :
 19 |     return librosa.load(str(path), sr=hp.sample_rate)[0]
 20 | 
 21 | 
 22 | def save_wav(x, path) :
 23 |     librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate)
 24 | 
 25 | 
 26 | def split_signal(x) :
 27 |     unsigned = x + 2**15
 28 |     coarse = unsigned // 256
 29 |     fine = unsigned % 256
 30 |     return coarse, fine
 31 | 
 32 | 
 33 | def combine_signal(coarse, fine) :
 34 |     return coarse * 256 + fine - 2**15
 35 | 
 36 | 
 37 | def encode_16bits(x) :
 38 |     return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
 39 | 
 40 | 
 41 | mel_basis = None
 42 | 
 43 | 
 44 | def linear_to_mel(spectrogram):
 45 |     global mel_basis
 46 |     if mel_basis is None:
 47 |         mel_basis = build_mel_basis()
 48 |     return np.dot(mel_basis, spectrogram)
 49 | 
 50 | 
 51 | def build_mel_basis():
 52 |     return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
 53 | 
 54 | 
 55 | def normalize(S):
 56 |     return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
 57 | 
 58 | 
 59 | def denormalize(S):
 60 |     return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
 61 | 
 62 | 
 63 | def amp_to_db(x):
 64 |     return 20 * np.log10(np.maximum(1e-5, x))
 65 | 
 66 | 
 67 | def db_to_amp(x):
 68 |     return np.power(10.0, x * 0.05)
 69 | 
 70 | 
 71 | def spectrogram(y):
 72 |     D = stft(y)
 73 |     S = amp_to_db(np.abs(D)) - hp.ref_level_db
 74 |     return normalize(S)
 75 | 
 76 | 
 77 | def melspectrogram(y):
 78 |     D = stft(y)
 79 |     S = amp_to_db(linear_to_mel(np.abs(D)))
 80 |     return normalize(S)
 81 | 
 82 | 
 83 | def stft(y):
 84 |     return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
 85 | 
 86 | 
 87 | def pre_emphasis(x):
 88 |     return lfilter([1, -hp.preemphasis], [1], x)
 89 | 
 90 | 
 91 | def de_emphasis(x):
 92 |     return lfilter([1], [1, -hp.preemphasis], x)
 93 | 
 94 | 
 95 | def encode_mu_law(x, mu) :
 96 |     mu = mu - 1
 97 |     fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
 98 |     return np.floor((fx + 1) / 2 * mu + 0.5)
 99 | 
100 | 
101 | def decode_mu_law(y, mu, from_labels=True) :
102 |     if from_labels: 
103 |         y = label_2_float(y, math.log2(mu))
104 |     mu = mu - 1
105 |     x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
106 |     return x
107 | 
108 | 


--------------------------------------------------------------------------------
/vocoder/display.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import time
  3 | import numpy as np
  4 | import sys
  5 | 
  6 | 
  7 | def progbar(i, n, size=16):
  8 |     done = (i * size) // n
  9 |     bar = ''
 10 |     for i in range(size):
 11 |         bar += '█' if i <= done else '░'
 12 |     return bar
 13 | 
 14 | 
 15 | def stream(message) :
 16 |     try:
 17 |         sys.stdout.write("\r{%s}" % message)
 18 |     except:
 19 |         #Remove non-ASCII characters from message
 20 |         message = ''.join(i for i in message if ord(i)<128)
 21 |         sys.stdout.write("\r{%s}" % message)
 22 | 
 23 | 
 24 | def simple_table(item_tuples) :
 25 | 
 26 |     border_pattern = '+---------------------------------------'
 27 |     whitespace = '                                            '
 28 | 
 29 |     headings, cells, = [], []
 30 | 
 31 |     for item in item_tuples :
 32 | 
 33 |         heading, cell = str(item[0]), str(item[1])
 34 | 
 35 |         pad_head = True if len(heading) < len(cell) else False
 36 | 
 37 |         pad = abs(len(heading) - len(cell))
 38 |         pad = whitespace[:pad]
 39 | 
 40 |         pad_left = pad[:len(pad)//2]
 41 |         pad_right = pad[len(pad)//2:]
 42 | 
 43 |         if pad_head :
 44 |             heading = pad_left + heading + pad_right
 45 |         else :
 46 |             cell = pad_left + cell + pad_right
 47 | 
 48 |         headings += [heading]
 49 |         cells += [cell]
 50 | 
 51 |     border, head, body = '', '', ''
 52 | 
 53 |     for i in range(len(item_tuples)) :
 54 | 
 55 |         temp_head = f'| {headings[i]} '
 56 |         temp_body = f'| {cells[i]} '
 57 | 
 58 |         border += border_pattern[:len(temp_head)]
 59 |         head += temp_head
 60 |         body += temp_body
 61 | 
 62 |         if i == len(item_tuples) - 1 :
 63 |             head += '|'
 64 |             body += '|'
 65 |             border += '+'
 66 | 
 67 |     print(border)
 68 |     print(head)
 69 |     print(border)
 70 |     print(body)
 71 |     print(border)
 72 |     print(' ')
 73 | 
 74 | 
 75 | def time_since(started) :
 76 |     elapsed = time.time() - started
 77 |     m = int(elapsed // 60)
 78 |     s = int(elapsed % 60)
 79 |     if m >= 60 :
 80 |         h = int(m // 60)
 81 |         m = m % 60
 82 |         return f'{h}h {m}m {s}s'
 83 |     else :
 84 |         return f'{m}m {s}s'
 85 | 
 86 | 
 87 | def save_attention(attn, path) :
 88 |     fig = plt.figure(figsize=(12, 6))
 89 |     plt.imshow(attn.T, interpolation='nearest', aspect='auto')
 90 |     fig.savefig(f'{path}.png', bbox_inches='tight')
 91 |     plt.close(fig)
 92 | 
 93 | 
 94 | def save_spectrogram(M, path, length=None) :
 95 |     M = np.flip(M, axis=0)
 96 |     if length : M = M[:, :length]
 97 |     fig = plt.figure(figsize=(12, 6))
 98 |     plt.imshow(M, interpolation='nearest', aspect='auto')
 99 |     fig.savefig(f'{path}.png', bbox_inches='tight')
100 |     plt.close(fig)
101 | 
102 | 
103 | def plot(array) : 
104 |     fig = plt.figure(figsize=(30, 5))
105 |     ax = fig.add_subplot(111)
106 |     ax.xaxis.label.set_color('grey')
107 |     ax.yaxis.label.set_color('grey')
108 |     ax.xaxis.label.set_fontsize(23)
109 |     ax.yaxis.label.set_fontsize(23)
110 |     ax.tick_params(axis='x', colors='grey', labelsize=23)
111 |     ax.tick_params(axis='y', colors='grey', labelsize=23)
112 |     plt.plot(array)
113 | 
114 | 
115 | def plot_spec(M) :
116 |     M = np.flip(M, axis=0)
117 |     plt.figure(figsize=(18,4))
118 |     plt.imshow(M, interpolation='nearest', aspect='auto')
119 |     plt.show()
120 | 
121 | 


--------------------------------------------------------------------------------
/vocoder/distribution.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def log_sum_exp(x):
  7 |     """ numerically stable log_sum_exp implementation that prevents overflow """
  8 |     # TF ordering
  9 |     axis = len(x.size()) - 1
 10 |     m, _ = torch.max(x, dim=axis)
 11 |     m2, _ = torch.max(x, dim=axis, keepdim=True)
 12 |     return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
 13 | 
 14 | 
 15 | # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
 16 | def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
 17 |                                   log_scale_min=None, reduce=True):
 18 |     if log_scale_min is None:
 19 |         log_scale_min = float(np.log(1e-14))
 20 |     y_hat = y_hat.permute(0,2,1)
 21 |     assert y_hat.dim() == 3
 22 |     assert y_hat.size(1) % 3 == 0
 23 |     nr_mix = y_hat.size(1) // 3
 24 | 
 25 |     # (B x T x C)
 26 |     y_hat = y_hat.transpose(1, 2)
 27 | 
 28 |     # unpack parameters. (B, T, num_mixtures) x 3
 29 |     logit_probs = y_hat[:, :, :nr_mix]
 30 |     means = y_hat[:, :, nr_mix:2 * nr_mix]
 31 |     log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
 32 | 
 33 |     # B x T x 1 -> B x T x num_mixtures
 34 |     y = y.expand_as(means)
 35 | 
 36 |     centered_y = y - means
 37 |     inv_stdv = torch.exp(-log_scales)
 38 |     plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 39 |     cdf_plus = torch.sigmoid(plus_in)
 40 |     min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 41 |     cdf_min = torch.sigmoid(min_in)
 42 | 
 43 |     # log probability for edge case of 0 (before scaling)
 44 |     # equivalent: torch.log(F.sigmoid(plus_in))
 45 |     log_cdf_plus = plus_in - F.softplus(plus_in)
 46 | 
 47 |     # log probability for edge case of 255 (before scaling)
 48 |     # equivalent: (1 - F.sigmoid(min_in)).log()
 49 |     log_one_minus_cdf_min = -F.softplus(min_in)
 50 | 
 51 |     # probability for all other cases
 52 |     cdf_delta = cdf_plus - cdf_min
 53 | 
 54 |     mid_in = inv_stdv * centered_y
 55 |     # log probability in the center of the bin, to be used in extreme cases
 56 |     # (not actually used in our code)
 57 |     log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
 58 | 
 59 |     # tf equivalent
 60 |     """
 61 |     log_probs = tf.where(x < -0.999, log_cdf_plus,
 62 |                          tf.where(x > 0.999, log_one_minus_cdf_min,
 63 |                                   tf.where(cdf_delta > 1e-5,
 64 |                                            tf.log(tf.maximum(cdf_delta, 1e-12)),
 65 |                                            log_pdf_mid - np.log(127.5))))
 66 |     """
 67 |     # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
 68 |     # for num_classes=65536 case? 1e-7? not sure..
 69 |     inner_inner_cond = (cdf_delta > 1e-5).float()
 70 | 
 71 |     inner_inner_out = inner_inner_cond * \
 72 |         torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
 73 |         (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
 74 |     inner_cond = (y > 0.999).float()
 75 |     inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
 76 |     cond = (y < -0.999).float()
 77 |     log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
 78 | 
 79 |     log_probs = log_probs + F.log_softmax(logit_probs, -1)
 80 | 
 81 |     if reduce:
 82 |         return -torch.mean(log_sum_exp(log_probs))
 83 |     else:
 84 |         return -log_sum_exp(log_probs).unsqueeze(-1)
 85 | 
 86 | 
 87 | def sample_from_discretized_mix_logistic(y, log_scale_min=None):
 88 |     """
 89 |     Sample from discretized mixture of logistic distributions
 90 |     Args:
 91 |         y (Tensor): B x C x T
 92 |         log_scale_min (float): Log scale minimum value
 93 |     Returns:
 94 |         Tensor: sample in range of [-1, 1].
 95 |     """
 96 |     if log_scale_min is None:
 97 |         log_scale_min = float(np.log(1e-14))
 98 |     assert y.size(1) % 3 == 0
 99 |     nr_mix = y.size(1) // 3
100 | 
101 |     # B x T x C
102 |     y = y.transpose(1, 2)
103 |     logit_probs = y[:, :, :nr_mix]
104 | 
105 |     # sample mixture indicator from softmax
106 |     temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
107 |     temp = logit_probs.data - torch.log(- torch.log(temp))
108 |     _, argmax = temp.max(dim=-1)
109 | 
110 |     # (B, T) -> (B, T, nr_mix)
111 |     one_hot = to_one_hot(argmax, nr_mix)
112 |     # select logistic parameters
113 |     means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
114 |     log_scales = torch.clamp(torch.sum(
115 |         y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
116 |     # sample from logistic & clip to interval
117 |     # we don't actually round to the nearest 8bit value when sampling
118 |     u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
119 |     x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
120 | 
121 |     x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
122 | 
123 |     return x
124 | 
125 | 
126 | def to_one_hot(tensor, n, fill_with=1.):
127 |     # we perform one hot encore with respect to the last axis
128 |     one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
129 |     if tensor.is_cuda:
130 |         one_hot = one_hot.cuda()
131 |     one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
132 |     return one_hot
133 | 


--------------------------------------------------------------------------------
/vocoder/gen_wavernn.py:
--------------------------------------------------------------------------------
 1 | from vocoder.models.fatchord_version import  WaveRNN
 2 | from vocoder.audio import *
 3 | 
 4 | 
 5 | def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path):
 6 |     k = model.get_step() // 1000
 7 | 
 8 |     for i, (m, x) in enumerate(test_set, 1):
 9 |         if i > samples: 
10 |             break
11 | 
12 |         print('\n| Generating: %i/%i' % (i, samples))
13 | 
14 |         x = x[0].numpy()
15 | 
16 |         bits = 16 if hp.voc_mode == 'MOL' else hp.bits
17 | 
18 |         if hp.mu_law and hp.voc_mode != 'MOL' :
19 |             x = decode_mu_law(x, 2**bits, from_labels=True)
20 |         else :
21 |             x = label_2_float(x, bits)
22 | 
23 |         save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i)))
24 |         
25 |         batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \
26 |             "gen_not_batched"
27 |         save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str))
28 | 
29 |         wav = model.generate(m, batched, target, overlap, hp.mu_law)
30 |         save_wav(wav, save_str)
31 | 
32 | 


--------------------------------------------------------------------------------
/vocoder/hparams.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.hparams import hparams as _syn_hp
 2 | 
 3 | 
 4 | # Audio settings------------------------------------------------------------------------
 5 | # Match the values of the synthesizer
 6 | sample_rate = _syn_hp.sample_rate
 7 | n_fft = _syn_hp.n_fft
 8 | num_mels = _syn_hp.num_mels
 9 | hop_length = _syn_hp.hop_size
10 | win_length = _syn_hp.win_size
11 | fmin = _syn_hp.fmin
12 | min_level_db = _syn_hp.min_level_db
13 | ref_level_db = _syn_hp.ref_level_db
14 | mel_max_abs_value = _syn_hp.max_abs_value
15 | preemphasis = _syn_hp.preemphasis
16 | apply_preemphasis = _syn_hp.preemphasize
17 | 
18 | bits = 9                            # bit depth of signal
19 | mu_law = True                       # Recommended to suppress noise if using raw bits in hp.voc_mode
20 |                                     # below
21 | 
22 | 
23 | # WAVERNN / VOCODER --------------------------------------------------------------------------------
24 | voc_mode = 'RAW'                    # either 'RAW' (softmax on raw bits) or 'MOL' (sample from 
25 | # mixture of logistics)
26 | voc_upsample_factors = (5, 5, 8)    # NB - this needs to correctly factorise hop_length
27 | voc_rnn_dims = 512
28 | voc_fc_dims = 512
29 | voc_compute_dims = 128
30 | voc_res_out_dims = 128
31 | voc_res_blocks = 10
32 | 
33 | # Training
34 | voc_batch_size = 100
35 | voc_lr = 1e-4
36 | voc_gen_at_checkpoint = 5           # number of samples to generate at each checkpoint
37 | voc_pad = 2                         # this will pad the input so that the resnet can 'see' wider 
38 |                                     # than input length
39 | voc_seq_len = hop_length * 5        # must be a multiple of hop_length
40 | 
41 | # Generating / Synthesizing
42 | voc_gen_batched = True              # very fast (realtime+) single utterance batched generation
43 | voc_target = 8000                   # target number of samples to be generated in each batch entry
44 | voc_overlap = 400                   # number of samples for crossfading between batches
45 | 


--------------------------------------------------------------------------------
/vocoder/inference.py:
--------------------------------------------------------------------------------
 1 | from vocoder.models.fatchord_version import WaveRNN
 2 | from vocoder import hparams as hp
 3 | import torch
 4 | 
 5 | 
 6 | _model = None   # type: WaveRNN
 7 | 
 8 | def load_model(weights_fpath, verbose=True):
 9 |     global _model, _device
10 |     
11 |     if verbose:
12 |         print("Building Wave-RNN")
13 |     _model = WaveRNN(
14 |         rnn_dims=hp.voc_rnn_dims,
15 |         fc_dims=hp.voc_fc_dims,
16 |         bits=hp.bits,
17 |         pad=hp.voc_pad,
18 |         upsample_factors=hp.voc_upsample_factors,
19 |         feat_dims=hp.num_mels,
20 |         compute_dims=hp.voc_compute_dims,
21 |         res_out_dims=hp.voc_res_out_dims,
22 |         res_blocks=hp.voc_res_blocks,
23 |         hop_length=hp.hop_length,
24 |         sample_rate=hp.sample_rate,
25 |         mode=hp.voc_mode
26 |     )
27 | 
28 |     if torch.cuda.is_available():
29 |         _model = _model.cuda()
30 |         _device = torch.device('cuda')
31 |     else:
32 |         _device = torch.device('cpu')
33 |     
34 |     if verbose:
35 |         print("Loading model weights at %s" % weights_fpath)
36 |     checkpoint = torch.load(weights_fpath, _device)
37 |     _model.load_state_dict(checkpoint['model_state'])
38 |     _model.eval()
39 | 
40 | 
41 | def is_loaded():
42 |     return _model is not None
43 | 
44 | 
45 | def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800, 
46 |                    progress_callback=None):
47 |     """
48 |     Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 
49 |     that of the synthesizer!)
50 |     
51 |     :param normalize:  
52 |     :param batched: 
53 |     :param target: 
54 |     :param overlap: 
55 |     :return: 
56 |     """
57 |     if _model is None:
58 |         raise Exception("Please load Wave-RNN in memory before using it")
59 |     
60 |     if normalize:
61 |         mel = mel / hp.mel_max_abs_value
62 |     mel = torch.from_numpy(mel[None, ...])
63 |     wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
64 |     return wav
65 | 


--------------------------------------------------------------------------------
/vocoder/models/__pycache__/fatchord_version.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/models/__pycache__/fatchord_version.cpython-37.pyc


--------------------------------------------------------------------------------
/vocoder/models/deepmind_version.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from utils.display import *
  5 | from utils.dsp import *
  6 | 
  7 | 
  8 | class WaveRNN(nn.Module) :
  9 |     def __init__(self, hidden_size=896, quantisation=256) :
 10 |         super(WaveRNN, self).__init__()
 11 |         
 12 |         self.hidden_size = hidden_size
 13 |         self.split_size = hidden_size // 2
 14 |         
 15 |         # The main matmul
 16 |         self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
 17 |         
 18 |         # Output fc layers
 19 |         self.O1 = nn.Linear(self.split_size, self.split_size)
 20 |         self.O2 = nn.Linear(self.split_size, quantisation)
 21 |         self.O3 = nn.Linear(self.split_size, self.split_size)
 22 |         self.O4 = nn.Linear(self.split_size, quantisation)
 23 |         
 24 |         # Input fc layers
 25 |         self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
 26 |         self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
 27 | 
 28 |         # biases for the gates
 29 |         self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
 30 |         self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
 31 |         self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
 32 |         
 33 |         # display num params
 34 |         self.num_params()
 35 | 
 36 |         
 37 |     def forward(self, prev_y, prev_hidden, current_coarse) :
 38 |         
 39 |         # Main matmul - the projection is split 3 ways
 40 |         R_hidden = self.R(prev_hidden)
 41 |         R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
 42 |         
 43 |         # Project the prev input 
 44 |         coarse_input_proj = self.I_coarse(prev_y)
 45 |         I_coarse_u, I_coarse_r, I_coarse_e = \
 46 |             torch.split(coarse_input_proj, self.split_size, dim=1)
 47 |         
 48 |         # Project the prev input and current coarse sample
 49 |         fine_input = torch.cat([prev_y, current_coarse], dim=1)
 50 |         fine_input_proj = self.I_fine(fine_input)
 51 |         I_fine_u, I_fine_r, I_fine_e = \
 52 |             torch.split(fine_input_proj, self.split_size, dim=1)
 53 |         
 54 |         # concatenate for the gates
 55 |         I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
 56 |         I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
 57 |         I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
 58 |         
 59 |         # Compute all gates for coarse and fine 
 60 |         u = F.sigmoid(R_u + I_u + self.bias_u)
 61 |         r = F.sigmoid(R_r + I_r + self.bias_r)
 62 |         e = F.tanh(r * R_e + I_e + self.bias_e)
 63 |         hidden = u * prev_hidden + (1. - u) * e
 64 |         
 65 |         # Split the hidden state
 66 |         hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
 67 |         
 68 |         # Compute outputs 
 69 |         out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
 70 |         out_fine = self.O4(F.relu(self.O3(hidden_fine)))
 71 | 
 72 |         return out_coarse, out_fine, hidden
 73 |     
 74 |         
 75 |     def generate(self, seq_len):
 76 |         with torch.no_grad():
 77 |             # First split up the biases for the gates 
 78 |             b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
 79 |             b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
 80 |             b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
 81 | 
 82 |             # Lists for the two output seqs
 83 |             c_outputs, f_outputs = [], []
 84 | 
 85 |             # Some initial inputs
 86 |             out_coarse = torch.LongTensor([0]).cuda()
 87 |             out_fine = torch.LongTensor([0]).cuda()
 88 | 
 89 |             # We'll meed a hidden state
 90 |             hidden = self.init_hidden()
 91 | 
 92 |             # Need a clock for display
 93 |             start = time.time()
 94 | 
 95 |             # Loop for generation
 96 |             for i in range(seq_len) :
 97 | 
 98 |                 # Split into two hidden states
 99 |                 hidden_coarse, hidden_fine = \
100 |                     torch.split(hidden, self.split_size, dim=1)
101 | 
102 |                 # Scale and concat previous predictions
103 |                 out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
104 |                 out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
105 |                 prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
106 | 
107 |                 # Project input 
108 |                 coarse_input_proj = self.I_coarse(prev_outputs)
109 |                 I_coarse_u, I_coarse_r, I_coarse_e = \
110 |                     torch.split(coarse_input_proj, self.split_size, dim=1)
111 | 
112 |                 # Project hidden state and split 6 ways
113 |                 R_hidden = self.R(hidden)
114 |                 R_coarse_u , R_fine_u, \
115 |                 R_coarse_r, R_fine_r, \
116 |                 R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
117 | 
118 |                 # Compute the coarse gates
119 |                 u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
120 |                 r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
121 |                 e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
122 |                 hidden_coarse = u * hidden_coarse + (1. - u) * e
123 | 
124 |                 # Compute the coarse output
125 |                 out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
126 |                 posterior = F.softmax(out_coarse, dim=1)
127 |                 distrib = torch.distributions.Categorical(posterior)
128 |                 out_coarse = distrib.sample()
129 |                 c_outputs.append(out_coarse)
130 | 
131 |                 # Project the [prev outputs and predicted coarse sample]
132 |                 coarse_pred = out_coarse.float() / 127.5 - 1.
133 |                 fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
134 |                 fine_input_proj = self.I_fine(fine_input)
135 |                 I_fine_u, I_fine_r, I_fine_e = \
136 |                     torch.split(fine_input_proj, self.split_size, dim=1)
137 | 
138 |                 # Compute the fine gates
139 |                 u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
140 |                 r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
141 |                 e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
142 |                 hidden_fine = u * hidden_fine + (1. - u) * e
143 | 
144 |                 # Compute the fine output
145 |                 out_fine = self.O4(F.relu(self.O3(hidden_fine)))
146 |                 posterior = F.softmax(out_fine, dim=1)
147 |                 distrib = torch.distributions.Categorical(posterior)
148 |                 out_fine = distrib.sample()
149 |                 f_outputs.append(out_fine)
150 | 
151 |                 # Put the hidden state back together
152 |                 hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
153 | 
154 |                 # Display progress
155 |                 speed = (i + 1) / (time.time() - start)
156 |                 stream('Gen: %i/%i -- Speed: %i',  (i + 1, seq_len, speed))
157 | 
158 |             coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
159 |             fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()        
160 |             output = combine_signal(coarse, fine)
161 |         
162 |         return output, coarse, fine
163 | 
164 |     def init_hidden(self, batch_size=1) :
165 |         return torch.zeros(batch_size, self.hidden_size).cuda()
166 |     
167 |     def num_params(self) :
168 |         parameters = filter(lambda p: p.requires_grad, self.parameters())
169 |         parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
170 |         print('Trainable Parameters: %.3f million' % parameters)


--------------------------------------------------------------------------------
/vocoder/saved_models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/saved_models/.gitkeep


--------------------------------------------------------------------------------
/vocoder/train.py:
--------------------------------------------------------------------------------
  1 | from vocoder.models.fatchord_version import WaveRNN
  2 | from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
  3 | from vocoder.distribution import discretized_mix_logistic_loss
  4 | from vocoder.display import stream, simple_table
  5 | from vocoder.gen_wavernn import gen_testset
  6 | from torch.utils.data import DataLoader
  7 | from pathlib import Path
  8 | from torch import optim
  9 | import torch.nn.functional as F
 10 | import vocoder.hparams as hp
 11 | import numpy as np
 12 | import time
 13 | import torch
 14 | 
 15 | 
 16 | def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool,
 17 |           save_every: int, backup_every: int, force_restart: bool):
 18 |     # Check to make sure the hop length is correctly factorised
 19 |     assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
 20 |     
 21 |     # Instantiate the model
 22 |     print("Initializing the model...")
 23 |     model = WaveRNN(
 24 |         rnn_dims=hp.voc_rnn_dims,
 25 |         fc_dims=hp.voc_fc_dims,
 26 |         bits=hp.bits,
 27 |         pad=hp.voc_pad,
 28 |         upsample_factors=hp.voc_upsample_factors,
 29 |         feat_dims=hp.num_mels,
 30 |         compute_dims=hp.voc_compute_dims,
 31 |         res_out_dims=hp.voc_res_out_dims,
 32 |         res_blocks=hp.voc_res_blocks,
 33 |         hop_length=hp.hop_length,
 34 |         sample_rate=hp.sample_rate,
 35 |         mode=hp.voc_mode
 36 |     )
 37 | 
 38 |     if torch.cuda.is_available():
 39 |         model = model.cuda()
 40 |         device = torch.device('cuda')
 41 |     else:
 42 |         device = torch.device('cpu')   
 43 | 
 44 |     # Initialize the optimizer
 45 |     optimizer = optim.Adam(model.parameters())
 46 |     for p in optimizer.param_groups: 
 47 |         p["lr"] = hp.voc_lr
 48 |     loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss
 49 | 
 50 |     # Load the weights
 51 |     model_dir = models_dir.joinpath(run_id)
 52 |     model_dir.mkdir(exist_ok=True)
 53 |     weights_fpath = model_dir.joinpath(run_id + ".pt")
 54 |     if force_restart or not weights_fpath.exists():
 55 |         print("\nStarting the training of WaveRNN from scratch\n")
 56 |         model.save(weights_fpath, optimizer)
 57 |     else:
 58 |         print("\nLoading weights at %s" % weights_fpath)
 59 |         model.load(weights_fpath, optimizer)
 60 |         print("WaveRNN weights loaded from step %d" % model.step)
 61 |     
 62 |     # Initialize the dataset
 63 |     metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
 64 |         voc_dir.joinpath("synthesized.txt")
 65 |     mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta")
 66 |     wav_dir = syn_dir.joinpath("audio")
 67 |     dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
 68 |     test_loader = DataLoader(dataset,
 69 |                              batch_size=1,
 70 |                              shuffle=True,
 71 |                              pin_memory=True)
 72 | 
 73 |     # Begin the training
 74 |     simple_table([('Batch size', hp.voc_batch_size),
 75 |                   ('LR', hp.voc_lr),
 76 |                   ('Sequence Len', hp.voc_seq_len)])
 77 |     
 78 |     for epoch in range(1, 350):
 79 |         data_loader = DataLoader(dataset,
 80 |                                  collate_fn=collate_vocoder,
 81 |                                  batch_size=hp.voc_batch_size,
 82 |                                  num_workers=2,
 83 |                                  shuffle=True,
 84 |                                  pin_memory=True)
 85 |         start = time.time()
 86 |         running_loss = 0.
 87 | 
 88 |         for i, (x, y, m) in enumerate(data_loader, 1):
 89 |             if torch.cuda.is_available():
 90 |                 x, m, y = x.cuda(), m.cuda(), y.cuda()
 91 |             
 92 |             # Forward pass
 93 |             y_hat = model(x, m)
 94 |             if model.mode == 'RAW':
 95 |                 y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
 96 |             elif model.mode == 'MOL':
 97 |                 y = y.float()
 98 |             y = y.unsqueeze(-1)
 99 |             
100 |             # Backward pass
101 |             loss = loss_func(y_hat, y)
102 |             optimizer.zero_grad()
103 |             loss.backward()
104 |             optimizer.step()
105 | 
106 |             running_loss += loss.item()
107 |             speed = i / (time.time() - start)
108 |             avg_loss = running_loss / i
109 | 
110 |             step = model.get_step()
111 |             k = step // 1000
112 | 
113 |             if backup_every != 0 and step % backup_every == 0 :
114 |                 model.checkpoint(model_dir, optimizer)
115 |                 
116 |             if save_every != 0 and step % save_every == 0 :
117 |                 model.save(weights_fpath, optimizer)
118 | 
119 |             msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
120 |                 f"Loss: {avg_loss:.4f} | {speed:.1f} " \
121 |                 f"steps/s | Step: {k}k | "
122 |             stream(msg)
123 | 
124 | 
125 |         gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
126 |                     hp.voc_target, hp.voc_overlap, model_dir)
127 |         print("")
128 | 


--------------------------------------------------------------------------------
/vocoder/vocoder_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from pathlib import Path
 3 | from vocoder import audio
 4 | import vocoder.hparams as hp
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | class VocoderDataset(Dataset):
10 |     def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path):
11 |         print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir))
12 |         
13 |         with metadata_fpath.open("r") as metadata_file:
14 |             metadata = [line.split("|") for line in metadata_file]
15 |         
16 |         gta_fnames = [x[1] for x in metadata if int(x[4])]
17 |         gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames]
18 |         wav_fnames = [x[0] for x in metadata if int(x[4])]
19 |         wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames]
20 |         self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths))
21 |         
22 |         print("Found %d samples" % len(self.samples_fpaths))
23 |     
24 |     def __getitem__(self, index):  
25 |         mel_path, wav_path = self.samples_fpaths[index]
26 |         
27 |         # Load the mel spectrogram and adjust its range to [-1, 1]
28 |         mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value
29 |         
30 |         # Load the wav
31 |         wav = np.load(wav_path)
32 |         if hp.apply_preemphasis:
33 |             wav = audio.pre_emphasis(wav)
34 |         wav = np.clip(wav, -1, 1)
35 |         
36 |         # Fix for missing padding   # TODO: settle on whether this is any useful
37 |         r_pad =  (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
38 |         wav = np.pad(wav, (0, r_pad), mode='constant')
39 |         assert len(wav) >= mel.shape[1] * hp.hop_length
40 |         wav = wav[:mel.shape[1] * hp.hop_length]
41 |         assert len(wav) % hp.hop_length == 0
42 |         
43 |         # Quantize the wav
44 |         if hp.voc_mode == 'RAW':
45 |             if hp.mu_law:
46 |                 quant = audio.encode_mu_law(wav, mu=2 ** hp.bits)
47 |             else:
48 |                 quant = audio.float_2_label(wav, bits=hp.bits)
49 |         elif hp.voc_mode == 'MOL':
50 |             quant = audio.float_2_label(wav, bits=16)
51 |             
52 |         return mel.astype(np.float32), quant.astype(np.int64)
53 | 
54 |     def __len__(self):
55 |         return len(self.samples_fpaths)
56 |         
57 |         
58 | def collate_vocoder(batch):
59 |     mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
60 |     max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
61 |     mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
62 |     sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]
63 | 
64 |     mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
65 | 
66 |     labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]
67 | 
68 |     mels = np.stack(mels).astype(np.float32)
69 |     labels = np.stack(labels).astype(np.int64)
70 | 
71 |     mels = torch.tensor(mels)
72 |     labels = torch.tensor(labels).long()
73 | 
74 |     x = labels[:, :hp.voc_seq_len]
75 |     y = labels[:, 1:]
76 | 
77 |     bits = 16 if hp.voc_mode == 'MOL' else hp.bits
78 | 
79 |     x = audio.label_2_float(x.float(), bits)
80 | 
81 |     if hp.voc_mode == 'MOL' :
82 |         y = audio.label_2_float(y.float(), bits)
83 | 
84 |     return x, y, mels


--------------------------------------------------------------------------------