├── .gitignore ├── LICENSE ├── README.md ├── demo_streamlit.ipynb ├── demo_voice.py ├── encoder ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── audio.cpython-37.pyc │ ├── inference.cpython-37.pyc │ ├── model.cpython-37.pyc │ ├── params_data.cpython-37.pyc │ └── params_model.cpython-37.pyc ├── audio.py ├── config.py ├── data_objects │ ├── __init__.py │ ├── random_cycler.py │ ├── speaker.py │ ├── speaker_batch.py │ ├── speaker_verification_dataset.py │ └── utterance.py ├── inference.py ├── model.py ├── params_data.py ├── params_model.py ├── preprocess.py ├── saved_models │ └── .gitkeep ├── train.py └── visualizations.py ├── helper.py ├── requirements_demo.txt ├── samples ├── .DS_Store ├── 1320_00000.mp3 ├── 3575_00000.mp3 ├── 8230_00000.mp3 ├── README.md ├── VCTK.txt ├── myvoice.mp3 ├── p240_00000.mp3 └── p260_00000.mp3 ├── slides.pdf ├── synthesizer ├── LICENSE.txt ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── audio.cpython-37.pyc │ ├── hparams.cpython-37.pyc │ ├── inference.cpython-37.pyc │ ├── infolog.cpython-37.pyc │ └── tacotron2.cpython-37.pyc ├── audio.py ├── feeder.py ├── hparams.py ├── inference.py ├── infolog.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── architecture_wrappers.cpython-37.pyc │ │ ├── attention.cpython-37.pyc │ │ ├── custom_decoder.cpython-37.pyc │ │ ├── helpers.cpython-37.pyc │ │ ├── modules.cpython-37.pyc │ │ └── tacotron.cpython-37.pyc │ ├── architecture_wrappers.py │ ├── attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── modules.py │ └── tacotron.py ├── preprocess.py ├── synthesize.py ├── tacotron2.py ├── train.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── cleaners.cpython-37.pyc │ ├── numbers.cpython-37.pyc │ ├── plot.cpython-37.pyc │ ├── symbols.cpython-37.pyc │ └── text.cpython-37.pyc │ ├── _cmudict.py │ ├── cleaners.py │ ├── numbers.py │ ├── plot.py │ ├── symbols.py │ └── text.py └── vocoder ├── LICENSE.txt ├── __pycache__ ├── audio.cpython-37.pyc ├── display.cpython-37.pyc ├── distribution.cpython-37.pyc ├── hparams.cpython-37.pyc └── inference.cpython-37.pyc ├── audio.py ├── display.py ├── distribution.py ├── gen_wavernn.py ├── hparams.py ├── inference.py ├── models ├── __pycache__ │ └── fatchord_version.cpython-37.pyc ├── deepmind_version.py └── fatchord_version.py ├── saved_models └── .gitkeep ├── train.py └── vocoder_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | # don't upload wave files and pretrained models 2 | *.wav 3 | *.pt 4 | *.pyc 5 | *data-00000-of-00001 6 | synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001 7 | synthesizer/saved_models/* 8 | __pycache__/ 9 | 10 | # pipfiles 11 | Pipfile* 12 | 13 | # exception to the rule 14 | !saved_models/.gitkeep -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 dataroots 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Streamlit Demo: Real-Time Voice Cloning 2 | 3 | This repository demonstrates how a simple voice transfer app can be created using [Streamlit](https://www.streamlit.io/). The code for this demo is based on the repository for [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning). 4 | 5 | This app allows you to: 6 | * Record your voice 7 | * Visualize the embedding of the speaker 8 | * Synthesize speech based on the recorded voice 9 | 10 | 11 | ## Setup 12 | 13 | ### 1. Install Requirements 14 | **Python 3.6 or 3.7** is needed 15 | 16 | * Create your virtual environment (e.g. [pipenv](https://pipenv.pypa.io/en/latest/), [poetry](https://python-poetry.org/) or [venv](https://docs.python.org/3/library/venv.html)). 17 | * Install [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). 18 | * Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). 19 | * Run `pip install -r requirements_demo.txt` to install the remaining necessary packages. 20 | 21 | ### 2. Download Pretrained Models 22 | Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). 23 | 24 | ### 3. Launch streamlit demo 25 | 26 | * `streamlit run demo_voice.py` -------------------------------------------------------------------------------- /demo_streamlit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from synthesizer.inference import Synthesizer\n", 10 | "from encoder import inference as encoder\n", 11 | "from vocoder import inference as vocoder\n", 12 | "from pathlib import Path\n", 13 | "import numpy as np\n", 14 | "import soundfile as sf\n", 15 | "import os\n", 16 | "import librosa\n", 17 | "import sounddevice as sd\n", 18 | "import wavio\n", 19 | "import glob\n", 20 | "from helper import draw_embed, create_spectrogram, read_audio\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# 1. Record your own voice" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "print(\"Recording...\")\n", 38 | "duration = 5 # seconds\n", 39 | "fs = 48000\n", 40 | "sd.default.samplerate = fs\n", 41 | "sd.default.channels = 1\n", 42 | "myrecording = sd.rec(int(duration * fs))\n", 43 | "sd.wait(duration)\n", 44 | "print(\"Saving sample as myvoice.mp3\")\n", 45 | "path_myrecording = \"./samples/myvoice.mp3\"\n", 46 | "wavio.write(path_myrecording, myrecording, fs, sampwidth=2)\n", 47 | "sd.play(myrecording, fs) #st\n", 48 | "print(\"Done! Saved sample as myvoice.mp3\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "fig = create_spectrogram(path_myrecording)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "# 2. Load your pretrained models" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "print(\"Loading pretrained models...\")\n", 74 | "seed = 42\n", 75 | "low_mem = False\n", 76 | "num_generated = 0\n", 77 | "enc_model_fpath = Path(\"encoder/saved_models/pretrained.pt\")\n", 78 | "syn_model_dir = Path(\"synthesizer/saved_models/logs-pretrained/\")\n", 79 | "voc_model_fpath = Path(\"vocoder/saved_models/pretrained/pretrained.pt\")\n", 80 | "encoder.load_model(enc_model_fpath)\n", 81 | "synthesizer = Synthesizer(\n", 82 | " syn_model_dir.joinpath(\"taco_pretrained\"), low_mem=low_mem, seed=seed\n", 83 | ")\n", 84 | "vocoder.load_model(voc_model_fpath)\n", 85 | "print(\"Loaded pretrained models!\")" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "# 3. Choose a recording" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "audio_folder = \"samples\"\n", 102 | "filenames = glob.glob(os.path.join(audio_folder, \"*.mp3\"))\n", 103 | "print(filenames)\n", 104 | "\n", 105 | "selected_filename = 'samples/myvoice.mp3'\n", 106 | "in_fpath = Path(selected_filename.replace('\"', \"\").replace(\"'\", \"\"))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "# 4. Start preprocessing" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "original_wav, sampling_rate = librosa.load(str(in_fpath))\n", 123 | "preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)\n", 124 | "print(\"Loaded file succesfully!\")\n", 125 | "embed = encoder.embed_utterance(preprocessed_wav)\n", 126 | "sd.play(original_wav, sampling_rate) #st\n", 127 | "print(\"Created the embedding\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "fig = draw_embed(embed, \"myembedding\", None)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# 5. Synthesize the text you like to hear" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "## Generating the spectrogram\n", 153 | "text = input(\"Write a sentence (+-20 words) to be synthesized:\\n\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "if text != \"\":\n", 163 | " texts = [text]\n", 164 | " embeds = [embed]\n", 165 | " # If you know what the attention layer alignments are,\n", 166 | " # you can retrieve them here by passing return_alignments=True\n", 167 | " specs = synthesizer.synthesize_spectrograms(texts, embeds)\n", 168 | " spec = specs[0]\n", 169 | " print(\"Created the mel spectrogram\")\n", 170 | "\n", 171 | " # Generating the waveform\n", 172 | " print(\"Synthesizing the waveform:\")\n", 173 | "\n", 174 | " generated_wav = vocoder.infer_waveform(spec)\n", 175 | "\n", 176 | " # Post-generation\n", 177 | " # There's a bug with sounddevice that makes the audio cut one\n", 178 | " # second earlier, so we pad it.\n", 179 | " generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode=\"constant\")\n", 180 | "\n", 181 | " # Trim excess silences to compensate for gaps in spectrograms (issue #53)\n", 182 | " generated_wav = encoder.preprocess_wav(generated_wav)\n", 183 | "\n", 184 | " # Play the audio (non-blocking)\n", 185 | " try:\n", 186 | " sd.stop()\n", 187 | " sd.play(generated_wav, synthesizer.sample_rate)\n", 188 | " except sd.PortAudioError as e:\n", 189 | " print(\"\\nCaught exception: %s\" % repr(e))\n", 190 | " print(\n", 191 | " 'Continuing without audio playback. Suppress this message with \\\n", 192 | " the \"--no_sound\" flag.\\n'\n", 193 | " )\n", 194 | "\n", 195 | " # Save it on the disk\n", 196 | " filename = \"demo_output_%02d.wav\" % num_generated\n", 197 | " sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)\n", 198 | " num_generated += 1\n", 199 | " print(\"\\nSaved output as %s\\n\\n\" % filename)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.6" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 4 231 | } 232 | -------------------------------------------------------------------------------- /demo_voice.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from synthesizer.inference import Synthesizer 3 | from encoder import inference as encoder 4 | from vocoder import inference as vocoder 5 | from pathlib import Path 6 | import numpy as np 7 | import soundfile as sf 8 | import os 9 | import librosa 10 | import glob 11 | from helper import draw_embed, create_spectrogram, read_audio, record, save_record 12 | 13 | "# Streamlit showcase" 14 | 15 | model_load_state = st.text("Loading pretrained models...") 16 | 17 | seed = 42 18 | low_mem = False 19 | num_generated = 0 20 | enc_model_fpath = Path("encoder/saved_models/pretrained.pt") 21 | syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/") 22 | voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt") 23 | encoder.load_model(enc_model_fpath) 24 | synthesizer = Synthesizer( 25 | syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem, seed=seed 26 | ) 27 | vocoder.load_model(voc_model_fpath) 28 | 29 | model_load_state.text("Loaded pretrained models!") 30 | 31 | st.header("1. Record your own voice") 32 | 33 | filename = st.text_input("Choose a filename: ") 34 | 35 | if st.button(f"Click to Record"): 36 | if filename == "": 37 | st.warning("Choose a filename.") 38 | else: 39 | record_state = st.text("Recording...") 40 | duration = 5 # seconds 41 | fs = 48000 42 | myrecording = record(duration, fs) 43 | record_state.text(f"Saving sample as {filename}.mp3") 44 | 45 | path_myrecording = f"./samples/{filename}.mp3" 46 | 47 | save_record(path_myrecording, myrecording, fs) 48 | record_state.text(f"Done! Saved sample as {filename}.mp3") 49 | 50 | st.audio(read_audio(path_myrecording)) 51 | 52 | fig = create_spectrogram(path_myrecording) 53 | st.pyplot(fig) 54 | 55 | "## 2. Choose an audio record" 56 | 57 | audio_folder = "samples" 58 | filenames = glob.glob(os.path.join(audio_folder, "*.mp3")) 59 | selected_filename = st.selectbox("Select a file", filenames) 60 | 61 | if selected_filename is not None: 62 | # Create embedding 63 | in_fpath = Path(selected_filename.replace('"', "").replace("'", "")) 64 | original_wav, sampling_rate = librosa.load(str(in_fpath)) 65 | preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) 66 | embed = encoder.embed_utterance(preprocessed_wav) 67 | st.success("Created the embedding") 68 | 69 | st.audio(read_audio(in_fpath)) 70 | 71 | if st.sidebar.checkbox("Do you want to change your embedding?"): 72 | height = int(np.sqrt(len(embed))) 73 | shape = (height, -1) 74 | matrix_embed = np.round(embed, 2).reshape(shape) 75 | matrix_embed = [list(row) for row in matrix_embed] 76 | a = st.text_area("Change your embedding:", value=str(matrix_embed).replace("],", "],\n")) 77 | 78 | matrix = [[float(x) for x in row.strip("[] \n").split(",")] for row in a.split("],")] 79 | embed = np.array(matrix).flatten() 80 | 81 | fig = draw_embed(embed, "myembedding", None) 82 | st.pyplot(fig) 83 | 84 | 85 | "## 3. Synthesize text." 86 | text = st.text_input("Write a sentence (+-20 words) to be synthesized:") 87 | 88 | 89 | def pgbar(i, seq_len, b_size, gen_rate): 90 | mybar.progress(i / seq_len) 91 | 92 | 93 | if st.button("Click to synthesize"): 94 | texts = [text] 95 | embeds = [embed] 96 | 97 | # generate waveform 98 | with st.spinner("Generating your speech..."): 99 | specs = synthesizer.synthesize_spectrograms(texts, embeds) 100 | spec = specs[0] 101 | synthesize_state = st.text("Created the mel spectrogram") 102 | synthesize_state.text("Generating the waveform...") 103 | mybar = st.progress(0) 104 | generated_wav = vocoder.infer_waveform(spec, progress_callback=pgbar) 105 | generated_wav = np.pad( 106 | generated_wav, (0, synthesizer.sample_rate), mode="constant" 107 | ) 108 | generated_wav = encoder.preprocess_wav(generated_wav) 109 | synthesize_state.text("Synthesized the waveform") 110 | st.success("Done!") 111 | 112 | # Save it on the disk 113 | filename = "demo_output_%02d.wav" % num_generated 114 | sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) 115 | num_generated += 1 116 | synthesize_state.text("\nSaved output as %s\n\n" % filename) 117 | st.audio(read_audio(filename)) 118 | -------------------------------------------------------------------------------- /encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__init__.py -------------------------------------------------------------------------------- /encoder/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/__pycache__/audio.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/audio.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/__pycache__/inference.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/inference.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/__pycache__/params_data.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/params_data.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/__pycache__/params_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/__pycache__/params_model.cpython-37.pyc -------------------------------------------------------------------------------- /encoder/audio.py: -------------------------------------------------------------------------------- 1 | from scipy.ndimage.morphology import binary_dilation 2 | from encoder.params_data import * 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | from warnings import warn 6 | import numpy as np 7 | import librosa 8 | import struct 9 | 10 | try: 11 | import webrtcvad 12 | except: 13 | warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.") 14 | webrtcvad=None 15 | 16 | int16_max = (2 ** 15) - 1 17 | 18 | 19 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], 20 | source_sr: Optional[int] = None): 21 | """ 22 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 23 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 24 | 25 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 26 | just .wav), either the waveform as a numpy array of floats. 27 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 28 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 29 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 30 | this argument will be ignored. 31 | """ 32 | # Load the wav from disk if needed 33 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 34 | wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) 35 | else: 36 | wav = fpath_or_wav 37 | 38 | # Resample the wav if needed 39 | if source_sr is not None and source_sr != sampling_rate: 40 | wav = librosa.resample(wav, source_sr, sampling_rate) 41 | 42 | # Apply the preprocessing: normalize volume and shorten long silences 43 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 44 | if webrtcvad: 45 | wav = trim_long_silences(wav) 46 | 47 | return wav 48 | 49 | 50 | def wav_to_mel_spectrogram(wav): 51 | """ 52 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 53 | Note: this not a log-mel spectrogram. 54 | """ 55 | frames = librosa.feature.melspectrogram( 56 | wav, 57 | sampling_rate, 58 | n_fft=int(sampling_rate * mel_window_length / 1000), 59 | hop_length=int(sampling_rate * mel_window_step / 1000), 60 | n_mels=mel_n_channels 61 | ) 62 | return frames.astype(np.float32).T 63 | 64 | 65 | def trim_long_silences(wav): 66 | """ 67 | Ensures that segments without voice in the waveform remain no longer than a 68 | threshold determined by the VAD parameters in params.py. 69 | 70 | :param wav: the raw waveform as a numpy array of floats 71 | :return: the same waveform with silences trimmed away (length <= original wav length) 72 | """ 73 | # Compute the voice detection window size 74 | samples_per_window = (vad_window_length * sampling_rate) // 1000 75 | 76 | # Trim the end of the audio to have a multiple of the window size 77 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] 78 | 79 | # Convert the float waveform to 16-bit mono PCM 80 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) 81 | 82 | # Perform voice activation detection 83 | voice_flags = [] 84 | vad = webrtcvad.Vad(mode=3) 85 | for window_start in range(0, len(wav), samples_per_window): 86 | window_end = window_start + samples_per_window 87 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], 88 | sample_rate=sampling_rate)) 89 | voice_flags = np.array(voice_flags) 90 | 91 | # Smooth the voice detection with a moving average 92 | def moving_average(array, width): 93 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) 94 | ret = np.cumsum(array_padded, dtype=float) 95 | ret[width:] = ret[width:] - ret[:-width] 96 | return ret[width - 1:] / width 97 | 98 | audio_mask = moving_average(voice_flags, vad_moving_average_width) 99 | audio_mask = np.round(audio_mask).astype(np.bool) 100 | 101 | # Dilate the voiced regions 102 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) 103 | audio_mask = np.repeat(audio_mask, samples_per_window) 104 | 105 | return wav[audio_mask == True] 106 | 107 | 108 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 109 | if increase_only and decrease_only: 110 | raise ValueError("Both increase only and decrease only are set") 111 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) 112 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 113 | return wav 114 | return wav * (10 ** (dBFS_change / 20)) 115 | -------------------------------------------------------------------------------- /encoder/config.py: -------------------------------------------------------------------------------- 1 | librispeech_datasets = { 2 | "train": { 3 | "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], 4 | "other": ["LibriSpeech/train-other-500"] 5 | }, 6 | "test": { 7 | "clean": ["LibriSpeech/test-clean"], 8 | "other": ["LibriSpeech/test-other"] 9 | }, 10 | "dev": { 11 | "clean": ["LibriSpeech/dev-clean"], 12 | "other": ["LibriSpeech/dev-other"] 13 | }, 14 | } 15 | libritts_datasets = { 16 | "train": { 17 | "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], 18 | "other": ["LibriTTS/train-other-500"] 19 | }, 20 | "test": { 21 | "clean": ["LibriTTS/test-clean"], 22 | "other": ["LibriTTS/test-other"] 23 | }, 24 | "dev": { 25 | "clean": ["LibriTTS/dev-clean"], 26 | "other": ["LibriTTS/dev-other"] 27 | }, 28 | } 29 | voxceleb_datasets = { 30 | "voxceleb1" : { 31 | "train": ["VoxCeleb1/wav"], 32 | "test": ["VoxCeleb1/test_wav"] 33 | }, 34 | "voxceleb2" : { 35 | "train": ["VoxCeleb2/dev/aac"], 36 | "test": ["VoxCeleb2/test_wav"] 37 | } 38 | } 39 | 40 | other_datasets = [ 41 | "LJSpeech-1.1", 42 | "VCTK-Corpus/wav48", 43 | ] 44 | 45 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] 46 | -------------------------------------------------------------------------------- /encoder/data_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 2 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader 3 | -------------------------------------------------------------------------------- /encoder/data_objects/random_cycler.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class RandomCycler: 4 | """ 5 | Creates an internal copy of a sequence and allows access to its items in a constrained random 6 | order. For a source sequence of n items and one or several consecutive queries of a total 7 | of m items, the following guarantees hold (one implies the other): 8 | - Each item will be returned between m // n and ((m - 1) // n) + 1 times. 9 | - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. 10 | """ 11 | 12 | def __init__(self, source): 13 | if len(source) == 0: 14 | raise Exception("Can't create RandomCycler from an empty collection") 15 | self.all_items = list(source) 16 | self.next_items = [] 17 | 18 | def sample(self, count: int): 19 | shuffle = lambda l: random.sample(l, len(l)) 20 | 21 | out = [] 22 | while count > 0: 23 | if count >= len(self.all_items): 24 | out.extend(shuffle(list(self.all_items))) 25 | count -= len(self.all_items) 26 | continue 27 | n = min(count, len(self.next_items)) 28 | out.extend(self.next_items[:n]) 29 | count -= n 30 | self.next_items = self.next_items[n:] 31 | if len(self.next_items) == 0: 32 | self.next_items = shuffle(list(self.all_items)) 33 | return out 34 | 35 | def __next__(self): 36 | return self.sample(1)[0] 37 | 38 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.random_cycler import RandomCycler 2 | from encoder.data_objects.utterance import Utterance 3 | from pathlib import Path 4 | 5 | # Contains the set of utterances of a single speaker 6 | class Speaker: 7 | def __init__(self, root: Path): 8 | self.root = root 9 | self.name = root.name 10 | self.utterances = None 11 | self.utterance_cycler = None 12 | 13 | def _load_utterances(self): 14 | with self.root.joinpath("_sources.txt").open("r") as sources_file: 15 | sources = [l.split(",") for l in sources_file] 16 | sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} 17 | self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] 18 | self.utterance_cycler = RandomCycler(self.utterances) 19 | 20 | def random_partial(self, count, n_frames): 21 | """ 22 | Samples a batch of unique partial utterances from the disk in a way that all 23 | utterances come up at least once every two cycles and in a random order every time. 24 | 25 | :param count: The number of partial utterances to sample from the set of utterances from 26 | that speaker. Utterances are guaranteed not to be repeated if is not larger than 27 | the number of utterances available. 28 | :param n_frames: The number of frames in the partial utterance. 29 | :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 30 | frames are the frames of the partial utterances and range is the range of the partial 31 | utterance with regard to the complete utterance. 32 | """ 33 | if self.utterances is None: 34 | self._load_utterances() 35 | 36 | utterances = self.utterance_cycler.sample(count) 37 | 38 | a = [(u,) + u.random_partial(n_frames) for u in utterances] 39 | 40 | return a 41 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List 3 | from encoder.data_objects.speaker import Speaker 4 | 5 | class SpeakerBatch: 6 | def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): 7 | self.speakers = speakers 8 | self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} 9 | 10 | # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with 11 | # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) 12 | self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) 13 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker_verification_dataset.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.random_cycler import RandomCycler 2 | from encoder.data_objects.speaker_batch import SpeakerBatch 3 | from encoder.data_objects.speaker import Speaker 4 | from encoder.params_data import partials_n_frames 5 | from torch.utils.data import Dataset, DataLoader 6 | from pathlib import Path 7 | 8 | # TODO: improve with a pool of speakers for data efficiency 9 | 10 | class SpeakerVerificationDataset(Dataset): 11 | def __init__(self, datasets_root: Path): 12 | self.root = datasets_root 13 | speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] 14 | if len(speaker_dirs) == 0: 15 | raise Exception("No speakers found. Make sure you are pointing to the directory " 16 | "containing all preprocessed speaker directories.") 17 | self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] 18 | self.speaker_cycler = RandomCycler(self.speakers) 19 | 20 | def __len__(self): 21 | return int(1e10) 22 | 23 | def __getitem__(self, index): 24 | return next(self.speaker_cycler) 25 | 26 | def get_logs(self): 27 | log_string = "" 28 | for log_fpath in self.root.glob("*.txt"): 29 | with log_fpath.open("r") as log_file: 30 | log_string += "".join(log_file.readlines()) 31 | return log_string 32 | 33 | 34 | class SpeakerVerificationDataLoader(DataLoader): 35 | def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 36 | batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 37 | worker_init_fn=None): 38 | self.utterances_per_speaker = utterances_per_speaker 39 | 40 | super().__init__( 41 | dataset=dataset, 42 | batch_size=speakers_per_batch, 43 | shuffle=False, 44 | sampler=sampler, 45 | batch_sampler=batch_sampler, 46 | num_workers=num_workers, 47 | collate_fn=self.collate, 48 | pin_memory=pin_memory, 49 | drop_last=False, 50 | timeout=timeout, 51 | worker_init_fn=worker_init_fn 52 | ) 53 | 54 | def collate(self, speakers): 55 | return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 56 | -------------------------------------------------------------------------------- /encoder/data_objects/utterance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Utterance: 5 | def __init__(self, frames_fpath, wave_fpath): 6 | self.frames_fpath = frames_fpath 7 | self.wave_fpath = wave_fpath 8 | 9 | def get_frames(self): 10 | return np.load(self.frames_fpath) 11 | 12 | def random_partial(self, n_frames): 13 | """ 14 | Crops the frames into a partial utterance of n_frames 15 | 16 | :param n_frames: The number of frames of the partial utterance 17 | :return: the partial utterance frames and a tuple indicating the start and end of the 18 | partial utterance in the complete utterance. 19 | """ 20 | frames = self.get_frames() 21 | if frames.shape[0] == n_frames: 22 | start = 0 23 | else: 24 | start = np.random.randint(0, frames.shape[0] - n_frames) 25 | end = start + n_frames 26 | return frames[start:end], (start, end) -------------------------------------------------------------------------------- /encoder/inference.py: -------------------------------------------------------------------------------- 1 | from encoder.params_data import * 2 | from encoder.model import SpeakerEncoder 3 | from encoder.audio import preprocess_wav # We want to expose this function from here 4 | from matplotlib import cm 5 | from encoder import audio 6 | from pathlib import Path 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import torch 10 | 11 | _model = None # type: SpeakerEncoder 12 | _device = None # type: torch.device 13 | 14 | 15 | def load_model(weights_fpath: Path, device=None): 16 | """ 17 | Loads the model in memory. If this function is not explicitely called, it will be run on the 18 | first call to embed_frames() with the default weights file. 19 | 20 | :param weights_fpath: the path to saved model weights. 21 | :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 22 | model will be loaded and will run on this device. Outputs will however always be on the cpu. 23 | If None, will default to your GPU if it"s available, otherwise your CPU. 24 | """ 25 | # TODO: I think the slow loading of the encoder might have something to do with the device it 26 | # was saved on. Worth investigating. 27 | global _model, _device 28 | if device is None: 29 | _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | elif isinstance(device, str): 31 | _device = torch.device(device) 32 | _model = SpeakerEncoder(_device, torch.device("cpu")) 33 | checkpoint = torch.load(weights_fpath, _device) 34 | _model.load_state_dict(checkpoint["model_state"]) 35 | _model.eval() 36 | print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) 37 | 38 | 39 | def is_loaded(): 40 | return _model is not None 41 | 42 | 43 | def embed_frames_batch(frames_batch): 44 | """ 45 | Computes embeddings for a batch of mel spectrogram. 46 | 47 | :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 48 | (batch_size, n_frames, n_channels) 49 | :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size) 50 | """ 51 | if _model is None: 52 | raise Exception("Model was not loaded. Call load_model() before inference.") 53 | 54 | frames = torch.from_numpy(frames_batch).to(_device) 55 | embed = _model.forward(frames).detach().cpu().numpy() 56 | return embed 57 | 58 | 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, 60 | min_pad_coverage=0.75, overlap=0.5): 61 | """ 62 | Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 63 | partial utterances of each. Both the waveform and the mel 64 | spectrogram slices are returned, so as to make each partial utterance waveform correspond to 65 | its spectrogram. This function assumes that the mel spectrogram parameters used are those 66 | defined in params_data.py. 67 | 68 | The returned ranges may be indexing further than the length of the waveform. It is 69 | recommended that you pad the waveform with zeros up to wave_slices[-1].stop. 70 | 71 | :param n_samples: the number of samples in the waveform 72 | :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 73 | utterance 74 | :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 75 | enough frames. If at least of are present, 76 | then the last partial utterance will be considered, as if we padded the audio. Otherwise, 77 | it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 78 | utterance, this parameter is ignored so that the function always returns at least 1 slice. 79 | :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 80 | utterances are entirely disjoint. 81 | :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 82 | respectively the waveform and the mel spectrogram with these slices to obtain the partial 83 | utterances. 84 | """ 85 | assert 0 <= overlap < 1 86 | assert 0 < min_pad_coverage <= 1 87 | 88 | samples_per_frame = int((sampling_rate * mel_window_step / 1000)) 89 | n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) 90 | frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) 91 | 92 | # Compute the slices 93 | wav_slices, mel_slices = [], [] 94 | steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) 95 | for i in range(0, steps, frame_step): 96 | mel_range = np.array([i, i + partial_utterance_n_frames]) 97 | wav_range = mel_range * samples_per_frame 98 | mel_slices.append(slice(*mel_range)) 99 | wav_slices.append(slice(*wav_range)) 100 | 101 | # Evaluate whether extra padding is warranted or not 102 | last_wav_range = wav_slices[-1] 103 | coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) 104 | if coverage < min_pad_coverage and len(mel_slices) > 1: 105 | mel_slices = mel_slices[:-1] 106 | wav_slices = wav_slices[:-1] 107 | 108 | return wav_slices, mel_slices 109 | 110 | 111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): 112 | """ 113 | Computes an embedding for a single utterance. 114 | 115 | # TODO: handle multiple wavs to benefit from batching on GPU 116 | :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 117 | :param using_partials: if True, then the utterance is split in partial utterances of 118 | frames and the utterance embedding is computed from their 119 | normalized average. If False, the utterance is instead computed from feeding the entire 120 | spectogram to the network. 121 | :param return_partials: if True, the partial embeddings will also be returned along with the 122 | wav slices that correspond to the partial embeddings. 123 | :param kwargs: additional arguments to compute_partial_splits() 124 | :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 125 | is True, the partial utterances as a numpy array of float32 of shape 126 | (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 127 | returned. If is simultaneously set to False, both these values will be None 128 | instead. 129 | """ 130 | # Process the entire utterance if not using partials 131 | if not using_partials: 132 | frames = audio.wav_to_mel_spectrogram(wav) 133 | embed = embed_frames_batch(frames[None, ...])[0] 134 | if return_partials: 135 | return embed, None, None 136 | return embed 137 | 138 | # Compute where to split the utterance into partials and pad if necessary 139 | wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) 140 | max_wave_length = wave_slices[-1].stop 141 | if max_wave_length >= len(wav): 142 | wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") 143 | 144 | # Split the utterance into partials 145 | frames = audio.wav_to_mel_spectrogram(wav) 146 | frames_batch = np.array([frames[s] for s in mel_slices]) 147 | partial_embeds = embed_frames_batch(frames_batch) 148 | 149 | # Compute the utterance embedding from the partial embeddings 150 | raw_embed = np.mean(partial_embeds, axis=0) 151 | embed = raw_embed / np.linalg.norm(raw_embed, 2) 152 | 153 | if return_partials: 154 | return embed, partial_embeds, wave_slices 155 | return embed 156 | 157 | 158 | def embed_speaker(wavs, **kwargs): 159 | raise NotImplemented() 160 | 161 | 162 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): 163 | if ax is None: 164 | ax = plt.gca() 165 | 166 | if shape is None: 167 | height = int(np.sqrt(len(embed))) 168 | shape = (height, -1) 169 | embed = embed.reshape(shape) 170 | 171 | cmap = cm.get_cmap() 172 | mappable = ax.imshow(embed, cmap=cmap) 173 | cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) 174 | mappable.set_clim(*color_range) 175 | 176 | 177 | ax.set_xticks([]), ax.set_yticks([]) 178 | ax.set_title(title) 179 | -------------------------------------------------------------------------------- /encoder/model.py: -------------------------------------------------------------------------------- 1 | from encoder.params_model import * 2 | from encoder.params_data import * 3 | from scipy.interpolate import interp1d 4 | from sklearn.metrics import roc_curve 5 | from torch.nn.utils import clip_grad_norm_ 6 | from scipy.optimize import brentq 7 | from torch import nn 8 | import numpy as np 9 | import torch 10 | 11 | 12 | class SpeakerEncoder(nn.Module): 13 | def __init__(self, device, loss_device): 14 | super().__init__() 15 | self.loss_device = loss_device 16 | 17 | # Network defition 18 | self.lstm = nn.LSTM(input_size=mel_n_channels, 19 | hidden_size=model_hidden_size, 20 | num_layers=model_num_layers, 21 | batch_first=True).to(device) 22 | self.linear = nn.Linear(in_features=model_hidden_size, 23 | out_features=model_embedding_size).to(device) 24 | self.relu = torch.nn.ReLU().to(device) 25 | 26 | # Cosine similarity scaling (with fixed initial parameter values) 27 | self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) 28 | self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) 29 | 30 | # Loss 31 | self.loss_fn = nn.CrossEntropyLoss().to(loss_device) 32 | 33 | def do_gradient_ops(self): 34 | # Gradient scale 35 | self.similarity_weight.grad *= 0.01 36 | self.similarity_bias.grad *= 0.01 37 | 38 | # Gradient clipping 39 | clip_grad_norm_(self.parameters(), 3, norm_type=2) 40 | 41 | def forward(self, utterances, hidden_init=None): 42 | """ 43 | Computes the embeddings of a batch of utterance spectrograms. 44 | 45 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 46 | (batch_size, n_frames, n_channels) 47 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 48 | batch_size, hidden_size). Will default to a tensor of zeros if None. 49 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 50 | """ 51 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 52 | # and the final cell state. 53 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 54 | 55 | # We take only the hidden state of the last layer 56 | embeds_raw = self.relu(self.linear(hidden[-1])) 57 | 58 | # L2-normalize it 59 | embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 60 | 61 | return embeds 62 | 63 | def similarity_matrix(self, embeds): 64 | """ 65 | Computes the similarity matrix according the section 2.1 of GE2E. 66 | 67 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 68 | utterances_per_speaker, embedding_size) 69 | :return: the similarity matrix as a tensor of shape (speakers_per_batch, 70 | utterances_per_speaker, speakers_per_batch) 71 | """ 72 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 73 | 74 | # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation 75 | centroids_incl = torch.mean(embeds, dim=1, keepdim=True) 76 | centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) 77 | 78 | # Exclusive centroids (1 per utterance) 79 | centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) 80 | centroids_excl /= (utterances_per_speaker - 1) 81 | centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) 82 | 83 | # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot 84 | # product of these vectors (which is just an element-wise multiplication reduced by a sum). 85 | # We vectorize the computation for efficiency. 86 | sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, 87 | speakers_per_batch).to(self.loss_device) 88 | mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) 89 | for j in range(speakers_per_batch): 90 | mask = np.where(mask_matrix[j])[0] 91 | sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) 92 | sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) 93 | 94 | ## Even more vectorized version (slower maybe because of transpose) 95 | # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker 96 | # ).to(self.loss_device) 97 | # eye = np.eye(speakers_per_batch, dtype=np.int) 98 | # mask = np.where(1 - eye) 99 | # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) 100 | # mask = np.where(eye) 101 | # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) 102 | # sim_matrix2 = sim_matrix2.transpose(1, 2) 103 | 104 | sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias 105 | return sim_matrix 106 | 107 | def loss(self, embeds): 108 | """ 109 | Computes the softmax loss according the section 2.1 of GE2E. 110 | 111 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 112 | utterances_per_speaker, embedding_size) 113 | :return: the loss and the EER for this batch of embeddings. 114 | """ 115 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 116 | 117 | # Loss 118 | sim_matrix = self.similarity_matrix(embeds) 119 | sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 120 | speakers_per_batch)) 121 | ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) 122 | target = torch.from_numpy(ground_truth).long().to(self.loss_device) 123 | loss = self.loss_fn(sim_matrix, target) 124 | 125 | # EER (not backpropagated) 126 | with torch.no_grad(): 127 | inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] 128 | labels = np.array([inv_argmax(i) for i in ground_truth]) 129 | preds = sim_matrix.detach().cpu().numpy() 130 | 131 | # Snippet from https://yangcha.github.io/EER-ROC/ 132 | fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) 133 | eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) 134 | 135 | return loss, eer -------------------------------------------------------------------------------- /encoder/params_data.py: -------------------------------------------------------------------------------- 1 | 2 | ## Mel-filterbank 3 | mel_window_length = 25 # In milliseconds 4 | mel_window_step = 10 # In milliseconds 5 | mel_n_channels = 40 6 | 7 | 8 | ## Audio 9 | sampling_rate = 16000 10 | # Number of spectrogram frames in a partial utterance 11 | partials_n_frames = 160 # 1600 ms 12 | # Number of spectrogram frames at inference 13 | inference_n_frames = 80 # 800 ms 14 | 15 | 16 | ## Voice Activation Detection 17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 18 | # This sets the granularity of the VAD. Should not need to be changed. 19 | vad_window_length = 30 # In milliseconds 20 | # Number of frames to average together when performing the moving average smoothing. 21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 22 | vad_moving_average_width = 8 23 | # Maximum number of consecutive silent frames a segment can have. 24 | vad_max_silence_length = 6 25 | 26 | 27 | ## Audio volume normalization 28 | audio_norm_target_dBFS = -30 29 | 30 | -------------------------------------------------------------------------------- /encoder/params_model.py: -------------------------------------------------------------------------------- 1 | 2 | ## Model parameters 3 | model_hidden_size = 256 4 | model_embedding_size = 256 5 | model_num_layers = 3 6 | 7 | 8 | ## Training parameters 9 | learning_rate_init = 1e-4 10 | speakers_per_batch = 64 11 | utterances_per_speaker = 10 12 | -------------------------------------------------------------------------------- /encoder/preprocess.py: -------------------------------------------------------------------------------- 1 | from multiprocess.pool import ThreadPool 2 | from encoder.params_data import * 3 | from encoder.config import librispeech_datasets, anglophone_nationalites 4 | from datetime import datetime 5 | from encoder import audio 6 | from pathlib import Path 7 | from tqdm import tqdm 8 | import numpy as np 9 | 10 | 11 | class DatasetLog: 12 | """ 13 | Registers metadata about the dataset in a text file. 14 | """ 15 | def __init__(self, root, name): 16 | self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") 17 | self.sample_data = dict() 18 | 19 | start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 20 | self.write_line("Creating dataset %s on %s" % (name, start_time)) 21 | self.write_line("-----") 22 | self._log_params() 23 | 24 | def _log_params(self): 25 | from encoder import params_data 26 | self.write_line("Parameter values:") 27 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 28 | value = getattr(params_data, param_name) 29 | self.write_line("\t%s: %s" % (param_name, value)) 30 | self.write_line("-----") 31 | 32 | def write_line(self, line): 33 | self.text_file.write("%s\n" % line) 34 | 35 | def add_sample(self, **kwargs): 36 | for param_name, value in kwargs.items(): 37 | if not param_name in self.sample_data: 38 | self.sample_data[param_name] = [] 39 | self.sample_data[param_name].append(value) 40 | 41 | def finalize(self): 42 | self.write_line("Statistics:") 43 | for param_name, values in self.sample_data.items(): 44 | self.write_line("\t%s:" % param_name) 45 | self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) 46 | self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) 47 | self.write_line("-----") 48 | end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) 49 | self.write_line("Finished on %s" % end_time) 50 | self.text_file.close() 51 | 52 | 53 | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): 54 | dataset_root = datasets_root.joinpath(dataset_name) 55 | if not dataset_root.exists(): 56 | print("Couldn\'t find %s, skipping this dataset." % dataset_root) 57 | return None, None 58 | return dataset_root, DatasetLog(out_dir, dataset_name) 59 | 60 | 61 | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, 62 | skip_existing, logger): 63 | print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) 64 | 65 | # Function to preprocess utterances for one speaker 66 | def preprocess_speaker(speaker_dir: Path): 67 | # Give a name to the speaker that includes its dataset 68 | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) 69 | 70 | # Create an output directory with that name, as well as a txt file containing a 71 | # reference to each source file. 72 | speaker_out_dir = out_dir.joinpath(speaker_name) 73 | speaker_out_dir.mkdir(exist_ok=True) 74 | sources_fpath = speaker_out_dir.joinpath("_sources.txt") 75 | 76 | # There's a possibility that the preprocessing was interrupted earlier, check if 77 | # there already is a sources file. 78 | if sources_fpath.exists(): 79 | try: 80 | with sources_fpath.open("r") as sources_file: 81 | existing_fnames = {line.split(",")[0] for line in sources_file} 82 | except: 83 | existing_fnames = {} 84 | else: 85 | existing_fnames = {} 86 | 87 | # Gather all audio files for that speaker recursively 88 | sources_file = sources_fpath.open("a" if skip_existing else "w") 89 | for in_fpath in speaker_dir.glob("**/*.%s" % extension): 90 | # Check if the target output file already exists 91 | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) 92 | out_fname = out_fname.replace(".%s" % extension, ".npy") 93 | if skip_existing and out_fname in existing_fnames: 94 | continue 95 | 96 | # Load and preprocess the waveform 97 | wav = audio.preprocess_wav(in_fpath) 98 | if len(wav) == 0: 99 | continue 100 | 101 | # Create the mel spectrogram, discard those that are too short 102 | frames = audio.wav_to_mel_spectrogram(wav) 103 | if len(frames) < partials_n_frames: 104 | continue 105 | 106 | out_fpath = speaker_out_dir.joinpath(out_fname) 107 | np.save(out_fpath, frames) 108 | logger.add_sample(duration=len(wav) / sampling_rate) 109 | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) 110 | 111 | sources_file.close() 112 | 113 | # Process the utterances for each speaker 114 | with ThreadPool(8) as pool: 115 | list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), 116 | unit="speakers")) 117 | logger.finalize() 118 | print("Done preprocessing %s.\n" % dataset_name) 119 | 120 | 121 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): 122 | for dataset_name in librispeech_datasets["train"]["other"]: 123 | # Initialize the preprocessing 124 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 125 | if not dataset_root: 126 | return 127 | 128 | # Preprocess all speakers 129 | speaker_dirs = list(dataset_root.glob("*")) 130 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", 131 | skip_existing, logger) 132 | 133 | 134 | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): 135 | # Initialize the preprocessing 136 | dataset_name = "VoxCeleb1" 137 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 138 | if not dataset_root: 139 | return 140 | 141 | # Get the contents of the meta file 142 | with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: 143 | metadata = [line.split("\t") for line in metafile][1:] 144 | 145 | # Select the ID and the nationality, filter out non-anglophone speakers 146 | nationalities = {line[0]: line[3] for line in metadata} 147 | keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 148 | nationality.lower() in anglophone_nationalites] 149 | print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 150 | (len(keep_speaker_ids), len(nationalities))) 151 | 152 | # Get the speaker directories for anglophone speakers only 153 | speaker_dirs = dataset_root.joinpath("wav").glob("*") 154 | speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if 155 | speaker_dir.name in keep_speaker_ids] 156 | print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 157 | (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) 158 | 159 | # Preprocess all speakers 160 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", 161 | skip_existing, logger) 162 | 163 | 164 | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): 165 | # Initialize the preprocessing 166 | dataset_name = "VoxCeleb2" 167 | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) 168 | if not dataset_root: 169 | return 170 | 171 | # Get the speaker directories 172 | # Preprocess all speakers 173 | speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) 174 | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", 175 | skip_existing, logger) 176 | -------------------------------------------------------------------------------- /encoder/saved_models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/encoder/saved_models/.gitkeep -------------------------------------------------------------------------------- /encoder/train.py: -------------------------------------------------------------------------------- 1 | from encoder.visualizations import Visualizations 2 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset 3 | from encoder.params_model import * 4 | from encoder.model import SpeakerEncoder 5 | from utils.profiler import Profiler 6 | from pathlib import Path 7 | import torch 8 | 9 | def sync(device: torch.device): 10 | # For correct profiling (cuda operations are async) 11 | if device.type == "cuda": 12 | torch.cuda.synchronize(device) 13 | 14 | 15 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, 16 | backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, 17 | no_visdom: bool): 18 | # Create a dataset and a dataloader 19 | dataset = SpeakerVerificationDataset(clean_data_root) 20 | loader = SpeakerVerificationDataLoader( 21 | dataset, 22 | speakers_per_batch, 23 | utterances_per_speaker, 24 | num_workers=8, 25 | ) 26 | 27 | # Setup the device on which to run the forward pass and the loss. These can be different, 28 | # because the forward pass is faster on the GPU whereas the loss is often (depending on your 29 | # hyperparameters) faster on the CPU. 30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 31 | # FIXME: currently, the gradient is None if loss_device is cuda 32 | loss_device = torch.device("cpu") 33 | 34 | # Create the model and the optimizer 35 | model = SpeakerEncoder(device, loss_device) 36 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) 37 | init_step = 1 38 | 39 | # Configure file path for the model 40 | state_fpath = models_dir.joinpath(run_id + ".pt") 41 | backup_dir = models_dir.joinpath(run_id + "_backups") 42 | 43 | # Load any existing model 44 | if not force_restart: 45 | if state_fpath.exists(): 46 | print("Found existing model \"%s\", loading it and resuming training." % run_id) 47 | checkpoint = torch.load(state_fpath) 48 | init_step = checkpoint["step"] 49 | model.load_state_dict(checkpoint["model_state"]) 50 | optimizer.load_state_dict(checkpoint["optimizer_state"]) 51 | optimizer.param_groups[0]["lr"] = learning_rate_init 52 | else: 53 | print("No model \"%s\" found, starting training from scratch." % run_id) 54 | else: 55 | print("Starting the training from scratch.") 56 | model.train() 57 | 58 | # Initialize the visualization environment 59 | vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) 60 | vis.log_dataset(dataset) 61 | vis.log_params() 62 | device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") 63 | vis.log_implementation({"Device": device_name}) 64 | 65 | # Training loop 66 | profiler = Profiler(summarize_every=10, disabled=False) 67 | for step, speaker_batch in enumerate(loader, init_step): 68 | profiler.tick("Blocking, waiting for batch (threaded)") 69 | 70 | # Forward pass 71 | inputs = torch.from_numpy(speaker_batch.data).to(device) 72 | sync(device) 73 | profiler.tick("Data to %s" % device) 74 | embeds = model(inputs) 75 | sync(device) 76 | profiler.tick("Forward pass") 77 | embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) 78 | loss, eer = model.loss(embeds_loss) 79 | sync(loss_device) 80 | profiler.tick("Loss") 81 | 82 | # Backward pass 83 | model.zero_grad() 84 | loss.backward() 85 | profiler.tick("Backward pass") 86 | model.do_gradient_ops() 87 | optimizer.step() 88 | profiler.tick("Parameter update") 89 | 90 | # Update visualizations 91 | # learning_rate = optimizer.param_groups[0]["lr"] 92 | vis.update(loss.item(), eer, step) 93 | 94 | # Draw projections and save them to the backup folder 95 | if umap_every != 0 and step % umap_every == 0: 96 | print("Drawing and saving projections (step %d)" % step) 97 | backup_dir.mkdir(exist_ok=True) 98 | projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) 99 | embeds = embeds.detach().cpu().numpy() 100 | vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) 101 | vis.save() 102 | 103 | # Overwrite the latest version of the model 104 | if save_every != 0 and step % save_every == 0: 105 | print("Saving the model (step %d)" % step) 106 | torch.save({ 107 | "step": step + 1, 108 | "model_state": model.state_dict(), 109 | "optimizer_state": optimizer.state_dict(), 110 | }, state_fpath) 111 | 112 | # Make a backup 113 | if backup_every != 0 and step % backup_every == 0: 114 | print("Making a backup (step %d)" % step) 115 | backup_dir.mkdir(exist_ok=True) 116 | backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) 117 | torch.save({ 118 | "step": step + 1, 119 | "model_state": model.state_dict(), 120 | "optimizer_state": optimizer.state_dict(), 121 | }, backup_fpath) 122 | 123 | profiler.tick("Extras (visualizations, saving)") 124 | -------------------------------------------------------------------------------- /encoder/visualizations.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 2 | from datetime import datetime 3 | from time import perf_counter as timer 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | # import webbrowser 7 | import visdom 8 | import umap 9 | 10 | colormap = np.array([ 11 | [76, 255, 0], 12 | [0, 127, 70], 13 | [255, 0, 0], 14 | [255, 217, 38], 15 | [0, 135, 255], 16 | [165, 0, 165], 17 | [255, 167, 255], 18 | [0, 255, 255], 19 | [255, 96, 38], 20 | [142, 76, 0], 21 | [33, 0, 127], 22 | [0, 0, 0], 23 | [183, 183, 183], 24 | ], dtype=np.float) / 255 25 | 26 | 27 | class Visualizations: 28 | def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): 29 | # Tracking data 30 | self.last_update_timestamp = timer() 31 | self.update_every = update_every 32 | self.step_times = [] 33 | self.losses = [] 34 | self.eers = [] 35 | print("Updating the visualizations every %d steps." % update_every) 36 | 37 | # If visdom is disabled TODO: use a better paradigm for that 38 | self.disabled = disabled 39 | if self.disabled: 40 | return 41 | 42 | # Set the environment name 43 | now = str(datetime.now().strftime("%d-%m %Hh%M")) 44 | if env_name is None: 45 | self.env_name = now 46 | else: 47 | self.env_name = "%s (%s)" % (env_name, now) 48 | 49 | # Connect to visdom and open the corresponding window in the browser 50 | try: 51 | self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) 52 | except ConnectionError: 53 | raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " 54 | "start it.") 55 | # webbrowser.open("http://localhost:8097/env/" + self.env_name) 56 | 57 | # Create the windows 58 | self.loss_win = None 59 | self.eer_win = None 60 | # self.lr_win = None 61 | self.implementation_win = None 62 | self.projection_win = None 63 | self.implementation_string = "" 64 | 65 | def log_params(self): 66 | if self.disabled: 67 | return 68 | from encoder import params_data 69 | from encoder import params_model 70 | param_string = "Model parameters:
" 71 | for param_name in (p for p in dir(params_model) if not p.startswith("__")): 72 | value = getattr(params_model, param_name) 73 | param_string += "\t%s: %s
" % (param_name, value) 74 | param_string += "Data parameters:
" 75 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 76 | value = getattr(params_data, param_name) 77 | param_string += "\t%s: %s
" % (param_name, value) 78 | self.vis.text(param_string, opts={"title": "Parameters"}) 79 | 80 | def log_dataset(self, dataset: SpeakerVerificationDataset): 81 | if self.disabled: 82 | return 83 | dataset_string = "" 84 | dataset_string += "Speakers: %s\n" % len(dataset.speakers) 85 | dataset_string += "\n" + dataset.get_logs() 86 | dataset_string = dataset_string.replace("\n", "
") 87 | self.vis.text(dataset_string, opts={"title": "Dataset"}) 88 | 89 | def log_implementation(self, params): 90 | if self.disabled: 91 | return 92 | implementation_string = "" 93 | for param, value in params.items(): 94 | implementation_string += "%s: %s\n" % (param, value) 95 | implementation_string = implementation_string.replace("\n", "
") 96 | self.implementation_string = implementation_string 97 | self.implementation_win = self.vis.text( 98 | implementation_string, 99 | opts={"title": "Training implementation"} 100 | ) 101 | 102 | def update(self, loss, eer, step): 103 | # Update the tracking data 104 | now = timer() 105 | self.step_times.append(1000 * (now - self.last_update_timestamp)) 106 | self.last_update_timestamp = now 107 | self.losses.append(loss) 108 | self.eers.append(eer) 109 | print(".", end="") 110 | 111 | # Update the plots every steps 112 | if step % self.update_every != 0: 113 | return 114 | time_string = "Step time: mean: %5dms std: %5dms" % \ 115 | (int(np.mean(self.step_times)), int(np.std(self.step_times))) 116 | print("\nStep %6d Loss: %.4f EER: %.4f %s" % 117 | (step, np.mean(self.losses), np.mean(self.eers), time_string)) 118 | if not self.disabled: 119 | self.loss_win = self.vis.line( 120 | [np.mean(self.losses)], 121 | [step], 122 | win=self.loss_win, 123 | update="append" if self.loss_win else None, 124 | opts=dict( 125 | legend=["Avg. loss"], 126 | xlabel="Step", 127 | ylabel="Loss", 128 | title="Loss", 129 | ) 130 | ) 131 | self.eer_win = self.vis.line( 132 | [np.mean(self.eers)], 133 | [step], 134 | win=self.eer_win, 135 | update="append" if self.eer_win else None, 136 | opts=dict( 137 | legend=["Avg. EER"], 138 | xlabel="Step", 139 | ylabel="EER", 140 | title="Equal error rate" 141 | ) 142 | ) 143 | if self.implementation_win is not None: 144 | self.vis.text( 145 | self.implementation_string + ("%s" % time_string), 146 | win=self.implementation_win, 147 | opts={"title": "Training implementation"}, 148 | ) 149 | 150 | # Reset the tracking 151 | self.losses.clear() 152 | self.eers.clear() 153 | self.step_times.clear() 154 | 155 | def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, 156 | max_speakers=10): 157 | max_speakers = min(max_speakers, len(colormap)) 158 | embeds = embeds[:max_speakers * utterances_per_speaker] 159 | 160 | n_speakers = len(embeds) // utterances_per_speaker 161 | ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) 162 | colors = [colormap[i] for i in ground_truth] 163 | 164 | reducer = umap.UMAP() 165 | projected = reducer.fit_transform(embeds) 166 | plt.scatter(projected[:, 0], projected[:, 1], c=colors) 167 | plt.gca().set_aspect("equal", "datalim") 168 | plt.title("UMAP projection (step %d)" % step) 169 | if not self.disabled: 170 | self.projection_win = self.vis.matplot(plt, win=self.projection_win) 171 | if out_fpath is not None: 172 | plt.savefig(out_fpath) 173 | plt.clf() 174 | 175 | def save(self): 176 | if not self.disabled: 177 | self.vis.save([self.env_name]) 178 | -------------------------------------------------------------------------------- /helper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import librosa 3 | from pathlib import Path 4 | from encoder.inference import plot_embedding_as_heatmap 5 | import sounddevice as sd 6 | import wavio 7 | 8 | def draw_embed(embed, name, which): 9 | """ 10 | Draws an embedding. 11 | 12 | Parameters: 13 | embed (np.array): array of embedding 14 | 15 | name (str): title of plot 16 | 17 | 18 | Return: 19 | fig: matplotlib figure 20 | """ 21 | fig, embed_ax = plt.subplots() 22 | plot_embedding_as_heatmap(embed) 23 | embed_ax.set_title(name) 24 | embed_ax.set_aspect("equal", "datalim") 25 | embed_ax.set_xticks([]) 26 | embed_ax.set_yticks([]) 27 | embed_ax.figure.canvas.draw() 28 | return fig 29 | 30 | 31 | def create_spectrogram(voice_sample): 32 | """ 33 | Creates and saves a spectrogram plot for a sound sample. 34 | 35 | Parameters: 36 | voice_sample (str): path to sample of sound 37 | 38 | Return: 39 | fig 40 | """ 41 | 42 | in_fpath = Path(voice_sample.replace('"', "").replace("'", "")) 43 | original_wav, sampling_rate = librosa.load(str(in_fpath)) 44 | 45 | # Plot the signal read from wav file 46 | fig = plt.figure() 47 | plt.subplot(211) 48 | plt.title(f"Spectrogram of file {voice_sample}") 49 | 50 | plt.plot(original_wav) 51 | plt.xlabel("Sample") 52 | plt.ylabel("Amplitude") 53 | 54 | plt.subplot(212) 55 | plt.specgram(original_wav, Fs=sampling_rate) 56 | plt.xlabel("Time") 57 | plt.ylabel("Frequency") 58 | # plt.savefig(voice_sample.split(".")[0] + "_spectogram.png") 59 | return fig 60 | 61 | def read_audio(file): 62 | with open(file, "rb") as audio_file: 63 | audio_bytes = audio_file.read() 64 | return audio_bytes 65 | 66 | def record(duration=5, fs=48000): 67 | sd.default.samplerate = fs 68 | sd.default.channels = 1 69 | myrecording = sd.rec(int(duration * fs)) 70 | sd.wait(duration) 71 | return myrecording 72 | 73 | def save_record(path_myrecording, myrecording, fs): 74 | wavio.write(path_myrecording, myrecording, fs, sampwidth=2) 75 | return None 76 | -------------------------------------------------------------------------------- /requirements_demo.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.10.0 2 | altair==4.1.0 3 | appdirs==1.4.4 4 | appnope==0.1.0 5 | argon2-cffi==20.1.0 6 | astor==0.8.1 7 | async-generator==1.10 8 | attrs==20.2.0 9 | audioread==2.1.8 10 | backcall==0.2.0 11 | base58==2.0.1 12 | bleach==3.2.1 13 | blinker==1.4 14 | boto3==1.15.7 15 | botocore==1.18.7 16 | cachetools==4.1.1 17 | certifi==2020.6.20 18 | cffi==1.14.3 19 | chardet==3.0.4 20 | click==7.1.2 21 | cycler==0.10.0 22 | decorator==4.4.2 23 | defusedxml==0.6.0 24 | dill==0.3.2 25 | entrypoints==0.3 26 | enum-compat==0.0.3 27 | ffmpeg==1.4 28 | future==0.18.2 29 | gast==0.2.2 30 | google-pasta==0.2.0 31 | grpcio==1.32.0 32 | h5py==2.10.0 33 | idna==2.10 34 | importlib-metadata==2.0.0 35 | inflect==4.1.0 36 | ipykernel==5.3.4 37 | ipython==7.18.1 38 | ipython-genutils==0.2.0 39 | ipywidgets==7.5.1 40 | jedi==0.17.2 41 | Jinja2==2.11.2 42 | jmespath==0.10.0 43 | joblib==0.16.0 44 | jsonpatch==1.26 45 | jsonpointer==2.0 46 | jsonschema==3.2.0 47 | jupyter-client==6.1.7 48 | jupyter-core==4.6.3 49 | jupyterlab-pygments==0.1.2 50 | Keras-Applications==1.0.8 51 | Keras-Preprocessing==1.1.2 52 | kiwisolver==1.2.0 53 | librosa==0.8.0 54 | llvmlite==0.31.0 55 | Markdown==3.2.2 56 | MarkupSafe==1.1.1 57 | matplotlib==3.2.2 58 | mistune==0.8.4 59 | multiprocess==0.70.10 60 | nbclient==0.5.0 61 | nbconvert==6.0.6 62 | nbformat==5.0.7 63 | nest-asyncio==1.4.1 64 | notebook==6.1.4 65 | numba==0.48.0 66 | numpy==1.19.2 67 | opt-einsum==3.3.0 68 | packaging==20.4 69 | pandas==1.1.2 70 | pandocfilters==1.4.2 71 | parso==0.7.1 72 | pathtools==0.1.2 73 | pexpect==4.8.0 74 | pickleshare==0.7.5 75 | Pillow==7.2.0 76 | pooch==1.2.0 77 | prometheus-client==0.8.0 78 | prompt-toolkit==3.0.7 79 | protobuf==3.13.0 80 | ptyprocess==0.6.0 81 | pyarrow==1.0.1 82 | pycparser==2.20 83 | pydeck==0.5.0b1 84 | Pygments==2.7.1 85 | pyparsing==2.4.7 86 | PyQt5==5.15.1 87 | PyQt5-sip==12.8.1 88 | pyrsistent==0.17.3 89 | python-dateutil==2.8.1 90 | pytz==2020.1 91 | pyzmq==19.0.2 92 | requests==2.24.0 93 | resampy==0.2.2 94 | s3transfer==0.3.3 95 | scikit-learn==0.23.2 96 | scipy==1.5.2 97 | Send2Trash==1.5.0 98 | six==1.15.0 99 | sounddevice==0.4.1 100 | SoundFile==0.10.3.post1 101 | streamlit==0.67.1 102 | tensorboard==1.15.0 103 | tensorflow==1.15.0 104 | tensorflow-estimator==1.15.1 105 | termcolor==1.1.0 106 | terminado==0.9.1 107 | testpath==0.4.4 108 | threadpoolctl==2.1.0 109 | toml==0.10.1 110 | toolz==0.11.1 111 | torch==1.6.0 112 | torchfile==0.1.0 113 | torchvision==0.7.0 114 | tornado==6.0.4 115 | tqdm==4.50.0 116 | traitlets==5.0.4 117 | tzlocal==2.1 118 | umap-learn==0.4.6 119 | Unidecode==1.1.1 120 | urllib3==1.25.10 121 | validators==0.18.1 122 | visdom==0.1.8.9 123 | watchdog==0.10.3 124 | wavio==0.0.4 125 | wcwidth==0.2.5 126 | webencodings==0.5.1 127 | webrtcvad==2.0.10 128 | websocket-client==0.57.0 129 | Werkzeug==1.0.1 130 | widgetsnbextension==3.5.1 131 | wrapt==1.12.1 132 | zipp==3.2.0 133 | -------------------------------------------------------------------------------- /samples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/.DS_Store -------------------------------------------------------------------------------- /samples/1320_00000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/1320_00000.mp3 -------------------------------------------------------------------------------- /samples/3575_00000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/3575_00000.mp3 -------------------------------------------------------------------------------- /samples/8230_00000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/8230_00000.mp3 -------------------------------------------------------------------------------- /samples/README.md: -------------------------------------------------------------------------------- 1 | The audio files in this folder are provided for toolbox testing and 2 | benchmarking purposes. These are the same reference utterances 3 | used by the SV2TTS authors to generate the audio samples located at: 4 | https://google.github.io/tacotron/publications/speaker_adaptation/index.html 5 | 6 | The `p240_00000.mp3` and `p260_00000.mp3` files are compressed 7 | versions of audios from the VCTK corpus available at: 8 | https://datashare.is.ed.ac.uk/handle/10283/3443 9 | VCTK.txt contains the copyright notices and licensing information. 10 | 11 | The `1320_00000.mp3`, `3575_00000.mp3`, `6829_00000.mp3` 12 | and `8230_00000.mp3` files are compressed versions of audios 13 | from the LibriSpeech dataset available at: https://openslr.org/12 14 | For these files, the following notice applies: 15 | ``` 16 | LibriSpeech (c) 2014 by Vassil Panayotov 17 | 18 | LibriSpeech ASR corpus is licensed under a 19 | Creative Commons Attribution 4.0 International License. 20 | 21 | See . 22 | ``` 23 | -------------------------------------------------------------------------------- /samples/VCTK.txt: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------- 2 | CSTR VCTK Corpus 3 | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit 4 | 5 | (Version 0.92) 6 | RELEASE September 2019 7 | The Centre for Speech Technology Research 8 | University of Edinburgh 9 | Copyright (c) 2019 10 | 11 | Junichi Yamagishi 12 | jyamagis@inf.ed.ac.uk 13 | --------------------------------------------------------------------- 14 | 15 | Overview 16 | 17 | This CSTR VCTK Corpus includes speech data uttered by 110 English 18 | speakers with various accents. Each speaker reads out about 400 19 | sentences, which were selected from a newspaper, the rainbow passage 20 | and an elicitation paragraph used for the speech accent archive. 21 | 22 | The newspaper texts were taken from Herald Glasgow, with permission 23 | from Herald & Times Group. Each speaker has a different set of the 24 | newspaper texts selected based a greedy algorithm that increases the 25 | contextual and phonetic coverage. The details of the text selection 26 | algorithms are described in the following paper: 27 | 28 | C. Veaux, J. Yamagishi and S. King, 29 | "The voice bank corpus: Design, collection and data analysis of 30 | a large regional accent speech database," 31 | https://doi.org/10.1109/ICSDA.2013.6709856 32 | 33 | The rainbow passage and elicitation paragraph are the same for all 34 | speakers. The rainbow passage can be found at International Dialects 35 | of English Archive: 36 | (http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation 37 | paragraph is identical to the one used for the speech accent archive 38 | (http://accent.gmu.edu). The details of the the speech accent archive 39 | can be found at 40 | http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf 41 | 42 | All speech data was recorded using an identical recording setup: an 43 | omni-directional microphone (DPA 4035) and a small diaphragm condenser 44 | microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz 45 | sampling frequency at 24 bits and in a hemi-anechoic chamber of 46 | the University of Edinburgh. (However, two speakers, p280 and p315 47 | had technical issues of the audio recordings using MKH 800). 48 | All recordings were converted into 16 bits, were downsampled to 49 | 48 kHz, and were manually end-pointed. 50 | 51 | This corpus was originally aimed for HMM-based text-to-speech synthesis 52 | systems, especially for speaker-adaptive HMM-based speech synthesis 53 | that uses average voice models trained on multiple speakers and speaker 54 | adaptation technologies. This corpus is also suitable for DNN-based 55 | multi-speaker text-to-speech synthesis systems and waveform modeling. 56 | 57 | COPYING 58 | 59 | This corpus is licensed under the Creative Commons License: Attribution 4.0 International 60 | http://creativecommons.org/licenses/by/4.0/legalcode 61 | 62 | VCTK VARIANTS 63 | There are several variants of the VCTK corpus: 64 | Speech enhancement 65 | - Noisy speech database for training speech enhancement algorithms and TTS models where we added various types of noises to VCTK artificially: http://dx.doi.org/10.7488/ds/2117 66 | - Reverberant speech database for training speech dereverberation algorithms and TTS models where we added various types of reverberantion to VCTK artificially http://dx.doi.org/10.7488/ds/1425 67 | - Noisy reverberant speech database for training speech enhancement algorithms and TTS models http://dx.doi.org/10.7488/ds/2139 68 | - Device Recorded VCTK where speech signals of the VCTK corpus were played back and re-recorded in office environments using relatively inexpensive consumer devices http://dx.doi.org/10.7488/ds/2316 69 | - The Microsoft Scalable Noisy Speech Dataset (MS-SNSD) https://github.com/microsoft/MS-SNSD 70 | 71 | ASV and anti-spoofing 72 | - Spoofing and Anti-Spoofing (SAS) corpus, which is a collection of synthetic speech signals produced by nine techniques, two of which are speech synthesis, and seven are voice conversion. All of them were built using the VCTK corpus. http://dx.doi.org/10.7488/ds/252 73 | - Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) Database. This database consists of synthetic speech signals produced by ten techniques and this has been used in the first Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) http://dx.doi.org/10.7488/ds/298 74 | - ASVspoof 2019: The 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge database. This database has been used in the 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2019) https://doi.org/10.7488/ds/2555 75 | 76 | 77 | ACKNOWLEDGEMENTS 78 | 79 | The CSTR VCTK Corpus was constructed by: 80 | 81 | Christophe Veaux (University of Edinburgh) 82 | Junichi Yamagishi (University of Edinburgh) 83 | Kirsten MacDonald 84 | 85 | The research leading to these results was partly funded from EPSRC 86 | grants EP/I031022/1 (NST) and EP/J002526/1 (CAF), from the RSE-NSFC 87 | grant (61111130120), and from the JST CREST (uDialogue). 88 | 89 | Please cite this corpus as follows: 90 | Christophe Veaux, Junichi Yamagishi, Kirsten MacDonald, 91 | "CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit", 92 | The Centre for Speech Technology Research (CSTR), 93 | University of Edinburgh 94 | 95 | -------------------------------------------------------------------------------- /samples/myvoice.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/myvoice.mp3 -------------------------------------------------------------------------------- /samples/p240_00000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/p240_00000.mp3 -------------------------------------------------------------------------------- /samples/p260_00000.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/samples/p260_00000.mp3 -------------------------------------------------------------------------------- /slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/slides.pdf -------------------------------------------------------------------------------- /synthesizer/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /synthesizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /synthesizer/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/__pycache__/audio.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/audio.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/__pycache__/hparams.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/hparams.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/__pycache__/inference.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/inference.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/__pycache__/infolog.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/infolog.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/__pycache__/tacotron2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/__pycache__/tacotron2.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr): 18 | librosa.output.write_wav(path, wav, sr=sr) 19 | 20 | def preemphasis(wav, k, preemphasize=True): 21 | if preemphasize: 22 | return signal.lfilter([1, -k], [1], wav) 23 | return wav 24 | 25 | def inv_preemphasis(wav, k, inv_preemphasize=True): 26 | if inv_preemphasize: 27 | return signal.lfilter([1], [1, -k], wav) 28 | return wav 29 | 30 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py 31 | def start_and_end_indices(quantized, silence_threshold=2): 32 | for start in range(quantized.size): 33 | if abs(quantized[start] - 127) > silence_threshold: 34 | break 35 | for end in range(quantized.size - 1, 1, -1): 36 | if abs(quantized[end] - 127) > silence_threshold: 37 | break 38 | 39 | assert abs(quantized[start] - 127) > silence_threshold 40 | assert abs(quantized[end] - 127) > silence_threshold 41 | 42 | return start, end 43 | 44 | def get_hop_size(hparams): 45 | hop_size = hparams.hop_size 46 | if hop_size is None: 47 | assert hparams.frame_shift_ms is not None 48 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 49 | return hop_size 50 | 51 | def linearspectrogram(wav, hparams): 52 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 53 | S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db 54 | 55 | if hparams.signal_normalization: 56 | return _normalize(S, hparams) 57 | return S 58 | 59 | def melspectrogram(wav, hparams): 60 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 61 | S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db 62 | 63 | if hparams.signal_normalization: 64 | return _normalize(S, hparams) 65 | return S 66 | 67 | def inv_linear_spectrogram(linear_spectrogram, hparams): 68 | """Converts linear spectrogram to waveform using librosa""" 69 | if hparams.signal_normalization: 70 | D = _denormalize(linear_spectrogram, hparams) 71 | else: 72 | D = linear_spectrogram 73 | 74 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 75 | 76 | if hparams.use_lws: 77 | processor = _lws_processor(hparams) 78 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 79 | y = processor.istft(D).astype(np.float32) 80 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 81 | else: 82 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 83 | 84 | def inv_mel_spectrogram(mel_spectrogram, hparams): 85 | """Converts mel spectrogram to waveform using librosa""" 86 | if hparams.signal_normalization: 87 | D = _denormalize(mel_spectrogram, hparams) 88 | else: 89 | D = mel_spectrogram 90 | 91 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear 92 | 93 | if hparams.use_lws: 94 | processor = _lws_processor(hparams) 95 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 96 | y = processor.istft(D).astype(np.float32) 97 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 98 | else: 99 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 100 | 101 | def _lws_processor(hparams): 102 | import lws 103 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") 104 | 105 | def _griffin_lim(S, hparams): 106 | """librosa implementation of Griffin-Lim 107 | Based on https://github.com/librosa/librosa/issues/434 108 | """ 109 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 110 | S_complex = np.abs(S).astype(np.complex) 111 | y = _istft(S_complex * angles, hparams) 112 | for i in range(hparams.griffin_lim_iters): 113 | angles = np.exp(1j * np.angle(_stft(y, hparams))) 114 | y = _istft(S_complex * angles, hparams) 115 | return y 116 | 117 | def _stft(y, hparams): 118 | if hparams.use_lws: 119 | return _lws_processor(hparams).stft(y).T 120 | else: 121 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 122 | 123 | def _istft(y, hparams): 124 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 125 | 126 | ########################################################## 127 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 128 | def num_frames(length, fsize, fshift): 129 | """Compute number of time frames of spectrogram 130 | """ 131 | pad = (fsize - fshift) 132 | if length % fshift == 0: 133 | M = (length + pad * 2 - fsize) // fshift + 1 134 | else: 135 | M = (length + pad * 2 - fsize) // fshift + 2 136 | return M 137 | 138 | 139 | def pad_lr(x, fsize, fshift): 140 | """Compute left and right padding 141 | """ 142 | M = num_frames(len(x), fsize, fshift) 143 | pad = (fsize - fshift) 144 | T = len(x) + 2 * pad 145 | r = (M - 1) * fshift + fsize - T 146 | return pad, pad + r 147 | ########################################################## 148 | #Librosa correct padding 149 | def librosa_pad_lr(x, fsize, fshift): 150 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 151 | 152 | # Conversions 153 | _mel_basis = None 154 | _inv_mel_basis = None 155 | 156 | def _linear_to_mel(spectogram, hparams): 157 | global _mel_basis 158 | if _mel_basis is None: 159 | _mel_basis = _build_mel_basis(hparams) 160 | return np.dot(_mel_basis, spectogram) 161 | 162 | def _mel_to_linear(mel_spectrogram, hparams): 163 | global _inv_mel_basis 164 | if _inv_mel_basis is None: 165 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 166 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 167 | 168 | def _build_mel_basis(hparams): 169 | assert hparams.fmax <= hparams.sample_rate // 2 170 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 171 | fmin=hparams.fmin, fmax=hparams.fmax) 172 | 173 | def _amp_to_db(x, hparams): 174 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 175 | return 20 * np.log10(np.maximum(min_level, x)) 176 | 177 | def _db_to_amp(x): 178 | return np.power(10.0, (x) * 0.05) 179 | 180 | def _normalize(S, hparams): 181 | if hparams.allow_clipping_in_normalization: 182 | if hparams.symmetric_mels: 183 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 184 | -hparams.max_abs_value, hparams.max_abs_value) 185 | else: 186 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 187 | 188 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 189 | if hparams.symmetric_mels: 190 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 191 | else: 192 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 193 | 194 | def _denormalize(D, hparams): 195 | if hparams.allow_clipping_in_normalization: 196 | if hparams.symmetric_mels: 197 | return (((np.clip(D, -hparams.max_abs_value, 198 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 199 | + hparams.min_level_db) 200 | else: 201 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 202 | 203 | if hparams.symmetric_mels: 204 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 205 | else: 206 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 207 | -------------------------------------------------------------------------------- /synthesizer/feeder.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from synthesizer.utils.text import text_to_sequence 3 | from synthesizer.infolog import log 4 | import tensorflow as tf 5 | import numpy as np 6 | import threading 7 | import time 8 | import os 9 | 10 | _batches_per_group = 64 11 | 12 | class Feeder: 13 | """ 14 | Feeds batches of data into queue on a background thread. 15 | """ 16 | 17 | def __init__(self, coordinator, metadata_filename, hparams): 18 | super(Feeder, self).__init__() 19 | self._coord = coordinator 20 | self._hparams = hparams 21 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] 22 | self._train_offset = 0 23 | self._test_offset = 0 24 | 25 | # Load metadata 26 | self._mel_dir = os.path.join(os.path.dirname(metadata_filename), "mels") 27 | self._embed_dir = os.path.join(os.path.dirname(metadata_filename), "embeds") 28 | with open(metadata_filename, encoding="utf-8") as f: 29 | self._metadata = [line.strip().split("|") for line in f] 30 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 31 | hours = sum([int(x[4]) for x in self._metadata]) * frame_shift_ms / (3600) 32 | log("Loaded metadata for {} examples ({:.2f} hours)".format(len(self._metadata), hours)) 33 | 34 | #Train test split 35 | if hparams.tacotron_test_size is None: 36 | assert hparams.tacotron_test_batches is not None 37 | 38 | test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None 39 | else hparams.tacotron_test_batches * hparams.tacotron_batch_size) 40 | indices = np.arange(len(self._metadata)) 41 | train_indices, test_indices = train_test_split(indices, 42 | test_size=test_size, random_state=hparams.tacotron_data_random_state) 43 | 44 | #Make sure test_indices is a multiple of batch_size else round up 45 | len_test_indices = self._round_down(len(test_indices), hparams.tacotron_batch_size) 46 | extra_test = test_indices[len_test_indices:] 47 | test_indices = test_indices[:len_test_indices] 48 | train_indices = np.concatenate([train_indices, extra_test]) 49 | 50 | self._train_meta = list(np.array(self._metadata)[train_indices]) 51 | self._test_meta = list(np.array(self._metadata)[test_indices]) 52 | 53 | self.test_steps = len(self._test_meta) // hparams.tacotron_batch_size 54 | 55 | if hparams.tacotron_test_size is None: 56 | assert hparams.tacotron_test_batches == self.test_steps 57 | 58 | #pad input sequences with the 0 ( _ ) 59 | self._pad = 0 60 | #explicitely setting the padding to a value that doesn"t originally exist in the spectogram 61 | #to avoid any possible conflicts, without affecting the output range of the model too much 62 | if hparams.symmetric_mels: 63 | self._target_pad = -hparams.max_abs_value 64 | else: 65 | self._target_pad = 0. 66 | #Mark finished sequences with 1s 67 | self._token_pad = 1. 68 | 69 | with tf.device("/cpu:0"): 70 | # Create placeholders for inputs and targets. Don"t specify batch size because we want 71 | # to be able to feed different batch sizes at eval time. 72 | self._placeholders = [ 73 | tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"), 74 | tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"), 75 | tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 76 | name="mel_targets"), 77 | tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"), 78 | tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), 79 | tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 80 | name="split_infos"), 81 | 82 | # SV2TTS 83 | tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 84 | name="speaker_embeddings") 85 | ] 86 | 87 | # Create queue for buffering data 88 | queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 89 | tf.int32, tf.int32, tf.float32], name="input_queue") 90 | self._enqueue_op = queue.enqueue(self._placeholders) 91 | self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ 92 | self.targets_lengths, self.split_infos, self.speaker_embeddings = queue.dequeue() 93 | 94 | self.inputs.set_shape(self._placeholders[0].shape) 95 | self.input_lengths.set_shape(self._placeholders[1].shape) 96 | self.mel_targets.set_shape(self._placeholders[2].shape) 97 | self.token_targets.set_shape(self._placeholders[3].shape) 98 | self.targets_lengths.set_shape(self._placeholders[4].shape) 99 | self.split_infos.set_shape(self._placeholders[5].shape) 100 | self.speaker_embeddings.set_shape(self._placeholders[6].shape) 101 | 102 | # Create eval queue for buffering eval data 103 | eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, 104 | tf.int32, tf.int32, tf.float32], name="eval_queue") 105 | self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) 106 | self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ 107 | self.eval_token_targets, self.eval_targets_lengths, \ 108 | self.eval_split_infos, self.eval_speaker_embeddings = eval_queue.dequeue() 109 | 110 | self.eval_inputs.set_shape(self._placeholders[0].shape) 111 | self.eval_input_lengths.set_shape(self._placeholders[1].shape) 112 | self.eval_mel_targets.set_shape(self._placeholders[2].shape) 113 | self.eval_token_targets.set_shape(self._placeholders[3].shape) 114 | self.eval_targets_lengths.set_shape(self._placeholders[4].shape) 115 | self.eval_split_infos.set_shape(self._placeholders[5].shape) 116 | self.eval_speaker_embeddings.set_shape(self._placeholders[6].shape) 117 | 118 | 119 | def start_threads(self, session): 120 | self._session = session 121 | thread = threading.Thread(name="background", target=self._enqueue_next_train_group) 122 | thread.daemon = True #Thread will close when parent quits 123 | thread.start() 124 | 125 | thread = threading.Thread(name="background", target=self._enqueue_next_test_group) 126 | thread.daemon = True #Thread will close when parent quits 127 | thread.start() 128 | 129 | def _get_test_groups(self): 130 | meta = self._test_meta[self._test_offset] 131 | self._test_offset += 1 132 | 133 | text = meta[5] 134 | 135 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 136 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 137 | #Create parallel sequences containing zeros to represent a non finished sequence 138 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 139 | embed_target = np.load(os.path.join(self._embed_dir, meta[2])) 140 | return input_data, mel_target, token_target, embed_target, len(mel_target) 141 | 142 | def make_test_batches(self): 143 | start = time.time() 144 | 145 | # Read a group of examples 146 | n = self._hparams.tacotron_batch_size 147 | r = self._hparams.outputs_per_step 148 | 149 | #Test on entire test set 150 | examples = [self._get_test_groups() for i in range(len(self._test_meta))] 151 | 152 | # Bucket examples based on similar output sequence length for efficiency 153 | examples.sort(key=lambda x: x[-1]) 154 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 155 | np.random.shuffle(batches) 156 | 157 | log("\nGenerated %d test batches of size %d in %.3f sec" % (len(batches), n, time.time() - start)) 158 | return batches, r 159 | 160 | def _enqueue_next_train_group(self): 161 | while not self._coord.should_stop(): 162 | start = time.time() 163 | 164 | # Read a group of examples 165 | n = self._hparams.tacotron_batch_size 166 | r = self._hparams.outputs_per_step 167 | examples = [self._get_next_example() for i in range(n * _batches_per_group)] 168 | 169 | # Bucket examples based on similar output sequence length for efficiency 170 | examples.sort(key=lambda x: x[-1]) 171 | batches = [examples[i: i+n] for i in range(0, len(examples), n)] 172 | np.random.shuffle(batches) 173 | 174 | log("\nGenerated {} train batches of size {} in {:.3f} sec".format(len(batches), n, time.time() - start)) 175 | for batch in batches: 176 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 177 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 178 | 179 | def _enqueue_next_test_group(self): 180 | #Create test batches once and evaluate on them for all test steps 181 | test_batches, r = self.make_test_batches() 182 | while not self._coord.should_stop(): 183 | for batch in test_batches: 184 | feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, r))) 185 | self._session.run(self._eval_enqueue_op, feed_dict=feed_dict) 186 | 187 | def _get_next_example(self): 188 | """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk 189 | """ 190 | if self._train_offset >= len(self._train_meta): 191 | self._train_offset = 0 192 | np.random.shuffle(self._train_meta) 193 | 194 | meta = self._train_meta[self._train_offset] 195 | self._train_offset += 1 196 | 197 | text = meta[5] 198 | 199 | input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) 200 | mel_target = np.load(os.path.join(self._mel_dir, meta[1])) 201 | #Create parallel sequences containing zeros to represent a non finished sequence 202 | token_target = np.asarray([0.] * (len(mel_target) - 1)) 203 | embed_target = np.load(os.path.join(self._embed_dir, meta[2])) 204 | return input_data, mel_target, token_target, embed_target, len(mel_target) 205 | 206 | def _prepare_batch(self, batches, outputs_per_step): 207 | assert 0 == len(batches) % self._hparams.tacotron_num_gpus 208 | size_per_device = int(len(batches) / self._hparams.tacotron_num_gpus) 209 | np.random.shuffle(batches) 210 | 211 | inputs = None 212 | mel_targets = None 213 | token_targets = None 214 | targets_lengths = None 215 | split_infos = [] 216 | 217 | targets_lengths = np.asarray([x[-1] for x in batches], dtype=np.int32) #Used to mask loss 218 | input_lengths = np.asarray([len(x[0]) for x in batches], dtype=np.int32) 219 | 220 | for i in range(self._hparams.tacotron_num_gpus): 221 | batch = batches[size_per_device*i:size_per_device*(i+1)] 222 | input_cur_device, input_max_len = self._prepare_inputs([x[0] for x in batch]) 223 | inputs = np.concatenate((inputs, input_cur_device), axis=1) if inputs is not None else input_cur_device 224 | mel_target_cur_device, mel_target_max_len = self._prepare_targets([x[1] for x in batch], outputs_per_step) 225 | mel_targets = np.concatenate(( mel_targets, mel_target_cur_device), axis=1) if mel_targets is not None else mel_target_cur_device 226 | 227 | #Pad sequences with 1 to infer that the sequence is done 228 | token_target_cur_device, token_target_max_len = self._prepare_token_targets([x[2] for x in batch], outputs_per_step) 229 | token_targets = np.concatenate((token_targets, token_target_cur_device),axis=1) if token_targets is not None else token_target_cur_device 230 | split_infos.append([input_max_len, mel_target_max_len, token_target_max_len]) 231 | 232 | split_infos = np.asarray(split_infos, dtype=np.int32) 233 | 234 | ### SV2TTS ### 235 | 236 | embed_targets = np.asarray([x[3] for x in batches]) 237 | 238 | ############## 239 | 240 | return inputs, input_lengths, mel_targets, token_targets, targets_lengths, \ 241 | split_infos, embed_targets 242 | 243 | def _prepare_inputs(self, inputs): 244 | max_len = max([len(x) for x in inputs]) 245 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 246 | 247 | def _prepare_targets(self, targets, alignment): 248 | max_len = max([len(t) for t in targets]) 249 | data_len = self._round_up(max_len, alignment) 250 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 251 | 252 | def _prepare_token_targets(self, targets, alignment): 253 | max_len = max([len(t) for t in targets]) + 1 254 | data_len = self._round_up(max_len, alignment) 255 | return np.stack([self._pad_token_target(t, data_len) for t in targets]), data_len 256 | 257 | def _pad_input(self, x, length): 258 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad) 259 | 260 | def _pad_target(self, t, length): 261 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad) 262 | 263 | def _pad_token_target(self, t, length): 264 | return np.pad(t, (0, length - t.shape[0]), mode="constant", constant_values=self._token_pad) 265 | 266 | def _round_up(self, x, multiple): 267 | remainder = x % multiple 268 | return x if remainder == 0 else x + multiple - remainder 269 | 270 | def _round_down(self, x, multiple): 271 | remainder = x % multiple 272 | return x if remainder == 0 else x - remainder 273 | -------------------------------------------------------------------------------- /synthesizer/inference.py: -------------------------------------------------------------------------------- 1 | from synthesizer.tacotron2 import Tacotron2 2 | from synthesizer.hparams import hparams 3 | from multiprocess.pool import Pool # You're free to use either one 4 | #from multiprocessing import Pool # 5 | from synthesizer import audio 6 | from pathlib import Path 7 | from typing import Union, List 8 | import tensorflow as tf 9 | import numpy as np 10 | import numba.cuda 11 | import librosa 12 | 13 | 14 | class Synthesizer: 15 | sample_rate = hparams.sample_rate 16 | hparams = hparams 17 | 18 | def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, seed=None): 19 | """ 20 | Creates a synthesizer ready for inference. The actual model isn't loaded in memory until 21 | needed or until load() is called. 22 | 23 | :param checkpoints_dir: path to the directory containing the checkpoint file as well as the 24 | weight files (.data, .index and .meta files) 25 | :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too 26 | :param low_mem: if True, the model will be loaded in a separate process and its resources 27 | will be released after each usage. Adds a large overhead, only recommended if your GPU 28 | memory is low (<= 2gb) 29 | :param seed: optional integer for seeding random number generators when initializing model 30 | This makes the synthesizer output consistent for a given embedding and input text. 31 | However, it requires the model to be reloaded whenever a text is synthesized. 32 | """ 33 | self.verbose = verbose 34 | self._low_mem = low_mem 35 | self._seed = seed 36 | 37 | # Prepare the model 38 | self._model = None # type: Tacotron2 39 | checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir) 40 | if checkpoint_state is None: 41 | raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) 42 | self.checkpoint_fpath = checkpoint_state.model_checkpoint_path 43 | if verbose: 44 | model_name = checkpoints_dir.parent.name.replace("logs-", "") 45 | step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) 46 | print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) 47 | 48 | def set_seed(self, new_seed): 49 | """ 50 | Updates the seed that initializes random number generators associated with Tacotron2. 51 | Returns the new seed state as confirmation. 52 | """ 53 | try: 54 | self._seed = int(new_seed) 55 | except: 56 | self._seed = None 57 | 58 | return self._seed 59 | 60 | def is_loaded(self): 61 | """ 62 | Whether the model is loaded in GPU memory. 63 | """ 64 | return self._model is not None 65 | 66 | def load(self): 67 | """ 68 | Effectively loads the model to GPU memory given the weights file that was passed in the 69 | constructor. 70 | """ 71 | if self._low_mem: 72 | raise Exception("Cannot load the synthesizer permanently in low mem mode") 73 | tf.compat.v1.reset_default_graph() 74 | self._model = Tacotron2(self.checkpoint_fpath, hparams, seed=self._seed) 75 | 76 | def synthesize_spectrograms(self, texts: List[str], 77 | embeddings: Union[np.ndarray, List[np.ndarray]], 78 | return_alignments=False): 79 | """ 80 | Synthesizes mel spectrograms from texts and speaker embeddings. 81 | 82 | :param texts: a list of N text prompts to be synthesized 83 | :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 84 | :param return_alignments: if True, a matrix representing the alignments between the 85 | characters 86 | and each decoder output step will be returned for each spectrogram 87 | :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 88 | sequence length of spectrogram i, and possibly the alignments. 89 | """ 90 | if not self._low_mem: 91 | # Usual inference mode: load the model on the first request and keep it loaded. 92 | # Reload it every time for deterministic operation if seed specified. 93 | if not self.is_loaded() or self._seed is not None: 94 | self.load() 95 | specs, alignments = self._model.my_synthesize(embeddings, texts) 96 | else: 97 | # Low memory inference mode: load the model upon every request. The model has to be 98 | # loaded in a separate process to be able to release GPU memory (a simple workaround 99 | # to tensorflow's intricacies) 100 | specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 101 | [(self.checkpoint_fpath, embeddings, texts)])[0] 102 | 103 | return (specs, alignments) if return_alignments else specs 104 | 105 | @staticmethod 106 | def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts): 107 | # Load the model and forward the inputs 108 | tf.compat.v1.reset_default_graph() 109 | model = Tacotron2(checkpoint_fpath, hparams, seed=self._seed) 110 | specs, alignments = model.my_synthesize(embeddings, texts) 111 | 112 | # Detach the outputs (not doing so will cause the process to hang) 113 | specs, alignments = [spec.copy() for spec in specs], alignments.copy() 114 | 115 | # Close cuda for this process 116 | model.session.close() 117 | numba.cuda.select_device(0) 118 | numba.cuda.close() 119 | 120 | return specs, alignments 121 | 122 | @staticmethod 123 | def load_preprocess_wav(fpath): 124 | """ 125 | Loads and preprocesses an audio file under the same conditions the audio files were used to 126 | train the synthesizer. 127 | """ 128 | wav = librosa.load(str(fpath), hparams.sample_rate)[0] 129 | if hparams.rescale: 130 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 131 | return wav 132 | 133 | @staticmethod 134 | def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): 135 | """ 136 | Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 137 | were fed to the synthesizer when training. 138 | """ 139 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 140 | wav = Synthesizer.load_preprocess_wav(fpath_or_wav) 141 | else: 142 | wav = fpath_or_wav 143 | 144 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 145 | return mel_spectrogram 146 | 147 | @staticmethod 148 | def griffin_lim(mel): 149 | """ 150 | Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built 151 | with the same parameters present in hparams.py. 152 | """ 153 | return audio.inv_mel_spectrogram(mel, hparams) 154 | -------------------------------------------------------------------------------- /synthesizer/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import json 3 | from datetime import datetime 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | _format = "%Y-%m-%d %H:%M:%S.%f" 8 | _file = None 9 | _run_name = None 10 | _slack_url = None 11 | 12 | 13 | def init(filename, run_name, slack_url=None): 14 | global _file, _run_name, _slack_url 15 | _close_logfile() 16 | _file = open(filename, "a") 17 | _file = open(filename, "a") 18 | _file.write("\n-----------------------------------------------------------------\n") 19 | _file.write("Starting new {} training run\n".format(run_name)) 20 | _file.write("-----------------------------------------------------------------\n") 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, end="\n", slack=False): 26 | print(msg, end=end) 27 | if _file is not None: 28 | _file.write("[%s] %s\n" % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header("Content-Type", "application/json") 43 | urlopen(req, json.dumps({ 44 | "username": "tacotron", 45 | "icon_emoji": ":taco:", 46 | "text": "*%s*: %s" % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /synthesizer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == "Tacotron": 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception("Unknown model: " + name) 9 | -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/architecture_wrappers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/architecture_wrappers.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/attention.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/attention.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/custom_decoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/custom_decoder.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/helpers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/helpers.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/modules.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/modules.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/__pycache__/tacotron.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/models/__pycache__/tacotron.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/models/architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers useful for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | import tensorflow as tf 6 | from synthesizer.models.attention import _compute_attention 7 | from tensorflow.contrib.rnn import RNNCell 8 | from tensorflow.python.framework import ops, tensor_shape 9 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops 10 | from tensorflow.python.util import nest 11 | 12 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 13 | 14 | 15 | 16 | class TacotronEncoderCell(RNNCell): 17 | """Tacotron 2 Encoder Cell 18 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 19 | layer to predict the hidden representation vector (or memory) 20 | """ 21 | 22 | def __init__(self, convolutional_layers, lstm_layer): 23 | """Initialize encoder parameters 24 | 25 | Args: 26 | convolutional_layers: Encoder convolutional block class 27 | lstm_layer: encoder bidirectional lstm layer class 28 | """ 29 | super(TacotronEncoderCell, self).__init__() 30 | #Initialize encoder layers 31 | self._convolutions = convolutional_layers 32 | self._cell = lstm_layer 33 | 34 | def __call__(self, inputs, input_lengths=None): 35 | #Pass input sequence through a stack of convolutional layers 36 | conv_output = self._convolutions(inputs) 37 | 38 | #Extract hidden representation from encoder lstm cells 39 | hidden_representation = self._cell(conv_output, input_lengths) 40 | 41 | #For shape visualization 42 | self.conv_output_shape = conv_output.shape 43 | return hidden_representation 44 | 45 | 46 | class TacotronDecoderCellState( 47 | collections.namedtuple("TacotronDecoderCellState", 48 | ("cell_state", "attention", "time", "alignments", 49 | "alignment_history"))): 50 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 51 | Contains: 52 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 53 | step. 54 | - `attention`: The attention emitted at the previous time step. 55 | - `time`: int32 scalar containing the current time step. 56 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 57 | emitted at the previous time step for each attention mechanism. 58 | - `alignment_history`: a single or tuple of `TensorArray`(s) 59 | containing alignment matrices from all time steps for each attention 60 | mechanism. Call `stack()` on each to convert to a `Tensor`. 61 | """ 62 | def replace(self, **kwargs): 63 | """Clones the current state while overwriting components provided by kwargs. 64 | """ 65 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 66 | 67 | class TacotronDecoderCell(RNNCell): 68 | """Tactron 2 Decoder Cell 69 | Decodes encoder output and previous mel frames into next r frames 70 | 71 | Decoder Step i: 72 | 1) Prenet to compress last output information 73 | 2) Concat compressed inputs with previous context vector (input feeding) * 74 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 75 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 76 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 77 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 78 | 79 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper, 80 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 81 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 82 | tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only. 83 | """ 84 | 85 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): 86 | """Initialize decoder parameters 87 | 88 | Args: 89 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 90 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 91 | learn encoder-decoder alignments 92 | rnn_cell: Instance of RNNCell, main body of the decoder 93 | frame_projection: tensorflow fully connected layer with r * num_mels output units 94 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 95 | and through a sigmoid activation 96 | mask_finished: Boolean, Whether to mask decoder frames after the 97 | """ 98 | super(TacotronDecoderCell, self).__init__() 99 | #Initialize decoder layers 100 | self._prenet = prenet 101 | self._attention_mechanism = attention_mechanism 102 | self._cell = rnn_cell 103 | self._frame_projection = frame_projection 104 | self._stop_projection = stop_projection 105 | 106 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 107 | 108 | def _batch_size_checks(self, batch_size, error_message): 109 | return [check_ops.assert_equal(batch_size, 110 | self._attention_mechanism.batch_size, 111 | message=error_message)] 112 | 113 | @property 114 | def output_size(self): 115 | return self._frame_projection.shape 116 | 117 | @property 118 | def state_size(self): 119 | """The `state_size` property of `TacotronDecoderCell`. 120 | 121 | Returns: 122 | An `TacotronDecoderCell` tuple containing shapes used by this object. 123 | """ 124 | return TacotronDecoderCellState( 125 | cell_state=self._cell._cell.state_size, 126 | time=tensor_shape.TensorShape([]), 127 | attention=self._attention_layer_size, 128 | alignments=self._attention_mechanism.alignments_size, 129 | alignment_history=()) 130 | 131 | def zero_state(self, batch_size, dtype): 132 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 133 | 134 | Args: 135 | batch_size: `0D` integer tensor: the batch size. 136 | dtype: The internal state data type. 137 | Returns: 138 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 139 | possibly, empty `TensorArray` objects. 140 | Raises: 141 | ValueError: (or, possibly at runtime, InvalidArgument), if 142 | `batch_size` does not match the output size of the encoder passed 143 | to the wrapper object at initialization time. 144 | """ 145 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 146 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 147 | error_message = ( 148 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 149 | "Non-matching batch sizes between the memory " 150 | "(encoder output) and the requested batch size.") 151 | with ops.control_dependencies( 152 | self._batch_size_checks(batch_size, error_message)): 153 | cell_state = nest.map_structure( 154 | lambda s: array_ops.identity(s, name="checked_cell_state"), 155 | cell_state) 156 | return TacotronDecoderCellState( 157 | cell_state=cell_state, 158 | time=array_ops.zeros([], dtype=tf.int32), 159 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 160 | dtype), 161 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 162 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 163 | dynamic_size=True)) 164 | 165 | def __call__(self, inputs, state): 166 | #Information bottleneck (essential for learning attention) 167 | prenet_output = self._prenet(inputs) 168 | 169 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 170 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 171 | 172 | #Unidirectional LSTM layers 173 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 174 | 175 | 176 | #Compute the attention (context) vector and alignments using 177 | #the new decoder cell hidden state as query vector 178 | #and cumulative alignments to extract location features 179 | #The choice of the new cell hidden state (s_{i}) of the last 180 | #decoder RNN Cell is based on Luong et Al. (2015): 181 | #https://arxiv.org/pdf/1508.04025.pdf 182 | previous_alignments = state.alignments 183 | previous_alignment_history = state.alignment_history 184 | context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 185 | LSTM_output, 186 | previous_alignments, 187 | attention_layer=None) 188 | 189 | #Concat LSTM outputs and context vector to form projections inputs 190 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 191 | 192 | #Compute predicted frames and predicted 193 | cell_outputs = self._frame_projection(projections_input) 194 | stop_tokens = self._stop_projection(projections_input) 195 | 196 | #Save alignment history 197 | alignment_history = previous_alignment_history.write(state.time, alignments) 198 | 199 | #Prepare next decoder state 200 | next_state = TacotronDecoderCellState( 201 | time=state.time + 1, 202 | cell_state=next_cell_state, 203 | attention=context_vector, 204 | alignments=cumulated_alignments, 205 | alignment_history=alignment_history) 206 | 207 | return (cell_outputs, stop_tokens), next_state 208 | -------------------------------------------------------------------------------- /synthesizer/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.layers import core as layers_core 6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope 7 | 8 | 9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, 11 | attention_layer): 12 | """Computes the attention and alignments for a given attention_mechanism.""" 13 | alignments, next_attention_state = attention_mechanism( 14 | cell_output, state=attention_state) 15 | 16 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 17 | expanded_alignments = array_ops.expand_dims(alignments, 1) 18 | # Context is the inner product of alignments and values along the 19 | # memory time dimension. 20 | # alignments shape is 21 | # [batch_size, 1, memory_time] 22 | # attention_mechanism.values shape is 23 | # [batch_size, memory_time, memory_size] 24 | # the batched matmul is over memory_time, so the output shape is 25 | # [batch_size, 1, memory_size]. 26 | # we then squeeze out the singleton dim. 27 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 28 | context = array_ops.squeeze(context, [1]) 29 | 30 | if attention_layer is not None: 31 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 32 | else: 33 | attention = context 34 | 35 | return attention, alignments, next_attention_state 36 | 37 | 38 | def _location_sensitive_score(W_query, W_fil, W_keys): 39 | """Impelements Bahdanau-style (cumulative) scoring function. 40 | This attention is described in: 41 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 42 | gio, “Attention-based models for speech recognition,” in Ad- 43 | vances in Neural Information Processing Systems, 2015, pp. 44 | 577–585. 45 | 46 | ############################################################################# 47 | hybrid attention (content-based + location-based) 48 | f = F * α_{i-1} 49 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 50 | ############################################################################# 51 | 52 | Args: 53 | W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features. 54 | W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]" 55 | W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs. 56 | Returns: 57 | A "[batch_size, max_time]" attention score (energy) 58 | """ 59 | # Get the number of hidden units from the trailing dimension of keys 60 | dtype = W_query.dtype 61 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 62 | 63 | v_a = tf.compat.v1.get_variable( 64 | "attention_variable_projection", shape=[num_units], dtype=dtype, 65 | initializer=tf.contrib.layers.xavier_initializer()) 66 | b_a = tf.compat.v1.get_variable( 67 | "attention_bias", shape=[num_units], dtype=dtype, 68 | initializer=tf.zeros_initializer()) 69 | 70 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 71 | 72 | def _smoothing_normalization(e): 73 | """Applies a smoothing normalization function instead of softmax 74 | Introduced in: 75 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 76 | gio, “Attention-based models for speech recognition,” in Ad- 77 | vances in Neural Information Processing Systems, 2015, pp. 78 | 577–585. 79 | 80 | ############################################################################ 81 | Smoothing normalization function 82 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 83 | ############################################################################ 84 | 85 | Args: 86 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 87 | values of an attention mechanism 88 | Returns: 89 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 90 | attendance to multiple memory time steps. 91 | """ 92 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 93 | 94 | 95 | class LocationSensitiveAttention(BahdanauAttention): 96 | """Impelements Bahdanau-style (cumulative) scoring function. 97 | Usually referred to as "hybrid" attention (content-based + location-based) 98 | Extends the additive attention described in: 99 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 100 | tion by jointly learning to align and translate,” in Proceedings 101 | of ICLR, 2015." 102 | to use previous alignments as additional location features. 103 | 104 | This attention is described in: 105 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 106 | gio, “Attention-based models for speech recognition,” in Ad- 107 | vances in Neural Information Processing Systems, 2015, pp. 108 | 577–585. 109 | """ 110 | 111 | def __init__(self, 112 | num_units, 113 | memory, 114 | hparams, 115 | mask_encoder=True, 116 | memory_sequence_length=None, 117 | smoothing=False, 118 | cumulate_weights=True, 119 | name="LocationSensitiveAttention"): 120 | """Construct the Attention mechanism. 121 | Args: 122 | num_units: The depth of the query mechanism. 123 | memory: The memory to query; usually the output of an RNN encoder. This 124 | tensor should be shaped `[batch_size, max_time, ...]`. 125 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 126 | memory_sequence_length (optional): Sequence lengths for the batch entries 127 | in memory. If provided, the memory tensor rows are masked with zeros 128 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 129 | smoothing (optional): Boolean. Determines which normalization function to use. 130 | Default normalization function (probablity_fn) is softmax. If smoothing is 131 | enabled, we replace softmax with: 132 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 133 | Introduced in: 134 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 135 | gio, “Attention-based models for speech recognition,” in Ad- 136 | vances in Neural Information Processing Systems, 2015, pp. 137 | 577–585. 138 | This is mainly used if the model wants to attend to multiple input parts 139 | at the same decoding step. We probably won"t be using it since multiple sound 140 | frames may depend on the same character/phone, probably not the way around. 141 | Note: 142 | We still keep it implemented in case we want to test it. They used it in the 143 | paper in the context of speech recognition, where one phoneme may depend on 144 | multiple subsequent sound frames. 145 | name: Name to use when creating ops. 146 | """ 147 | #Create normalization function 148 | #Setting it to None defaults in using softmax 149 | normalization_function = _smoothing_normalization if (smoothing == True) else None 150 | memory_length = memory_sequence_length if (mask_encoder==True) else None 151 | super(LocationSensitiveAttention, self).__init__( 152 | num_units=num_units, 153 | memory=memory, 154 | memory_sequence_length=memory_length, 155 | probability_fn=normalization_function, 156 | name=name) 157 | 158 | self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters, 159 | kernel_size=hparams.attention_kernel, padding="same", use_bias=True, 160 | bias_initializer=tf.zeros_initializer(), name="location_features_convolution") 161 | self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False, 162 | dtype=tf.float32, name="location_features_layer") 163 | self._cumulate = cumulate_weights 164 | 165 | def __call__(self, query, state): 166 | """Score the query based on the keys and values. 167 | Args: 168 | query: Tensor of dtype matching `self.values` and shape 169 | `[batch_size, query_depth]`. 170 | state (previous alignments): Tensor of dtype matching `self.values` and shape 171 | `[batch_size, alignments_size]` 172 | (`alignments_size` is memory"s `max_time`). 173 | Returns: 174 | alignments: Tensor of dtype matching `self.values` and shape 175 | `[batch_size, alignments_size]` (`alignments_size` is memory's 176 | `max_time`). 177 | """ 178 | previous_alignments = state 179 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 180 | 181 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 182 | processed_query = self.query_layer(query) if self.query_layer else query 183 | # -> [batch_size, 1, attention_dim] 184 | processed_query = tf.expand_dims(processed_query, 1) 185 | 186 | # processed_location_features shape [batch_size, max_time, attention dimension] 187 | # [batch_size, max_time] -> [batch_size, max_time, 1] 188 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 189 | # location features [batch_size, max_time, filters] 190 | f = self.location_convolution(expanded_alignments) 191 | # Projected location features [batch_size, max_time, attention_dim] 192 | processed_location_features = self.location_layer(f) 193 | 194 | # energy shape [batch_size, max_time] 195 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 196 | 197 | 198 | # alignments shape = energy shape = [batch_size, max_time] 199 | alignments = self._probability_fn(energy, previous_alignments) 200 | 201 | # Cumulate alignments 202 | if self._cumulate: 203 | next_state = alignments + previous_alignments 204 | else: 205 | next_state = alignments 206 | 207 | return alignments, next_state 208 | -------------------------------------------------------------------------------- /synthesizer/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import collections 3 | import tensorflow as tf 4 | from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper 5 | from tensorflow.contrib.seq2seq.python.ops import decoder 6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 7 | from tensorflow.python.framework import ops, tensor_shape 8 | from tensorflow.python.layers import base as layers_base 9 | from tensorflow.python.ops import rnn_cell_impl 10 | from tensorflow.python.util import nest 11 | 12 | 13 | class CustomDecoderOutput( 14 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 15 | pass 16 | 17 | 18 | class CustomDecoder(decoder.Decoder): 19 | """Custom sampling decoder. 20 | 21 | Allows for stop token prediction at inference time 22 | and returns equivalent loss in training time. 23 | 24 | Note: 25 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 26 | """ 27 | 28 | def __init__(self, cell, helper, initial_state, output_layer=None): 29 | """Initialize CustomDecoder. 30 | Args: 31 | cell: An `RNNCell` instance. 32 | helper: A `Helper` instance. 33 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 34 | The initial state of the RNNCell. 35 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 36 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 37 | to storing the result or sampling. 38 | Raises: 39 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 40 | """ 41 | rnn_cell_impl.assert_like_rnncell(type(cell), cell) 42 | if not isinstance(helper, helper_py.Helper): 43 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 44 | if (output_layer is not None 45 | and not isinstance(output_layer, layers_base.Layer)): 46 | raise TypeError( 47 | "output_layer must be a Layer, received: %s" % type(output_layer)) 48 | self._cell = cell 49 | self._helper = helper 50 | self._initial_state = initial_state 51 | self._output_layer = output_layer 52 | 53 | @property 54 | def batch_size(self): 55 | return self._helper.batch_size 56 | 57 | def _rnn_output_size(self): 58 | size = self._cell.output_size 59 | if self._output_layer is None: 60 | return size 61 | else: 62 | # To use layer"s compute_output_shape, we need to convert the 63 | # RNNCell"s output_size entries into shapes with an unknown 64 | # batch size. We then pass this through the layer"s 65 | # compute_output_shape and read off all but the first (batch) 66 | # dimensions to get the output size of the rnn with the layer 67 | # applied to the top. 68 | output_shape_with_unknown_batch = nest.map_structure( 69 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 70 | size) 71 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 72 | output_shape_with_unknown_batch) 73 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 74 | 75 | @property 76 | def output_size(self): 77 | # Return the cell output and the id 78 | return CustomDecoderOutput( 79 | rnn_output=self._rnn_output_size(), 80 | token_output=self._helper.token_output_size, 81 | sample_id=self._helper.sample_ids_shape) 82 | 83 | @property 84 | def output_dtype(self): 85 | # Assume the dtype of the cell is the output_size structure 86 | # containing the input_state"s first component's dtype. 87 | # Return that structure and the sample_ids_dtype from the helper. 88 | dtype = nest.flatten(self._initial_state)[0].dtype 89 | return CustomDecoderOutput( 90 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 91 | tf.float32, 92 | self._helper.sample_ids_dtype) 93 | 94 | def initialize(self, name=None): 95 | """Initialize the decoder. 96 | Args: 97 | name: Name scope for any created operations. 98 | Returns: 99 | `(finished, first_inputs, initial_state)`. 100 | """ 101 | return self._helper.initialize() + (self._initial_state,) 102 | 103 | def step(self, time, inputs, state, name=None): 104 | """Perform a custom decoding step. 105 | Enables for dyanmic prediction 106 | Args: 107 | time: scalar `int32` tensor. 108 | inputs: A (structure of) input tensors. 109 | state: A (structure of) state tensors and TensorArrays. 110 | name: Name scope for any created operations. 111 | Returns: 112 | `(outputs, next_state, next_inputs, finished)`. 113 | """ 114 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 115 | #Call outputprojection wrapper cell 116 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 117 | 118 | #apply output_layer (if existant) 119 | if self._output_layer is not None: 120 | cell_outputs = self._output_layer(cell_outputs) 121 | sample_ids = self._helper.sample( 122 | time=time, outputs=cell_outputs, state=cell_state) 123 | 124 | (finished, next_inputs, next_state) = self._helper.next_inputs( 125 | time=time, 126 | outputs=cell_outputs, 127 | state=cell_state, 128 | sample_ids=sample_ids, 129 | stop_token_prediction=stop_token) 130 | 131 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 132 | return (outputs, next_state, next_inputs, finished) 133 | -------------------------------------------------------------------------------- /synthesizer/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | class TacoTestHelper(Helper): 7 | def __init__(self, batch_size, hparams): 8 | with tf.name_scope("TacoTestHelper"): 9 | self._batch_size = batch_size 10 | self._output_dim = hparams.num_mels 11 | self._reduction_factor = hparams.outputs_per_step 12 | self.stop_at_any = hparams.stop_at_any 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | """Stop on EOS. Otherwise, pass the last output as the next input and pass through state.""" 38 | with tf.name_scope("TacoTestHelper"): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn"t 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if self.stop_at_any: 52 | finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended 53 | else: 54 | finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope("TacoTrainingHelper"): 66 | self._batch_size = batch_size 67 | self._output_dim = hparams.num_mels 68 | self._reduction_factor = hparams.outputs_per_step 69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) 70 | self.gta = gta 71 | self.eval = evaluating 72 | self._hparams = hparams 73 | self.global_step = global_step 74 | 75 | r = self._reduction_factor 76 | # Feed every r-th target frame as input 77 | self._targets = targets[:, r-1::r, :] 78 | 79 | #Maximal sequence length 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | #Compute teacher forcing ratio for this global step. 100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing 101 | if self.gta: 102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth 103 | elif self.eval and self._hparams.natural_eval: 104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions 105 | else: 106 | if self._hparams.tacotron_teacher_forcing_mode == "scheduled": 107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, 108 | self.global_step, self._hparams) 109 | 110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 111 | 112 | def sample(self, time, outputs, state, name=None): 113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 114 | 115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 116 | with tf.name_scope(name or "TacoTrainingHelper"): 117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions) 118 | finished = (time + 1 >= self._lengths) 119 | 120 | #Pick previous outputs randomly with respect to teacher forcing ratio 121 | next_inputs = tf.cond( 122 | tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 124 | lambda: outputs[:,-self._output_dim:]) 125 | 126 | #Pass on state 127 | next_state = state 128 | return (finished, next_inputs, next_state) 129 | 130 | 131 | def _go_frames(batch_size, output_dim): 132 | """Returns all-zero frames for a given batch size and output dimension""" 133 | return tf.tile([[0.0]], [batch_size, output_dim]) 134 | 135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): 136 | ################################################################# 137 | # Narrow Cosine Decay: 138 | 139 | # Phase 1: tfr = 1 140 | # We only start learning rate decay after 10k steps 141 | 142 | # Phase 2: tfr in ]0, 1[ 143 | # decay reach minimal value at step ~280k 144 | 145 | # Phase 3: tfr = 0 146 | # clip by minimal teacher forcing ratio value (step >~ 280k) 147 | ################################################################# 148 | #Compute natural cosine decay 149 | tfr = tf.train.cosine_decay(init_tfr, 150 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k 151 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k 152 | alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value 153 | name="tfr_cosine_decay") 154 | 155 | #force teacher forcing ratio to take initial value when global step < start decay step. 156 | narrow_tfr = tf.cond( 157 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), 158 | lambda: tf.convert_to_tensor(init_tfr), 159 | lambda: tfr) 160 | 161 | return narrow_tfr -------------------------------------------------------------------------------- /synthesizer/synthesize.py: -------------------------------------------------------------------------------- 1 | from synthesizer.tacotron2 import Tacotron2 2 | from synthesizer.hparams import hparams_debug_string 3 | from synthesizer.infolog import log 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | import time 7 | import os 8 | 9 | 10 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences): 11 | eval_dir = os.path.join(output_dir, "eval") 12 | log_dir = os.path.join(output_dir, "logs-eval") 13 | 14 | #Create output path if it doesn"t exist 15 | os.makedirs(eval_dir, exist_ok=True) 16 | os.makedirs(log_dir, exist_ok=True) 17 | os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True) 18 | os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True) 19 | 20 | log(hparams_debug_string()) 21 | synth = Tacotron2(checkpoint_path, hparams) 22 | 23 | #Set inputs batch wise 24 | sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 25 | in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] 26 | 27 | log("Starting Synthesis") 28 | with open(os.path.join(eval_dir, "map.txt"), "w") as file: 29 | for i, texts in enumerate(tqdm(sentences)): 30 | start = time.time() 31 | basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))] 32 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) 33 | 34 | for elems in zip(texts, mel_filenames, speaker_ids): 35 | file.write("|".join([str(x) for x in elems]) + "\n") 36 | log("synthesized mel spectrograms at {}".format(eval_dir)) 37 | return eval_dir 38 | 39 | def run_synthesis(in_dir, out_dir, model_dir, hparams): 40 | synth_dir = os.path.join(out_dir, "mels_gta") 41 | os.makedirs(synth_dir, exist_ok=True) 42 | metadata_filename = os.path.join(in_dir, "train.txt") 43 | print(hparams_debug_string()) 44 | 45 | # Load the model in memory 46 | weights_dir = os.path.join(model_dir, "taco_pretrained") 47 | checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path 48 | synth = Tacotron2(checkpoint_fpath, hparams, gta=True) 49 | 50 | # Load the metadata 51 | with open(metadata_filename, encoding="utf-8") as f: 52 | metadata = [line.strip().split("|") for line in f] 53 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 54 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600 55 | print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours)) 56 | 57 | #Set inputs batch wise 58 | metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in 59 | range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] 60 | # TODO: come on big boy, fix this 61 | # Quick and dirty fix to make sure that all batches have the same size 62 | metadata = metadata[:-1] 63 | 64 | print("Starting Synthesis") 65 | mel_dir = os.path.join(in_dir, "mels") 66 | embed_dir = os.path.join(in_dir, "embeds") 67 | meta_out_fpath = os.path.join(out_dir, "synthesized.txt") 68 | with open(meta_out_fpath, "w") as file: 69 | for i, meta in enumerate(tqdm(metadata)): 70 | texts = [m[5] for m in meta] 71 | mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] 72 | embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta] 73 | basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "") 74 | for m in mel_filenames] 75 | synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames) 76 | 77 | for elems in meta: 78 | file.write("|".join([str(x) for x in elems]) + "\n") 79 | 80 | print("Synthesized mel spectrograms at {}".format(synth_dir)) 81 | return meta_out_fpath 82 | 83 | -------------------------------------------------------------------------------- /synthesizer/tacotron2.py: -------------------------------------------------------------------------------- 1 | from synthesizer.utils.text import text_to_sequence 2 | from synthesizer.infolog import log 3 | from synthesizer.models import create_model 4 | from synthesizer.utils import plot 5 | from synthesizer import audio 6 | import tensorflow as tf 7 | import numpy as np 8 | import os 9 | 10 | 11 | class Tacotron2: 12 | def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron", seed=None): 13 | log("Constructing model: %s" % model_name) 14 | 15 | # Initialize tensorflow random number seed for deterministic operation if provided 16 | if seed is not None: 17 | tf.compat.v1.set_random_seed(seed) 18 | 19 | #Force the batch size to be known in order to use attention masking in batch synthesis 20 | inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs") 21 | input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths") 22 | speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size), 23 | name="speaker_embeddings") 24 | targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") 25 | split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") 26 | with tf.compat.v1.variable_scope("Tacotron_model") as scope: 27 | self.model = create_model(model_name, hparams) 28 | if gta: 29 | self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, 30 | split_infos=split_infos) 31 | else: 32 | self.model.initialize(inputs, input_lengths, speaker_embeddings, 33 | split_infos=split_infos) 34 | 35 | self.mel_outputs = self.model.tower_mel_outputs 36 | self.linear_outputs = self.model.tower_linear_outputs if (hparams.predict_linear and not gta) else None 37 | self.alignments = self.model.tower_alignments 38 | self.stop_token_prediction = self.model.tower_stop_token_prediction 39 | self.targets = targets 40 | 41 | self.gta = gta 42 | self._hparams = hparams 43 | #pad input sequences with the 0 ( _ ) 44 | self._pad = 0 45 | #explicitely setting the padding to a value that doesn"t originally exist in the spectogram 46 | #to avoid any possible conflicts, without affecting the output range of the model too much 47 | if hparams.symmetric_mels: 48 | self._target_pad = -hparams.max_abs_value 49 | else: 50 | self._target_pad = 0. 51 | 52 | self.inputs = inputs 53 | self.input_lengths = input_lengths 54 | self.speaker_embeddings = speaker_embeddings 55 | self.targets = targets 56 | self.split_infos = split_infos 57 | 58 | log("Loading checkpoint: %s" % checkpoint_path) 59 | #Memory allocation on the GPUs as needed 60 | config = tf.compat.v1.ConfigProto() 61 | config.gpu_options.allow_growth = True 62 | config.allow_soft_placement = True 63 | 64 | self.session = tf.compat.v1.Session(config=config) 65 | self.session.run(tf.compat.v1.global_variables_initializer()) 66 | 67 | saver = tf.compat.v1.train.Saver() 68 | saver.restore(self.session, checkpoint_path) 69 | 70 | def my_synthesize(self, speaker_embeds, texts): 71 | """ 72 | Lighter synthesis function that directly returns the mel spectrograms. 73 | """ 74 | 75 | # Prepare the input 76 | cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")] 77 | seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 78 | input_lengths = [len(seq) for seq in seqs] 79 | input_seqs, max_seq_len = self._prepare_inputs(seqs) 80 | split_infos = [[max_seq_len, 0, 0, 0]] 81 | feed_dict = { 82 | self.inputs: input_seqs, 83 | self.input_lengths: np.asarray(input_lengths, dtype=np.int32), 84 | self.split_infos: np.asarray(split_infos, dtype=np.int32), 85 | self.speaker_embeddings: speaker_embeds 86 | } 87 | 88 | # Forward it 89 | mels, alignments, stop_tokens = self.session.run( 90 | [self.mel_outputs, self.alignments, self.stop_token_prediction], 91 | feed_dict=feed_dict) 92 | mels, alignments, stop_tokens = list(mels[0]), alignments[0], stop_tokens[0] 93 | 94 | # Trim the output 95 | for i in range(len(mels)): 96 | try: 97 | target_length = list(np.round(stop_tokens[i])).index(1) 98 | mels[i] = mels[i][:target_length, :] 99 | except ValueError: 100 | # If no token is generated, we simply do not trim the output 101 | continue 102 | 103 | return [mel.T for mel in mels], alignments 104 | 105 | def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames): 106 | hparams = self._hparams 107 | cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] 108 | 109 | assert 0 == len(texts) % self._hparams.tacotron_num_gpus 110 | seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts] 111 | input_lengths = [len(seq) for seq in seqs] 112 | 113 | size_per_device = len(seqs) // self._hparams.tacotron_num_gpus 114 | 115 | #Pad inputs according to each GPU max length 116 | input_seqs = None 117 | split_infos = [] 118 | for i in range(self._hparams.tacotron_num_gpus): 119 | device_input = seqs[size_per_device*i: size_per_device*(i+1)] 120 | device_input, max_seq_len = self._prepare_inputs(device_input) 121 | input_seqs = np.concatenate((input_seqs, device_input), axis=1) if input_seqs is not None else device_input 122 | split_infos.append([max_seq_len, 0, 0, 0]) 123 | 124 | feed_dict = { 125 | self.inputs: input_seqs, 126 | self.input_lengths: np.asarray(input_lengths, dtype=np.int32), 127 | } 128 | 129 | if self.gta: 130 | np_targets = [np.load(mel_filename) for mel_filename in mel_filenames] 131 | target_lengths = [len(np_target) for np_target in np_targets] 132 | 133 | #pad targets according to each GPU max length 134 | target_seqs = None 135 | for i in range(self._hparams.tacotron_num_gpus): 136 | device_target = np_targets[size_per_device*i: size_per_device*(i+1)] 137 | device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step) 138 | target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target 139 | split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe? 140 | 141 | feed_dict[self.targets] = target_seqs 142 | assert len(np_targets) == len(texts) 143 | 144 | feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) 145 | feed_dict[self.speaker_embeddings] = [np.load(f) for f in embed_filenames] 146 | 147 | if self.gta or not hparams.predict_linear: 148 | mels, alignments, stop_tokens = self.session.run( 149 | [self.mel_outputs, self.alignments, self.stop_token_prediction], 150 | feed_dict=feed_dict) 151 | #Linearize outputs (1D arrays) 152 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 153 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 154 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 155 | 156 | if not self.gta: 157 | #Natural batch synthesis 158 | #Get Mel lengths for the entire batch from stop_tokens predictions 159 | target_lengths = self._get_output_lengths(stop_tokens) 160 | 161 | #Take off the batch wise padding 162 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 163 | assert len(mels) == len(texts) 164 | 165 | else: 166 | linears, mels, alignments, stop_tokens = self.session.run( 167 | [self.linear_outputs, self.mel_outputs, self.alignments, 168 | self.stop_token_prediction], 169 | feed_dict=feed_dict) 170 | #Linearize outputs (1D arrays) 171 | linears = [linear for gpu_linear in linears for linear in gpu_linear] 172 | mels = [mel for gpu_mels in mels for mel in gpu_mels] 173 | alignments = [align for gpu_aligns in alignments for align in gpu_aligns] 174 | stop_tokens = [token for gpu_token in stop_tokens for token in gpu_token] 175 | 176 | #Natural batch synthesis 177 | #Get Mel/Linear lengths for the entire batch from stop_tokens predictions 178 | # target_lengths = self._get_output_lengths(stop_tokens) 179 | target_lengths = [9999] 180 | 181 | #Take off the batch wise padding 182 | mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] 183 | linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)] 184 | assert len(mels) == len(linears) == len(texts) 185 | 186 | if basenames is None: 187 | raise NotImplemented() 188 | 189 | saved_mels_paths = [] 190 | for i, mel in enumerate(mels): 191 | # Write the spectrogram to disk 192 | # Note: outputs mel-spectrogram files and target ones have same names, just different folders 193 | mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i])) 194 | np.save(mel_filename, mel, allow_pickle=False) 195 | saved_mels_paths.append(mel_filename) 196 | 197 | if log_dir is not None: 198 | #save wav (mel -> wav) 199 | wav = audio.inv_mel_spectrogram(mel.T, hparams) 200 | audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate) 201 | 202 | #save alignments 203 | plot.plot_alignment(alignments[i], os.path.join(log_dir, "plots/alignment-{}.png".format(basenames[i])), 204 | title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i]) 205 | 206 | #save mel spectrogram plot 207 | plot.plot_spectrogram(mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])), 208 | title="{}".format(texts[i]), split_title=True) 209 | 210 | if hparams.predict_linear: 211 | #save wav (linear -> wav) 212 | wav = audio.inv_linear_spectrogram(linears[i].T, hparams) 213 | audio.save_wav(wav, os.path.join(log_dir, "wavs/wav-{}-linear.wav".format(basenames[i])), sr=hparams.sample_rate) 214 | 215 | #save linear spectrogram plot 216 | plot.plot_spectrogram(linears[i], os.path.join(log_dir, "plots/linear-{}.png".format(basenames[i])), 217 | title="{}".format(texts[i]), split_title=True, auto_aspect=True) 218 | 219 | return saved_mels_paths 220 | 221 | def _round_up(self, x, multiple): 222 | remainder = x % multiple 223 | return x if remainder == 0 else x + multiple - remainder 224 | 225 | def _prepare_inputs(self, inputs): 226 | max_len = max([len(x) for x in inputs]) 227 | return np.stack([self._pad_input(x, max_len) for x in inputs]), max_len 228 | 229 | def _pad_input(self, x, length): 230 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=self._pad) 231 | 232 | def _prepare_targets(self, targets, alignment): 233 | max_len = max([len(t) for t in targets]) 234 | data_len = self._round_up(max_len, alignment) 235 | return np.stack([self._pad_target(t, data_len) for t in targets]), data_len 236 | 237 | def _pad_target(self, t, length): 238 | return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode="constant", constant_values=self._target_pad) 239 | 240 | def _get_output_lengths(self, stop_tokens): 241 | #Determine each mel length by the stop token predictions. (len = first occurence of 1 in stop_tokens row wise) 242 | output_lengths = [row.index(1) for row in np.round(stop_tokens).tolist()] 243 | return output_lengths 244 | -------------------------------------------------------------------------------- /synthesizer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/cleaners.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/cleaners.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/numbers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/numbers.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/plot.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/plot.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/symbols.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/symbols.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/__pycache__/text.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/synthesizer/utils/__pycache__/text.cpython-37.pyc -------------------------------------------------------------------------------- /synthesizer/utils/_cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_symbols = [ 4 | "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2", 5 | "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2", 6 | "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY", 7 | "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1", 8 | "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0", 9 | "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW", 10 | "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH" 11 | ] 12 | 13 | _valid_symbol_set = set(valid_symbols) 14 | 15 | 16 | class CMUDict: 17 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" 18 | def __init__(self, file_or_path, keep_ambiguous=True): 19 | if isinstance(file_or_path, str): 20 | with open(file_or_path, encoding="latin-1") as f: 21 | entries = _parse_cmudict(f) 22 | else: 23 | entries = _parse_cmudict(file_or_path) 24 | if not keep_ambiguous: 25 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 26 | self._entries = entries 27 | 28 | 29 | def __len__(self): 30 | return len(self._entries) 31 | 32 | 33 | def lookup(self, word): 34 | """Returns list of ARPAbet pronunciations of the given word.""" 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | 39 | _alt_re = re.compile(r"\([0-9]+\)") 40 | 41 | 42 | def _parse_cmudict(file): 43 | cmudict = {} 44 | for line in file: 45 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 46 | parts = line.split(" ") 47 | word = re.sub(_alt_re, "", parts[0]) 48 | pronunciation = _get_pronunciation(parts[1]) 49 | if pronunciation: 50 | if word in cmudict: 51 | cmudict[word].append(pronunciation) 52 | else: 53 | cmudict[word] = [pronunciation] 54 | return cmudict 55 | 56 | 57 | def _get_pronunciation(s): 58 | parts = s.strip().split(" ") 59 | for part in parts: 60 | if part not in _valid_symbol_set: 61 | return None 62 | return " ".join(parts) 63 | -------------------------------------------------------------------------------- /synthesizer/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You"ll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | """ 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | # Regular expression matching whitespace: 18 | _whitespace_re = re.compile(r"\s+") 19 | 20 | # List of (regular expression, replacement) pairs for abbreviations: 21 | _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ 22 | ("mrs", "misess"), 23 | ("mr", "mister"), 24 | ("dr", "doctor"), 25 | ("st", "saint"), 26 | ("co", "company"), 27 | ("jr", "junior"), 28 | ("maj", "major"), 29 | ("gen", "general"), 30 | ("drs", "doctors"), 31 | ("rev", "reverend"), 32 | ("lt", "lieutenant"), 33 | ("hon", "honorable"), 34 | ("sgt", "sergeant"), 35 | ("capt", "captain"), 36 | ("esq", "esquire"), 37 | ("ltd", "limited"), 38 | ("col", "colonel"), 39 | ("ft", "fort"), 40 | ]] 41 | 42 | 43 | def expand_abbreviations(text): 44 | for regex, replacement in _abbreviations: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def expand_numbers(text): 50 | return normalize_numbers(text) 51 | 52 | 53 | def lowercase(text): 54 | """lowercase input tokens.""" 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, " ", text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def basic_cleaners(text): 67 | """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 68 | text = lowercase(text) 69 | text = collapse_whitespace(text) 70 | return text 71 | 72 | 73 | def transliteration_cleaners(text): 74 | """Pipeline for non-English text that transliterates to ASCII.""" 75 | text = convert_to_ascii(text) 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def english_cleaners(text): 82 | """Pipeline for English text, including number and abbreviation expansion.""" 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = expand_numbers(text) 86 | text = expand_abbreviations(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | -------------------------------------------------------------------------------- /synthesizer/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import re 2 | import inflect 3 | 4 | _inflect = inflect.engine() 5 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 6 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 7 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 8 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 9 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 10 | _number_re = re.compile(r"[0-9]+") 11 | 12 | 13 | def _remove_commas(m): 14 | return m.group(1).replace(",", "") 15 | 16 | 17 | def _expand_decimal_point(m): 18 | return m.group(1).replace(".", " point ") 19 | 20 | 21 | def _expand_dollars(m): 22 | match = m.group(1) 23 | parts = match.split(".") 24 | if len(parts) > 2: 25 | return match + " dollars" # Unexpected format 26 | dollars = int(parts[0]) if parts[0] else 0 27 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 28 | if dollars and cents: 29 | dollar_unit = "dollar" if dollars == 1 else "dollars" 30 | cent_unit = "cent" if cents == 1 else "cents" 31 | return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) 32 | elif dollars: 33 | dollar_unit = "dollar" if dollars == 1 else "dollars" 34 | return "%s %s" % (dollars, dollar_unit) 35 | elif cents: 36 | cent_unit = "cent" if cents == 1 else "cents" 37 | return "%s %s" % (cents, cent_unit) 38 | else: 39 | return "zero dollars" 40 | 41 | 42 | def _expand_ordinal(m): 43 | return _inflect.number_to_words(m.group(0)) 44 | 45 | 46 | def _expand_number(m): 47 | num = int(m.group(0)) 48 | if num > 1000 and num < 3000: 49 | if num == 2000: 50 | return "two thousand" 51 | elif num > 2000 and num < 2010: 52 | return "two thousand " + _inflect.number_to_words(num % 100) 53 | elif num % 100 == 0: 54 | return _inflect.number_to_words(num // 100) + " hundred" 55 | else: 56 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 57 | else: 58 | return _inflect.number_to_words(num, andword="") 59 | 60 | 61 | def normalize_numbers(text): 62 | text = re.sub(_comma_number_re, _remove_commas, text) 63 | text = re.sub(_pounds_re, r"\1 pounds", text) 64 | text = re.sub(_dollars_re, _expand_dollars, text) 65 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 66 | text = re.sub(_ordinal_re, _expand_ordinal, text) 67 | text = re.sub(_number_re, _expand_number, text) 68 | return text 69 | -------------------------------------------------------------------------------- /synthesizer/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use("Agg") 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def split_title_line(title_text, max_words=5): 8 | """ 9 | A function that splits any string based on specific character 10 | (returning it with the string), with maximum number of words on it 11 | """ 12 | seq = title_text.split() 13 | return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 14 | 15 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None): 16 | if max_len is not None: 17 | alignment = alignment[:, :max_len] 18 | 19 | fig = plt.figure(figsize=(8, 6)) 20 | ax = fig.add_subplot(111) 21 | 22 | im = ax.imshow( 23 | alignment, 24 | aspect="auto", 25 | origin="lower", 26 | interpolation="none") 27 | fig.colorbar(im, ax=ax) 28 | xlabel = "Decoder timestep" 29 | 30 | if split_title: 31 | title = split_title_line(title) 32 | 33 | plt.xlabel(xlabel) 34 | plt.title(title) 35 | plt.ylabel("Encoder timestep") 36 | plt.tight_layout() 37 | plt.savefig(path, format="png") 38 | plt.close() 39 | 40 | 41 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 42 | if max_len is not None: 43 | target_spectrogram = target_spectrogram[:max_len] 44 | pred_spectrogram = pred_spectrogram[:max_len] 45 | 46 | if split_title: 47 | title = split_title_line(title) 48 | 49 | fig = plt.figure(figsize=(10, 8)) 50 | # Set common labels 51 | fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16) 52 | 53 | #target spectrogram subplot 54 | if target_spectrogram is not None: 55 | ax1 = fig.add_subplot(311) 56 | ax2 = fig.add_subplot(312) 57 | 58 | if auto_aspect: 59 | im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none") 60 | else: 61 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none") 62 | ax1.set_title("Target Mel-Spectrogram") 63 | fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1) 64 | ax2.set_title("Predicted Mel-Spectrogram") 65 | else: 66 | ax2 = fig.add_subplot(211) 67 | 68 | if auto_aspect: 69 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none") 70 | else: 71 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none") 72 | fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2) 73 | 74 | plt.tight_layout() 75 | plt.savefig(path, format="png") 76 | plt.close() 77 | -------------------------------------------------------------------------------- /synthesizer/utils/symbols.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | """ 7 | # from . import cmudict 8 | 9 | _pad = "_" 10 | _eos = "~" 11 | _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? " 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | #_arpabet = ["@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet 18 | -------------------------------------------------------------------------------- /synthesizer/utils/text.py: -------------------------------------------------------------------------------- 1 | from .symbols import symbols 2 | from . import cleaners 3 | import re 4 | 5 | # Mappings from symbol to numeric ID and vice versa: 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 8 | 9 | # Regular expression matching text enclosed in curly braces: 10 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") 11 | 12 | 13 | def text_to_sequence(text, cleaner_names): 14 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 15 | 16 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 17 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 18 | 19 | Args: 20 | text: string to convert to a sequence 21 | cleaner_names: names of the cleaner functions to run the text through 22 | 23 | Returns: 24 | List of integers corresponding to the symbols in the text 25 | """ 26 | sequence = [] 27 | 28 | # Check for curly braces and treat their contents as ARPAbet: 29 | while len(text): 30 | m = _curly_re.match(text) 31 | if not m: 32 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 33 | break 34 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 35 | sequence += _arpabet_to_sequence(m.group(2)) 36 | text = m.group(3) 37 | 38 | # Append EOS token 39 | sequence.append(_symbol_to_id["~"]) 40 | return sequence 41 | 42 | 43 | def sequence_to_text(sequence): 44 | """Converts a sequence of IDs back to a string""" 45 | result = "" 46 | for symbol_id in sequence: 47 | if symbol_id in _id_to_symbol: 48 | s = _id_to_symbol[symbol_id] 49 | # Enclose ARPAbet back in curly braces: 50 | if len(s) > 1 and s[0] == "@": 51 | s = "{%s}" % s[1:] 52 | result += s 53 | return result.replace("}{", " ") 54 | 55 | 56 | def _clean_text(text, cleaner_names): 57 | for name in cleaner_names: 58 | cleaner = getattr(cleaners, name) 59 | if not cleaner: 60 | raise Exception("Unknown cleaner: %s" % name) 61 | text = cleaner(text) 62 | return text 63 | 64 | 65 | def _symbols_to_sequence(symbols): 66 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 67 | 68 | 69 | def _arpabet_to_sequence(text): 70 | return _symbols_to_sequence(["@" + s for s in text.split()]) 71 | 72 | 73 | def _should_keep_symbol(s): 74 | return s in _symbol_to_id and s not in ("_", "~") 75 | -------------------------------------------------------------------------------- /vocoder/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /vocoder/__pycache__/audio.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/audio.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/__pycache__/display.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/display.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/__pycache__/distribution.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/distribution.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/__pycache__/hparams.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/hparams.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/__pycache__/inference.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/__pycache__/inference.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/audio.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import librosa 4 | import vocoder.hparams as hp 5 | from scipy.signal import lfilter 6 | 7 | 8 | def label_2_float(x, bits) : 9 | return 2 * x / (2**bits - 1.) - 1. 10 | 11 | 12 | def float_2_label(x, bits) : 13 | assert abs(x).max() <= 1.0 14 | x = (x + 1.) * (2**bits - 1) / 2 15 | return x.clip(0, 2**bits - 1) 16 | 17 | 18 | def load_wav(path) : 19 | return librosa.load(str(path), sr=hp.sample_rate)[0] 20 | 21 | 22 | def save_wav(x, path) : 23 | librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate) 24 | 25 | 26 | def split_signal(x) : 27 | unsigned = x + 2**15 28 | coarse = unsigned // 256 29 | fine = unsigned % 256 30 | return coarse, fine 31 | 32 | 33 | def combine_signal(coarse, fine) : 34 | return coarse * 256 + fine - 2**15 35 | 36 | 37 | def encode_16bits(x) : 38 | return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) 39 | 40 | 41 | mel_basis = None 42 | 43 | 44 | def linear_to_mel(spectrogram): 45 | global mel_basis 46 | if mel_basis is None: 47 | mel_basis = build_mel_basis() 48 | return np.dot(mel_basis, spectrogram) 49 | 50 | 51 | def build_mel_basis(): 52 | return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) 53 | 54 | 55 | def normalize(S): 56 | return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1) 57 | 58 | 59 | def denormalize(S): 60 | return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 61 | 62 | 63 | def amp_to_db(x): 64 | return 20 * np.log10(np.maximum(1e-5, x)) 65 | 66 | 67 | def db_to_amp(x): 68 | return np.power(10.0, x * 0.05) 69 | 70 | 71 | def spectrogram(y): 72 | D = stft(y) 73 | S = amp_to_db(np.abs(D)) - hp.ref_level_db 74 | return normalize(S) 75 | 76 | 77 | def melspectrogram(y): 78 | D = stft(y) 79 | S = amp_to_db(linear_to_mel(np.abs(D))) 80 | return normalize(S) 81 | 82 | 83 | def stft(y): 84 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) 85 | 86 | 87 | def pre_emphasis(x): 88 | return lfilter([1, -hp.preemphasis], [1], x) 89 | 90 | 91 | def de_emphasis(x): 92 | return lfilter([1], [1, -hp.preemphasis], x) 93 | 94 | 95 | def encode_mu_law(x, mu) : 96 | mu = mu - 1 97 | fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) 98 | return np.floor((fx + 1) / 2 * mu + 0.5) 99 | 100 | 101 | def decode_mu_law(y, mu, from_labels=True) : 102 | if from_labels: 103 | y = label_2_float(y, math.log2(mu)) 104 | mu = mu - 1 105 | x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1) 106 | return x 107 | 108 | -------------------------------------------------------------------------------- /vocoder/display.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | import sys 5 | 6 | 7 | def progbar(i, n, size=16): 8 | done = (i * size) // n 9 | bar = '' 10 | for i in range(size): 11 | bar += '█' if i <= done else '░' 12 | return bar 13 | 14 | 15 | def stream(message) : 16 | try: 17 | sys.stdout.write("\r{%s}" % message) 18 | except: 19 | #Remove non-ASCII characters from message 20 | message = ''.join(i for i in message if ord(i)<128) 21 | sys.stdout.write("\r{%s}" % message) 22 | 23 | 24 | def simple_table(item_tuples) : 25 | 26 | border_pattern = '+---------------------------------------' 27 | whitespace = ' ' 28 | 29 | headings, cells, = [], [] 30 | 31 | for item in item_tuples : 32 | 33 | heading, cell = str(item[0]), str(item[1]) 34 | 35 | pad_head = True if len(heading) < len(cell) else False 36 | 37 | pad = abs(len(heading) - len(cell)) 38 | pad = whitespace[:pad] 39 | 40 | pad_left = pad[:len(pad)//2] 41 | pad_right = pad[len(pad)//2:] 42 | 43 | if pad_head : 44 | heading = pad_left + heading + pad_right 45 | else : 46 | cell = pad_left + cell + pad_right 47 | 48 | headings += [heading] 49 | cells += [cell] 50 | 51 | border, head, body = '', '', '' 52 | 53 | for i in range(len(item_tuples)) : 54 | 55 | temp_head = f'| {headings[i]} ' 56 | temp_body = f'| {cells[i]} ' 57 | 58 | border += border_pattern[:len(temp_head)] 59 | head += temp_head 60 | body += temp_body 61 | 62 | if i == len(item_tuples) - 1 : 63 | head += '|' 64 | body += '|' 65 | border += '+' 66 | 67 | print(border) 68 | print(head) 69 | print(border) 70 | print(body) 71 | print(border) 72 | print(' ') 73 | 74 | 75 | def time_since(started) : 76 | elapsed = time.time() - started 77 | m = int(elapsed // 60) 78 | s = int(elapsed % 60) 79 | if m >= 60 : 80 | h = int(m // 60) 81 | m = m % 60 82 | return f'{h}h {m}m {s}s' 83 | else : 84 | return f'{m}m {s}s' 85 | 86 | 87 | def save_attention(attn, path) : 88 | fig = plt.figure(figsize=(12, 6)) 89 | plt.imshow(attn.T, interpolation='nearest', aspect='auto') 90 | fig.savefig(f'{path}.png', bbox_inches='tight') 91 | plt.close(fig) 92 | 93 | 94 | def save_spectrogram(M, path, length=None) : 95 | M = np.flip(M, axis=0) 96 | if length : M = M[:, :length] 97 | fig = plt.figure(figsize=(12, 6)) 98 | plt.imshow(M, interpolation='nearest', aspect='auto') 99 | fig.savefig(f'{path}.png', bbox_inches='tight') 100 | plt.close(fig) 101 | 102 | 103 | def plot(array) : 104 | fig = plt.figure(figsize=(30, 5)) 105 | ax = fig.add_subplot(111) 106 | ax.xaxis.label.set_color('grey') 107 | ax.yaxis.label.set_color('grey') 108 | ax.xaxis.label.set_fontsize(23) 109 | ax.yaxis.label.set_fontsize(23) 110 | ax.tick_params(axis='x', colors='grey', labelsize=23) 111 | ax.tick_params(axis='y', colors='grey', labelsize=23) 112 | plt.plot(array) 113 | 114 | 115 | def plot_spec(M) : 116 | M = np.flip(M, axis=0) 117 | plt.figure(figsize=(18,4)) 118 | plt.imshow(M, interpolation='nearest', aspect='auto') 119 | plt.show() 120 | 121 | -------------------------------------------------------------------------------- /vocoder/distribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def log_sum_exp(x): 7 | """ numerically stable log_sum_exp implementation that prevents overflow """ 8 | # TF ordering 9 | axis = len(x.size()) - 1 10 | m, _ = torch.max(x, dim=axis) 11 | m2, _ = torch.max(x, dim=axis, keepdim=True) 12 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) 13 | 14 | 15 | # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py 16 | def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, 17 | log_scale_min=None, reduce=True): 18 | if log_scale_min is None: 19 | log_scale_min = float(np.log(1e-14)) 20 | y_hat = y_hat.permute(0,2,1) 21 | assert y_hat.dim() == 3 22 | assert y_hat.size(1) % 3 == 0 23 | nr_mix = y_hat.size(1) // 3 24 | 25 | # (B x T x C) 26 | y_hat = y_hat.transpose(1, 2) 27 | 28 | # unpack parameters. (B, T, num_mixtures) x 3 29 | logit_probs = y_hat[:, :, :nr_mix] 30 | means = y_hat[:, :, nr_mix:2 * nr_mix] 31 | log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) 32 | 33 | # B x T x 1 -> B x T x num_mixtures 34 | y = y.expand_as(means) 35 | 36 | centered_y = y - means 37 | inv_stdv = torch.exp(-log_scales) 38 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 39 | cdf_plus = torch.sigmoid(plus_in) 40 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 41 | cdf_min = torch.sigmoid(min_in) 42 | 43 | # log probability for edge case of 0 (before scaling) 44 | # equivalent: torch.log(F.sigmoid(plus_in)) 45 | log_cdf_plus = plus_in - F.softplus(plus_in) 46 | 47 | # log probability for edge case of 255 (before scaling) 48 | # equivalent: (1 - F.sigmoid(min_in)).log() 49 | log_one_minus_cdf_min = -F.softplus(min_in) 50 | 51 | # probability for all other cases 52 | cdf_delta = cdf_plus - cdf_min 53 | 54 | mid_in = inv_stdv * centered_y 55 | # log probability in the center of the bin, to be used in extreme cases 56 | # (not actually used in our code) 57 | log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) 58 | 59 | # tf equivalent 60 | """ 61 | log_probs = tf.where(x < -0.999, log_cdf_plus, 62 | tf.where(x > 0.999, log_one_minus_cdf_min, 63 | tf.where(cdf_delta > 1e-5, 64 | tf.log(tf.maximum(cdf_delta, 1e-12)), 65 | log_pdf_mid - np.log(127.5)))) 66 | """ 67 | # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value 68 | # for num_classes=65536 case? 1e-7? not sure.. 69 | inner_inner_cond = (cdf_delta > 1e-5).float() 70 | 71 | inner_inner_out = inner_inner_cond * \ 72 | torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ 73 | (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) 74 | inner_cond = (y > 0.999).float() 75 | inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out 76 | cond = (y < -0.999).float() 77 | log_probs = cond * log_cdf_plus + (1. - cond) * inner_out 78 | 79 | log_probs = log_probs + F.log_softmax(logit_probs, -1) 80 | 81 | if reduce: 82 | return -torch.mean(log_sum_exp(log_probs)) 83 | else: 84 | return -log_sum_exp(log_probs).unsqueeze(-1) 85 | 86 | 87 | def sample_from_discretized_mix_logistic(y, log_scale_min=None): 88 | """ 89 | Sample from discretized mixture of logistic distributions 90 | Args: 91 | y (Tensor): B x C x T 92 | log_scale_min (float): Log scale minimum value 93 | Returns: 94 | Tensor: sample in range of [-1, 1]. 95 | """ 96 | if log_scale_min is None: 97 | log_scale_min = float(np.log(1e-14)) 98 | assert y.size(1) % 3 == 0 99 | nr_mix = y.size(1) // 3 100 | 101 | # B x T x C 102 | y = y.transpose(1, 2) 103 | logit_probs = y[:, :, :nr_mix] 104 | 105 | # sample mixture indicator from softmax 106 | temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) 107 | temp = logit_probs.data - torch.log(- torch.log(temp)) 108 | _, argmax = temp.max(dim=-1) 109 | 110 | # (B, T) -> (B, T, nr_mix) 111 | one_hot = to_one_hot(argmax, nr_mix) 112 | # select logistic parameters 113 | means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) 114 | log_scales = torch.clamp(torch.sum( 115 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) 116 | # sample from logistic & clip to interval 117 | # we don't actually round to the nearest 8bit value when sampling 118 | u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) 119 | x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) 120 | 121 | x = torch.clamp(torch.clamp(x, min=-1.), max=1.) 122 | 123 | return x 124 | 125 | 126 | def to_one_hot(tensor, n, fill_with=1.): 127 | # we perform one hot encore with respect to the last axis 128 | one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() 129 | if tensor.is_cuda: 130 | one_hot = one_hot.cuda() 131 | one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) 132 | return one_hot 133 | -------------------------------------------------------------------------------- /vocoder/gen_wavernn.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder.audio import * 3 | 4 | 5 | def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path): 6 | k = model.get_step() // 1000 7 | 8 | for i, (m, x) in enumerate(test_set, 1): 9 | if i > samples: 10 | break 11 | 12 | print('\n| Generating: %i/%i' % (i, samples)) 13 | 14 | x = x[0].numpy() 15 | 16 | bits = 16 if hp.voc_mode == 'MOL' else hp.bits 17 | 18 | if hp.mu_law and hp.voc_mode != 'MOL' : 19 | x = decode_mu_law(x, 2**bits, from_labels=True) 20 | else : 21 | x = label_2_float(x, bits) 22 | 23 | save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i))) 24 | 25 | batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \ 26 | "gen_not_batched" 27 | save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str)) 28 | 29 | wav = model.generate(m, batched, target, overlap, hp.mu_law) 30 | save_wav(wav, save_str) 31 | 32 | -------------------------------------------------------------------------------- /vocoder/hparams.py: -------------------------------------------------------------------------------- 1 | from synthesizer.hparams import hparams as _syn_hp 2 | 3 | 4 | # Audio settings------------------------------------------------------------------------ 5 | # Match the values of the synthesizer 6 | sample_rate = _syn_hp.sample_rate 7 | n_fft = _syn_hp.n_fft 8 | num_mels = _syn_hp.num_mels 9 | hop_length = _syn_hp.hop_size 10 | win_length = _syn_hp.win_size 11 | fmin = _syn_hp.fmin 12 | min_level_db = _syn_hp.min_level_db 13 | ref_level_db = _syn_hp.ref_level_db 14 | mel_max_abs_value = _syn_hp.max_abs_value 15 | preemphasis = _syn_hp.preemphasis 16 | apply_preemphasis = _syn_hp.preemphasize 17 | 18 | bits = 9 # bit depth of signal 19 | mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode 20 | # below 21 | 22 | 23 | # WAVERNN / VOCODER -------------------------------------------------------------------------------- 24 | voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from 25 | # mixture of logistics) 26 | voc_upsample_factors = (5, 5, 8) # NB - this needs to correctly factorise hop_length 27 | voc_rnn_dims = 512 28 | voc_fc_dims = 512 29 | voc_compute_dims = 128 30 | voc_res_out_dims = 128 31 | voc_res_blocks = 10 32 | 33 | # Training 34 | voc_batch_size = 100 35 | voc_lr = 1e-4 36 | voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint 37 | voc_pad = 2 # this will pad the input so that the resnet can 'see' wider 38 | # than input length 39 | voc_seq_len = hop_length * 5 # must be a multiple of hop_length 40 | 41 | # Generating / Synthesizing 42 | voc_gen_batched = True # very fast (realtime+) single utterance batched generation 43 | voc_target = 8000 # target number of samples to be generated in each batch entry 44 | voc_overlap = 400 # number of samples for crossfading between batches 45 | -------------------------------------------------------------------------------- /vocoder/inference.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder import hparams as hp 3 | import torch 4 | 5 | 6 | _model = None # type: WaveRNN 7 | 8 | def load_model(weights_fpath, verbose=True): 9 | global _model, _device 10 | 11 | if verbose: 12 | print("Building Wave-RNN") 13 | _model = WaveRNN( 14 | rnn_dims=hp.voc_rnn_dims, 15 | fc_dims=hp.voc_fc_dims, 16 | bits=hp.bits, 17 | pad=hp.voc_pad, 18 | upsample_factors=hp.voc_upsample_factors, 19 | feat_dims=hp.num_mels, 20 | compute_dims=hp.voc_compute_dims, 21 | res_out_dims=hp.voc_res_out_dims, 22 | res_blocks=hp.voc_res_blocks, 23 | hop_length=hp.hop_length, 24 | sample_rate=hp.sample_rate, 25 | mode=hp.voc_mode 26 | ) 27 | 28 | if torch.cuda.is_available(): 29 | _model = _model.cuda() 30 | _device = torch.device('cuda') 31 | else: 32 | _device = torch.device('cpu') 33 | 34 | if verbose: 35 | print("Loading model weights at %s" % weights_fpath) 36 | checkpoint = torch.load(weights_fpath, _device) 37 | _model.load_state_dict(checkpoint['model_state']) 38 | _model.eval() 39 | 40 | 41 | def is_loaded(): 42 | return _model is not None 43 | 44 | 45 | def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800, 46 | progress_callback=None): 47 | """ 48 | Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 49 | that of the synthesizer!) 50 | 51 | :param normalize: 52 | :param batched: 53 | :param target: 54 | :param overlap: 55 | :return: 56 | """ 57 | if _model is None: 58 | raise Exception("Please load Wave-RNN in memory before using it") 59 | 60 | if normalize: 61 | mel = mel / hp.mel_max_abs_value 62 | mel = torch.from_numpy(mel[None, ...]) 63 | wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback) 64 | return wav 65 | -------------------------------------------------------------------------------- /vocoder/models/__pycache__/fatchord_version.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/models/__pycache__/fatchord_version.cpython-37.pyc -------------------------------------------------------------------------------- /vocoder/models/deepmind_version.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from utils.display import * 5 | from utils.dsp import * 6 | 7 | 8 | class WaveRNN(nn.Module) : 9 | def __init__(self, hidden_size=896, quantisation=256) : 10 | super(WaveRNN, self).__init__() 11 | 12 | self.hidden_size = hidden_size 13 | self.split_size = hidden_size // 2 14 | 15 | # The main matmul 16 | self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) 17 | 18 | # Output fc layers 19 | self.O1 = nn.Linear(self.split_size, self.split_size) 20 | self.O2 = nn.Linear(self.split_size, quantisation) 21 | self.O3 = nn.Linear(self.split_size, self.split_size) 22 | self.O4 = nn.Linear(self.split_size, quantisation) 23 | 24 | # Input fc layers 25 | self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False) 26 | self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False) 27 | 28 | # biases for the gates 29 | self.bias_u = nn.Parameter(torch.zeros(self.hidden_size)) 30 | self.bias_r = nn.Parameter(torch.zeros(self.hidden_size)) 31 | self.bias_e = nn.Parameter(torch.zeros(self.hidden_size)) 32 | 33 | # display num params 34 | self.num_params() 35 | 36 | 37 | def forward(self, prev_y, prev_hidden, current_coarse) : 38 | 39 | # Main matmul - the projection is split 3 ways 40 | R_hidden = self.R(prev_hidden) 41 | R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1) 42 | 43 | # Project the prev input 44 | coarse_input_proj = self.I_coarse(prev_y) 45 | I_coarse_u, I_coarse_r, I_coarse_e = \ 46 | torch.split(coarse_input_proj, self.split_size, dim=1) 47 | 48 | # Project the prev input and current coarse sample 49 | fine_input = torch.cat([prev_y, current_coarse], dim=1) 50 | fine_input_proj = self.I_fine(fine_input) 51 | I_fine_u, I_fine_r, I_fine_e = \ 52 | torch.split(fine_input_proj, self.split_size, dim=1) 53 | 54 | # concatenate for the gates 55 | I_u = torch.cat([I_coarse_u, I_fine_u], dim=1) 56 | I_r = torch.cat([I_coarse_r, I_fine_r], dim=1) 57 | I_e = torch.cat([I_coarse_e, I_fine_e], dim=1) 58 | 59 | # Compute all gates for coarse and fine 60 | u = F.sigmoid(R_u + I_u + self.bias_u) 61 | r = F.sigmoid(R_r + I_r + self.bias_r) 62 | e = F.tanh(r * R_e + I_e + self.bias_e) 63 | hidden = u * prev_hidden + (1. - u) * e 64 | 65 | # Split the hidden state 66 | hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1) 67 | 68 | # Compute outputs 69 | out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) 70 | out_fine = self.O4(F.relu(self.O3(hidden_fine))) 71 | 72 | return out_coarse, out_fine, hidden 73 | 74 | 75 | def generate(self, seq_len): 76 | with torch.no_grad(): 77 | # First split up the biases for the gates 78 | b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size) 79 | b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size) 80 | b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size) 81 | 82 | # Lists for the two output seqs 83 | c_outputs, f_outputs = [], [] 84 | 85 | # Some initial inputs 86 | out_coarse = torch.LongTensor([0]).cuda() 87 | out_fine = torch.LongTensor([0]).cuda() 88 | 89 | # We'll meed a hidden state 90 | hidden = self.init_hidden() 91 | 92 | # Need a clock for display 93 | start = time.time() 94 | 95 | # Loop for generation 96 | for i in range(seq_len) : 97 | 98 | # Split into two hidden states 99 | hidden_coarse, hidden_fine = \ 100 | torch.split(hidden, self.split_size, dim=1) 101 | 102 | # Scale and concat previous predictions 103 | out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1. 104 | out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1. 105 | prev_outputs = torch.cat([out_coarse, out_fine], dim=1) 106 | 107 | # Project input 108 | coarse_input_proj = self.I_coarse(prev_outputs) 109 | I_coarse_u, I_coarse_r, I_coarse_e = \ 110 | torch.split(coarse_input_proj, self.split_size, dim=1) 111 | 112 | # Project hidden state and split 6 ways 113 | R_hidden = self.R(hidden) 114 | R_coarse_u , R_fine_u, \ 115 | R_coarse_r, R_fine_r, \ 116 | R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1) 117 | 118 | # Compute the coarse gates 119 | u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u) 120 | r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r) 121 | e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e) 122 | hidden_coarse = u * hidden_coarse + (1. - u) * e 123 | 124 | # Compute the coarse output 125 | out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) 126 | posterior = F.softmax(out_coarse, dim=1) 127 | distrib = torch.distributions.Categorical(posterior) 128 | out_coarse = distrib.sample() 129 | c_outputs.append(out_coarse) 130 | 131 | # Project the [prev outputs and predicted coarse sample] 132 | coarse_pred = out_coarse.float() / 127.5 - 1. 133 | fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1) 134 | fine_input_proj = self.I_fine(fine_input) 135 | I_fine_u, I_fine_r, I_fine_e = \ 136 | torch.split(fine_input_proj, self.split_size, dim=1) 137 | 138 | # Compute the fine gates 139 | u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u) 140 | r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r) 141 | e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e) 142 | hidden_fine = u * hidden_fine + (1. - u) * e 143 | 144 | # Compute the fine output 145 | out_fine = self.O4(F.relu(self.O3(hidden_fine))) 146 | posterior = F.softmax(out_fine, dim=1) 147 | distrib = torch.distributions.Categorical(posterior) 148 | out_fine = distrib.sample() 149 | f_outputs.append(out_fine) 150 | 151 | # Put the hidden state back together 152 | hidden = torch.cat([hidden_coarse, hidden_fine], dim=1) 153 | 154 | # Display progress 155 | speed = (i + 1) / (time.time() - start) 156 | stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed)) 157 | 158 | coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy() 159 | fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy() 160 | output = combine_signal(coarse, fine) 161 | 162 | return output, coarse, fine 163 | 164 | def init_hidden(self, batch_size=1) : 165 | return torch.zeros(batch_size, self.hidden_size).cuda() 166 | 167 | def num_params(self) : 168 | parameters = filter(lambda p: p.requires_grad, self.parameters()) 169 | parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 170 | print('Trainable Parameters: %.3f million' % parameters) -------------------------------------------------------------------------------- /vocoder/saved_models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarootsio/tutorial-streamlit-demo/97a397949c43ece1109f2b8ea2c8757b541fa45b/vocoder/saved_models/.gitkeep -------------------------------------------------------------------------------- /vocoder/train.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder 3 | from vocoder.distribution import discretized_mix_logistic_loss 4 | from vocoder.display import stream, simple_table 5 | from vocoder.gen_wavernn import gen_testset 6 | from torch.utils.data import DataLoader 7 | from pathlib import Path 8 | from torch import optim 9 | import torch.nn.functional as F 10 | import vocoder.hparams as hp 11 | import numpy as np 12 | import time 13 | import torch 14 | 15 | 16 | def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, 17 | save_every: int, backup_every: int, force_restart: bool): 18 | # Check to make sure the hop length is correctly factorised 19 | assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length 20 | 21 | # Instantiate the model 22 | print("Initializing the model...") 23 | model = WaveRNN( 24 | rnn_dims=hp.voc_rnn_dims, 25 | fc_dims=hp.voc_fc_dims, 26 | bits=hp.bits, 27 | pad=hp.voc_pad, 28 | upsample_factors=hp.voc_upsample_factors, 29 | feat_dims=hp.num_mels, 30 | compute_dims=hp.voc_compute_dims, 31 | res_out_dims=hp.voc_res_out_dims, 32 | res_blocks=hp.voc_res_blocks, 33 | hop_length=hp.hop_length, 34 | sample_rate=hp.sample_rate, 35 | mode=hp.voc_mode 36 | ) 37 | 38 | if torch.cuda.is_available(): 39 | model = model.cuda() 40 | device = torch.device('cuda') 41 | else: 42 | device = torch.device('cpu') 43 | 44 | # Initialize the optimizer 45 | optimizer = optim.Adam(model.parameters()) 46 | for p in optimizer.param_groups: 47 | p["lr"] = hp.voc_lr 48 | loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss 49 | 50 | # Load the weights 51 | model_dir = models_dir.joinpath(run_id) 52 | model_dir.mkdir(exist_ok=True) 53 | weights_fpath = model_dir.joinpath(run_id + ".pt") 54 | if force_restart or not weights_fpath.exists(): 55 | print("\nStarting the training of WaveRNN from scratch\n") 56 | model.save(weights_fpath, optimizer) 57 | else: 58 | print("\nLoading weights at %s" % weights_fpath) 59 | model.load(weights_fpath, optimizer) 60 | print("WaveRNN weights loaded from step %d" % model.step) 61 | 62 | # Initialize the dataset 63 | metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ 64 | voc_dir.joinpath("synthesized.txt") 65 | mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta") 66 | wav_dir = syn_dir.joinpath("audio") 67 | dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir) 68 | test_loader = DataLoader(dataset, 69 | batch_size=1, 70 | shuffle=True, 71 | pin_memory=True) 72 | 73 | # Begin the training 74 | simple_table([('Batch size', hp.voc_batch_size), 75 | ('LR', hp.voc_lr), 76 | ('Sequence Len', hp.voc_seq_len)]) 77 | 78 | for epoch in range(1, 350): 79 | data_loader = DataLoader(dataset, 80 | collate_fn=collate_vocoder, 81 | batch_size=hp.voc_batch_size, 82 | num_workers=2, 83 | shuffle=True, 84 | pin_memory=True) 85 | start = time.time() 86 | running_loss = 0. 87 | 88 | for i, (x, y, m) in enumerate(data_loader, 1): 89 | if torch.cuda.is_available(): 90 | x, m, y = x.cuda(), m.cuda(), y.cuda() 91 | 92 | # Forward pass 93 | y_hat = model(x, m) 94 | if model.mode == 'RAW': 95 | y_hat = y_hat.transpose(1, 2).unsqueeze(-1) 96 | elif model.mode == 'MOL': 97 | y = y.float() 98 | y = y.unsqueeze(-1) 99 | 100 | # Backward pass 101 | loss = loss_func(y_hat, y) 102 | optimizer.zero_grad() 103 | loss.backward() 104 | optimizer.step() 105 | 106 | running_loss += loss.item() 107 | speed = i / (time.time() - start) 108 | avg_loss = running_loss / i 109 | 110 | step = model.get_step() 111 | k = step // 1000 112 | 113 | if backup_every != 0 and step % backup_every == 0 : 114 | model.checkpoint(model_dir, optimizer) 115 | 116 | if save_every != 0 and step % save_every == 0 : 117 | model.save(weights_fpath, optimizer) 118 | 119 | msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \ 120 | f"Loss: {avg_loss:.4f} | {speed:.1f} " \ 121 | f"steps/s | Step: {k}k | " 122 | stream(msg) 123 | 124 | 125 | gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, 126 | hp.voc_target, hp.voc_overlap, model_dir) 127 | print("") 128 | -------------------------------------------------------------------------------- /vocoder/vocoder_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from pathlib import Path 3 | from vocoder import audio 4 | import vocoder.hparams as hp 5 | import numpy as np 6 | import torch 7 | 8 | 9 | class VocoderDataset(Dataset): 10 | def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path): 11 | print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir)) 12 | 13 | with metadata_fpath.open("r") as metadata_file: 14 | metadata = [line.split("|") for line in metadata_file] 15 | 16 | gta_fnames = [x[1] for x in metadata if int(x[4])] 17 | gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames] 18 | wav_fnames = [x[0] for x in metadata if int(x[4])] 19 | wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames] 20 | self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths)) 21 | 22 | print("Found %d samples" % len(self.samples_fpaths)) 23 | 24 | def __getitem__(self, index): 25 | mel_path, wav_path = self.samples_fpaths[index] 26 | 27 | # Load the mel spectrogram and adjust its range to [-1, 1] 28 | mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value 29 | 30 | # Load the wav 31 | wav = np.load(wav_path) 32 | if hp.apply_preemphasis: 33 | wav = audio.pre_emphasis(wav) 34 | wav = np.clip(wav, -1, 1) 35 | 36 | # Fix for missing padding # TODO: settle on whether this is any useful 37 | r_pad = (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav) 38 | wav = np.pad(wav, (0, r_pad), mode='constant') 39 | assert len(wav) >= mel.shape[1] * hp.hop_length 40 | wav = wav[:mel.shape[1] * hp.hop_length] 41 | assert len(wav) % hp.hop_length == 0 42 | 43 | # Quantize the wav 44 | if hp.voc_mode == 'RAW': 45 | if hp.mu_law: 46 | quant = audio.encode_mu_law(wav, mu=2 ** hp.bits) 47 | else: 48 | quant = audio.float_2_label(wav, bits=hp.bits) 49 | elif hp.voc_mode == 'MOL': 50 | quant = audio.float_2_label(wav, bits=16) 51 | 52 | return mel.astype(np.float32), quant.astype(np.int64) 53 | 54 | def __len__(self): 55 | return len(self.samples_fpaths) 56 | 57 | 58 | def collate_vocoder(batch): 59 | mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad 60 | max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch] 61 | mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] 62 | sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets] 63 | 64 | mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)] 65 | 66 | labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)] 67 | 68 | mels = np.stack(mels).astype(np.float32) 69 | labels = np.stack(labels).astype(np.int64) 70 | 71 | mels = torch.tensor(mels) 72 | labels = torch.tensor(labels).long() 73 | 74 | x = labels[:, :hp.voc_seq_len] 75 | y = labels[:, 1:] 76 | 77 | bits = 16 if hp.voc_mode == 'MOL' else hp.bits 78 | 79 | x = audio.label_2_float(x.float(), bits) 80 | 81 | if hp.voc_mode == 'MOL' : 82 | y = audio.label_2_float(y.float(), bits) 83 | 84 | return x, y, mels --------------------------------------------------------------------------------