├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── any-issue.md │ └── bug-report.md ├── .gitignore ├── LICENSE.txt ├── README.md ├── demo_cli.py ├── demo_toolbox.py ├── demo_toolbox_collab.ipynb ├── encoder ├── __init__.py ├── audio.py ├── config.py ├── data_objects │ ├── __init__.py │ ├── random_cycler.py │ ├── speaker.py │ ├── speaker_batch.py │ ├── speaker_verification_dataset.py │ └── utterance.py ├── inference.py ├── model.py ├── params_data.py ├── params_model.py ├── preprocess.py ├── train.py └── visualizations.py ├── encoder_preprocess.py ├── encoder_train.py ├── requirements.txt ├── synthesizer ├── LICENSE.txt ├── __init__.py ├── audio.py ├── feeder.py ├── hparams.py ├── inference.py ├── infolog.py ├── models │ ├── __init__.py │ ├── architecture_wrappers.py │ ├── attention.py │ ├── custom_decoder.py │ ├── helpers.py │ ├── modules.py │ └── tacotron.py ├── preprocess.py ├── synthesize.py ├── tacotron2.py ├── train.py └── utils │ ├── __init__.py │ ├── _cmudict.py │ ├── cleaners.py │ ├── numbers.py │ ├── plot.py │ ├── symbols.py │ └── text.py ├── synthesizer_preprocess_audio.py ├── synthesizer_preprocess_embeds.py ├── synthesizer_train.py ├── toolbox ├── __init__.py ├── ui.py └── utterance.py ├── utils ├── __init__.py ├── argutils.py ├── logmmse.py └── profiler.py ├── vocoder ├── LICENSE.txt ├── audio.py ├── display.py ├── distribution.py ├── gen_wavernn.py ├── hparams.py ├── inference.py ├── models │ ├── deepmind_version.py │ └── fatchord_version.py ├── train.py └── vocoder_dataset.py ├── vocoder_preprocess.py └── vocoder_train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/any-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Any issue 3 | about: Any issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | I am not maintaining this repo anymore (I explain why in the readme). 11 | I keep issues open only because some old ones are useful. 12 | I will not assist you in any way. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: '' 3 | about: Any issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | I am not maintaining this repo anymore (I explain why in the readme). 11 | I keep issues open only because some old ones are useful. 12 | I will not assist you in any way. 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.aux 3 | *.log 4 | *.out 5 | *.synctex.gz 6 | *.suo 7 | *__pycache__ 8 | *.idea 9 | *.ipynb_checkpoints 10 | *.pickle 11 | *.npy 12 | *.blg 13 | *.bbl 14 | *.bcf 15 | *.toc 16 | *.wav 17 | *.sh 18 | encoder/saved_models/* 19 | synthesizer/saved_models/* 20 | vocoder/saved_models/* 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead) 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Voice Cloning 2 | This repository is an implementation of [Transfer Learning from Speaker Verification to 3 | Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious or if you're looking for info I haven't documented yet (don't hesitate to make an issue for that too). Mostly I would recommend giving a quick look to the figures beyond the introduction. 4 | 5 | SV2TTS is a three-stage deep learning framework that allows to create a numerical representation of a voice from a few seconds of audio, and to use it to condition a text-to-speech model trained to generalize to new voices. 6 | 7 | **Video demonstration** (click the picture): 8 | 9 | [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) 10 | 11 | 12 | 13 | ### Papers implemented 14 | | URL | Designation | Title | Implementation source | 15 | | --- | ----------- | ----- | --------------------- | 16 | |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo | 17 | |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) | 18 | |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2) 19 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | 20 | 21 | ## News 22 | **13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup. 23 | 24 | **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it. 25 | 26 | **06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/). 27 | 28 | **25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM. 29 | 30 | 31 | ## Quick start 32 | ### Requirements 33 | You will need the following whether you plan to use the toolbox only or to retrain the models. 34 | 35 | **Python 3.7**. Python 3.6 might work too, but I wouldn't go lower because I make extensive use of pathlib. 36 | 37 | Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). 38 | 39 | A GPU is mandatory, but you don't necessarily need a high tier GPU if you only want to use the toolbox. 40 | 41 | ### Pretrained models 42 | Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). 43 | 44 | ### Preliminary 45 | Before you download any dataset, you can begin by testing your configuration with: 46 | 47 | `python demo_cli.py` 48 | 49 | If all tests pass, you're good to go. 50 | 51 | ### Datasets 52 | For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. 53 | 54 | ### Toolbox 55 | You can then try the toolbox: 56 | 57 | `python demo_toolbox.py -d ` 58 | or 59 | `python demo_toolbox.py` 60 | 61 | depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). 62 | 63 | ## Contributions & Issues 64 | I'm working full-time as of June 2019. I don't have time to maintain this repo nor reply to issues. Sorry. 65 | -------------------------------------------------------------------------------- /demo_cli.py: -------------------------------------------------------------------------------- 1 | from encoder.params_model import model_embedding_size as speaker_embedding_size 2 | from utils.argutils import print_args 3 | from synthesizer.inference import Synthesizer 4 | from encoder import inference as encoder 5 | from vocoder import inference as vocoder 6 | from pathlib import Path 7 | import numpy as np 8 | import librosa 9 | import argparse 10 | import torch 11 | import sys 12 | 13 | 14 | if __name__ == '__main__': 15 | ## Info & args 16 | parser = argparse.ArgumentParser( 17 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 18 | ) 19 | parser.add_argument("-e", "--enc_model_fpath", type=Path, 20 | default="encoder/saved_models/pretrained.pt", 21 | help="Path to a saved encoder") 22 | parser.add_argument("-s", "--syn_model_dir", type=Path, 23 | default="synthesizer/saved_models/logs-pretrained/", 24 | help="Directory containing the synthesizer model") 25 | parser.add_argument("-v", "--voc_model_fpath", type=Path, 26 | default="vocoder/saved_models/pretrained/pretrained.pt", 27 | help="Path to a saved vocoder") 28 | parser.add_argument("--low_mem", action="store_true", help=\ 29 | "If True, the memory used by the synthesizer will be freed after each use. Adds large " 30 | "overhead but allows to save some GPU memory for lower-end GPUs.") 31 | parser.add_argument("--no_sound", action="store_true", help=\ 32 | "If True, audio won't be played.") 33 | args = parser.parse_args() 34 | print_args(args, parser) 35 | if not args.no_sound: 36 | import sounddevice as sd 37 | 38 | 39 | ## Print some environment information (for debugging purposes) 40 | print("Running a test of your configuration...\n") 41 | if not torch.cuda.is_available(): 42 | print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " 43 | "for deep learning, ensure that the drivers are properly installed, and that your " 44 | "CUDA version matches your PyTorch installation. CPU-only inference is currently " 45 | "not supported.", file=sys.stderr) 46 | quit(-1) 47 | device_id = torch.cuda.current_device() 48 | gpu_properties = torch.cuda.get_device_properties(device_id) 49 | print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " 50 | "%.1fGb total memory.\n" % 51 | (torch.cuda.device_count(), 52 | device_id, 53 | gpu_properties.name, 54 | gpu_properties.major, 55 | gpu_properties.minor, 56 | gpu_properties.total_memory / 1e9)) 57 | 58 | 59 | ## Load the models one by one. 60 | print("Preparing the encoder, the synthesizer and the vocoder...") 61 | encoder.load_model(args.enc_model_fpath) 62 | synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) 63 | vocoder.load_model(args.voc_model_fpath) 64 | 65 | 66 | ## Run a test 67 | print("Testing your configuration with small inputs.") 68 | # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's 69 | # sampling rate, which may differ. 70 | # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 71 | # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. 72 | # The sampling rate is the number of values (samples) recorded per second, it is set to 73 | # 16000 for the encoder. Creating an array of length will always correspond 74 | # to an audio of 1 second. 75 | print("\tTesting the encoder...") 76 | encoder.embed_utterance(np.zeros(encoder.sampling_rate)) 77 | 78 | # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance 79 | # returns, but here we're going to make one ourselves just for the sake of showing that it's 80 | # possible. 81 | embed = np.random.rand(speaker_embedding_size) 82 | # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 83 | # embeddings it will be). 84 | embed /= np.linalg.norm(embed) 85 | # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 86 | # illustrate that 87 | embeds = [embed, np.zeros(speaker_embedding_size)] 88 | texts = ["test 1", "test 2"] 89 | print("\tTesting the synthesizer... (loading the model will output a lot of text)") 90 | mels = synthesizer.synthesize_spectrograms(texts, embeds) 91 | 92 | # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 93 | # can concatenate the mel spectrograms to a single one. 94 | mel = np.concatenate(mels, axis=1) 95 | # The vocoder can take a callback function to display the generation. More on that later. For 96 | # now we'll simply hide it like this: 97 | no_action = lambda *args: None 98 | print("\tTesting the vocoder...") 99 | # For the sake of making this test short, we'll pass a short target length. The target length 100 | # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 101 | # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of 102 | # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 103 | # that has a detrimental effect on the quality of the audio. The default parameters are 104 | # recommended in general. 105 | vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) 106 | 107 | print("All test passed! You can now synthesize speech.\n\n") 108 | 109 | 110 | ## Interactive speech generation 111 | print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to " 112 | "show how you can interface this project easily with your own. See the source code for " 113 | "an explanation of what is happening.\n") 114 | 115 | print("Interactive generation loop") 116 | num_generated = 0 117 | while True: 118 | try: 119 | # Get the reference audio filepath 120 | message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ 121 | "wav, m4a, flac, ...):\n" 122 | in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) 123 | 124 | 125 | ## Computing the embedding 126 | # First, we load the wav using the function that the speaker encoder provides. This is 127 | # important: there is preprocessing that must be applied. 128 | 129 | # The following two methods are equivalent: 130 | # - Directly load from the filepath: 131 | preprocessed_wav = encoder.preprocess_wav(in_fpath) 132 | # - If the wav is already loaded: 133 | original_wav, sampling_rate = librosa.load(in_fpath) 134 | preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) 135 | print("Loaded file succesfully") 136 | 137 | # Then we derive the embedding. There are many functions and parameters that the 138 | # speaker encoder interfaces. These are mostly for in-depth research. You will typically 139 | # only use this function (with its default parameters): 140 | embed = encoder.embed_utterance(preprocessed_wav) 141 | print("Created the embedding") 142 | 143 | 144 | ## Generating the spectrogram 145 | text = input("Write a sentence (+-20 words) to be synthesized:\n") 146 | 147 | # The synthesizer works in batch, so you need to put your data in a list or numpy array 148 | texts = [text] 149 | embeds = [embed] 150 | # If you know what the attention layer alignments are, you can retrieve them here by 151 | # passing return_alignments=True 152 | specs = synthesizer.synthesize_spectrograms(texts, embeds) 153 | spec = specs[0] 154 | print("Created the mel spectrogram") 155 | 156 | 157 | ## Generating the waveform 158 | print("Synthesizing the waveform:") 159 | # Synthesizing the waveform is fairly straightforward. Remember that the longer the 160 | # spectrogram, the more time-efficient the vocoder. 161 | generated_wav = vocoder.infer_waveform(spec) 162 | 163 | 164 | ## Post-generation 165 | # There's a bug with sounddevice that makes the audio cut one second earlier, so we 166 | # pad it. 167 | generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") 168 | 169 | # Play the audio (non-blocking) 170 | if not args.no_sound: 171 | sd.stop() 172 | sd.play(generated_wav, synthesizer.sample_rate) 173 | 174 | # Save it on the disk 175 | fpath = "demo_output_%02d.wav" % num_generated 176 | print(generated_wav.dtype) 177 | librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 178 | synthesizer.sample_rate) 179 | num_generated += 1 180 | print("\nSaved output as %s\n\n" % fpath) 181 | 182 | 183 | except Exception as e: 184 | print("Caught exception: %s" % repr(e)) 185 | print("Restarting\n") 186 | -------------------------------------------------------------------------------- /demo_toolbox.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from toolbox import Toolbox 3 | from utils.argutils import print_args 4 | import argparse 5 | 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser( 9 | description="Runs the toolbox", 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 11 | ) 12 | 13 | parser.add_argument("-d", "--datasets_root", type=Path, help= \ 14 | "Path to the directory containing your datasets. See toolbox/__init__.py for a list of " 15 | "supported datasets. You can add your own data by created a directory named UserAudio " 16 | "in your datasets root. Supported formats are mp3, flac, wav and m4a. Each speaker should " 17 | "be inside a directory, e.g. /UserAudio/speaker_01/audio_01.wav.", 18 | default=None) 19 | parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", 20 | help="Directory containing saved encoder models") 21 | parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", 22 | help="Directory containing saved synthesizer models") 23 | parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", 24 | help="Directory containing saved vocoder models") 25 | parser.add_argument("--low_mem", action="store_true", help=\ 26 | "If True, the memory used by the synthesizer will be freed after each use. Adds large " 27 | "overhead but allows to save some GPU memory for lower-end GPUs.") 28 | args = parser.parse_args() 29 | 30 | # Launch the toolbox 31 | print_args(args, parser) 32 | Toolbox(**vars(args)) 33 | -------------------------------------------------------------------------------- /encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwater/Real-Time-Voice-Cloning-Chinese/06882b9a83247beda1d4d84baca0400457096d1b/encoder/__init__.py -------------------------------------------------------------------------------- /encoder/audio.py: -------------------------------------------------------------------------------- 1 | from scipy.ndimage.morphology import binary_dilation 2 | from encoder.params_data import * 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | import numpy as np 6 | import webrtcvad 7 | import librosa 8 | import struct 9 | 10 | int16_max = (2 ** 15) - 1 11 | 12 | 13 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], 14 | source_sr: Optional[int] = None): 15 | """ 16 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 17 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 18 | 19 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 20 | just .wav), either the waveform as a numpy array of floats. 21 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 22 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 23 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 24 | this argument will be ignored. 25 | """ 26 | # Load the wav from disk if needed 27 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 28 | wav, source_sr = librosa.load(fpath_or_wav, sr=None) 29 | else: 30 | wav = fpath_or_wav 31 | 32 | # Resample the wav if needed 33 | if source_sr is not None and source_sr != sampling_rate: 34 | wav = librosa.resample(wav, source_sr, sampling_rate) 35 | 36 | # Apply the preprocessing: normalize volume and shorten long silences 37 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 38 | wav = trim_long_silences(wav) 39 | 40 | return wav 41 | 42 | 43 | def wav_to_mel_spectrogram(wav): 44 | """ 45 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 46 | Note: this not a log-mel spectrogram. 47 | """ 48 | frames = librosa.feature.melspectrogram( 49 | wav, 50 | sampling_rate, 51 | n_fft=int(sampling_rate * mel_window_length / 1000), 52 | hop_length=int(sampling_rate * mel_window_step / 1000), 53 | n_mels=mel_n_channels 54 | ) 55 | return frames.astype(np.float32).T 56 | 57 | 58 | def trim_long_silences(wav): 59 | """ 60 | Ensures that segments without voice in the waveform remain no longer than a 61 | threshold determined by the VAD parameters in params.py. 62 | 63 | :param wav: the raw waveform as a numpy array of floats 64 | :return: the same waveform with silences trimmed away (length <= original wav length) 65 | """ 66 | # Compute the voice detection window size 67 | samples_per_window = (vad_window_length * sampling_rate) // 1000 68 | 69 | # Trim the end of the audio to have a multiple of the window size 70 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] 71 | 72 | # Convert the float waveform to 16-bit mono PCM 73 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) 74 | 75 | # Perform voice activation detection 76 | voice_flags = [] 77 | vad = webrtcvad.Vad(mode=3) 78 | for window_start in range(0, len(wav), samples_per_window): 79 | window_end = window_start + samples_per_window 80 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], 81 | sample_rate=sampling_rate)) 82 | voice_flags = np.array(voice_flags) 83 | 84 | # Smooth the voice detection with a moving average 85 | def moving_average(array, width): 86 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) 87 | ret = np.cumsum(array_padded, dtype=float) 88 | ret[width:] = ret[width:] - ret[:-width] 89 | return ret[width - 1:] / width 90 | 91 | audio_mask = moving_average(voice_flags, vad_moving_average_width) 92 | audio_mask = np.round(audio_mask).astype(np.bool) 93 | 94 | # Dilate the voiced regions 95 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) 96 | audio_mask = np.repeat(audio_mask, samples_per_window) 97 | 98 | return wav[audio_mask == True] 99 | 100 | 101 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 102 | if increase_only and decrease_only: 103 | raise ValueError("Both increase only and decrease only are set") 104 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) 105 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 106 | return wav 107 | return wav * (10 ** (dBFS_change / 20)) 108 | -------------------------------------------------------------------------------- /encoder/config.py: -------------------------------------------------------------------------------- 1 | librispeech_datasets = { 2 | "train": { 3 | "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"], 4 | "other": ["LibriSpeech/train-other-500"] 5 | }, 6 | "test": { 7 | "clean": ["LibriSpeech/test-clean"], 8 | "other": ["LibriSpeech/test-other"] 9 | }, 10 | "dev": { 11 | "clean": ["LibriSpeech/dev-clean"], 12 | "other": ["LibriSpeech/dev-other"] 13 | }, 14 | } 15 | libritts_datasets = { 16 | "train": { 17 | "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"], 18 | "other": ["LibriTTS/train-other-500"] 19 | }, 20 | "test": { 21 | "clean": ["LibriTTS/test-clean"], 22 | "other": ["LibriTTS/test-other"] 23 | }, 24 | "dev": { 25 | "clean": ["LibriTTS/dev-clean"], 26 | "other": ["LibriTTS/dev-other"] 27 | }, 28 | } 29 | voxceleb_datasets = { 30 | "voxceleb1" : { 31 | "train": ["VoxCeleb1/wav"], 32 | "test": ["VoxCeleb1/test_wav"] 33 | }, 34 | "voxceleb2" : { 35 | "train": ["VoxCeleb2/dev/aac"], 36 | "test": ["VoxCeleb2/test_wav"] 37 | } 38 | } 39 | aishell1_datasets = { 40 | "train": ["data_aishell/wav/train"], 41 | "dev": ["data_aishell/wav/dev"], 42 | "test": ["data_aishell/wav/test"], 43 | } 44 | 45 | magicdata_datasets = { 46 | "train": ["MagicData/train"], 47 | "dev": ["MagicData/dev"], 48 | "test": ["MagicData/test"], 49 | } 50 | 51 | aidatatang_datasets = { 52 | "train": ["aidatatang_200zh/corpus/train"], 53 | "dev": ["aidatatang_200zh/corpus/dev"], 54 | "test": ["aidatatang_200zh/corpus/test"], 55 | } 56 | 57 | thchs30_datasets = { 58 | "train": ["data_thchs30/train"], 59 | "dev": ["data_thchs30/dev"], 60 | "test": ["data_thchs30/test"], 61 | } 62 | 63 | mozilla_datasets = { 64 | "train": "Mozilla/train.tsv", 65 | "dev": "Mozilla/test.tsv", 66 | "test": "Mozilla/train.tsv", 67 | "validated": "Mozilla/validated.tsv", 68 | } 69 | 70 | stcmds_datasets = "ST-CMDS-20170001_1-OS" 71 | 72 | primewords_datasets = "primewords_md_2018_set1" 73 | 74 | other_datasets = [ 75 | "LJSpeech-1.1", 76 | "VCTK-Corpus/wav48", 77 | ] 78 | 79 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"] 80 | -------------------------------------------------------------------------------- /encoder/data_objects/__init__.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 2 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader 3 | -------------------------------------------------------------------------------- /encoder/data_objects/random_cycler.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | class RandomCycler: 4 | """ 5 | Creates an internal copy of a sequence and allows access to its items in a constrained random 6 | order. For a source sequence of n items and one or several consecutive queries of a total 7 | of m items, the following guarantees hold (one implies the other): 8 | - Each item will be returned between m // n and ((m - 1) // n) + 1 times. 9 | - Between two appearances of the same item, there may be at most 2 * (n - 1) other items. 10 | """ 11 | 12 | def __init__(self, source): 13 | if len(source) == 0: 14 | raise Exception("Can't create RandomCycler from an empty collection") 15 | self.all_items = list(source) 16 | self.next_items = [] 17 | 18 | def sample(self, count: int): 19 | shuffle = lambda l: random.sample(l, len(l)) 20 | 21 | out = [] 22 | while count > 0: 23 | if count >= len(self.all_items): 24 | out.extend(shuffle(list(self.all_items))) 25 | count -= len(self.all_items) 26 | continue 27 | n = min(count, len(self.next_items)) 28 | out.extend(self.next_items[:n]) 29 | count -= n 30 | self.next_items = self.next_items[n:] 31 | if len(self.next_items) == 0: 32 | self.next_items = shuffle(list(self.all_items)) 33 | return out 34 | 35 | def __next__(self): 36 | return self.sample(1)[0] 37 | 38 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.random_cycler import RandomCycler 2 | from encoder.data_objects.utterance import Utterance 3 | from pathlib import Path 4 | 5 | # Contains the set of utterances of a single speaker 6 | class Speaker: 7 | def __init__(self, root: Path): 8 | self.root = root 9 | self.name = root.name 10 | self.utterances = None 11 | self.utterance_cycler = None 12 | 13 | def _load_utterances(self): 14 | with self.root.joinpath("_sources.txt").open("r") as sources_file: 15 | sources = [l.split(",") for l in sources_file] 16 | sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} 17 | self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] 18 | self.utterance_cycler = RandomCycler(self.utterances) 19 | 20 | def random_partial(self, count, n_frames): 21 | """ 22 | Samples a batch of unique partial utterances from the disk in a way that all 23 | utterances come up at least once every two cycles and in a random order every time. 24 | 25 | :param count: The number of partial utterances to sample from the set of utterances from 26 | that speaker. Utterances are guaranteed not to be repeated if is not larger than 27 | the number of utterances available. 28 | :param n_frames: The number of frames in the partial utterance. 29 | :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 30 | frames are the frames of the partial utterances and range is the range of the partial 31 | utterance with regard to the complete utterance. 32 | """ 33 | if self.utterances is None: 34 | self._load_utterances() 35 | 36 | utterances = self.utterance_cycler.sample(count) 37 | 38 | a = [(u,) + u.random_partial(n_frames) for u in utterances] 39 | 40 | return a 41 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List 3 | from encoder.data_objects.speaker import Speaker 4 | 5 | class SpeakerBatch: 6 | def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): 7 | self.speakers = speakers 8 | self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers} 9 | 10 | # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with 11 | # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40) 12 | self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]]) 13 | -------------------------------------------------------------------------------- /encoder/data_objects/speaker_verification_dataset.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.random_cycler import RandomCycler 2 | from encoder.data_objects.speaker_batch import SpeakerBatch 3 | from encoder.data_objects.speaker import Speaker 4 | from encoder.params_data import partials_n_frames 5 | from torch.utils.data import Dataset, DataLoader 6 | from pathlib import Path 7 | 8 | # TODO: improve with a pool of speakers for data efficiency 9 | 10 | class SpeakerVerificationDataset(Dataset): 11 | def __init__(self, datasets_root: Path): 12 | self.root = datasets_root 13 | speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] 14 | if len(speaker_dirs) == 0: 15 | raise Exception("No speakers found. Make sure you are pointing to the directory " 16 | "containing all preprocessed speaker directories.") 17 | self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] 18 | self.speaker_cycler = RandomCycler(self.speakers) 19 | 20 | def __len__(self): 21 | return int(1e10) 22 | 23 | def __getitem__(self, index): 24 | return next(self.speaker_cycler) 25 | 26 | def get_logs(self): 27 | log_string = "" 28 | for log_fpath in self.root.glob("*.txt"): 29 | with log_fpath.open("r") as log_file: 30 | log_string += "".join(log_file.readlines()) 31 | return log_string 32 | 33 | 34 | class SpeakerVerificationDataLoader(DataLoader): 35 | def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 36 | batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 37 | worker_init_fn=None): 38 | self.utterances_per_speaker = utterances_per_speaker 39 | 40 | super().__init__( 41 | dataset=dataset, 42 | batch_size=speakers_per_batch, 43 | shuffle=False, 44 | sampler=sampler, 45 | batch_sampler=batch_sampler, 46 | num_workers=num_workers, 47 | collate_fn=self.collate, 48 | pin_memory=pin_memory, 49 | drop_last=False, 50 | timeout=timeout, 51 | worker_init_fn=worker_init_fn 52 | ) 53 | 54 | def collate(self, speakers): 55 | return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 56 | -------------------------------------------------------------------------------- /encoder/data_objects/utterance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Utterance: 5 | def __init__(self, frames_fpath, wave_fpath): 6 | self.frames_fpath = frames_fpath 7 | self.wave_fpath = wave_fpath 8 | 9 | def get_frames(self): 10 | return np.load(self.frames_fpath) 11 | 12 | def random_partial(self, n_frames): 13 | """ 14 | Crops the frames into a partial utterance of n_frames 15 | 16 | :param n_frames: The number of frames of the partial utterance 17 | :return: the partial utterance frames and a tuple indicating the start and end of the 18 | partial utterance in the complete utterance. 19 | """ 20 | frames = self.get_frames() 21 | if frames.shape[0] == n_frames: 22 | start = 0 23 | else: 24 | start = np.random.randint(0, frames.shape[0] - n_frames) 25 | end = start + n_frames 26 | return frames[start:end], (start, end) -------------------------------------------------------------------------------- /encoder/inference.py: -------------------------------------------------------------------------------- 1 | from encoder.params_data import * 2 | from encoder.model import SpeakerEncoder 3 | from encoder.audio import preprocess_wav # We want to expose this function from here 4 | from matplotlib import cm 5 | from encoder import audio 6 | from pathlib import Path 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import torch 10 | 11 | _model = None # type: SpeakerEncoder 12 | _device = None # type: torch.device 13 | 14 | 15 | def load_model(weights_fpath: Path, device=None): 16 | """ 17 | Loads the model in memory. If this function is not explicitely called, it will be run on the 18 | first call to embed_frames() with the default weights file. 19 | 20 | :param weights_fpath: the path to saved model weights. 21 | :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 22 | model will be loaded and will run on this device. Outputs will however always be on the cpu. 23 | If None, will default to your GPU if it"s available, otherwise your CPU. 24 | """ 25 | # TODO: I think the slow loading of the encoder might have something to do with the device it 26 | # was saved on. Worth investigating. 27 | global _model, _device 28 | if device is None: 29 | _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | elif isinstance(device, str): 31 | _device = torch.device(device) 32 | _model = SpeakerEncoder(_device, torch.device("cpu")) 33 | checkpoint = torch.load(weights_fpath) 34 | _model.load_state_dict(checkpoint["model_state"]) 35 | _model.eval() 36 | print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) 37 | 38 | 39 | def is_loaded(): 40 | return _model is not None 41 | 42 | 43 | def embed_frames_batch(frames_batch): 44 | """ 45 | Computes embeddings for a batch of mel spectrogram. 46 | 47 | :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 48 | (batch_size, n_frames, n_channels) 49 | :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size) 50 | """ 51 | if _model is None: 52 | raise Exception("Model was not loaded. Call load_model() before inference.") 53 | 54 | frames = torch.from_numpy(frames_batch).to(_device) 55 | embed = _model.forward(frames).detach().cpu().numpy() 56 | return embed 57 | 58 | 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, 60 | min_pad_coverage=0.75, overlap=0.5): 61 | """ 62 | Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 63 | partial utterances of each. Both the waveform and the mel 64 | spectrogram slices are returned, so as to make each partial utterance waveform correspond to 65 | its spectrogram. This function assumes that the mel spectrogram parameters used are those 66 | defined in params_data.py. 67 | 68 | The returned ranges may be indexing further than the length of the waveform. It is 69 | recommended that you pad the waveform with zeros up to wave_slices[-1].stop. 70 | 71 | :param n_samples: the number of samples in the waveform 72 | :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 73 | utterance 74 | :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 75 | enough frames. If at least of are present, 76 | then the last partial utterance will be considered, as if we padded the audio. Otherwise, 77 | it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 78 | utterance, this parameter is ignored so that the function always returns at least 1 slice. 79 | :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 80 | utterances are entirely disjoint. 81 | :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 82 | respectively the waveform and the mel spectrogram with these slices to obtain the partial 83 | utterances. 84 | """ 85 | assert 0 <= overlap < 1 86 | assert 0 < min_pad_coverage <= 1 87 | 88 | samples_per_frame = int((sampling_rate * mel_window_step / 1000)) 89 | n_frames = int(np.ceil((n_samples + 1) / samples_per_frame)) 90 | frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1) 91 | 92 | # Compute the slices 93 | wav_slices, mel_slices = [], [] 94 | steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1) 95 | for i in range(0, steps, frame_step): 96 | mel_range = np.array([i, i + partial_utterance_n_frames]) 97 | wav_range = mel_range * samples_per_frame 98 | mel_slices.append(slice(*mel_range)) 99 | wav_slices.append(slice(*wav_range)) 100 | 101 | # Evaluate whether extra padding is warranted or not 102 | last_wav_range = wav_slices[-1] 103 | coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start) 104 | if coverage < min_pad_coverage and len(mel_slices) > 1: 105 | mel_slices = mel_slices[:-1] 106 | wav_slices = wav_slices[:-1] 107 | 108 | return wav_slices, mel_slices 109 | 110 | 111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): 112 | """ 113 | Computes an embedding for a single utterance. 114 | 115 | # TODO: handle multiple wavs to benefit from batching on GPU 116 | :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 117 | :param using_partials: if True, then the utterance is split in partial utterances of 118 | frames and the utterance embedding is computed from their 119 | normalized average. If False, the utterance is instead computed from feeding the entire 120 | spectogram to the network. 121 | :param return_partials: if True, the partial embeddings will also be returned along with the 122 | wav slices that correspond to the partial embeddings. 123 | :param kwargs: additional arguments to compute_partial_splits() 124 | :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 125 | is True, the partial utterances as a numpy array of float32 of shape 126 | (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 127 | returned. If is simultaneously set to False, both these values will be None 128 | instead. 129 | """ 130 | # Process the entire utterance if not using partials 131 | if not using_partials: 132 | frames = audio.wav_to_mel_spectrogram(wav) 133 | embed = embed_frames_batch(frames[None, ...])[0] 134 | if return_partials: 135 | return embed, None, None 136 | return embed 137 | 138 | # Compute where to split the utterance into partials and pad if necessary 139 | wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) 140 | max_wave_length = wave_slices[-1].stop 141 | if max_wave_length >= len(wav): 142 | wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") 143 | 144 | # Split the utterance into partials 145 | frames = audio.wav_to_mel_spectrogram(wav) 146 | frames_batch = np.array([frames[s] for s in mel_slices]) 147 | partial_embeds = embed_frames_batch(frames_batch) 148 | 149 | # Compute the utterance embedding from the partial embeddings 150 | raw_embed = np.mean(partial_embeds, axis=0) 151 | embed = raw_embed / np.linalg.norm(raw_embed, 2) 152 | 153 | if return_partials: 154 | return embed, partial_embeds, wave_slices 155 | return embed 156 | 157 | 158 | def embed_speaker(wavs, **kwargs): 159 | raise NotImplemented() 160 | 161 | 162 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)): 163 | if ax is None: 164 | ax = plt.gca() 165 | 166 | if shape is None: 167 | height = int(np.sqrt(len(embed))) 168 | shape = (height, -1) 169 | embed = embed.reshape(shape) 170 | 171 | cmap = cm.get_cmap() 172 | mappable = ax.imshow(embed, cmap=cmap) 173 | cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04) 174 | cbar.set_clim(*color_range) 175 | 176 | ax.set_xticks([]), ax.set_yticks([]) 177 | ax.set_title(title) 178 | -------------------------------------------------------------------------------- /encoder/model.py: -------------------------------------------------------------------------------- 1 | from encoder.params_model import * 2 | from encoder.params_data import * 3 | from scipy.interpolate import interp1d 4 | from sklearn.metrics import roc_curve 5 | from torch.nn.utils import clip_grad_norm_ 6 | from scipy.optimize import brentq 7 | from torch import nn 8 | import numpy as np 9 | import torch 10 | 11 | 12 | class SpeakerEncoder(nn.Module): 13 | def __init__(self, device, loss_device): 14 | super().__init__() 15 | self.loss_device = loss_device 16 | 17 | # Network defition 18 | self.lstm = nn.LSTM(input_size=mel_n_channels, 19 | hidden_size=model_hidden_size, 20 | num_layers=model_num_layers, 21 | batch_first=True).to(device) 22 | self.linear = nn.Linear(in_features=model_hidden_size, 23 | out_features=model_embedding_size).to(device) 24 | self.relu = torch.nn.ReLU().to(device) 25 | 26 | # Cosine similarity scaling (with fixed initial parameter values) 27 | self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device) 28 | self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device) 29 | 30 | # Loss 31 | self.loss_fn = nn.CrossEntropyLoss().to(loss_device) 32 | 33 | def do_gradient_ops(self): 34 | # Gradient scale 35 | self.similarity_weight.grad *= 0.01 36 | self.similarity_bias.grad *= 0.01 37 | 38 | # Gradient clipping 39 | clip_grad_norm_(self.parameters(), 3, norm_type=2) 40 | 41 | def forward(self, utterances, hidden_init=None): 42 | """ 43 | Computes the embeddings of a batch of utterance spectrograms. 44 | 45 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 46 | (batch_size, n_frames, n_channels) 47 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 48 | batch_size, hidden_size). Will default to a tensor of zeros if None. 49 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) 50 | """ 51 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state 52 | # and the final cell state. 53 | out, (hidden, cell) = self.lstm(utterances, hidden_init) 54 | 55 | # We take only the hidden state of the last layer 56 | embeds_raw = self.relu(self.linear(hidden[-1])) 57 | 58 | # L2-normalize it 59 | embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) 60 | 61 | return embeds 62 | 63 | def similarity_matrix(self, embeds): 64 | """ 65 | Computes the similarity matrix according the section 2.1 of GE2E. 66 | 67 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 68 | utterances_per_speaker, embedding_size) 69 | :return: the similarity matrix as a tensor of shape (speakers_per_batch, 70 | utterances_per_speaker, speakers_per_batch) 71 | """ 72 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 73 | 74 | # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation 75 | centroids_incl = torch.mean(embeds, dim=1, keepdim=True) 76 | centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True) 77 | 78 | # Exclusive centroids (1 per utterance) 79 | centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds) 80 | centroids_excl /= (utterances_per_speaker - 1) 81 | centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True) 82 | 83 | # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot 84 | # product of these vectors (which is just an element-wise multiplication reduced by a sum). 85 | # We vectorize the computation for efficiency. 86 | sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker, 87 | speakers_per_batch).to(self.loss_device) 88 | mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int) 89 | for j in range(speakers_per_batch): 90 | mask = np.where(mask_matrix[j])[0] 91 | sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2) 92 | sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1) 93 | 94 | ## Even more vectorized version (slower maybe because of transpose) 95 | # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker 96 | # ).to(self.loss_device) 97 | # eye = np.eye(speakers_per_batch, dtype=np.int) 98 | # mask = np.where(1 - eye) 99 | # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2) 100 | # mask = np.where(eye) 101 | # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2) 102 | # sim_matrix2 = sim_matrix2.transpose(1, 2) 103 | 104 | sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias 105 | return sim_matrix 106 | 107 | def loss(self, embeds): 108 | """ 109 | Computes the softmax loss according the section 2.1 of GE2E. 110 | 111 | :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 112 | utterances_per_speaker, embedding_size) 113 | :return: the loss and the EER for this batch of embeddings. 114 | """ 115 | speakers_per_batch, utterances_per_speaker = embeds.shape[:2] 116 | 117 | # Loss 118 | sim_matrix = self.similarity_matrix(embeds) 119 | sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 120 | speakers_per_batch)) 121 | ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker) 122 | target = torch.from_numpy(ground_truth).long().to(self.loss_device) 123 | loss = self.loss_fn(sim_matrix, target) 124 | 125 | # EER (not backpropagated) 126 | with torch.no_grad(): 127 | inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0] 128 | labels = np.array([inv_argmax(i) for i in ground_truth]) 129 | preds = sim_matrix.detach().cpu().numpy() 130 | 131 | # Snippet from https://yangcha.github.io/EER-ROC/ 132 | fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten()) 133 | eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) 134 | 135 | return loss, eer -------------------------------------------------------------------------------- /encoder/params_data.py: -------------------------------------------------------------------------------- 1 | 2 | ## Mel-filterbank 3 | mel_window_length = 25 # In milliseconds 4 | mel_window_step = 10 # In milliseconds 5 | mel_n_channels = 40 6 | 7 | 8 | ## Audio 9 | sampling_rate = 16000 10 | # Number of spectrogram frames in a partial utterance 11 | partials_n_frames = 160 # 1600 ms 12 | # Number of spectrogram frames at inference 13 | inference_n_frames = 80 # 800 ms 14 | 15 | 16 | ## Voice Activation Detection 17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 18 | # This sets the granularity of the VAD. Should not need to be changed. 19 | vad_window_length = 30 # In milliseconds 20 | # Number of frames to average together when performing the moving average smoothing. 21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 22 | vad_moving_average_width = 8 23 | # Maximum number of consecutive silent frames a segment can have. 24 | vad_max_silence_length = 6 25 | 26 | 27 | ## Audio volume normalization 28 | audio_norm_target_dBFS = -30 29 | 30 | -------------------------------------------------------------------------------- /encoder/params_model.py: -------------------------------------------------------------------------------- 1 | 2 | ## Model parameters 3 | model_hidden_size = 256 4 | model_embedding_size = 256 5 | model_num_layers = 3 6 | 7 | 8 | ## Training parameters 9 | learning_rate_init = 1e-4 10 | speakers_per_batch = 64 11 | utterances_per_speaker = 10 12 | -------------------------------------------------------------------------------- /encoder/train.py: -------------------------------------------------------------------------------- 1 | from encoder.visualizations import Visualizations 2 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset 3 | from encoder.params_model import * 4 | from encoder.model import SpeakerEncoder 5 | from utils.profiler import Profiler 6 | from pathlib import Path 7 | import torch 8 | 9 | def sync(device: torch.device): 10 | # FIXME 11 | return 12 | # For correct profiling (cuda operations are async) 13 | if device.type == "cuda": 14 | torch.cuda.synchronize(device) 15 | 16 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, 17 | backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, 18 | no_visdom: bool): 19 | # Create a dataset and a dataloader 20 | dataset = SpeakerVerificationDataset(clean_data_root) 21 | loader = SpeakerVerificationDataLoader( 22 | dataset, 23 | speakers_per_batch, 24 | utterances_per_speaker, 25 | num_workers=8, 26 | ) 27 | 28 | # Setup the device on which to run the forward pass and the loss. These can be different, 29 | # because the forward pass is faster on the GPU whereas the loss is often (depending on your 30 | # hyperparameters) faster on the CPU. 31 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | # FIXME: currently, the gradient is None if loss_device is cuda 33 | loss_device = torch.device("cpu") 34 | 35 | # Create the model and the optimizer 36 | model = SpeakerEncoder(device, loss_device) 37 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init) 38 | init_step = 1 39 | 40 | # Configure file path for the model 41 | state_fpath = models_dir.joinpath(run_id + ".pt") 42 | backup_dir = models_dir.joinpath(run_id + "_backups") 43 | 44 | # Load any existing model 45 | if not force_restart: 46 | if state_fpath.exists(): 47 | print("Found existing model \"%s\", loading it and resuming training." % run_id) 48 | checkpoint = torch.load(state_fpath) 49 | init_step = checkpoint["step"] 50 | model.load_state_dict(checkpoint["model_state"]) 51 | optimizer.load_state_dict(checkpoint["optimizer_state"]) 52 | optimizer.param_groups[0]["lr"] = learning_rate_init 53 | else: 54 | print("No model \"%s\" found, starting training from scratch." % run_id) 55 | else: 56 | print("Starting the training from scratch.") 57 | model.train() 58 | 59 | # Initialize the visualization environment 60 | vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom) 61 | vis.log_dataset(dataset) 62 | vis.log_params() 63 | device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU") 64 | vis.log_implementation({"Device": device_name}) 65 | 66 | # Training loop 67 | profiler = Profiler(summarize_every=10, disabled=False) 68 | for step, speaker_batch in enumerate(loader, init_step): 69 | profiler.tick("Blocking, waiting for batch (threaded)") 70 | 71 | # Forward pass 72 | inputs = torch.from_numpy(speaker_batch.data).to(device) 73 | sync(device) 74 | profiler.tick("Data to %s" % device) 75 | embeds = model(inputs) 76 | sync(device) 77 | profiler.tick("Forward pass") 78 | embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device) 79 | loss, eer = model.loss(embeds_loss) 80 | sync(loss_device) 81 | profiler.tick("Loss") 82 | 83 | # Backward pass 84 | model.zero_grad() 85 | loss.backward() 86 | profiler.tick("Backward pass") 87 | model.do_gradient_ops() 88 | optimizer.step() 89 | profiler.tick("Parameter update") 90 | 91 | # Update visualizations 92 | # learning_rate = optimizer.param_groups[0]["lr"] 93 | vis.update(loss.item(), eer, step) 94 | 95 | # Draw projections and save them to the backup folder 96 | if umap_every != 0 and step % umap_every == 0: 97 | print("Drawing and saving projections (step %d)" % step) 98 | backup_dir.mkdir(exist_ok=True) 99 | projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step)) 100 | embeds = embeds.detach().cpu().numpy() 101 | vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath) 102 | vis.save() 103 | 104 | # Overwrite the latest version of the model 105 | if save_every != 0 and step % save_every == 0: 106 | print("Saving the model (step %d)" % step) 107 | torch.save({ 108 | "step": step + 1, 109 | "model_state": model.state_dict(), 110 | "optimizer_state": optimizer.state_dict(), 111 | }, state_fpath) 112 | 113 | # Make a backup 114 | if backup_every != 0 and step % backup_every == 0: 115 | print("Making a backup (step %d)" % step) 116 | backup_dir.mkdir(exist_ok=True) 117 | backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step)) 118 | torch.save({ 119 | "step": step + 1, 120 | "model_state": model.state_dict(), 121 | "optimizer_state": optimizer.state_dict(), 122 | }, backup_fpath) 123 | 124 | profiler.tick("Extras (visualizations, saving)") 125 | -------------------------------------------------------------------------------- /encoder/visualizations.py: -------------------------------------------------------------------------------- 1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset 2 | from datetime import datetime 3 | from time import perf_counter as timer 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | # import webbrowser 7 | import visdom 8 | import umap 9 | 10 | colormap = np.array([ 11 | [76, 255, 0], 12 | [0, 127, 70], 13 | [255, 0, 0], 14 | [255, 217, 38], 15 | [0, 135, 255], 16 | [165, 0, 165], 17 | [255, 167, 255], 18 | [0, 255, 255], 19 | [255, 96, 38], 20 | [142, 76, 0], 21 | [33, 0, 127], 22 | [0, 0, 0], 23 | [183, 183, 183], 24 | ], dtype=np.float) / 255 25 | 26 | 27 | class Visualizations: 28 | def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False): 29 | # Tracking data 30 | self.last_update_timestamp = timer() 31 | self.update_every = update_every 32 | self.step_times = [] 33 | self.losses = [] 34 | self.eers = [] 35 | print("Updating the visualizations every %d steps." % update_every) 36 | 37 | # If visdom is disabled TODO: use a better paradigm for that 38 | self.disabled = disabled 39 | if self.disabled: 40 | return 41 | 42 | # Set the environment name 43 | now = str(datetime.now().strftime("%d-%m %Hh%M")) 44 | if env_name is None: 45 | self.env_name = now 46 | else: 47 | self.env_name = "%s (%s)" % (env_name, now) 48 | 49 | # Connect to visdom and open the corresponding window in the browser 50 | try: 51 | self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) 52 | except ConnectionError: 53 | raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to " 54 | "start it.") 55 | # webbrowser.open("http://localhost:8097/env/" + self.env_name) 56 | 57 | # Create the windows 58 | self.loss_win = None 59 | self.eer_win = None 60 | # self.lr_win = None 61 | self.implementation_win = None 62 | self.projection_win = None 63 | self.implementation_string = "" 64 | 65 | def log_params(self): 66 | if self.disabled: 67 | return 68 | from encoder import params_data 69 | from encoder import params_model 70 | param_string = "Model parameters:
" 71 | for param_name in (p for p in dir(params_model) if not p.startswith("__")): 72 | value = getattr(params_model, param_name) 73 | param_string += "\t%s: %s
" % (param_name, value) 74 | param_string += "Data parameters:
" 75 | for param_name in (p for p in dir(params_data) if not p.startswith("__")): 76 | value = getattr(params_data, param_name) 77 | param_string += "\t%s: %s
" % (param_name, value) 78 | self.vis.text(param_string, opts={"title": "Parameters"}) 79 | 80 | def log_dataset(self, dataset: SpeakerVerificationDataset): 81 | if self.disabled: 82 | return 83 | dataset_string = "" 84 | dataset_string += "Speakers: %s\n" % len(dataset.speakers) 85 | dataset_string += "\n" + dataset.get_logs() 86 | dataset_string = dataset_string.replace("\n", "
") 87 | self.vis.text(dataset_string, opts={"title": "Dataset"}) 88 | 89 | def log_implementation(self, params): 90 | if self.disabled: 91 | return 92 | implementation_string = "" 93 | for param, value in params.items(): 94 | implementation_string += "%s: %s\n" % (param, value) 95 | implementation_string = implementation_string.replace("\n", "
") 96 | self.implementation_string = implementation_string 97 | self.implementation_win = self.vis.text( 98 | implementation_string, 99 | opts={"title": "Training implementation"} 100 | ) 101 | 102 | def update(self, loss, eer, step): 103 | # Update the tracking data 104 | now = timer() 105 | self.step_times.append(1000 * (now - self.last_update_timestamp)) 106 | self.last_update_timestamp = now 107 | self.losses.append(loss) 108 | self.eers.append(eer) 109 | print(".", end="") 110 | 111 | # Update the plots every steps 112 | if step % self.update_every != 0: 113 | return 114 | time_string = "Step time: mean: %5dms std: %5dms" % \ 115 | (int(np.mean(self.step_times)), int(np.std(self.step_times))) 116 | print("\nStep %6d Loss: %.4f EER: %.4f %s" % 117 | (step, np.mean(self.losses), np.mean(self.eers), time_string)) 118 | if not self.disabled: 119 | self.loss_win = self.vis.line( 120 | [np.mean(self.losses)], 121 | [step], 122 | win=self.loss_win, 123 | update="append" if self.loss_win else None, 124 | opts=dict( 125 | legend=["Avg. loss"], 126 | xlabel="Step", 127 | ylabel="Loss", 128 | title="Loss", 129 | ) 130 | ) 131 | self.eer_win = self.vis.line( 132 | [np.mean(self.eers)], 133 | [step], 134 | win=self.eer_win, 135 | update="append" if self.eer_win else None, 136 | opts=dict( 137 | legend=["Avg. EER"], 138 | xlabel="Step", 139 | ylabel="EER", 140 | title="Equal error rate" 141 | ) 142 | ) 143 | if self.implementation_win is not None: 144 | self.vis.text( 145 | self.implementation_string + ("%s" % time_string), 146 | win=self.implementation_win, 147 | opts={"title": "Training implementation"}, 148 | ) 149 | 150 | # Reset the tracking 151 | self.losses.clear() 152 | self.eers.clear() 153 | self.step_times.clear() 154 | 155 | def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, 156 | max_speakers=10): 157 | max_speakers = min(max_speakers, len(colormap)) 158 | embeds = embeds[:max_speakers * utterances_per_speaker] 159 | 160 | n_speakers = len(embeds) // utterances_per_speaker 161 | ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) 162 | colors = [colormap[i] for i in ground_truth] 163 | 164 | reducer = umap.UMAP() 165 | projected = reducer.fit_transform(embeds) 166 | plt.scatter(projected[:, 0], projected[:, 1], c=colors) 167 | plt.gca().set_aspect("equal", "datalim") 168 | plt.title("UMAP projection (step %d)" % step) 169 | if not self.disabled: 170 | self.projection_win = self.vis.matplot(plt, win=self.projection_win) 171 | if out_fpath is not None: 172 | plt.savefig(out_fpath) 173 | plt.clf() 174 | 175 | def save(self): 176 | if not self.disabled: 177 | self.vis.save([self.env_name]) 178 | -------------------------------------------------------------------------------- /encoder_preprocess.py: -------------------------------------------------------------------------------- 1 | from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aishell1, preprocess_magicdata, preprocess_aidatatang, preprocess_thchs30, preprocess_mozilla, preprocess_primewords, preprocess_stcmds 2 | from utils.argutils import print_args 3 | from pathlib import Path 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 9 | pass 10 | 11 | parser = argparse.ArgumentParser( 12 | description="Preprocesses audio files from datasets, encodes them as mel spectrograms and " 13 | "writes them to the disk. This will allow you to train the encoder. The " 14 | "datasets required are at least one of VoxCeleb1, VoxCeleb2 and LibriSpeech. " 15 | "Ideally, you should have all three. You should extract them as they are " 16 | "after having downloaded them and put them in a same directory, e.g.:\n" 17 | "-[datasets_root]\n" 18 | " -LibriSpeech\n" 19 | " -train-other-500\n" 20 | " -VoxCeleb1\n" 21 | " -wav\n" 22 | " -vox1_meta.csv\n" 23 | " -VoxCeleb2\n" 24 | " -dev", 25 | formatter_class=MyFormatter 26 | ) 27 | parser.add_argument("datasets_root", type=Path, help=\ 28 | "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.") 29 | parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ 30 | "Path to the output directory that will contain the mel spectrograms. If left out, " 31 | "defaults to /SV2TTS/encoder/") 32 | parser.add_argument("-d", "--datasets", type=str, 33 | default="aishell1,magicdata,aidatatang,thchs30,mozilla,primewords,stcmds", help=\ 34 | "Comma-separated list of the name of the datasets you want to preprocess. Only the train " 35 | "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, " 36 | "voxceleb2.") 37 | parser.add_argument("-s", "--skip_existing", action="store_true", help=\ 38 | "Whether to skip existing output files with the same name. Useful if this script was " 39 | "interrupted.") 40 | args = parser.parse_args() 41 | 42 | # Process the arguments 43 | args.datasets = args.datasets.split(",") 44 | if not hasattr(args, "out_dir"): 45 | args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder") 46 | assert args.datasets_root.exists() 47 | args.out_dir.mkdir(exist_ok=True, parents=True) 48 | 49 | # Preprocess the datasets 50 | print_args(args, parser) 51 | preprocess_func = { 52 | "librispeech_other": preprocess_librispeech, 53 | "voxceleb1": preprocess_voxceleb1, 54 | "voxceleb2": preprocess_voxceleb2, 55 | "aishell1": preprocess_aishell1, 56 | "magicdata": preprocess_magicdata, 57 | "aidatatang": preprocess_aidatatang, 58 | "thchs30": preprocess_thchs30, 59 | "mozilla": preprocess_mozilla, 60 | "primewords": preprocess_primewords, 61 | "stcmds": preprocess_stcmds, 62 | } 63 | args = vars(args) 64 | for dataset in args.pop("datasets"): 65 | print("Preprocessing %s" % dataset) 66 | preprocess_func[dataset](**args) 67 | -------------------------------------------------------------------------------- /encoder_train.py: -------------------------------------------------------------------------------- 1 | from utils.argutils import print_args 2 | from encoder.train import train 3 | from pathlib import Path 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser( 9 | description="Trains the speaker encoder. You must have run encoder_preprocess.py first.", 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 11 | ) 12 | 13 | parser.add_argument("run_id", type=str, help= \ 14 | "Name for this model instance. If a model state from the same run ID was previously " 15 | "saved, the training will restart from there. Pass -f to overwrite saved states and " 16 | "restart from scratch.") 17 | parser.add_argument("clean_data_root", type=Path, help= \ 18 | "Path to the output directory of encoder_preprocess.py. If you left the default " 19 | "output directory when preprocessing, it should be /SV2TTS/encoder/.") 20 | parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\ 21 | "Path to the output directory that will contain the saved model weights, as well as " 22 | "backups of those weights and plots generated during training.") 23 | parser.add_argument("-v", "--vis_every", type=int, default=10, help= \ 24 | "Number of steps between updates of the loss and the plots.") 25 | parser.add_argument("-u", "--umap_every", type=int, default=100, help= \ 26 | "Number of steps between updates of the umap projection. Set to 0 to never update the " 27 | "projections.") 28 | parser.add_argument("-s", "--save_every", type=int, default=500, help= \ 29 | "Number of steps between updates of the model on the disk. Set to 0 to never save the " 30 | "model.") 31 | parser.add_argument("-b", "--backup_every", type=int, default=7500, help= \ 32 | "Number of steps between backups of the model. Set to 0 to never make backups of the " 33 | "model.") 34 | parser.add_argument("-f", "--force_restart", action="store_true", help= \ 35 | "Do not load any saved model.") 36 | parser.add_argument("--visdom_server", type=str, default="http://localhost") 37 | parser.add_argument("--no_visdom", action="store_true", help= \ 38 | "Disable visdom.") 39 | args = parser.parse_args() 40 | 41 | # Process the arguments 42 | args.models_dir.mkdir(exist_ok=True) 43 | 44 | # Run the training 45 | print_args(args, parser) 46 | train(**vars(args)) 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu>=1.10.0,<=1.14.0 2 | umap-learn 3 | visdom 4 | webrtcvad 5 | librosa>=0.5.1 6 | matplotlib>=2.0.2 7 | numpy>=1.14.0 8 | scipy>=1.0.0 9 | tqdm 10 | sounddevice 11 | Unidecode 12 | inflect 13 | PyQt5 14 | multiprocess 15 | numba 16 | pandas -------------------------------------------------------------------------------- /synthesizer/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah) 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /synthesizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /synthesizer/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import librosa.filters 3 | import numpy as np 4 | import tensorflow as tf 5 | from scipy import signal 6 | from scipy.io import wavfile 7 | 8 | 9 | def load_wav(path, sr): 10 | return librosa.core.load(path, sr=sr)[0] 11 | 12 | def save_wav(wav, path, sr): 13 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 14 | #proposed by @dsmiller 15 | wavfile.write(path, sr, wav.astype(np.int16)) 16 | 17 | def save_wavenet_wav(wav, path, sr): 18 | librosa.output.write_wav(path, wav, sr=sr) 19 | 20 | def preemphasis(wav, k, preemphasize=True): 21 | if preemphasize: 22 | return signal.lfilter([1, -k], [1], wav) 23 | return wav 24 | 25 | def inv_preemphasis(wav, k, inv_preemphasize=True): 26 | if inv_preemphasize: 27 | return signal.lfilter([1], [1, -k], wav) 28 | return wav 29 | 30 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py 31 | def start_and_end_indices(quantized, silence_threshold=2): 32 | for start in range(quantized.size): 33 | if abs(quantized[start] - 127) > silence_threshold: 34 | break 35 | for end in range(quantized.size - 1, 1, -1): 36 | if abs(quantized[end] - 127) > silence_threshold: 37 | break 38 | 39 | assert abs(quantized[start] - 127) > silence_threshold 40 | assert abs(quantized[end] - 127) > silence_threshold 41 | 42 | return start, end 43 | 44 | def get_hop_size(hparams): 45 | hop_size = hparams.hop_size 46 | if hop_size is None: 47 | assert hparams.frame_shift_ms is not None 48 | hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 49 | return hop_size 50 | 51 | def linearspectrogram(wav, hparams): 52 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 53 | S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db 54 | 55 | if hparams.signal_normalization: 56 | return _normalize(S, hparams) 57 | return S 58 | 59 | def melspectrogram(wav, hparams): 60 | D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) 61 | S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db 62 | 63 | if hparams.signal_normalization: 64 | return _normalize(S, hparams) 65 | return S 66 | 67 | def inv_linear_spectrogram(linear_spectrogram, hparams): 68 | """Converts linear spectrogram to waveform using librosa""" 69 | if hparams.signal_normalization: 70 | D = _denormalize(linear_spectrogram, hparams) 71 | else: 72 | D = linear_spectrogram 73 | 74 | S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear 75 | 76 | if hparams.use_lws: 77 | processor = _lws_processor(hparams) 78 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 79 | y = processor.istft(D).astype(np.float32) 80 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 81 | else: 82 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 83 | 84 | def inv_mel_spectrogram(mel_spectrogram, hparams): 85 | """Converts mel spectrogram to waveform using librosa""" 86 | if hparams.signal_normalization: 87 | D = _denormalize(mel_spectrogram, hparams) 88 | else: 89 | D = mel_spectrogram 90 | 91 | S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear 92 | 93 | if hparams.use_lws: 94 | processor = _lws_processor(hparams) 95 | D = processor.run_lws(S.astype(np.float64).T ** hparams.power) 96 | y = processor.istft(D).astype(np.float32) 97 | return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) 98 | else: 99 | return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) 100 | 101 | def _lws_processor(hparams): 102 | import lws 103 | return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech") 104 | 105 | def _griffin_lim(S, hparams): 106 | """librosa implementation of Griffin-Lim 107 | Based on https://github.com/librosa/librosa/issues/434 108 | """ 109 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 110 | S_complex = np.abs(S).astype(np.complex) 111 | y = _istft(S_complex * angles, hparams) 112 | for i in range(hparams.griffin_lim_iters): 113 | angles = np.exp(1j * np.angle(_stft(y, hparams))) 114 | y = _istft(S_complex * angles, hparams) 115 | return y 116 | 117 | def _stft(y, hparams): 118 | if hparams.use_lws: 119 | return _lws_processor(hparams).stft(y).T 120 | else: 121 | return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 122 | 123 | def _istft(y, hparams): 124 | return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) 125 | 126 | ########################################################## 127 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!) 128 | def num_frames(length, fsize, fshift): 129 | """Compute number of time frames of spectrogram 130 | """ 131 | pad = (fsize - fshift) 132 | if length % fshift == 0: 133 | M = (length + pad * 2 - fsize) // fshift + 1 134 | else: 135 | M = (length + pad * 2 - fsize) // fshift + 2 136 | return M 137 | 138 | 139 | def pad_lr(x, fsize, fshift): 140 | """Compute left and right padding 141 | """ 142 | M = num_frames(len(x), fsize, fshift) 143 | pad = (fsize - fshift) 144 | T = len(x) + 2 * pad 145 | r = (M - 1) * fshift + fsize - T 146 | return pad, pad + r 147 | ########################################################## 148 | #Librosa correct padding 149 | def librosa_pad_lr(x, fsize, fshift): 150 | return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0] 151 | 152 | # Conversions 153 | _mel_basis = None 154 | _inv_mel_basis = None 155 | 156 | def _linear_to_mel(spectogram, hparams): 157 | global _mel_basis 158 | if _mel_basis is None: 159 | _mel_basis = _build_mel_basis(hparams) 160 | return np.dot(_mel_basis, spectogram) 161 | 162 | def _mel_to_linear(mel_spectrogram, hparams): 163 | global _inv_mel_basis 164 | if _inv_mel_basis is None: 165 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams)) 166 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 167 | 168 | def _build_mel_basis(hparams): 169 | assert hparams.fmax <= hparams.sample_rate // 2 170 | return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels, 171 | fmin=hparams.fmin, fmax=hparams.fmax) 172 | 173 | def _amp_to_db(x, hparams): 174 | min_level = np.exp(hparams.min_level_db / 20 * np.log(10)) 175 | return 20 * np.log10(np.maximum(min_level, x)) 176 | 177 | def _db_to_amp(x): 178 | return np.power(10.0, (x) * 0.05) 179 | 180 | def _normalize(S, hparams): 181 | if hparams.allow_clipping_in_normalization: 182 | if hparams.symmetric_mels: 183 | return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value, 184 | -hparams.max_abs_value, hparams.max_abs_value) 185 | else: 186 | return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value) 187 | 188 | assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0 189 | if hparams.symmetric_mels: 190 | return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value 191 | else: 192 | return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)) 193 | 194 | def _denormalize(D, hparams): 195 | if hparams.allow_clipping_in_normalization: 196 | if hparams.symmetric_mels: 197 | return (((np.clip(D, -hparams.max_abs_value, 198 | hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) 199 | + hparams.min_level_db) 200 | else: 201 | return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 202 | 203 | if hparams.symmetric_mels: 204 | return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db) 205 | else: 206 | return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db) 207 | -------------------------------------------------------------------------------- /synthesizer/inference.py: -------------------------------------------------------------------------------- 1 | from synthesizer.tacotron2 import Tacotron2 2 | from synthesizer.hparams import hparams 3 | from multiprocess.pool import Pool # You're free to use either one 4 | #from multiprocessing import Pool # 5 | from synthesizer import audio 6 | from pathlib import Path 7 | from typing import Union, List 8 | import tensorflow as tf 9 | import numpy as np 10 | import numba.cuda 11 | import librosa 12 | 13 | 14 | class Synthesizer: 15 | sample_rate = hparams.sample_rate 16 | hparams = hparams 17 | 18 | def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False): 19 | """ 20 | Creates a synthesizer ready for inference. The actual model isn't loaded in memory until 21 | needed or until load() is called. 22 | 23 | :param checkpoints_dir: path to the directory containing the checkpoint file as well as the 24 | weight files (.data, .index and .meta files) 25 | :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too 26 | :param low_mem: if True, the model will be loaded in a separate process and its resources 27 | will be released after each usage. Adds a large overhead, only recommended if your GPU 28 | memory is low (<= 2gb) 29 | """ 30 | self.verbose = verbose 31 | self._low_mem = low_mem 32 | 33 | # Prepare the model 34 | self._model = None # type: Tacotron2 35 | checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir) 36 | if checkpoint_state is None: 37 | raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) 38 | self.checkpoint_fpath = checkpoint_state.model_checkpoint_path 39 | if verbose: 40 | model_name = checkpoints_dir.parent.name.replace("logs-", "") 41 | step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) 42 | print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) 43 | 44 | def is_loaded(self): 45 | """ 46 | Whether the model is loaded in GPU memory. 47 | """ 48 | return self._model is not None 49 | 50 | def load(self): 51 | """ 52 | Effectively loads the model to GPU memory given the weights file that was passed in the 53 | constructor. 54 | """ 55 | if self._low_mem: 56 | raise Exception("Cannot load the synthesizer permanently in low mem mode") 57 | tf.reset_default_graph() 58 | self._model = Tacotron2(self.checkpoint_fpath, hparams) 59 | 60 | def synthesize_spectrograms(self, texts: List[str], 61 | embeddings: Union[np.ndarray, List[np.ndarray]], 62 | return_alignments=False): 63 | """ 64 | Synthesizes mel spectrograms from texts and speaker embeddings. 65 | 66 | :param texts: a list of N text prompts to be synthesized 67 | :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 68 | :param return_alignments: if True, a matrix representing the alignments between the 69 | characters 70 | and each decoder output step will be returned for each spectrogram 71 | :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 72 | sequence length of spectrogram i, and possibly the alignments. 73 | """ 74 | if not self._low_mem: 75 | # Usual inference mode: load the model on the first request and keep it loaded. 76 | if not self.is_loaded(): 77 | self.load() 78 | specs, alignments = self._model.my_synthesize(embeddings, texts) 79 | else: 80 | # Low memory inference mode: load the model upon every request. The model has to be 81 | # loaded in a separate process to be able to release GPU memory (a simple workaround 82 | # to tensorflow's intricacies) 83 | specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 84 | [(self.checkpoint_fpath, embeddings, texts)])[0] 85 | 86 | return (specs, alignments) if return_alignments else specs 87 | 88 | @staticmethod 89 | def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts): 90 | # Load the model and forward the inputs 91 | tf.reset_default_graph() 92 | model = Tacotron2(checkpoint_fpath, hparams) 93 | specs, alignments = model.my_synthesize(embeddings, texts) 94 | 95 | # Detach the outputs (not doing so will cause the process to hang) 96 | specs, alignments = [spec.copy() for spec in specs], alignments.copy() 97 | 98 | # Close cuda for this process 99 | model.session.close() 100 | numba.cuda.select_device(0) 101 | numba.cuda.close() 102 | 103 | return specs, alignments 104 | 105 | @staticmethod 106 | def load_preprocess_wav(fpath): 107 | """ 108 | Loads and preprocesses an audio file under the same conditions the audio files were used to 109 | train the synthesizer. 110 | """ 111 | wav = librosa.load(fpath, hparams.sample_rate)[0] 112 | if hparams.rescale: 113 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 114 | return wav 115 | 116 | @staticmethod 117 | def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): 118 | """ 119 | Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 120 | were fed to the synthesizer when training. 121 | """ 122 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 123 | wav = Synthesizer.load_preprocess_wav(fpath_or_wav) 124 | else: 125 | wav = fpath_or_wav 126 | 127 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 128 | return mel_spectrogram 129 | 130 | @staticmethod 131 | def griffin_lim(mel): 132 | """ 133 | Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built 134 | with the same parameters present in hparams.py. 135 | """ 136 | return audio.inv_mel_spectrogram(mel, hparams) 137 | -------------------------------------------------------------------------------- /synthesizer/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import json 3 | from datetime import datetime 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | _format = "%Y-%m-%d %H:%M:%S.%f" 8 | _file = None 9 | _run_name = None 10 | _slack_url = None 11 | 12 | 13 | def init(filename, run_name, slack_url=None): 14 | global _file, _run_name, _slack_url 15 | _close_logfile() 16 | _file = open(filename, "a") 17 | _file = open(filename, "a") 18 | _file.write("\n-----------------------------------------------------------------\n") 19 | _file.write("Starting new {} training run\n".format(run_name)) 20 | _file.write("-----------------------------------------------------------------\n") 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, end="\n", slack=False): 26 | print(msg, end=end) 27 | if _file is not None: 28 | _file.write("[%s] %s\n" % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header("Content-Type", "application/json") 43 | urlopen(req, json.dumps({ 44 | "username": "tacotron", 45 | "icon_emoji": ":taco:", 46 | "text": "*%s*: %s" % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /synthesizer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tacotron import Tacotron 2 | 3 | 4 | def create_model(name, hparams): 5 | if name == "Tacotron": 6 | return Tacotron(hparams) 7 | else: 8 | raise Exception("Unknown model: " + name) 9 | -------------------------------------------------------------------------------- /synthesizer/models/architecture_wrappers.py: -------------------------------------------------------------------------------- 1 | """A set of wrappers useful for tacotron 2 architecture 2 | All notations and variable names were used in concordance with originial tensorflow implementation 3 | """ 4 | import collections 5 | import tensorflow as tf 6 | from synthesizer.models.attention import _compute_attention 7 | from tensorflow.contrib.rnn import RNNCell 8 | from tensorflow.python.framework import ops, tensor_shape 9 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops 10 | from tensorflow.python.util import nest 11 | 12 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors 13 | 14 | 15 | 16 | class TacotronEncoderCell(RNNCell): 17 | """Tacotron 2 Encoder Cell 18 | Passes inputs through a stack of convolutional layers then through a bidirectional LSTM 19 | layer to predict the hidden representation vector (or memory) 20 | """ 21 | 22 | def __init__(self, convolutional_layers, lstm_layer): 23 | """Initialize encoder parameters 24 | 25 | Args: 26 | convolutional_layers: Encoder convolutional block class 27 | lstm_layer: encoder bidirectional lstm layer class 28 | """ 29 | super(TacotronEncoderCell, self).__init__() 30 | #Initialize encoder layers 31 | self._convolutions = convolutional_layers 32 | self._cell = lstm_layer 33 | 34 | def __call__(self, inputs, input_lengths=None): 35 | #Pass input sequence through a stack of convolutional layers 36 | conv_output = self._convolutions(inputs) 37 | 38 | #Extract hidden representation from encoder lstm cells 39 | hidden_representation = self._cell(conv_output, input_lengths) 40 | 41 | #For shape visualization 42 | self.conv_output_shape = conv_output.shape 43 | return hidden_representation 44 | 45 | 46 | class TacotronDecoderCellState( 47 | collections.namedtuple("TacotronDecoderCellState", 48 | ("cell_state", "attention", "time", "alignments", 49 | "alignment_history"))): 50 | """`namedtuple` storing the state of a `TacotronDecoderCell`. 51 | Contains: 52 | - `cell_state`: The state of the wrapped `RNNCell` at the previous time 53 | step. 54 | - `attention`: The attention emitted at the previous time step. 55 | - `time`: int32 scalar containing the current time step. 56 | - `alignments`: A single or tuple of `Tensor`(s) containing the alignments 57 | emitted at the previous time step for each attention mechanism. 58 | - `alignment_history`: a single or tuple of `TensorArray`(s) 59 | containing alignment matrices from all time steps for each attention 60 | mechanism. Call `stack()` on each to convert to a `Tensor`. 61 | """ 62 | def replace(self, **kwargs): 63 | """Clones the current state while overwriting components provided by kwargs. 64 | """ 65 | return super(TacotronDecoderCellState, self)._replace(**kwargs) 66 | 67 | class TacotronDecoderCell(RNNCell): 68 | """Tactron 2 Decoder Cell 69 | Decodes encoder output and previous mel frames into next r frames 70 | 71 | Decoder Step i: 72 | 1) Prenet to compress last output information 73 | 2) Concat compressed inputs with previous context vector (input feeding) * 74 | 3) Decoder RNN (actual decoding) to predict current state s_{i} * 75 | 4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments * 76 | 5) Predict new output y_{i} using s_{i} and c_{i} (concatenated) 77 | 6) Predict output ys_{i} using s_{i} and c_{i} (concatenated) 78 | 79 | * : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper, 80 | and wrap that with the prenet before doing an input feeding, and with the prediction layer 81 | that uses RNN states to project on output space. Actions marked with (*) can be replaced with 82 | tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only. 83 | """ 84 | 85 | def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection): 86 | """Initialize decoder parameters 87 | 88 | Args: 89 | prenet: A tensorflow fully connected layer acting as the decoder pre-net 90 | attention_mechanism: A _BaseAttentionMechanism instance, usefull to 91 | learn encoder-decoder alignments 92 | rnn_cell: Instance of RNNCell, main body of the decoder 93 | frame_projection: tensorflow fully connected layer with r * num_mels output units 94 | stop_projection: tensorflow fully connected layer, expected to project to a scalar 95 | and through a sigmoid activation 96 | mask_finished: Boolean, Whether to mask decoder frames after the 97 | """ 98 | super(TacotronDecoderCell, self).__init__() 99 | #Initialize decoder layers 100 | self._prenet = prenet 101 | self._attention_mechanism = attention_mechanism 102 | self._cell = rnn_cell 103 | self._frame_projection = frame_projection 104 | self._stop_projection = stop_projection 105 | 106 | self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value 107 | 108 | def _batch_size_checks(self, batch_size, error_message): 109 | return [check_ops.assert_equal(batch_size, 110 | self._attention_mechanism.batch_size, 111 | message=error_message)] 112 | 113 | @property 114 | def output_size(self): 115 | return self._frame_projection.shape 116 | 117 | @property 118 | def state_size(self): 119 | """The `state_size` property of `TacotronDecoderCell`. 120 | 121 | Returns: 122 | An `TacotronDecoderCell` tuple containing shapes used by this object. 123 | """ 124 | return TacotronDecoderCellState( 125 | cell_state=self._cell._cell.state_size, 126 | time=tensor_shape.TensorShape([]), 127 | attention=self._attention_layer_size, 128 | alignments=self._attention_mechanism.alignments_size, 129 | alignment_history=()) 130 | 131 | def zero_state(self, batch_size, dtype): 132 | """Return an initial (zero) state tuple for this `AttentionWrapper`. 133 | 134 | Args: 135 | batch_size: `0D` integer tensor: the batch size. 136 | dtype: The internal state data type. 137 | Returns: 138 | An `TacotronDecoderCellState` tuple containing zeroed out tensors and, 139 | possibly, empty `TensorArray` objects. 140 | Raises: 141 | ValueError: (or, possibly at runtime, InvalidArgument), if 142 | `batch_size` does not match the output size of the encoder passed 143 | to the wrapper object at initialization time. 144 | """ 145 | with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): 146 | cell_state = self._cell._cell.zero_state(batch_size, dtype) 147 | error_message = ( 148 | "When calling zero_state of TacotronDecoderCell %s: " % self._base_name + 149 | "Non-matching batch sizes between the memory " 150 | "(encoder output) and the requested batch size.") 151 | with ops.control_dependencies( 152 | self._batch_size_checks(batch_size, error_message)): 153 | cell_state = nest.map_structure( 154 | lambda s: array_ops.identity(s, name="checked_cell_state"), 155 | cell_state) 156 | return TacotronDecoderCellState( 157 | cell_state=cell_state, 158 | time=array_ops.zeros([], dtype=tf.int32), 159 | attention=_zero_state_tensors(self._attention_layer_size, batch_size, 160 | dtype), 161 | alignments=self._attention_mechanism.initial_alignments(batch_size, dtype), 162 | alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0, 163 | dynamic_size=True)) 164 | 165 | def __call__(self, inputs, state): 166 | #Information bottleneck (essential for learning attention) 167 | prenet_output = self._prenet(inputs) 168 | 169 | #Concat context vector and prenet output to form LSTM cells input (input feeding) 170 | LSTM_input = tf.concat([prenet_output, state.attention], axis=-1) 171 | 172 | #Unidirectional LSTM layers 173 | LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state) 174 | 175 | 176 | #Compute the attention (context) vector and alignments using 177 | #the new decoder cell hidden state as query vector 178 | #and cumulative alignments to extract location features 179 | #The choice of the new cell hidden state (s_{i}) of the last 180 | #decoder RNN Cell is based on Luong et Al. (2015): 181 | #https://arxiv.org/pdf/1508.04025.pdf 182 | previous_alignments = state.alignments 183 | previous_alignment_history = state.alignment_history 184 | context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism, 185 | LSTM_output, 186 | previous_alignments, 187 | attention_layer=None) 188 | 189 | #Concat LSTM outputs and context vector to form projections inputs 190 | projections_input = tf.concat([LSTM_output, context_vector], axis=-1) 191 | 192 | #Compute predicted frames and predicted 193 | cell_outputs = self._frame_projection(projections_input) 194 | stop_tokens = self._stop_projection(projections_input) 195 | 196 | #Save alignment history 197 | alignment_history = previous_alignment_history.write(state.time, alignments) 198 | 199 | #Prepare next decoder state 200 | next_state = TacotronDecoderCellState( 201 | time=state.time + 1, 202 | cell_state=next_cell_state, 203 | attention=context_vector, 204 | alignments=cumulated_alignments, 205 | alignment_history=alignment_history) 206 | 207 | return (cell_outputs, stop_tokens), next_state 208 | -------------------------------------------------------------------------------- /synthesizer/models/attention.py: -------------------------------------------------------------------------------- 1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)""" 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention 5 | from tensorflow.python.layers import core as layers_core 6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope 7 | 8 | 9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py 10 | def _compute_attention(attention_mechanism, cell_output, attention_state, 11 | attention_layer): 12 | """Computes the attention and alignments for a given attention_mechanism.""" 13 | alignments, next_attention_state = attention_mechanism( 14 | cell_output, state=attention_state) 15 | 16 | # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time] 17 | expanded_alignments = array_ops.expand_dims(alignments, 1) 18 | # Context is the inner product of alignments and values along the 19 | # memory time dimension. 20 | # alignments shape is 21 | # [batch_size, 1, memory_time] 22 | # attention_mechanism.values shape is 23 | # [batch_size, memory_time, memory_size] 24 | # the batched matmul is over memory_time, so the output shape is 25 | # [batch_size, 1, memory_size]. 26 | # we then squeeze out the singleton dim. 27 | context = math_ops.matmul(expanded_alignments, attention_mechanism.values) 28 | context = array_ops.squeeze(context, [1]) 29 | 30 | if attention_layer is not None: 31 | attention = attention_layer(array_ops.concat([cell_output, context], 1)) 32 | else: 33 | attention = context 34 | 35 | return attention, alignments, next_attention_state 36 | 37 | 38 | def _location_sensitive_score(W_query, W_fil, W_keys): 39 | """Impelements Bahdanau-style (cumulative) scoring function. 40 | This attention is described in: 41 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 42 | gio, “Attention-based models for speech recognition,” in Ad- 43 | vances in Neural Information Processing Systems, 2015, pp. 44 | 577–585. 45 | 46 | ############################################################################# 47 | hybrid attention (content-based + location-based) 48 | f = F * α_{i-1} 49 | energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a)) 50 | ############################################################################# 51 | 52 | Args: 53 | W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features. 54 | W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]" 55 | W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs. 56 | Returns: 57 | A "[batch_size, max_time]" attention score (energy) 58 | """ 59 | # Get the number of hidden units from the trailing dimension of keys 60 | dtype = W_query.dtype 61 | num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] 62 | 63 | v_a = tf.get_variable( 64 | "attention_variable_projection", shape=[num_units], dtype=dtype, 65 | initializer=tf.contrib.layers.xavier_initializer()) 66 | b_a = tf.get_variable( 67 | "attention_bias", shape=[num_units], dtype=dtype, 68 | initializer=tf.zeros_initializer()) 69 | 70 | return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2]) 71 | 72 | def _smoothing_normalization(e): 73 | """Applies a smoothing normalization function instead of softmax 74 | Introduced in: 75 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 76 | gio, “Attention-based models for speech recognition,” in Ad- 77 | vances in Neural Information Processing Systems, 2015, pp. 78 | 577–585. 79 | 80 | ############################################################################ 81 | Smoothing normalization function 82 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 83 | ############################################################################ 84 | 85 | Args: 86 | e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score) 87 | values of an attention mechanism 88 | Returns: 89 | matrix [batch_size, max_time]: [0, 1] normalized alignments with possible 90 | attendance to multiple memory time steps. 91 | """ 92 | return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True) 93 | 94 | 95 | class LocationSensitiveAttention(BahdanauAttention): 96 | """Impelements Bahdanau-style (cumulative) scoring function. 97 | Usually referred to as "hybrid" attention (content-based + location-based) 98 | Extends the additive attention described in: 99 | "D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla- 100 | tion by jointly learning to align and translate,” in Proceedings 101 | of ICLR, 2015." 102 | to use previous alignments as additional location features. 103 | 104 | This attention is described in: 105 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 106 | gio, “Attention-based models for speech recognition,” in Ad- 107 | vances in Neural Information Processing Systems, 2015, pp. 108 | 577–585. 109 | """ 110 | 111 | def __init__(self, 112 | num_units, 113 | memory, 114 | hparams, 115 | mask_encoder=True, 116 | memory_sequence_length=None, 117 | smoothing=False, 118 | cumulate_weights=True, 119 | name="LocationSensitiveAttention"): 120 | """Construct the Attention mechanism. 121 | Args: 122 | num_units: The depth of the query mechanism. 123 | memory: The memory to query; usually the output of an RNN encoder. This 124 | tensor should be shaped `[batch_size, max_time, ...]`. 125 | mask_encoder (optional): Boolean, whether to mask encoder paddings. 126 | memory_sequence_length (optional): Sequence lengths for the batch entries 127 | in memory. If provided, the memory tensor rows are masked with zeros 128 | for values past the respective sequence lengths. Only relevant if mask_encoder = True. 129 | smoothing (optional): Boolean. Determines which normalization function to use. 130 | Default normalization function (probablity_fn) is softmax. If smoothing is 131 | enabled, we replace softmax with: 132 | a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j})) 133 | Introduced in: 134 | J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben- 135 | gio, “Attention-based models for speech recognition,” in Ad- 136 | vances in Neural Information Processing Systems, 2015, pp. 137 | 577–585. 138 | This is mainly used if the model wants to attend to multiple input parts 139 | at the same decoding step. We probably won"t be using it since multiple sound 140 | frames may depend on the same character/phone, probably not the way around. 141 | Note: 142 | We still keep it implemented in case we want to test it. They used it in the 143 | paper in the context of speech recognition, where one phoneme may depend on 144 | multiple subsequent sound frames. 145 | name: Name to use when creating ops. 146 | """ 147 | #Create normalization function 148 | #Setting it to None defaults in using softmax 149 | normalization_function = _smoothing_normalization if (smoothing == True) else None 150 | memory_length = memory_sequence_length if (mask_encoder==True) else None 151 | super(LocationSensitiveAttention, self).__init__( 152 | num_units=num_units, 153 | memory=memory, 154 | memory_sequence_length=memory_length, 155 | probability_fn=normalization_function, 156 | name=name) 157 | 158 | self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, 159 | kernel_size=hparams.attention_kernel, padding="same", use_bias=True, 160 | bias_initializer=tf.zeros_initializer(), name="location_features_convolution") 161 | self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, 162 | dtype=tf.float32, name="location_features_layer") 163 | self._cumulate = cumulate_weights 164 | 165 | def __call__(self, query, state): 166 | """Score the query based on the keys and values. 167 | Args: 168 | query: Tensor of dtype matching `self.values` and shape 169 | `[batch_size, query_depth]`. 170 | state (previous alignments): Tensor of dtype matching `self.values` and shape 171 | `[batch_size, alignments_size]` 172 | (`alignments_size` is memory"s `max_time`). 173 | Returns: 174 | alignments: Tensor of dtype matching `self.values` and shape 175 | `[batch_size, alignments_size]` (`alignments_size` is memory's 176 | `max_time`). 177 | """ 178 | previous_alignments = state 179 | with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]): 180 | 181 | # processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim] 182 | processed_query = self.query_layer(query) if self.query_layer else query 183 | # -> [batch_size, 1, attention_dim] 184 | processed_query = tf.expand_dims(processed_query, 1) 185 | 186 | # processed_location_features shape [batch_size, max_time, attention dimension] 187 | # [batch_size, max_time] -> [batch_size, max_time, 1] 188 | expanded_alignments = tf.expand_dims(previous_alignments, axis=2) 189 | # location features [batch_size, max_time, filters] 190 | f = self.location_convolution(expanded_alignments) 191 | # Projected location features [batch_size, max_time, attention_dim] 192 | processed_location_features = self.location_layer(f) 193 | 194 | # energy shape [batch_size, max_time] 195 | energy = _location_sensitive_score(processed_query, processed_location_features, self.keys) 196 | 197 | 198 | # alignments shape = energy shape = [batch_size, max_time] 199 | alignments = self._probability_fn(energy, previous_alignments) 200 | 201 | # Cumulate alignments 202 | if self._cumulate: 203 | next_state = alignments + previous_alignments 204 | else: 205 | next_state = alignments 206 | 207 | return alignments, next_state 208 | -------------------------------------------------------------------------------- /synthesizer/models/custom_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import collections 3 | import tensorflow as tf 4 | from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper 5 | from tensorflow.contrib.seq2seq.python.ops import decoder 6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 7 | from tensorflow.python.framework import ops, tensor_shape 8 | from tensorflow.python.layers import base as layers_base 9 | from tensorflow.python.ops import rnn_cell_impl 10 | from tensorflow.python.util import nest 11 | 12 | 13 | class CustomDecoderOutput( 14 | collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))): 15 | pass 16 | 17 | 18 | class CustomDecoder(decoder.Decoder): 19 | """Custom sampling decoder. 20 | 21 | Allows for stop token prediction at inference time 22 | and returns equivalent loss in training time. 23 | 24 | Note: 25 | Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers 26 | """ 27 | 28 | def __init__(self, cell, helper, initial_state, output_layer=None): 29 | """Initialize CustomDecoder. 30 | Args: 31 | cell: An `RNNCell` instance. 32 | helper: A `Helper` instance. 33 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 34 | The initial state of the RNNCell. 35 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 36 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 37 | to storing the result or sampling. 38 | Raises: 39 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 40 | """ 41 | rnn_cell_impl.assert_like_rnncell(type(cell), cell) 42 | if not isinstance(helper, helper_py.Helper): 43 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 44 | if (output_layer is not None 45 | and not isinstance(output_layer, layers_base.Layer)): 46 | raise TypeError( 47 | "output_layer must be a Layer, received: %s" % type(output_layer)) 48 | self._cell = cell 49 | self._helper = helper 50 | self._initial_state = initial_state 51 | self._output_layer = output_layer 52 | 53 | @property 54 | def batch_size(self): 55 | return self._helper.batch_size 56 | 57 | def _rnn_output_size(self): 58 | size = self._cell.output_size 59 | if self._output_layer is None: 60 | return size 61 | else: 62 | # To use layer"s compute_output_shape, we need to convert the 63 | # RNNCell"s output_size entries into shapes with an unknown 64 | # batch size. We then pass this through the layer"s 65 | # compute_output_shape and read off all but the first (batch) 66 | # dimensions to get the output size of the rnn with the layer 67 | # applied to the top. 68 | output_shape_with_unknown_batch = nest.map_structure( 69 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 70 | size) 71 | layer_output_shape = self._output_layer._compute_output_shape( # pylint: disable=protected-access 72 | output_shape_with_unknown_batch) 73 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 74 | 75 | @property 76 | def output_size(self): 77 | # Return the cell output and the id 78 | return CustomDecoderOutput( 79 | rnn_output=self._rnn_output_size(), 80 | token_output=self._helper.token_output_size, 81 | sample_id=self._helper.sample_ids_shape) 82 | 83 | @property 84 | def output_dtype(self): 85 | # Assume the dtype of the cell is the output_size structure 86 | # containing the input_state"s first component's dtype. 87 | # Return that structure and the sample_ids_dtype from the helper. 88 | dtype = nest.flatten(self._initial_state)[0].dtype 89 | return CustomDecoderOutput( 90 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 91 | tf.float32, 92 | self._helper.sample_ids_dtype) 93 | 94 | def initialize(self, name=None): 95 | """Initialize the decoder. 96 | Args: 97 | name: Name scope for any created operations. 98 | Returns: 99 | `(finished, first_inputs, initial_state)`. 100 | """ 101 | return self._helper.initialize() + (self._initial_state,) 102 | 103 | def step(self, time, inputs, state, name=None): 104 | """Perform a custom decoding step. 105 | Enables for dyanmic prediction 106 | Args: 107 | time: scalar `int32` tensor. 108 | inputs: A (structure of) input tensors. 109 | state: A (structure of) state tensors and TensorArrays. 110 | name: Name scope for any created operations. 111 | Returns: 112 | `(outputs, next_state, next_inputs, finished)`. 113 | """ 114 | with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)): 115 | #Call outputprojection wrapper cell 116 | (cell_outputs, stop_token), cell_state = self._cell(inputs, state) 117 | 118 | #apply output_layer (if existant) 119 | if self._output_layer is not None: 120 | cell_outputs = self._output_layer(cell_outputs) 121 | sample_ids = self._helper.sample( 122 | time=time, outputs=cell_outputs, state=cell_state) 123 | 124 | (finished, next_inputs, next_state) = self._helper.next_inputs( 125 | time=time, 126 | outputs=cell_outputs, 127 | state=cell_state, 128 | sample_ids=sample_ids, 129 | stop_token_prediction=stop_token) 130 | 131 | outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids) 132 | return (outputs, next_state, next_inputs, finished) 133 | -------------------------------------------------------------------------------- /synthesizer/models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.seq2seq import Helper 4 | 5 | 6 | class TacoTestHelper(Helper): 7 | def __init__(self, batch_size, hparams): 8 | with tf.name_scope("TacoTestHelper"): 9 | self._batch_size = batch_size 10 | self._output_dim = hparams.num_mels 11 | self._reduction_factor = hparams.outputs_per_step 12 | self.stop_at_any = hparams.stop_at_any 13 | 14 | @property 15 | def batch_size(self): 16 | return self._batch_size 17 | 18 | @property 19 | def token_output_size(self): 20 | return self._reduction_factor 21 | 22 | @property 23 | def sample_ids_shape(self): 24 | return tf.TensorShape([]) 25 | 26 | @property 27 | def sample_ids_dtype(self): 28 | return np.int32 29 | 30 | def initialize(self, name=None): 31 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 32 | 33 | def sample(self, time, outputs, state, name=None): 34 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 35 | 36 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 37 | """Stop on EOS. Otherwise, pass the last output as the next input and pass through state.""" 38 | with tf.name_scope("TacoTestHelper"): 39 | #A sequence is finished when the output probability is > 0.5 40 | finished = tf.cast(tf.round(stop_token_prediction), tf.bool) 41 | 42 | #Since we are predicting r frames at each step, two modes are 43 | #then possible: 44 | # Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended) 45 | # Stop when the model outputs a p > 0.5 for all r frames (Safer) 46 | #Note: 47 | # With enough training steps, the model should be able to predict when to stop correctly 48 | # and the use of stop_at_any = True would be recommended. If however the model didn"t 49 | # learn to stop correctly yet, (stops too soon) one could choose to use the safer option 50 | # to get a correct synthesis 51 | if self.stop_at_any: 52 | finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended 53 | else: 54 | finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option 55 | 56 | # Feed last output frame as next input. outputs is [N, output_dim * r] 57 | next_inputs = outputs[:, -self._output_dim:] 58 | next_state = state 59 | return (finished, next_inputs, next_state) 60 | 61 | 62 | class TacoTrainingHelper(Helper): 63 | def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step): 64 | # inputs is [N, T_in], targets is [N, T_out, D] 65 | with tf.name_scope("TacoTrainingHelper"): 66 | self._batch_size = batch_size 67 | self._output_dim = hparams.num_mels 68 | self._reduction_factor = hparams.outputs_per_step 69 | self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio) 70 | self.gta = gta 71 | self.eval = evaluating 72 | self._hparams = hparams 73 | self.global_step = global_step 74 | 75 | r = self._reduction_factor 76 | # Feed every r-th target frame as input 77 | self._targets = targets[:, r-1::r, :] 78 | 79 | #Maximal sequence length 80 | self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size]) 81 | 82 | @property 83 | def batch_size(self): 84 | return self._batch_size 85 | 86 | @property 87 | def token_output_size(self): 88 | return self._reduction_factor 89 | 90 | @property 91 | def sample_ids_shape(self): 92 | return tf.TensorShape([]) 93 | 94 | @property 95 | def sample_ids_dtype(self): 96 | return np.int32 97 | 98 | def initialize(self, name=None): 99 | #Compute teacher forcing ratio for this global step. 100 | #In GTA mode, override teacher forcing scheme to work with full teacher forcing 101 | if self.gta: 102 | self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth 103 | elif self.eval and self._hparams.natural_eval: 104 | self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions 105 | else: 106 | if self._hparams.tacotron_teacher_forcing_mode == "scheduled": 107 | self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio, 108 | self.global_step, self._hparams) 109 | 110 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 111 | 112 | def sample(self, time, outputs, state, name=None): 113 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 114 | 115 | def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None): 116 | with tf.name_scope(name or "TacoTrainingHelper"): 117 | #synthesis stop (we let the model see paddings as we mask them when computing loss functions) 118 | finished = (time + 1 >= self._lengths) 119 | 120 | #Pick previous outputs randomly with respect to teacher forcing ratio 121 | next_inputs = tf.cond( 122 | tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), 123 | lambda: self._targets[:, time, :], #Teacher-forcing: return true frame 124 | lambda: outputs[:,-self._output_dim:]) 125 | 126 | #Pass on state 127 | next_state = state 128 | return (finished, next_inputs, next_state) 129 | 130 | 131 | def _go_frames(batch_size, output_dim): 132 | """Returns all-zero frames for a given batch size and output dimension""" 133 | return tf.tile([[0.0]], [batch_size, output_dim]) 134 | 135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams): 136 | ################################################################# 137 | # Narrow Cosine Decay: 138 | 139 | # Phase 1: tfr = 1 140 | # We only start learning rate decay after 10k steps 141 | 142 | # Phase 2: tfr in ]0, 1[ 143 | # decay reach minimal value at step ~280k 144 | 145 | # Phase 3: tfr = 0 146 | # clip by minimal teacher forcing ratio value (step >~ 280k) 147 | ################################################################# 148 | #Compute natural cosine decay 149 | tfr = tf.train.cosine_decay(init_tfr, 150 | global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k 151 | decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k 152 | alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value 153 | name="tfr_cosine_decay") 154 | 155 | #force teacher forcing ratio to take initial value when global step < start decay step. 156 | narrow_tfr = tf.cond( 157 | tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)), 158 | lambda: tf.convert_to_tensor(init_tfr), 159 | lambda: tfr) 160 | 161 | return narrow_tfr -------------------------------------------------------------------------------- /synthesizer/preprocess.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.pool import Pool 2 | from synthesizer import audio 3 | from functools import partial 4 | from itertools import chain 5 | from encoder import inference as encoder 6 | from pathlib import Path 7 | from utils import logmmse 8 | from tqdm import tqdm 9 | import numpy as np 10 | import librosa 11 | 12 | 13 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, 14 | skip_existing: bool, hparams): 15 | # Gather the input directories 16 | dataset_root = datasets_root.joinpath("LibriSpeech") 17 | input_dirs = [dataset_root.joinpath("train-clean-100"), 18 | dataset_root.joinpath("train-clean-360")] 19 | print("\n ".join(map(str, ["Using data from:"] + input_dirs))) 20 | assert all(input_dir.exists() for input_dir in input_dirs) 21 | 22 | # Create the output directories for each output file type 23 | out_dir.joinpath("mels").mkdir(exist_ok=True) 24 | out_dir.joinpath("audio").mkdir(exist_ok=True) 25 | 26 | # Create a metadata file 27 | metadata_fpath = out_dir.joinpath("train.txt") 28 | metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8") 29 | 30 | # Preprocess the dataset 31 | speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) 32 | func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 33 | hparams=hparams) 34 | job = Pool(n_processes).imap(func, speaker_dirs) 35 | for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"): 36 | for metadatum in speaker_metadata: 37 | metadata_file.write("|".join(str(x) for x in metadatum) + "\n") 38 | metadata_file.close() 39 | 40 | # Verify the contents of the metadata file 41 | with metadata_fpath.open("r", encoding="utf-8") as metadata_file: 42 | metadata = [line.split("|") for line in metadata_file] 43 | mel_frames = sum([int(m[4]) for m in metadata]) 44 | timesteps = sum([int(m[3]) for m in metadata]) 45 | sample_rate = hparams.sample_rate 46 | hours = (timesteps / sample_rate) / 3600 47 | print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." % 48 | (len(metadata), mel_frames, timesteps, hours)) 49 | print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata)) 50 | print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) 51 | print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) 52 | 53 | 54 | def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams): 55 | metadata = [] 56 | for book_dir in speaker_dir.glob("*"): 57 | # Gather the utterance audios and texts 58 | try: 59 | alignments_fpath = next(book_dir.glob("*.alignment.txt")) 60 | with alignments_fpath.open("r") as alignments_file: 61 | alignments = [line.rstrip().split(" ") for line in alignments_file] 62 | except StopIteration: 63 | # A few alignment files will be missing 64 | continue 65 | 66 | # Iterate over each entry in the alignments file 67 | for wav_fname, words, end_times in alignments: 68 | wav_fpath = book_dir.joinpath(wav_fname + ".flac") 69 | assert wav_fpath.exists() 70 | words = words.replace("\"", "").split(",") 71 | end_times = list(map(float, end_times.replace("\"", "").split(","))) 72 | 73 | # Process each sub-utterance 74 | wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams) 75 | for i, (wav, text) in enumerate(zip(wavs, texts)): 76 | sub_basename = "%s_%02d" % (wav_fname, i) 77 | metadata.append(process_utterance(wav, text, out_dir, sub_basename, 78 | skip_existing, hparams)) 79 | 80 | return [m for m in metadata if m is not None] 81 | 82 | 83 | def split_on_silences(wav_fpath, words, end_times, hparams): 84 | # Load the audio waveform 85 | wav, _ = librosa.load(wav_fpath, hparams.sample_rate) 86 | if hparams.rescale: 87 | wav = wav / np.abs(wav).max() * hparams.rescaling_max 88 | 89 | words = np.array(words) 90 | start_times = np.array([0.0] + end_times[:-1]) 91 | end_times = np.array(end_times) 92 | assert len(words) == len(end_times) == len(start_times) 93 | assert words[0] == "" and words[-1] == "" 94 | 95 | # Find pauses that are too long 96 | mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) 97 | mask[0] = mask[-1] = True 98 | breaks = np.where(mask)[0] 99 | 100 | # Profile the noise from the silences and perform noise reduction on the waveform 101 | silence_times = [[start_times[i], end_times[i]] for i in breaks] 102 | silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int) 103 | noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times]) 104 | if len(noisy_wav) > hparams.sample_rate * 0.02: 105 | profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) 106 | wav = logmmse.denoise(wav, profile, eta=0) 107 | 108 | # Re-attach segments that are too short 109 | segments = list(zip(breaks[:-1], breaks[1:])) 110 | segment_durations = [start_times[end] - end_times[start] for start, end in segments] 111 | i = 0 112 | while i < len(segments) and len(segments) > 1: 113 | if segment_durations[i] < hparams.utterance_min_duration: 114 | # See if the segment can be re-attached with the right or the left segment 115 | left_duration = float("inf") if i == 0 else segment_durations[i - 1] 116 | right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1] 117 | joined_duration = segment_durations[i] + min(left_duration, right_duration) 118 | 119 | # Do not re-attach if it causes the joined utterance to be too long 120 | if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: 121 | i += 1 122 | continue 123 | 124 | # Re-attach the segment with the neighbour of shortest duration 125 | j = i - 1 if left_duration <= right_duration else i 126 | segments[j] = (segments[j][0], segments[j + 1][1]) 127 | segment_durations[j] = joined_duration 128 | del segments[j + 1], segment_durations[j + 1] 129 | else: 130 | i += 1 131 | 132 | # Split the utterance 133 | segment_times = [[end_times[start], start_times[end]] for start, end in segments] 134 | segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int) 135 | wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times] 136 | texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments] 137 | 138 | # # DEBUG: play the audio segments (run with -n=1) 139 | # import sounddevice as sd 140 | # if len(wavs) > 1: 141 | # print("This sentence was split in %d segments:" % len(wavs)) 142 | # else: 143 | # print("There are no silences long enough for this sentence to be split:") 144 | # for wav, text in zip(wavs, texts): 145 | # # Pad the waveform with 1 second of silence because sounddevice tends to cut them early 146 | # # when playing them. You shouldn't need to do that in your parsers. 147 | # wav = np.concatenate((wav, [0] * 16000)) 148 | # print("\t%s" % text) 149 | # sd.play(wav, 16000, blocking=True) 150 | # print("") 151 | 152 | return wavs, texts 153 | 154 | 155 | def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 156 | skip_existing: bool, hparams): 157 | ## FOR REFERENCE: 158 | # For you not to lose your head if you ever wish to change things here or implement your own 159 | # synthesizer. 160 | # - Both the audios and the mel spectrograms are saved as numpy arrays 161 | # - There is no processing done to the audios that will be saved to disk beyond volume 162 | # normalization (in split_on_silences) 163 | # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This 164 | # is why we re-apply it on the audio on the side of the vocoder. 165 | # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved 166 | # without extra padding. This means that you won't have an exact relation between the length 167 | # of the wav and of the mel spectrogram. See the vocoder data loader. 168 | 169 | 170 | # Skip existing utterances if needed 171 | mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) 172 | wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) 173 | if skip_existing and mel_fpath.exists() and wav_fpath.exists(): 174 | return None 175 | 176 | # Skip utterances that are too short 177 | if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: 178 | return None 179 | 180 | # Compute the mel spectrogram 181 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) 182 | mel_frames = mel_spectrogram.shape[1] 183 | 184 | # Skip utterances that are too long 185 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: 186 | return None 187 | 188 | # Write the spectrogram, embed and audio to disk 189 | np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) 190 | np.save(wav_fpath, wav, allow_pickle=False) 191 | 192 | # Return a tuple describing this training example 193 | return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text 194 | 195 | 196 | def embed_utterance(fpaths, encoder_model_fpath): 197 | if not encoder.is_loaded(): 198 | encoder.load_model(encoder_model_fpath) 199 | 200 | # Compute the speaker embedding of the utterance 201 | wav_fpath, embed_fpath = fpaths 202 | wav = np.load(wav_fpath) 203 | wav = encoder.preprocess_wav(wav) 204 | embed = encoder.embed_utterance(wav) 205 | np.save(embed_fpath, embed, allow_pickle=False) 206 | 207 | 208 | def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): 209 | wav_dir = synthesizer_root.joinpath("audio") 210 | metadata_fpath = synthesizer_root.joinpath("train.txt") 211 | assert wav_dir.exists() and metadata_fpath.exists() 212 | embed_dir = synthesizer_root.joinpath("embeds") 213 | embed_dir.mkdir(exist_ok=True) 214 | 215 | # Gather the input wave filepath and the target output embed filepath 216 | with metadata_fpath.open("r") as metadata_file: 217 | metadata = [line.split("|") for line in metadata_file] 218 | fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] 219 | 220 | # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. 221 | # Embed the utterances in separate threads 222 | func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) 223 | job = Pool(n_processes).imap(func, fpaths) 224 | list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) 225 | -------------------------------------------------------------------------------- /synthesizer/synthesize.py: -------------------------------------------------------------------------------- 1 | from synthesizer.tacotron2 import Tacotron2 2 | from synthesizer.hparams import hparams_debug_string 3 | from synthesizer.infolog import log 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | import time 7 | import os 8 | 9 | 10 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences): 11 | eval_dir = os.path.join(output_dir, "eval") 12 | log_dir = os.path.join(output_dir, "logs-eval") 13 | 14 | #Create output path if it doesn"t exist 15 | os.makedirs(eval_dir, exist_ok=True) 16 | os.makedirs(log_dir, exist_ok=True) 17 | os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True) 18 | os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True) 19 | 20 | log(hparams_debug_string()) 21 | synth = Tacotron2(checkpoint_path, hparams) 22 | 23 | #Set inputs batch wise 24 | sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 25 | in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] 26 | 27 | log("Starting Synthesis") 28 | with open(os.path.join(eval_dir, "map.txt"), "w") as file: 29 | for i, texts in enumerate(tqdm(sentences)): 30 | start = time.time() 31 | basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))] 32 | mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None) 33 | 34 | for elems in zip(texts, mel_filenames, speaker_ids): 35 | file.write("|".join([str(x) for x in elems]) + "\n") 36 | log("synthesized mel spectrograms at {}".format(eval_dir)) 37 | return eval_dir 38 | 39 | def run_synthesis(in_dir, out_dir, model_dir, hparams): 40 | synth_dir = os.path.join(out_dir, "mels_gta") 41 | os.makedirs(synth_dir, exist_ok=True) 42 | metadata_filename = os.path.join(in_dir, "train.txt") 43 | print(hparams_debug_string()) 44 | 45 | # Load the model in memory 46 | weights_dir = os.path.join(model_dir, "taco_pretrained") 47 | checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path 48 | synth = Tacotron2(checkpoint_fpath, hparams, gta=True) 49 | 50 | # Load the metadata 51 | with open(metadata_filename, encoding="utf-8") as f: 52 | metadata = [line.strip().split("|") for line in f] 53 | frame_shift_ms = hparams.hop_size / hparams.sample_rate 54 | hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600 55 | print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours)) 56 | 57 | #Set inputs batch wise 58 | metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in 59 | range(0, len(metadata), hparams.tacotron_synthesis_batch_size)] 60 | # TODO: come on big boy, fix this 61 | # Quick and dirty fix to make sure that all batches have the same size 62 | metadata = metadata[:-1] 63 | 64 | print("Starting Synthesis") 65 | mel_dir = os.path.join(in_dir, "mels") 66 | embed_dir = os.path.join(in_dir, "embeds") 67 | meta_out_fpath = os.path.join(out_dir, "synthesized.txt") 68 | with open(meta_out_fpath, "w") as file: 69 | for i, meta in enumerate(tqdm(metadata)): 70 | texts = [m[5] for m in meta] 71 | mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta] 72 | embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta] 73 | basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "") 74 | for m in mel_filenames] 75 | synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames) 76 | 77 | for elems in meta: 78 | file.write("|".join([str(x) for x in elems]) + "\n") 79 | 80 | print("Synthesized mel spectrograms at {}".format(synth_dir)) 81 | return meta_out_fpath 82 | 83 | -------------------------------------------------------------------------------- /synthesizer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class ValueWindow(): 2 | def __init__(self, window_size=100): 3 | self._window_size = window_size 4 | self._values = [] 5 | 6 | def append(self, x): 7 | self._values = self._values[-(self._window_size - 1):] + [x] 8 | 9 | @property 10 | def sum(self): 11 | return sum(self._values) 12 | 13 | @property 14 | def count(self): 15 | return len(self._values) 16 | 17 | @property 18 | def average(self): 19 | return self.sum / max(1, self.count) 20 | 21 | def reset(self): 22 | self._values = [] 23 | -------------------------------------------------------------------------------- /synthesizer/utils/_cmudict.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_symbols = [ 4 | "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2", 5 | "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2", 6 | "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY", 7 | "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1", 8 | "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0", 9 | "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW", 10 | "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH" 11 | ] 12 | 13 | _valid_symbol_set = set(valid_symbols) 14 | 15 | 16 | class CMUDict: 17 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" 18 | def __init__(self, file_or_path, keep_ambiguous=True): 19 | if isinstance(file_or_path, str): 20 | with open(file_or_path, encoding="latin-1") as f: 21 | entries = _parse_cmudict(f) 22 | else: 23 | entries = _parse_cmudict(file_or_path) 24 | if not keep_ambiguous: 25 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 26 | self._entries = entries 27 | 28 | 29 | def __len__(self): 30 | return len(self._entries) 31 | 32 | 33 | def lookup(self, word): 34 | """Returns list of ARPAbet pronunciations of the given word.""" 35 | return self._entries.get(word.upper()) 36 | 37 | 38 | 39 | _alt_re = re.compile(r"\([0-9]+\)") 40 | 41 | 42 | def _parse_cmudict(file): 43 | cmudict = {} 44 | for line in file: 45 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 46 | parts = line.split(" ") 47 | word = re.sub(_alt_re, "", parts[0]) 48 | pronunciation = _get_pronunciation(parts[1]) 49 | if pronunciation: 50 | if word in cmudict: 51 | cmudict[word].append(pronunciation) 52 | else: 53 | cmudict[word] = [pronunciation] 54 | return cmudict 55 | 56 | 57 | def _get_pronunciation(s): 58 | parts = s.strip().split(" ") 59 | for part in parts: 60 | if part not in _valid_symbol_set: 61 | return None 62 | return " ".join(parts) 63 | -------------------------------------------------------------------------------- /synthesizer/utils/cleaners.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You"ll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | """ 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | # Regular expression matching whitespace: 18 | _whitespace_re = re.compile(r"\s+") 19 | 20 | # List of (regular expression, replacement) pairs for abbreviations: 21 | _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ 22 | ("mrs", "misess"), 23 | ("mr", "mister"), 24 | ("dr", "doctor"), 25 | ("st", "saint"), 26 | ("co", "company"), 27 | ("jr", "junior"), 28 | ("maj", "major"), 29 | ("gen", "general"), 30 | ("drs", "doctors"), 31 | ("rev", "reverend"), 32 | ("lt", "lieutenant"), 33 | ("hon", "honorable"), 34 | ("sgt", "sergeant"), 35 | ("capt", "captain"), 36 | ("esq", "esquire"), 37 | ("ltd", "limited"), 38 | ("col", "colonel"), 39 | ("ft", "fort"), 40 | ]] 41 | 42 | 43 | def expand_abbreviations(text): 44 | for regex, replacement in _abbreviations: 45 | text = re.sub(regex, replacement, text) 46 | return text 47 | 48 | 49 | def expand_numbers(text): 50 | return normalize_numbers(text) 51 | 52 | 53 | def lowercase(text): 54 | """lowercase input tokens.""" 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, " ", text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def basic_cleaners(text): 67 | """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 68 | text = lowercase(text) 69 | text = collapse_whitespace(text) 70 | return text 71 | 72 | 73 | def transliteration_cleaners(text): 74 | """Pipeline for non-English text that transliterates to ASCII.""" 75 | text = convert_to_ascii(text) 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def english_cleaners(text): 82 | """Pipeline for English text, including number and abbreviation expansion.""" 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = expand_numbers(text) 86 | text = expand_abbreviations(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | -------------------------------------------------------------------------------- /synthesizer/utils/numbers.py: -------------------------------------------------------------------------------- 1 | import re 2 | import inflect 3 | 4 | _inflect = inflect.engine() 5 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 6 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 7 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 8 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 9 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 10 | _number_re = re.compile(r"[0-9]+") 11 | 12 | 13 | def _remove_commas(m): 14 | return m.group(1).replace(",", "") 15 | 16 | 17 | def _expand_decimal_point(m): 18 | return m.group(1).replace(".", " point ") 19 | 20 | 21 | def _expand_dollars(m): 22 | match = m.group(1) 23 | parts = match.split(".") 24 | if len(parts) > 2: 25 | return match + " dollars" # Unexpected format 26 | dollars = int(parts[0]) if parts[0] else 0 27 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 28 | if dollars and cents: 29 | dollar_unit = "dollar" if dollars == 1 else "dollars" 30 | cent_unit = "cent" if cents == 1 else "cents" 31 | return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) 32 | elif dollars: 33 | dollar_unit = "dollar" if dollars == 1 else "dollars" 34 | return "%s %s" % (dollars, dollar_unit) 35 | elif cents: 36 | cent_unit = "cent" if cents == 1 else "cents" 37 | return "%s %s" % (cents, cent_unit) 38 | else: 39 | return "zero dollars" 40 | 41 | 42 | def _expand_ordinal(m): 43 | return _inflect.number_to_words(m.group(0)) 44 | 45 | 46 | def _expand_number(m): 47 | num = int(m.group(0)) 48 | if num > 1000 and num < 3000: 49 | if num == 2000: 50 | return "two thousand" 51 | elif num > 2000 and num < 2010: 52 | return "two thousand " + _inflect.number_to_words(num % 100) 53 | elif num % 100 == 0: 54 | return _inflect.number_to_words(num // 100) + " hundred" 55 | else: 56 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 57 | else: 58 | return _inflect.number_to_words(num, andword="") 59 | 60 | 61 | def normalize_numbers(text): 62 | text = re.sub(_comma_number_re, _remove_commas, text) 63 | text = re.sub(_pounds_re, r"\1 pounds", text) 64 | text = re.sub(_dollars_re, _expand_dollars, text) 65 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 66 | text = re.sub(_ordinal_re, _expand_ordinal, text) 67 | text = re.sub(_number_re, _expand_number, text) 68 | return text 69 | -------------------------------------------------------------------------------- /synthesizer/utils/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use("Agg") 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def split_title_line(title_text, max_words=5): 8 | """ 9 | A function that splits any string based on specific character 10 | (returning it with the string), with maximum number of words on it 11 | """ 12 | seq = title_text.split() 13 | return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)]) 14 | 15 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None): 16 | if max_len is not None: 17 | alignment = alignment[:, :max_len] 18 | 19 | fig = plt.figure(figsize=(8, 6)) 20 | ax = fig.add_subplot(111) 21 | 22 | im = ax.imshow( 23 | alignment, 24 | aspect="auto", 25 | origin="lower", 26 | interpolation="none") 27 | fig.colorbar(im, ax=ax) 28 | xlabel = "Decoder timestep" 29 | 30 | if split_title: 31 | title = split_title_line(title) 32 | 33 | plt.xlabel(xlabel) 34 | plt.title(title) 35 | plt.ylabel("Encoder timestep") 36 | plt.tight_layout() 37 | plt.savefig(path, format="png") 38 | plt.close() 39 | 40 | 41 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False): 42 | if max_len is not None: 43 | target_spectrogram = target_spectrogram[:max_len] 44 | pred_spectrogram = pred_spectrogram[:max_len] 45 | 46 | if split_title: 47 | title = split_title_line(title) 48 | 49 | fig = plt.figure(figsize=(10, 8)) 50 | # Set common labels 51 | fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16) 52 | 53 | #target spectrogram subplot 54 | if target_spectrogram is not None: 55 | ax1 = fig.add_subplot(311) 56 | ax2 = fig.add_subplot(312) 57 | 58 | if auto_aspect: 59 | im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none") 60 | else: 61 | im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none") 62 | ax1.set_title("Target Mel-Spectrogram") 63 | fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1) 64 | ax2.set_title("Predicted Mel-Spectrogram") 65 | else: 66 | ax2 = fig.add_subplot(211) 67 | 68 | if auto_aspect: 69 | im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none") 70 | else: 71 | im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none") 72 | fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2) 73 | 74 | plt.tight_layout() 75 | plt.savefig(path, format="png") 76 | plt.close() 77 | -------------------------------------------------------------------------------- /synthesizer/utils/symbols.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | """ 7 | # from . import cmudict 8 | 9 | _pad = "_" 10 | _eos = "~" 11 | _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? " 12 | 13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 14 | #_arpabet = ["@' + s for s in cmudict.valid_symbols] 15 | 16 | # Export all symbols: 17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet 18 | -------------------------------------------------------------------------------- /synthesizer/utils/text.py: -------------------------------------------------------------------------------- 1 | from .symbols import symbols 2 | from . import cleaners 3 | import re 4 | 5 | # Mappings from symbol to numeric ID and vice versa: 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 8 | 9 | # Regular expression matching text enclosed in curly braces: 10 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") 11 | 12 | 13 | def text_to_sequence(text, cleaner_names): 14 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 15 | 16 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 17 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 18 | 19 | Args: 20 | text: string to convert to a sequence 21 | cleaner_names: names of the cleaner functions to run the text through 22 | 23 | Returns: 24 | List of integers corresponding to the symbols in the text 25 | """ 26 | sequence = [] 27 | 28 | # Check for curly braces and treat their contents as ARPAbet: 29 | while len(text): 30 | m = _curly_re.match(text) 31 | if not m: 32 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 33 | break 34 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 35 | sequence += _arpabet_to_sequence(m.group(2)) 36 | text = m.group(3) 37 | 38 | # Append EOS token 39 | sequence.append(_symbol_to_id["~"]) 40 | return sequence 41 | 42 | 43 | def sequence_to_text(sequence): 44 | """Converts a sequence of IDs back to a string""" 45 | result = "" 46 | for symbol_id in sequence: 47 | if symbol_id in _id_to_symbol: 48 | s = _id_to_symbol[symbol_id] 49 | # Enclose ARPAbet back in curly braces: 50 | if len(s) > 1 and s[0] == "@": 51 | s = "{%s}" % s[1:] 52 | result += s 53 | return result.replace("}{", " ") 54 | 55 | 56 | def _clean_text(text, cleaner_names): 57 | for name in cleaner_names: 58 | cleaner = getattr(cleaners, name) 59 | if not cleaner: 60 | raise Exception("Unknown cleaner: %s" % name) 61 | text = cleaner(text) 62 | return text 63 | 64 | 65 | def _symbols_to_sequence(symbols): 66 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 67 | 68 | 69 | def _arpabet_to_sequence(text): 70 | return _symbols_to_sequence(["@" + s for s in text.split()]) 71 | 72 | 73 | def _should_keep_symbol(s): 74 | return s in _symbol_to_id and s not in ("_", "~") 75 | -------------------------------------------------------------------------------- /synthesizer_preprocess_audio.py: -------------------------------------------------------------------------------- 1 | from synthesizer.preprocess import preprocess_librispeech 2 | from synthesizer.hparams import hparams 3 | from utils.argutils import print_args 4 | from pathlib import Path 5 | import argparse 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser( 10 | description="Preprocesses audio files from datasets, encodes them as mel spectrograms " 11 | "and writes them to the disk. Audio files are also saved, to be used by the " 12 | "vocoder for training.", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 14 | ) 15 | parser.add_argument("datasets_root", type=Path, help=\ 16 | "Path to the directory containing your LibriSpeech/TTS datasets.") 17 | parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ 18 | "Path to the output directory that will contain the mel spectrograms, the audios and the " 19 | "embeds. Defaults to /SV2TTS/synthesizer/") 20 | parser.add_argument("-n", "--n_processes", type=int, default=None, help=\ 21 | "Number of processes in parallel.") 22 | parser.add_argument("-s", "--skip_existing", action="store_true", help=\ 23 | "Whether to overwrite existing files with the same name. Useful if the preprocessing was " 24 | "interrupted.") 25 | parser.add_argument("--hparams", type=str, default="", help=\ 26 | "Hyperparameter overrides as a comma-separated list of name-value pairs") 27 | args = parser.parse_args() 28 | 29 | # Process the arguments 30 | if not hasattr(args, "out_dir"): 31 | args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer") 32 | 33 | # Create directories 34 | assert args.datasets_root.exists() 35 | args.out_dir.mkdir(exist_ok=True, parents=True) 36 | 37 | # Preprocess the dataset 38 | print_args(args, parser) 39 | args.hparams = hparams.parse(args.hparams) 40 | preprocess_librispeech(**vars(args)) 41 | -------------------------------------------------------------------------------- /synthesizer_preprocess_embeds.py: -------------------------------------------------------------------------------- 1 | from synthesizer.preprocess import create_embeddings 2 | from utils.argutils import print_args 3 | from pathlib import Path 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser( 9 | description="Creates embeddings for the synthesizer from the LibriSpeech utterances.", 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 11 | ) 12 | parser.add_argument("synthesizer_root", type=Path, help=\ 13 | "Path to the synthesizer training data that contains the audios and the train.txt file. " 14 | "If you let everything as default, it should be /SV2TTS/synthesizer/.") 15 | parser.add_argument("-e", "--encoder_model_fpath", type=Path, 16 | default="encoder/saved_models/pretrained.pt", help=\ 17 | "Path your trained encoder model.") 18 | parser.add_argument("-n", "--n_processes", type=int, default=4, help= \ 19 | "Number of parallel processes. An encoder is created for each, so you may need to lower " 20 | "this value on GPUs with low memory. Set it to 1 if CUDA is unhappy.") 21 | args = parser.parse_args() 22 | 23 | # Preprocess the dataset 24 | print_args(args, parser) 25 | create_embeddings(**vars(args)) 26 | -------------------------------------------------------------------------------- /synthesizer_train.py: -------------------------------------------------------------------------------- 1 | from synthesizer.hparams import hparams 2 | from synthesizer.train import tacotron_train 3 | from utils.argutils import print_args 4 | from synthesizer import infolog 5 | import argparse 6 | import os 7 | 8 | 9 | def prepare_run(args): 10 | modified_hp = hparams.parse(args.hparams) 11 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(args.tf_log_level) 12 | run_name = args.name 13 | log_dir = os.path.join(args.models_dir, "logs-{}".format(run_name)) 14 | os.makedirs(log_dir, exist_ok=True) 15 | infolog.init(os.path.join(log_dir, "Terminal_train_log"), run_name, args.slack_url) 16 | return log_dir, modified_hp 17 | 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("name", help="Name of the run and of the logging directory.") 22 | parser.add_argument("synthesizer_root", type=str, help=\ 23 | "Path to the synthesizer training data that contains the audios and the train.txt file. " 24 | "If you let everything as default, it should be /SV2TTS/synthesizer/.") 25 | parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ 26 | "Path to the output directory that will contain the saved model weights and the logs.") 27 | parser.add_argument("--mode", default="synthesis", 28 | help="mode for synthesis of tacotron after training") 29 | parser.add_argument("--GTA", default="True", 30 | help="Ground truth aligned synthesis, defaults to True, only considered " 31 | "in Tacotron synthesis mode") 32 | parser.add_argument("--restore", type=bool, default=True, 33 | help="Set this to False to do a fresh training") 34 | parser.add_argument("--summary_interval", type=int, default=2500, 35 | help="Steps between running summary ops") 36 | parser.add_argument("--embedding_interval", type=int, default=10000, 37 | help="Steps between updating embeddings projection visualization") 38 | parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000 39 | help="Steps between writing checkpoints") 40 | parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000 41 | help="Steps between eval on test data") 42 | parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000 43 | help="total number of tacotron training steps") 44 | parser.add_argument("--tf_log_level", type=int, default=1, help="Tensorflow C++ log level.") 45 | parser.add_argument("--slack_url", default=None, 46 | help="slack webhook notification destination link") 47 | parser.add_argument("--hparams", default="", 48 | help="Hyperparameter overrides as a comma-separated list of name=value " 49 | "pairs") 50 | args = parser.parse_args() 51 | print_args(args, parser) 52 | 53 | log_dir, hparams = prepare_run(args) 54 | 55 | tacotron_train(args, log_dir, hparams) 56 | -------------------------------------------------------------------------------- /toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | from toolbox.ui import UI 2 | from encoder import inference as encoder 3 | from synthesizer.inference import Synthesizer 4 | from vocoder import inference as vocoder 5 | from pathlib import Path 6 | from time import perf_counter as timer 7 | from toolbox.utterance import Utterance 8 | import numpy as np 9 | import traceback 10 | import sys 11 | 12 | 13 | # Use this directory structure for your datasets, or modify it to fit your needs 14 | recognized_datasets = [ 15 | "LibriSpeech/dev-clean", 16 | "LibriSpeech/dev-other", 17 | "LibriSpeech/test-clean", 18 | "LibriSpeech/test-other", 19 | "LibriSpeech/train-clean-100", 20 | "LibriSpeech/train-clean-360", 21 | "LibriSpeech/train-other-500", 22 | "LibriTTS/dev-clean", 23 | "LibriTTS/dev-other", 24 | "LibriTTS/test-clean", 25 | "LibriTTS/test-other", 26 | "LibriTTS/train-clean-100", 27 | "LibriTTS/train-clean-360", 28 | "LibriTTS/train-other-500", 29 | "LJSpeech-1.1", 30 | "VoxCeleb1/wav", 31 | "VoxCeleb1/test_wav", 32 | "VoxCeleb2/dev/aac", 33 | "VoxCeleb2/test/aac", 34 | "VCTK-Corpus/wav48", 35 | ] 36 | 37 | class Toolbox: 38 | def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, low_mem): 39 | sys.excepthook = self.excepthook 40 | self.datasets_root = datasets_root 41 | self.low_mem = low_mem 42 | self.utterances = set() 43 | self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav 44 | 45 | self.synthesizer = None # type: Synthesizer 46 | 47 | # Initialize the events and the interface 48 | self.ui = UI() 49 | self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir) 50 | self.setup_events() 51 | self.ui.start() 52 | 53 | def excepthook(self, exc_type, exc_value, exc_tb): 54 | traceback.print_exception(exc_type, exc_value, exc_tb) 55 | self.ui.log("Exception: %s" % exc_value) 56 | 57 | def setup_events(self): 58 | # Dataset, speaker and utterance selection 59 | self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser()) 60 | random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root, 61 | recognized_datasets, 62 | level) 63 | self.ui.random_dataset_button.clicked.connect(random_func(0)) 64 | self.ui.random_speaker_button.clicked.connect(random_func(1)) 65 | self.ui.random_utterance_button.clicked.connect(random_func(2)) 66 | self.ui.dataset_box.currentIndexChanged.connect(random_func(1)) 67 | self.ui.speaker_box.currentIndexChanged.connect(random_func(2)) 68 | 69 | # Model selection 70 | self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder) 71 | def func(): 72 | self.synthesizer = None 73 | self.ui.synthesizer_box.currentIndexChanged.connect(func) 74 | self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder) 75 | 76 | # Utterance selection 77 | func = lambda: self.load_from_browser(self.ui.browse_file()) 78 | self.ui.browser_browse_button.clicked.connect(func) 79 | func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current") 80 | self.ui.utterance_history.currentIndexChanged.connect(func) 81 | func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate) 82 | self.ui.play_button.clicked.connect(func) 83 | self.ui.stop_button.clicked.connect(self.ui.stop) 84 | self.ui.record_button.clicked.connect(self.record) 85 | 86 | # Generation 87 | func = lambda: self.synthesize() or self.vocode() 88 | self.ui.generate_button.clicked.connect(func) 89 | self.ui.synthesize_button.clicked.connect(self.synthesize) 90 | self.ui.vocode_button.clicked.connect(self.vocode) 91 | 92 | # UMAP legend 93 | self.ui.clear_button.clicked.connect(self.clear_utterances) 94 | 95 | def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir): 96 | self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True) 97 | self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir) 98 | 99 | def load_from_browser(self, fpath=None): 100 | if fpath is None: 101 | fpath = Path(self.datasets_root, 102 | self.ui.current_dataset_name, 103 | self.ui.current_speaker_name, 104 | self.ui.current_utterance_name) 105 | name = str(fpath.relative_to(self.datasets_root)) 106 | speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name 107 | 108 | # Select the next utterance 109 | if self.ui.auto_next_checkbox.isChecked(): 110 | self.ui.browser_select_next() 111 | elif fpath == "": 112 | return 113 | else: 114 | name = fpath.name 115 | speaker_name = fpath.parent.name 116 | 117 | # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for 118 | # playback, so as to have a fair comparison with the generated audio 119 | wav = Synthesizer.load_preprocess_wav(fpath) 120 | self.ui.log("Loaded %s" % name) 121 | 122 | self.add_real_utterance(wav, name, speaker_name) 123 | 124 | def record(self): 125 | wav = self.ui.record_one(encoder.sampling_rate, 5) 126 | if wav is None: 127 | return 128 | self.ui.play(wav, encoder.sampling_rate) 129 | 130 | speaker_name = "user01" 131 | name = speaker_name + "_rec_%05d" % np.random.randint(100000) 132 | self.add_real_utterance(wav, name, speaker_name) 133 | 134 | def add_real_utterance(self, wav, name, speaker_name): 135 | # Compute the mel spectrogram 136 | spec = Synthesizer.make_spectrogram(wav) 137 | self.ui.draw_spec(spec, "current") 138 | 139 | # Compute the embedding 140 | if not encoder.is_loaded(): 141 | self.init_encoder() 142 | encoder_wav = encoder.preprocess_wav(wav) 143 | embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) 144 | 145 | # Add the utterance 146 | utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) 147 | self.utterances.add(utterance) 148 | self.ui.register_utterance(utterance) 149 | 150 | # Plot it 151 | self.ui.draw_embed(embed, name, "current") 152 | self.ui.draw_umap_projections(self.utterances) 153 | 154 | def clear_utterances(self): 155 | self.utterances.clear() 156 | self.ui.draw_umap_projections(self.utterances) 157 | 158 | def synthesize(self): 159 | self.ui.log("Generating the mel spectrogram...") 160 | self.ui.set_loading(1) 161 | 162 | # Synthesize the spectrogram 163 | if self.synthesizer is None: 164 | model_dir = self.ui.current_synthesizer_model_dir 165 | checkpoints_dir = model_dir.joinpath("taco_pretrained") 166 | self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem) 167 | if not self.synthesizer.is_loaded(): 168 | self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath) 169 | 170 | texts = self.ui.text_prompt.toPlainText().split("\n") 171 | embed = self.ui.selected_utterance.embed 172 | embeds = np.stack([embed] * len(texts)) 173 | specs = self.synthesizer.synthesize_spectrograms(texts, embeds) 174 | breaks = [spec.shape[1] for spec in specs] 175 | spec = np.concatenate(specs, axis=1) 176 | 177 | self.ui.draw_spec(spec, "generated") 178 | self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None) 179 | self.ui.set_loading(0) 180 | 181 | def vocode(self): 182 | speaker_name, spec, breaks, _ = self.current_generated 183 | assert spec is not None 184 | 185 | # Synthesize the waveform 186 | if not vocoder.is_loaded(): 187 | self.init_vocoder() 188 | def vocoder_progress(i, seq_len, b_size, gen_rate): 189 | real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 190 | line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ 191 | % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) 192 | self.ui.log(line, "overwrite") 193 | self.ui.set_loading(i, seq_len) 194 | if self.ui.current_vocoder_fpath is not None: 195 | self.ui.log("") 196 | wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) 197 | else: 198 | self.ui.log("Waveform generation with Griffin-Lim... ") 199 | wav = Synthesizer.griffin_lim(spec) 200 | self.ui.set_loading(0) 201 | self.ui.log(" Done!", "append") 202 | 203 | # Add breaks 204 | b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) 205 | b_starts = np.concatenate(([0], b_ends[:-1])) 206 | wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] 207 | breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) 208 | wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) 209 | 210 | # Play it 211 | wav = wav / np.abs(wav).max() * 0.97 212 | self.ui.play(wav, Synthesizer.sample_rate) 213 | 214 | # Compute the embedding 215 | # TODO: this is problematic with different sampling rates, gotta fix it 216 | if not encoder.is_loaded(): 217 | self.init_encoder() 218 | encoder_wav = encoder.preprocess_wav(wav) 219 | embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) 220 | 221 | # Add the utterance 222 | name = speaker_name + "_gen_%05d" % np.random.randint(100000) 223 | utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) 224 | self.utterances.add(utterance) 225 | 226 | # Plot it 227 | self.ui.draw_embed(embed, name, "generated") 228 | self.ui.draw_umap_projections(self.utterances) 229 | 230 | def init_encoder(self): 231 | model_fpath = self.ui.current_encoder_fpath 232 | 233 | self.ui.log("Loading the encoder %s... " % model_fpath) 234 | self.ui.set_loading(1) 235 | start = timer() 236 | encoder.load_model(model_fpath) 237 | self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") 238 | self.ui.set_loading(0) 239 | 240 | def init_vocoder(self): 241 | model_fpath = self.ui.current_vocoder_fpath 242 | # Case of Griffin-lim 243 | if model_fpath is None: 244 | return 245 | 246 | self.ui.log("Loading the vocoder %s... " % model_fpath) 247 | self.ui.set_loading(1) 248 | start = timer() 249 | vocoder.load_model(model_fpath) 250 | self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") 251 | self.ui.set_loading(0) 252 | -------------------------------------------------------------------------------- /toolbox/utterance.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Utterance = namedtuple("Utterance", "name speaker_name wav spec embed partial_embeds synth") 4 | Utterance.__eq__ = lambda x, y: x.name == y.name 5 | Utterance.__hash__ = lambda x: hash(x.name) 6 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iwater/Real-Time-Voice-Cloning-Chinese/06882b9a83247beda1d4d84baca0400457096d1b/utils/__init__.py -------------------------------------------------------------------------------- /utils/argutils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | import argparse 4 | 5 | _type_priorities = [ # In decreasing order 6 | Path, 7 | str, 8 | int, 9 | float, 10 | bool, 11 | ] 12 | 13 | def _priority(o): 14 | p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 15 | if p is not None: 16 | return p 17 | p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 18 | if p is not None: 19 | return p 20 | return len(_type_priorities) 21 | 22 | def print_args(args: argparse.Namespace, parser=None): 23 | args = vars(args) 24 | if parser is None: 25 | priorities = list(map(_priority, args.values())) 26 | else: 27 | all_params = [a.dest for g in parser._action_groups for a in g._group_actions ] 28 | priority = lambda p: all_params.index(p) if p in all_params else len(all_params) 29 | priorities = list(map(priority, args.keys())) 30 | 31 | pad = max(map(len, args.keys())) + 3 32 | indices = np.lexsort((list(args.keys()), priorities)) 33 | items = list(args.items()) 34 | 35 | print("Arguments:") 36 | for i in indices: 37 | param, value = items[i] 38 | print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) 39 | print("") 40 | -------------------------------------------------------------------------------- /utils/logmmse.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2015 braindead 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | # 23 | # 24 | # This code was extracted from the logmmse package (https://pypi.org/project/logmmse/) and I 25 | # simply modified the interface to meet my needs. 26 | 27 | 28 | import numpy as np 29 | import math 30 | from scipy.special import expn 31 | from collections import namedtuple 32 | 33 | NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2") 34 | 35 | 36 | def profile_noise(noise, sampling_rate, window_size=0): 37 | """ 38 | Creates a profile of the noise in a given waveform. 39 | 40 | :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 41 | :param sampling_rate: the sampling rate of the audio 42 | :param window_size: the size of the window the logmmse algorithm operates on. A default value 43 | will be picked if left as 0. 44 | :return: a NoiseProfile object 45 | """ 46 | noise, dtype = to_float(noise) 47 | noise += np.finfo(np.float64).eps 48 | 49 | if window_size == 0: 50 | window_size = int(math.floor(0.02 * sampling_rate)) 51 | 52 | if window_size % 2 == 1: 53 | window_size = window_size + 1 54 | 55 | perc = 50 56 | len1 = int(math.floor(window_size * perc / 100)) 57 | len2 = int(window_size - len1) 58 | 59 | win = np.hanning(window_size) 60 | win = win * len2 / np.sum(win) 61 | n_fft = 2 * window_size 62 | 63 | noise_mean = np.zeros(n_fft) 64 | n_frames = len(noise) // window_size 65 | for j in range(0, window_size * n_frames, window_size): 66 | noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0)) 67 | noise_mu2 = (noise_mean / n_frames) ** 2 68 | 69 | return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2) 70 | 71 | 72 | def denoise(wav, noise_profile: NoiseProfile, eta=0.15): 73 | """ 74 | Cleans the noise from a speech waveform given a noise profile. The waveform must have the 75 | same sampling rate as the one used to create the noise profile. 76 | 77 | :param wav: a speech waveform as a numpy array of floats or ints. 78 | :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 79 | the same) waveform. 80 | :param eta: voice threshold for noise update. While the voice activation detection value is 81 | below this threshold, the noise profile will be continuously updated throughout the audio. 82 | Set to 0 to disable updating the noise profile. 83 | :return: the clean wav as a numpy array of floats or ints of the same length. 84 | """ 85 | wav, dtype = to_float(wav) 86 | wav += np.finfo(np.float64).eps 87 | p = noise_profile 88 | 89 | nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2)) 90 | x_final = np.zeros(nframes * p.len2) 91 | 92 | aa = 0.98 93 | mu = 0.98 94 | ksi_min = 10 ** (-25 / 10) 95 | 96 | x_old = np.zeros(p.len1) 97 | xk_prev = np.zeros(p.len1) 98 | noise_mu2 = p.noise_mu2 99 | for k in range(0, nframes * p.len2, p.len2): 100 | insign = p.win * wav[k:k + p.window_size] 101 | 102 | spec = np.fft.fft(insign, p.n_fft, axis=0) 103 | sig = np.absolute(spec) 104 | sig2 = sig ** 2 105 | 106 | gammak = np.minimum(sig2 / noise_mu2, 40) 107 | 108 | if xk_prev.all() == 0: 109 | ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 110 | else: 111 | ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 112 | ksi = np.maximum(ksi_min, ksi) 113 | 114 | log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi) 115 | vad_decision = np.sum(log_sigma_k) / p.window_size 116 | if vad_decision < eta: 117 | noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 118 | 119 | a = ksi / (1 + ksi) 120 | vk = a * gammak 121 | ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 122 | hw = a * np.exp(ei_vk) 123 | sig = sig * hw 124 | xk_prev = sig ** 2 125 | xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0) 126 | xi_w = np.real(xi_w) 127 | 128 | x_final[k:k + p.len2] = x_old + xi_w[0:p.len1] 129 | x_old = xi_w[p.len1:p.window_size] 130 | 131 | output = from_float(x_final, dtype) 132 | output = np.pad(output, (0, len(wav) - len(output)), mode="constant") 133 | return output 134 | 135 | 136 | ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 137 | ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 138 | ## webrctvad 139 | # def vad(wav, sampling_rate, eta=0.15, window_size=0): 140 | # """ 141 | # TODO: fix doc 142 | # Creates a profile of the noise in a given waveform. 143 | # 144 | # :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 145 | # :param sampling_rate: the sampling rate of the audio 146 | # :param window_size: the size of the window the logmmse algorithm operates on. A default value 147 | # will be picked if left as 0. 148 | # :param eta: voice threshold for noise update. While the voice activation detection value is 149 | # below this threshold, the noise profile will be continuously updated throughout the audio. 150 | # Set to 0 to disable updating the noise profile. 151 | # """ 152 | # wav, dtype = to_float(wav) 153 | # wav += np.finfo(np.float64).eps 154 | # 155 | # if window_size == 0: 156 | # window_size = int(math.floor(0.02 * sampling_rate)) 157 | # 158 | # if window_size % 2 == 1: 159 | # window_size = window_size + 1 160 | # 161 | # perc = 50 162 | # len1 = int(math.floor(window_size * perc / 100)) 163 | # len2 = int(window_size - len1) 164 | # 165 | # win = np.hanning(window_size) 166 | # win = win * len2 / np.sum(win) 167 | # n_fft = 2 * window_size 168 | # 169 | # wav_mean = np.zeros(n_fft) 170 | # n_frames = len(wav) // window_size 171 | # for j in range(0, window_size * n_frames, window_size): 172 | # wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0)) 173 | # noise_mu2 = (wav_mean / n_frames) ** 2 174 | # 175 | # wav, dtype = to_float(wav) 176 | # wav += np.finfo(np.float64).eps 177 | # 178 | # nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2)) 179 | # vad = np.zeros(nframes * len2, dtype=np.bool) 180 | # 181 | # aa = 0.98 182 | # mu = 0.98 183 | # ksi_min = 10 ** (-25 / 10) 184 | # 185 | # xk_prev = np.zeros(len1) 186 | # noise_mu2 = noise_mu2 187 | # for k in range(0, nframes * len2, len2): 188 | # insign = win * wav[k:k + window_size] 189 | # 190 | # spec = np.fft.fft(insign, n_fft, axis=0) 191 | # sig = np.absolute(spec) 192 | # sig2 = sig ** 2 193 | # 194 | # gammak = np.minimum(sig2 / noise_mu2, 40) 195 | # 196 | # if xk_prev.all() == 0: 197 | # ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0) 198 | # else: 199 | # ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0) 200 | # ksi = np.maximum(ksi_min, ksi) 201 | # 202 | # log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi) 203 | # vad_decision = np.sum(log_sigma_k) / window_size 204 | # if vad_decision < eta: 205 | # noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2 206 | # print(vad_decision) 207 | # 208 | # a = ksi / (1 + ksi) 209 | # vk = a * gammak 210 | # ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8)) 211 | # hw = a * np.exp(ei_vk) 212 | # sig = sig * hw 213 | # xk_prev = sig ** 2 214 | # 215 | # vad[k:k + len2] = vad_decision >= eta 216 | # 217 | # vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant") 218 | # return vad 219 | 220 | 221 | def to_float(_input): 222 | if _input.dtype == np.float64: 223 | return _input, _input.dtype 224 | elif _input.dtype == np.float32: 225 | return _input.astype(np.float64), _input.dtype 226 | elif _input.dtype == np.uint8: 227 | return (_input - 128) / 128., _input.dtype 228 | elif _input.dtype == np.int16: 229 | return _input / 32768., _input.dtype 230 | elif _input.dtype == np.int32: 231 | return _input / 2147483648., _input.dtype 232 | raise ValueError('Unsupported wave file format') 233 | 234 | 235 | def from_float(_input, dtype): 236 | if dtype == np.float64: 237 | return _input, np.float64 238 | elif dtype == np.float32: 239 | return _input.astype(np.float32) 240 | elif dtype == np.uint8: 241 | return ((_input * 128) + 128).astype(np.uint8) 242 | elif dtype == np.int16: 243 | return (_input * 32768).astype(np.int16) 244 | elif dtype == np.int32: 245 | print(_input) 246 | return (_input * 2147483648).astype(np.int32) 247 | raise ValueError('Unsupported wave file format') 248 | -------------------------------------------------------------------------------- /utils/profiler.py: -------------------------------------------------------------------------------- 1 | from time import perf_counter as timer 2 | from collections import OrderedDict 3 | import numpy as np 4 | 5 | 6 | class Profiler: 7 | def __init__(self, summarize_every=5, disabled=False): 8 | self.last_tick = timer() 9 | self.logs = OrderedDict() 10 | self.summarize_every = summarize_every 11 | self.disabled = disabled 12 | 13 | def tick(self, name): 14 | if self.disabled: 15 | return 16 | 17 | # Log the time needed to execute that function 18 | if not name in self.logs: 19 | self.logs[name] = [] 20 | if len(self.logs[name]) >= self.summarize_every: 21 | self.summarize() 22 | self.purge_logs() 23 | self.logs[name].append(timer() - self.last_tick) 24 | 25 | self.reset_timer() 26 | 27 | def purge_logs(self): 28 | for name in self.logs: 29 | self.logs[name].clear() 30 | 31 | def reset_timer(self): 32 | self.last_tick = timer() 33 | 34 | def summarize(self): 35 | n = max(map(len, self.logs.values())) 36 | assert n == self.summarize_every 37 | print("\nAverage execution time over %d steps:" % n) 38 | 39 | name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()] 40 | pad = max(map(len, name_msgs)) 41 | for name_msg, deltas in zip(name_msgs, self.logs.values()): 42 | print(" %s mean: %4.0fms std: %4.0fms" % 43 | (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) 44 | print("", flush=True) 45 | -------------------------------------------------------------------------------- /vocoder/LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord) 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /vocoder/audio.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import librosa 4 | import vocoder.hparams as hp 5 | from scipy.signal import lfilter 6 | 7 | 8 | def label_2_float(x, bits) : 9 | return 2 * x / (2**bits - 1.) - 1. 10 | 11 | 12 | def float_2_label(x, bits) : 13 | assert abs(x).max() <= 1.0 14 | x = (x + 1.) * (2**bits - 1) / 2 15 | return x.clip(0, 2**bits - 1) 16 | 17 | 18 | def load_wav(path) : 19 | return librosa.load(path, sr=hp.sample_rate)[0] 20 | 21 | 22 | def save_wav(x, path) : 23 | librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate) 24 | 25 | 26 | def split_signal(x) : 27 | unsigned = x + 2**15 28 | coarse = unsigned // 256 29 | fine = unsigned % 256 30 | return coarse, fine 31 | 32 | 33 | def combine_signal(coarse, fine) : 34 | return coarse * 256 + fine - 2**15 35 | 36 | 37 | def encode_16bits(x) : 38 | return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) 39 | 40 | 41 | mel_basis = None 42 | 43 | 44 | def linear_to_mel(spectrogram): 45 | global mel_basis 46 | if mel_basis is None: 47 | mel_basis = build_mel_basis() 48 | return np.dot(mel_basis, spectrogram) 49 | 50 | 51 | def build_mel_basis(): 52 | return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) 53 | 54 | 55 | def normalize(S): 56 | return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1) 57 | 58 | 59 | def denormalize(S): 60 | return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db 61 | 62 | 63 | def amp_to_db(x): 64 | return 20 * np.log10(np.maximum(1e-5, x)) 65 | 66 | 67 | def db_to_amp(x): 68 | return np.power(10.0, x * 0.05) 69 | 70 | 71 | def spectrogram(y): 72 | D = stft(y) 73 | S = amp_to_db(np.abs(D)) - hp.ref_level_db 74 | return normalize(S) 75 | 76 | 77 | def melspectrogram(y): 78 | D = stft(y) 79 | S = amp_to_db(linear_to_mel(np.abs(D))) 80 | return normalize(S) 81 | 82 | 83 | def stft(y): 84 | return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) 85 | 86 | 87 | def pre_emphasis(x): 88 | return lfilter([1, -hp.preemphasis], [1], x) 89 | 90 | 91 | def de_emphasis(x): 92 | return lfilter([1], [1, -hp.preemphasis], x) 93 | 94 | 95 | def encode_mu_law(x, mu) : 96 | mu = mu - 1 97 | fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) 98 | return np.floor((fx + 1) / 2 * mu + 0.5) 99 | 100 | 101 | def decode_mu_law(y, mu, from_labels=True) : 102 | if from_labels: 103 | y = label_2_float(y, math.log2(mu)) 104 | mu = mu - 1 105 | x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1) 106 | return x 107 | 108 | -------------------------------------------------------------------------------- /vocoder/display.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | import sys 5 | 6 | 7 | def progbar(i, n, size=16): 8 | done = (i * size) // n 9 | bar = '' 10 | for i in range(size): 11 | bar += '█' if i <= done else '░' 12 | return bar 13 | 14 | 15 | def stream(message) : 16 | sys.stdout.write("\r{%s}" % message) 17 | 18 | 19 | def simple_table(item_tuples) : 20 | 21 | border_pattern = '+---------------------------------------' 22 | whitespace = ' ' 23 | 24 | headings, cells, = [], [] 25 | 26 | for item in item_tuples : 27 | 28 | heading, cell = str(item[0]), str(item[1]) 29 | 30 | pad_head = True if len(heading) < len(cell) else False 31 | 32 | pad = abs(len(heading) - len(cell)) 33 | pad = whitespace[:pad] 34 | 35 | pad_left = pad[:len(pad)//2] 36 | pad_right = pad[len(pad)//2:] 37 | 38 | if pad_head : 39 | heading = pad_left + heading + pad_right 40 | else : 41 | cell = pad_left + cell + pad_right 42 | 43 | headings += [heading] 44 | cells += [cell] 45 | 46 | border, head, body = '', '', '' 47 | 48 | for i in range(len(item_tuples)) : 49 | 50 | temp_head = f'| {headings[i]} ' 51 | temp_body = f'| {cells[i]} ' 52 | 53 | border += border_pattern[:len(temp_head)] 54 | head += temp_head 55 | body += temp_body 56 | 57 | if i == len(item_tuples) - 1 : 58 | head += '|' 59 | body += '|' 60 | border += '+' 61 | 62 | print(border) 63 | print(head) 64 | print(border) 65 | print(body) 66 | print(border) 67 | print(' ') 68 | 69 | 70 | def time_since(started) : 71 | elapsed = time.time() - started 72 | m = int(elapsed // 60) 73 | s = int(elapsed % 60) 74 | if m >= 60 : 75 | h = int(m // 60) 76 | m = m % 60 77 | return f'{h}h {m}m {s}s' 78 | else : 79 | return f'{m}m {s}s' 80 | 81 | 82 | def save_attention(attn, path) : 83 | fig = plt.figure(figsize=(12, 6)) 84 | plt.imshow(attn.T, interpolation='nearest', aspect='auto') 85 | fig.savefig(f'{path}.png', bbox_inches='tight') 86 | plt.close(fig) 87 | 88 | 89 | def save_spectrogram(M, path, length=None) : 90 | M = np.flip(M, axis=0) 91 | if length : M = M[:, :length] 92 | fig = plt.figure(figsize=(12, 6)) 93 | plt.imshow(M, interpolation='nearest', aspect='auto') 94 | fig.savefig(f'{path}.png', bbox_inches='tight') 95 | plt.close(fig) 96 | 97 | 98 | def plot(array) : 99 | fig = plt.figure(figsize=(30, 5)) 100 | ax = fig.add_subplot(111) 101 | ax.xaxis.label.set_color('grey') 102 | ax.yaxis.label.set_color('grey') 103 | ax.xaxis.label.set_fontsize(23) 104 | ax.yaxis.label.set_fontsize(23) 105 | ax.tick_params(axis='x', colors='grey', labelsize=23) 106 | ax.tick_params(axis='y', colors='grey', labelsize=23) 107 | plt.plot(array) 108 | 109 | 110 | def plot_spec(M) : 111 | M = np.flip(M, axis=0) 112 | plt.figure(figsize=(18,4)) 113 | plt.imshow(M, interpolation='nearest', aspect='auto') 114 | plt.show() 115 | 116 | -------------------------------------------------------------------------------- /vocoder/distribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def log_sum_exp(x): 7 | """ numerically stable log_sum_exp implementation that prevents overflow """ 8 | # TF ordering 9 | axis = len(x.size()) - 1 10 | m, _ = torch.max(x, dim=axis) 11 | m2, _ = torch.max(x, dim=axis, keepdim=True) 12 | return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) 13 | 14 | 15 | # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py 16 | def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, 17 | log_scale_min=None, reduce=True): 18 | if log_scale_min is None: 19 | log_scale_min = float(np.log(1e-14)) 20 | y_hat = y_hat.permute(0,2,1) 21 | assert y_hat.dim() == 3 22 | assert y_hat.size(1) % 3 == 0 23 | nr_mix = y_hat.size(1) // 3 24 | 25 | # (B x T x C) 26 | y_hat = y_hat.transpose(1, 2) 27 | 28 | # unpack parameters. (B, T, num_mixtures) x 3 29 | logit_probs = y_hat[:, :, :nr_mix] 30 | means = y_hat[:, :, nr_mix:2 * nr_mix] 31 | log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) 32 | 33 | # B x T x 1 -> B x T x num_mixtures 34 | y = y.expand_as(means) 35 | 36 | centered_y = y - means 37 | inv_stdv = torch.exp(-log_scales) 38 | plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) 39 | cdf_plus = torch.sigmoid(plus_in) 40 | min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) 41 | cdf_min = torch.sigmoid(min_in) 42 | 43 | # log probability for edge case of 0 (before scaling) 44 | # equivalent: torch.log(F.sigmoid(plus_in)) 45 | log_cdf_plus = plus_in - F.softplus(plus_in) 46 | 47 | # log probability for edge case of 255 (before scaling) 48 | # equivalent: (1 - F.sigmoid(min_in)).log() 49 | log_one_minus_cdf_min = -F.softplus(min_in) 50 | 51 | # probability for all other cases 52 | cdf_delta = cdf_plus - cdf_min 53 | 54 | mid_in = inv_stdv * centered_y 55 | # log probability in the center of the bin, to be used in extreme cases 56 | # (not actually used in our code) 57 | log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) 58 | 59 | # tf equivalent 60 | """ 61 | log_probs = tf.where(x < -0.999, log_cdf_plus, 62 | tf.where(x > 0.999, log_one_minus_cdf_min, 63 | tf.where(cdf_delta > 1e-5, 64 | tf.log(tf.maximum(cdf_delta, 1e-12)), 65 | log_pdf_mid - np.log(127.5)))) 66 | """ 67 | # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value 68 | # for num_classes=65536 case? 1e-7? not sure.. 69 | inner_inner_cond = (cdf_delta > 1e-5).float() 70 | 71 | inner_inner_out = inner_inner_cond * \ 72 | torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ 73 | (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) 74 | inner_cond = (y > 0.999).float() 75 | inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out 76 | cond = (y < -0.999).float() 77 | log_probs = cond * log_cdf_plus + (1. - cond) * inner_out 78 | 79 | log_probs = log_probs + F.log_softmax(logit_probs, -1) 80 | 81 | if reduce: 82 | return -torch.mean(log_sum_exp(log_probs)) 83 | else: 84 | return -log_sum_exp(log_probs).unsqueeze(-1) 85 | 86 | 87 | def sample_from_discretized_mix_logistic(y, log_scale_min=None): 88 | """ 89 | Sample from discretized mixture of logistic distributions 90 | Args: 91 | y (Tensor): B x C x T 92 | log_scale_min (float): Log scale minimum value 93 | Returns: 94 | Tensor: sample in range of [-1, 1]. 95 | """ 96 | if log_scale_min is None: 97 | log_scale_min = float(np.log(1e-14)) 98 | assert y.size(1) % 3 == 0 99 | nr_mix = y.size(1) // 3 100 | 101 | # B x T x C 102 | y = y.transpose(1, 2) 103 | logit_probs = y[:, :, :nr_mix] 104 | 105 | # sample mixture indicator from softmax 106 | temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) 107 | temp = logit_probs.data - torch.log(- torch.log(temp)) 108 | _, argmax = temp.max(dim=-1) 109 | 110 | # (B, T) -> (B, T, nr_mix) 111 | one_hot = to_one_hot(argmax, nr_mix) 112 | # select logistic parameters 113 | means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) 114 | log_scales = torch.clamp(torch.sum( 115 | y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) 116 | # sample from logistic & clip to interval 117 | # we don't actually round to the nearest 8bit value when sampling 118 | u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) 119 | x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) 120 | 121 | x = torch.clamp(torch.clamp(x, min=-1.), max=1.) 122 | 123 | return x 124 | 125 | 126 | def to_one_hot(tensor, n, fill_with=1.): 127 | # we perform one hot encore with respect to the last axis 128 | one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() 129 | if tensor.is_cuda: 130 | one_hot = one_hot.cuda() 131 | one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) 132 | return one_hot 133 | -------------------------------------------------------------------------------- /vocoder/gen_wavernn.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder.audio import * 3 | 4 | 5 | def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path): 6 | k = model.get_step() // 1000 7 | 8 | for i, (m, x) in enumerate(test_set, 1): 9 | if i > samples: 10 | break 11 | 12 | print('\n| Generating: %i/%i' % (i, samples)) 13 | 14 | x = x[0].numpy() 15 | 16 | bits = 16 if hp.voc_mode == 'MOL' else hp.bits 17 | 18 | if hp.mu_law and hp.voc_mode != 'MOL' : 19 | x = decode_mu_law(x, 2**bits, from_labels=True) 20 | else : 21 | x = label_2_float(x, bits) 22 | 23 | save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i))) 24 | 25 | batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \ 26 | "gen_not_batched" 27 | save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str)) 28 | 29 | wav = model.generate(m, batched, target, overlap, hp.mu_law) 30 | save_wav(wav, save_str) 31 | 32 | -------------------------------------------------------------------------------- /vocoder/hparams.py: -------------------------------------------------------------------------------- 1 | from synthesizer.hparams import hparams as _syn_hp 2 | 3 | 4 | # Audio settings------------------------------------------------------------------------ 5 | # Match the values of the synthesizer 6 | sample_rate = _syn_hp.sample_rate 7 | n_fft = _syn_hp.n_fft 8 | num_mels = _syn_hp.num_mels 9 | hop_length = _syn_hp.hop_size 10 | win_length = _syn_hp.win_size 11 | fmin = _syn_hp.fmin 12 | min_level_db = _syn_hp.min_level_db 13 | ref_level_db = _syn_hp.ref_level_db 14 | mel_max_abs_value = _syn_hp.max_abs_value 15 | preemphasis = _syn_hp.preemphasis 16 | apply_preemphasis = _syn_hp.preemphasize 17 | 18 | bits = 9 # bit depth of signal 19 | mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode 20 | # below 21 | 22 | 23 | # WAVERNN / VOCODER -------------------------------------------------------------------------------- 24 | voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from 25 | # mixture of logistics) 26 | voc_upsample_factors = (5, 5, 8) # NB - this needs to correctly factorise hop_length 27 | voc_rnn_dims = 512 28 | voc_fc_dims = 512 29 | voc_compute_dims = 128 30 | voc_res_out_dims = 128 31 | voc_res_blocks = 10 32 | 33 | # Training 34 | voc_batch_size = 100 35 | voc_lr = 1e-4 36 | voc_gen_at_checkpoint = 5 # number of samples to generate at each checkpoint 37 | voc_pad = 2 # this will pad the input so that the resnet can 'see' wider 38 | # than input length 39 | voc_seq_len = hop_length * 5 # must be a multiple of hop_length 40 | 41 | # Generating / Synthesizing 42 | voc_gen_batched = True # very fast (realtime+) single utterance batched generation 43 | voc_target = 8000 # target number of samples to be generated in each batch entry 44 | voc_overlap = 400 # number of samples for crossfading between batches 45 | -------------------------------------------------------------------------------- /vocoder/inference.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder import hparams as hp 3 | import torch 4 | 5 | 6 | _model = None # type: WaveRNN 7 | 8 | def load_model(weights_fpath, verbose=True): 9 | global _model 10 | 11 | if verbose: 12 | print("Building Wave-RNN") 13 | _model = WaveRNN( 14 | rnn_dims=hp.voc_rnn_dims, 15 | fc_dims=hp.voc_fc_dims, 16 | bits=hp.bits, 17 | pad=hp.voc_pad, 18 | upsample_factors=hp.voc_upsample_factors, 19 | feat_dims=hp.num_mels, 20 | compute_dims=hp.voc_compute_dims, 21 | res_out_dims=hp.voc_res_out_dims, 22 | res_blocks=hp.voc_res_blocks, 23 | hop_length=hp.hop_length, 24 | sample_rate=hp.sample_rate, 25 | mode=hp.voc_mode 26 | ).cuda() 27 | 28 | if verbose: 29 | print("Loading model weights at %s" % weights_fpath) 30 | checkpoint = torch.load(weights_fpath) 31 | _model.load_state_dict(checkpoint['model_state']) 32 | _model.eval() 33 | 34 | 35 | def is_loaded(): 36 | return _model is not None 37 | 38 | 39 | def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800, 40 | progress_callback=None): 41 | """ 42 | Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 43 | that of the synthesizer!) 44 | 45 | :param normalize: 46 | :param batched: 47 | :param target: 48 | :param overlap: 49 | :return: 50 | """ 51 | if _model is None: 52 | raise Exception("Please load Wave-RNN in memory before using it") 53 | 54 | if normalize: 55 | mel = mel / hp.mel_max_abs_value 56 | mel = torch.from_numpy(mel[None, ...]) 57 | wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback) 58 | return wav 59 | -------------------------------------------------------------------------------- /vocoder/models/deepmind_version.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from utils.display import * 5 | from utils.dsp import * 6 | 7 | 8 | class WaveRNN(nn.Module) : 9 | def __init__(self, hidden_size=896, quantisation=256) : 10 | super(WaveRNN, self).__init__() 11 | 12 | self.hidden_size = hidden_size 13 | self.split_size = hidden_size // 2 14 | 15 | # The main matmul 16 | self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) 17 | 18 | # Output fc layers 19 | self.O1 = nn.Linear(self.split_size, self.split_size) 20 | self.O2 = nn.Linear(self.split_size, quantisation) 21 | self.O3 = nn.Linear(self.split_size, self.split_size) 22 | self.O4 = nn.Linear(self.split_size, quantisation) 23 | 24 | # Input fc layers 25 | self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False) 26 | self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False) 27 | 28 | # biases for the gates 29 | self.bias_u = nn.Parameter(torch.zeros(self.hidden_size)) 30 | self.bias_r = nn.Parameter(torch.zeros(self.hidden_size)) 31 | self.bias_e = nn.Parameter(torch.zeros(self.hidden_size)) 32 | 33 | # display num params 34 | self.num_params() 35 | 36 | 37 | def forward(self, prev_y, prev_hidden, current_coarse) : 38 | 39 | # Main matmul - the projection is split 3 ways 40 | R_hidden = self.R(prev_hidden) 41 | R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1) 42 | 43 | # Project the prev input 44 | coarse_input_proj = self.I_coarse(prev_y) 45 | I_coarse_u, I_coarse_r, I_coarse_e = \ 46 | torch.split(coarse_input_proj, self.split_size, dim=1) 47 | 48 | # Project the prev input and current coarse sample 49 | fine_input = torch.cat([prev_y, current_coarse], dim=1) 50 | fine_input_proj = self.I_fine(fine_input) 51 | I_fine_u, I_fine_r, I_fine_e = \ 52 | torch.split(fine_input_proj, self.split_size, dim=1) 53 | 54 | # concatenate for the gates 55 | I_u = torch.cat([I_coarse_u, I_fine_u], dim=1) 56 | I_r = torch.cat([I_coarse_r, I_fine_r], dim=1) 57 | I_e = torch.cat([I_coarse_e, I_fine_e], dim=1) 58 | 59 | # Compute all gates for coarse and fine 60 | u = F.sigmoid(R_u + I_u + self.bias_u) 61 | r = F.sigmoid(R_r + I_r + self.bias_r) 62 | e = F.tanh(r * R_e + I_e + self.bias_e) 63 | hidden = u * prev_hidden + (1. - u) * e 64 | 65 | # Split the hidden state 66 | hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1) 67 | 68 | # Compute outputs 69 | out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) 70 | out_fine = self.O4(F.relu(self.O3(hidden_fine))) 71 | 72 | return out_coarse, out_fine, hidden 73 | 74 | 75 | def generate(self, seq_len): 76 | with torch.no_grad(): 77 | # First split up the biases for the gates 78 | b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size) 79 | b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size) 80 | b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size) 81 | 82 | # Lists for the two output seqs 83 | c_outputs, f_outputs = [], [] 84 | 85 | # Some initial inputs 86 | out_coarse = torch.LongTensor([0]).cuda() 87 | out_fine = torch.LongTensor([0]).cuda() 88 | 89 | # We'll meed a hidden state 90 | hidden = self.init_hidden() 91 | 92 | # Need a clock for display 93 | start = time.time() 94 | 95 | # Loop for generation 96 | for i in range(seq_len) : 97 | 98 | # Split into two hidden states 99 | hidden_coarse, hidden_fine = \ 100 | torch.split(hidden, self.split_size, dim=1) 101 | 102 | # Scale and concat previous predictions 103 | out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1. 104 | out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1. 105 | prev_outputs = torch.cat([out_coarse, out_fine], dim=1) 106 | 107 | # Project input 108 | coarse_input_proj = self.I_coarse(prev_outputs) 109 | I_coarse_u, I_coarse_r, I_coarse_e = \ 110 | torch.split(coarse_input_proj, self.split_size, dim=1) 111 | 112 | # Project hidden state and split 6 ways 113 | R_hidden = self.R(hidden) 114 | R_coarse_u , R_fine_u, \ 115 | R_coarse_r, R_fine_r, \ 116 | R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1) 117 | 118 | # Compute the coarse gates 119 | u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u) 120 | r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r) 121 | e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e) 122 | hidden_coarse = u * hidden_coarse + (1. - u) * e 123 | 124 | # Compute the coarse output 125 | out_coarse = self.O2(F.relu(self.O1(hidden_coarse))) 126 | posterior = F.softmax(out_coarse, dim=1) 127 | distrib = torch.distributions.Categorical(posterior) 128 | out_coarse = distrib.sample() 129 | c_outputs.append(out_coarse) 130 | 131 | # Project the [prev outputs and predicted coarse sample] 132 | coarse_pred = out_coarse.float() / 127.5 - 1. 133 | fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1) 134 | fine_input_proj = self.I_fine(fine_input) 135 | I_fine_u, I_fine_r, I_fine_e = \ 136 | torch.split(fine_input_proj, self.split_size, dim=1) 137 | 138 | # Compute the fine gates 139 | u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u) 140 | r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r) 141 | e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e) 142 | hidden_fine = u * hidden_fine + (1. - u) * e 143 | 144 | # Compute the fine output 145 | out_fine = self.O4(F.relu(self.O3(hidden_fine))) 146 | posterior = F.softmax(out_fine, dim=1) 147 | distrib = torch.distributions.Categorical(posterior) 148 | out_fine = distrib.sample() 149 | f_outputs.append(out_fine) 150 | 151 | # Put the hidden state back together 152 | hidden = torch.cat([hidden_coarse, hidden_fine], dim=1) 153 | 154 | # Display progress 155 | speed = (i + 1) / (time.time() - start) 156 | stream('Gen: %i/%i -- Speed: %i', (i + 1, seq_len, speed)) 157 | 158 | coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy() 159 | fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy() 160 | output = combine_signal(coarse, fine) 161 | 162 | return output, coarse, fine 163 | 164 | def init_hidden(self, batch_size=1) : 165 | return torch.zeros(batch_size, self.hidden_size).cuda() 166 | 167 | def num_params(self) : 168 | parameters = filter(lambda p: p.requires_grad, self.parameters()) 169 | parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000 170 | print('Trainable Parameters: %.3f million' % parameters) -------------------------------------------------------------------------------- /vocoder/train.py: -------------------------------------------------------------------------------- 1 | from vocoder.models.fatchord_version import WaveRNN 2 | from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder 3 | from vocoder.distribution import discretized_mix_logistic_loss 4 | from vocoder.display import stream, simple_table 5 | from vocoder.gen_wavernn import gen_testset 6 | from torch.utils.data import DataLoader 7 | from pathlib import Path 8 | from torch import optim 9 | import torch.nn.functional as F 10 | import vocoder.hparams as hp 11 | import numpy as np 12 | import time 13 | 14 | 15 | def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, 16 | save_every: int, backup_every: int, force_restart: bool): 17 | # Check to make sure the hop length is correctly factorised 18 | assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length 19 | 20 | # Instantiate the model 21 | print("Initializing the model...") 22 | model = WaveRNN( 23 | rnn_dims=hp.voc_rnn_dims, 24 | fc_dims=hp.voc_fc_dims, 25 | bits=hp.bits, 26 | pad=hp.voc_pad, 27 | upsample_factors=hp.voc_upsample_factors, 28 | feat_dims=hp.num_mels, 29 | compute_dims=hp.voc_compute_dims, 30 | res_out_dims=hp.voc_res_out_dims, 31 | res_blocks=hp.voc_res_blocks, 32 | hop_length=hp.hop_length, 33 | sample_rate=hp.sample_rate, 34 | mode=hp.voc_mode 35 | ).cuda() 36 | 37 | # Initialize the optimizer 38 | optimizer = optim.Adam(model.parameters()) 39 | for p in optimizer.param_groups: 40 | p["lr"] = hp.voc_lr 41 | loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss 42 | 43 | # Load the weights 44 | model_dir = models_dir.joinpath(run_id) 45 | model_dir.mkdir(exist_ok=True) 46 | weights_fpath = model_dir.joinpath(run_id + ".pt") 47 | if force_restart or not weights_fpath.exists(): 48 | print("\nStarting the training of WaveRNN from scratch\n") 49 | model.save(weights_fpath, optimizer) 50 | else: 51 | print("\nLoading weights at %s" % weights_fpath) 52 | model.load(weights_fpath, optimizer) 53 | print("WaveRNN weights loaded from step %d" % model.step) 54 | 55 | # Initialize the dataset 56 | metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \ 57 | voc_dir.joinpath("synthesized.txt") 58 | mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta") 59 | wav_dir = syn_dir.joinpath("audio") 60 | dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir) 61 | test_loader = DataLoader(dataset, 62 | batch_size=1, 63 | shuffle=True, 64 | pin_memory=True) 65 | 66 | # Begin the training 67 | simple_table([('Batch size', hp.voc_batch_size), 68 | ('LR', hp.voc_lr), 69 | ('Sequence Len', hp.voc_seq_len)]) 70 | 71 | for epoch in range(1, 350): 72 | data_loader = DataLoader(dataset, 73 | collate_fn=collate_vocoder, 74 | batch_size=hp.voc_batch_size, 75 | num_workers=2, 76 | shuffle=True, 77 | pin_memory=True) 78 | start = time.time() 79 | running_loss = 0. 80 | 81 | for i, (x, y, m) in enumerate(data_loader, 1): 82 | x, m, y = x.cuda(), m.cuda(), y.cuda() 83 | 84 | # Forward pass 85 | y_hat = model(x, m) 86 | if model.mode == 'RAW': 87 | y_hat = y_hat.transpose(1, 2).unsqueeze(-1) 88 | elif model.mode == 'MOL': 89 | y = y.float() 90 | y = y.unsqueeze(-1) 91 | 92 | # Backward pass 93 | loss = loss_func(y_hat, y) 94 | optimizer.zero_grad() 95 | loss.backward() 96 | optimizer.step() 97 | 98 | running_loss += loss.item() 99 | speed = i / (time.time() - start) 100 | avg_loss = running_loss / i 101 | 102 | step = model.get_step() 103 | k = step // 1000 104 | 105 | if backup_every != 0 and step % backup_every == 0 : 106 | model.checkpoint(model_dir, optimizer) 107 | 108 | if save_every != 0 and step % save_every == 0 : 109 | model.save(weights_fpath, optimizer) 110 | 111 | msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \ 112 | f"Loss: {avg_loss:.4f} | {speed:.1f} " \ 113 | f"steps/s | Step: {k}k | " 114 | stream(msg) 115 | 116 | 117 | gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, 118 | hp.voc_target, hp.voc_overlap, model_dir) 119 | print("") 120 | -------------------------------------------------------------------------------- /vocoder/vocoder_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from pathlib import Path 3 | from vocoder import audio 4 | import vocoder.hparams as hp 5 | import numpy as np 6 | import torch 7 | 8 | 9 | class VocoderDataset(Dataset): 10 | def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path): 11 | print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir)) 12 | 13 | with metadata_fpath.open("r") as metadata_file: 14 | metadata = [line.split("|") for line in metadata_file] 15 | 16 | gta_fnames = [x[1] for x in metadata if int(x[4])] 17 | gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames] 18 | wav_fnames = [x[0] for x in metadata if int(x[4])] 19 | wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames] 20 | self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths)) 21 | 22 | print("Found %d samples" % len(self.samples_fpaths)) 23 | 24 | def __getitem__(self, index): 25 | mel_path, wav_path = self.samples_fpaths[index] 26 | 27 | # Load the mel spectrogram and adjust its range to [-1, 1] 28 | mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value 29 | 30 | # Load the wav 31 | wav = np.load(wav_path) 32 | if hp.apply_preemphasis: 33 | wav = audio.pre_emphasis(wav) 34 | wav = np.clip(wav, -1, 1) 35 | 36 | # Fix for missing padding # TODO: settle on whether this is any useful 37 | r_pad = (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav) 38 | wav = np.pad(wav, (0, r_pad), mode='constant') 39 | assert len(wav) >= mel.shape[1] * hp.hop_length 40 | wav = wav[:mel.shape[1] * hp.hop_length] 41 | assert len(wav) % hp.hop_length == 0 42 | 43 | # Quantize the wav 44 | if hp.voc_mode == 'RAW': 45 | if hp.mu_law: 46 | quant = audio.encode_mu_law(wav, mu=2 ** hp.bits) 47 | else: 48 | quant = audio.float_2_label(wav, bits=hp.bits) 49 | elif hp.voc_mode == 'MOL': 50 | quant = audio.float_2_label(wav, bits=16) 51 | 52 | return mel.astype(np.float32), quant.astype(np.int64) 53 | 54 | def __len__(self): 55 | return len(self.samples_fpaths) 56 | 57 | 58 | def collate_vocoder(batch): 59 | mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad 60 | max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch] 61 | mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] 62 | sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets] 63 | 64 | mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)] 65 | 66 | labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)] 67 | 68 | mels = np.stack(mels).astype(np.float32) 69 | labels = np.stack(labels).astype(np.int64) 70 | 71 | mels = torch.tensor(mels) 72 | labels = torch.tensor(labels).long() 73 | 74 | x = labels[:, :hp.voc_seq_len] 75 | y = labels[:, 1:] 76 | 77 | bits = 16 if hp.voc_mode == 'MOL' else hp.bits 78 | 79 | x = audio.label_2_float(x.float(), bits) 80 | 81 | if hp.voc_mode == 'MOL' : 82 | y = audio.label_2_float(y.float(), bits) 83 | 84 | return x, y, mels -------------------------------------------------------------------------------- /vocoder_preprocess.py: -------------------------------------------------------------------------------- 1 | from synthesizer.synthesize import run_synthesis 2 | from synthesizer.hparams import hparams 3 | from utils.argutils import print_args 4 | import argparse 5 | import os 6 | 7 | 8 | if __name__ == "__main__": 9 | class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 10 | pass 11 | 12 | parser = argparse.ArgumentParser( 13 | description="Creates ground-truth aligned (GTA) spectrograms from the vocoder.", 14 | formatter_class=MyFormatter 15 | ) 16 | parser.add_argument("datasets_root", type=str, help=\ 17 | "Path to the directory containing your SV2TTS directory. If you specify both --in_dir and " 18 | "--out_dir, this argument won't be used.") 19 | parser.add_argument("--model_dir", type=str, 20 | default="synthesizer/saved_models/logs-pretrained/", help=\ 21 | "Path to the pretrained model directory.") 22 | parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \ 23 | "Path to the synthesizer directory that contains the mel spectrograms, the wavs and the " 24 | "embeds. Defaults to /SV2TTS/synthesizer/.") 25 | parser.add_argument("-o", "--out_dir", type=str, default=argparse.SUPPRESS, help= \ 26 | "Path to the output vocoder directory that will contain the ground truth aligned mel " 27 | "spectrograms. Defaults to /SV2TTS/vocoder/.") 28 | parser.add_argument("--hparams", default="", 29 | help="Hyperparameter overrides as a comma-separated list of name=value " 30 | "pairs") 31 | args = parser.parse_args() 32 | print_args(args, parser) 33 | modified_hp = hparams.parse(args.hparams) 34 | 35 | if not hasattr(args, "in_dir"): 36 | args.in_dir = os.path.join(args.datasets_root, "SV2TTS", "synthesizer") 37 | if not hasattr(args, "out_dir"): 38 | args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder") 39 | 40 | run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp) 41 | -------------------------------------------------------------------------------- /vocoder_train.py: -------------------------------------------------------------------------------- 1 | from utils.argutils import print_args 2 | from vocoder.train import train 3 | from pathlib import Path 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser( 9 | description="Trains the vocoder from the synthesizer audios and the GTA synthesized mels, " 10 | "or ground truth mels.", 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 12 | ) 13 | 14 | parser.add_argument("run_id", type=str, help= \ 15 | "Name for this model instance. If a model state from the same run ID was previously " 16 | "saved, the training will restart from there. Pass -f to overwrite saved states and " 17 | "restart from scratch.") 18 | parser.add_argument("datasets_root", type=str, help= \ 19 | "Path to the directory containing your SV2TTS directory. Specifying --syn_dir or --voc_dir " 20 | "will take priority over this argument.") 21 | parser.add_argument("--syn_dir", type=str, default=argparse.SUPPRESS, help= \ 22 | "Path to the synthesizer directory that contains the ground truth mel spectrograms, " 23 | "the wavs and the embeds. Defaults to /SV2TTS/synthesizer/.") 24 | parser.add_argument("--voc_dir", type=str, default=argparse.SUPPRESS, help= \ 25 | "Path to the vocoder directory that contains the GTA synthesized mel spectrograms. " 26 | "Defaults to /SV2TTS/vocoder/. Unused if --ground_truth is passed.") 27 | parser.add_argument("-m", "--models_dir", type=str, default="vocoder/saved_models/", help=\ 28 | "Path to the directory that will contain the saved model weights, as well as backups " 29 | "of those weights and wavs generated during training.") 30 | parser.add_argument("-g", "--ground_truth", action="store_true", help= \ 31 | "Train on ground truth spectrograms (/SV2TTS/synthesizer/mels).") 32 | parser.add_argument("-s", "--save_every", type=int, default=1000, help= \ 33 | "Number of steps between updates of the model on the disk. Set to 0 to never save the " 34 | "model.") 35 | parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \ 36 | "Number of steps between backups of the model. Set to 0 to never make backups of the " 37 | "model.") 38 | parser.add_argument("-f", "--force_restart", action="store_true", help= \ 39 | "Do not load any saved model and restart from scratch.") 40 | args = parser.parse_args() 41 | 42 | # Process the arguments 43 | if not hasattr(args, "syn_dir"): 44 | args.syn_dir = Path(args.datasets_root, "SV2TTS", "synthesizer") 45 | args.syn_dir = Path(args.syn_dir) 46 | if not hasattr(args, "voc_dir"): 47 | args.voc_dir = Path(args.datasets_root, "SV2TTS", "vocoder") 48 | args.voc_dir = Path(args.voc_dir) 49 | del args.datasets_root 50 | args.models_dir = Path(args.models_dir) 51 | args.models_dir.mkdir(exist_ok=True) 52 | 53 | # Run the training 54 | print_args(args, parser) 55 | train(**vars(args)) 56 | --------------------------------------------------------------------------------