├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── any-issue.md
    │   └── bug-report.md
├── .gitignore
├── LICENSE.txt
├── README.md
├── demo_cli.py
├── demo_toolbox.py
├── demo_toolbox_collab.ipynb
├── encoder
    ├── __init__.py
    ├── audio.py
    ├── config.py
    ├── data_objects
    │   ├── __init__.py
    │   ├── random_cycler.py
    │   ├── speaker.py
    │   ├── speaker_batch.py
    │   ├── speaker_verification_dataset.py
    │   └── utterance.py
    ├── inference.py
    ├── model.py
    ├── params_data.py
    ├── params_model.py
    ├── preprocess.py
    ├── train.py
    └── visualizations.py
├── encoder_preprocess.py
├── encoder_train.py
├── requirements.txt
├── synthesizer
    ├── LICENSE.txt
    ├── __init__.py
    ├── audio.py
    ├── feeder.py
    ├── hparams.py
    ├── inference.py
    ├── infolog.py
    ├── models
    │   ├── __init__.py
    │   ├── architecture_wrappers.py
    │   ├── attention.py
    │   ├── custom_decoder.py
    │   ├── helpers.py
    │   ├── modules.py
    │   └── tacotron.py
    ├── preprocess.py
    ├── synthesize.py
    ├── tacotron2.py
    ├── train.py
    └── utils
    │   ├── __init__.py
    │   ├── _cmudict.py
    │   ├── cleaners.py
    │   ├── numbers.py
    │   ├── plot.py
    │   ├── symbols.py
    │   └── text.py
├── synthesizer_preprocess_audio.py
├── synthesizer_preprocess_embeds.py
├── synthesizer_train.py
├── toolbox
    ├── __init__.py
    ├── ui.py
    └── utterance.py
├── utils
    ├── __init__.py
    ├── argutils.py
    ├── logmmse.py
    └── profiler.py
├── vocoder
    ├── LICENSE.txt
    ├── audio.py
    ├── display.py
    ├── distribution.py
    ├── gen_wavernn.py
    ├── hparams.py
    ├── inference.py
    ├── models
    │   ├── deepmind_version.py
    │   └── fatchord_version.py
    ├── train.py
    └── vocoder_dataset.py
├── vocoder_preprocess.py
└── vocoder_train.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/any-issue.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Any issue
 3 | about: Any issue
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | I am not maintaining this repo anymore (I explain why in the readme).
11 | I keep issues open only because some old ones are useful.
12 | I will not assist you in any way.
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ''
 3 | about: Any issue
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | I am not maintaining this repo anymore (I explain why in the readme).
11 | I keep issues open only because some old ones are useful.
12 | I will not assist you in any way.
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.aux
 3 | *.log
 4 | *.out
 5 | *.synctex.gz
 6 | *.suo
 7 | *__pycache__
 8 | *.idea
 9 | *.ipynb_checkpoints
10 | *.pickle
11 | *.npy
12 | *.blg
13 | *.bbl
14 | *.bcf
15 | *.toc
16 | *.wav
17 | *.sh
18 | encoder/saved_models/*
19 | synthesizer/saved_models/*
20 | vocoder/saved_models/*
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 4 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
 5 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 6 | Original work Copyright (c) 2015 braindead (https://github.com/braindead)
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Real-Time Voice Cloning
 2 | This repository is an implementation of [Transfer Learning from Speaker Verification to
 3 | Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious or if you're looking for info I haven't documented yet (don't hesitate to make an issue for that too). Mostly I would recommend giving a quick look to the figures beyond the introduction.
 4 | 
 5 | SV2TTS is a three-stage deep learning framework that allows to create a numerical representation of a voice from a few seconds of audio, and to use it to condition a text-to-speech model trained to generalize to new voices.
 6 | 
 7 | **Video demonstration** (click the picture):
 8 | 
 9 | [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
10 | 
11 | 
12 | 
13 | ### Papers implemented  
14 | | URL | Designation | Title | Implementation source |
15 | | --- | ----------- | ----- | --------------------- |
16 | |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
17 | |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
18 | |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2)
19 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
20 | 
21 | ## News
22 | **13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup.
23 | 
24 | **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it.
25 | 
26 | **06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/).
27 | 
28 | **25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.
29 | 
30 | 
31 | ## Quick start
32 | ### Requirements
33 | You will need the following whether you plan to use the toolbox only or to retrain the models.
34 | 
35 | **Python 3.7**. Python 3.6 might work too, but I wouldn't go lower because I make extensive use of pathlib.
36 | 
37 | Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1).
38 | 
39 | A GPU is mandatory, but you don't necessarily need a high tier GPU if you only want to use the toolbox.
40 | 
41 | ### Pretrained models
42 | Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
43 | 
44 | ### Preliminary
45 | Before you download any dataset, you can begin by testing your configuration with:
46 | 
47 | `python demo_cli.py`
48 | 
49 | If all tests pass, you're good to go.
50 | 
51 | ### Datasets
52 | For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
53 | 
54 | ### Toolbox
55 | You can then try the toolbox:
56 | 
57 | `python demo_toolbox.py -d <datasets_root>`  
58 | or  
59 | `python demo_toolbox.py`  
60 | 
61 | depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
62 | 
63 | ## Contributions & Issues
64 | I'm working full-time as of June 2019. I don't have time to maintain this repo nor reply to issues. Sorry.
65 | 


--------------------------------------------------------------------------------
/demo_cli.py:
--------------------------------------------------------------------------------
  1 | from encoder.params_model import model_embedding_size as speaker_embedding_size
  2 | from utils.argutils import print_args
  3 | from synthesizer.inference import Synthesizer
  4 | from encoder import inference as encoder
  5 | from vocoder import inference as vocoder
  6 | from pathlib import Path
  7 | import numpy as np
  8 | import librosa
  9 | import argparse
 10 | import torch
 11 | import sys
 12 | 
 13 | 
 14 | if __name__ == '__main__':
 15 |     ## Info & args
 16 |     parser = argparse.ArgumentParser(
 17 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 18 |     )
 19 |     parser.add_argument("-e", "--enc_model_fpath", type=Path, 
 20 |                         default="encoder/saved_models/pretrained.pt",
 21 |                         help="Path to a saved encoder")
 22 |     parser.add_argument("-s", "--syn_model_dir", type=Path, 
 23 |                         default="synthesizer/saved_models/logs-pretrained/",
 24 |                         help="Directory containing the synthesizer model")
 25 |     parser.add_argument("-v", "--voc_model_fpath", type=Path, 
 26 |                         default="vocoder/saved_models/pretrained/pretrained.pt",
 27 |                         help="Path to a saved vocoder")
 28 |     parser.add_argument("--low_mem", action="store_true", help=\
 29 |         "If True, the memory used by the synthesizer will be freed after each use. Adds large "
 30 |         "overhead but allows to save some GPU memory for lower-end GPUs.")
 31 |     parser.add_argument("--no_sound", action="store_true", help=\
 32 |         "If True, audio won't be played.")
 33 |     args = parser.parse_args()
 34 |     print_args(args, parser)
 35 |     if not args.no_sound:
 36 |         import sounddevice as sd
 37 |         
 38 |     
 39 |     ## Print some environment information (for debugging purposes)
 40 |     print("Running a test of your configuration...\n")
 41 |     if not torch.cuda.is_available():
 42 |         print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
 43 |               "for deep learning, ensure that the drivers are properly installed, and that your "
 44 |               "CUDA version matches your PyTorch installation. CPU-only inference is currently "
 45 |               "not supported.", file=sys.stderr)
 46 |         quit(-1)
 47 |     device_id = torch.cuda.current_device()
 48 |     gpu_properties = torch.cuda.get_device_properties(device_id)
 49 |     print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
 50 |           "%.1fGb total memory.\n" % 
 51 |           (torch.cuda.device_count(),
 52 |            device_id,
 53 |            gpu_properties.name,
 54 |            gpu_properties.major,
 55 |            gpu_properties.minor,
 56 |            gpu_properties.total_memory / 1e9))
 57 |     
 58 |     
 59 |     ## Load the models one by one.
 60 |     print("Preparing the encoder, the synthesizer and the vocoder...")
 61 |     encoder.load_model(args.enc_model_fpath)
 62 |     synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
 63 |     vocoder.load_model(args.voc_model_fpath)
 64 |     
 65 |     
 66 |     ## Run a test
 67 |     print("Testing your configuration with small inputs.")
 68 |     # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
 69 |     # sampling rate, which may differ.
 70 |     # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
 71 |     # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
 72 |     # The sampling rate is the number of values (samples) recorded per second, it is set to
 73 |     # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
 74 |     # to an audio of 1 second.
 75 |     print("\tTesting the encoder...")
 76 |     encoder.embed_utterance(np.zeros(encoder.sampling_rate))
 77 |     
 78 |     # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
 79 |     # returns, but here we're going to make one ourselves just for the sake of showing that it's
 80 |     # possible.
 81 |     embed = np.random.rand(speaker_embedding_size)
 82 |     # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
 83 |     # embeddings it will be).
 84 |     embed /= np.linalg.norm(embed)
 85 |     # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
 86 |     # illustrate that
 87 |     embeds = [embed, np.zeros(speaker_embedding_size)]
 88 |     texts = ["test 1", "test 2"]
 89 |     print("\tTesting the synthesizer... (loading the model will output a lot of text)")
 90 |     mels = synthesizer.synthesize_spectrograms(texts, embeds)
 91 |     
 92 |     # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
 93 |     # can concatenate the mel spectrograms to a single one.
 94 |     mel = np.concatenate(mels, axis=1)
 95 |     # The vocoder can take a callback function to display the generation. More on that later. For 
 96 |     # now we'll simply hide it like this:
 97 |     no_action = lambda *args: None
 98 |     print("\tTesting the vocoder...")
 99 |     # For the sake of making this test short, we'll pass a short target length. The target length 
100 |     # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
101 |     # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
102 |     # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
103 |     # that has a detrimental effect on the quality of the audio. The default parameters are 
104 |     # recommended in general.
105 |     vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
106 |     
107 |     print("All test passed! You can now synthesize speech.\n\n")
108 |     
109 |     
110 |     ## Interactive speech generation
111 |     print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
112 |           "show how you can interface this project easily with your own. See the source code for "
113 |           "an explanation of what is happening.\n")
114 |     
115 |     print("Interactive generation loop")
116 |     num_generated = 0
117 |     while True:
118 |         try:
119 |             # Get the reference audio filepath
120 |             message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
121 |                       "wav, m4a, flac, ...):\n"
122 |             in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
123 |             
124 |             
125 |             ## Computing the embedding
126 |             # First, we load the wav using the function that the speaker encoder provides. This is 
127 |             # important: there is preprocessing that must be applied.
128 |             
129 |             # The following two methods are equivalent:
130 |             # - Directly load from the filepath:
131 |             preprocessed_wav = encoder.preprocess_wav(in_fpath)
132 |             # - If the wav is already loaded:
133 |             original_wav, sampling_rate = librosa.load(in_fpath)
134 |             preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
135 |             print("Loaded file succesfully")
136 |             
137 |             # Then we derive the embedding. There are many functions and parameters that the 
138 |             # speaker encoder interfaces. These are mostly for in-depth research. You will typically
139 |             # only use this function (with its default parameters):
140 |             embed = encoder.embed_utterance(preprocessed_wav)
141 |             print("Created the embedding")
142 |             
143 |             
144 |             ## Generating the spectrogram
145 |             text = input("Write a sentence (+-20 words) to be synthesized:\n")
146 |             
147 |             # The synthesizer works in batch, so you need to put your data in a list or numpy array
148 |             texts = [text]
149 |             embeds = [embed]
150 |             # If you know what the attention layer alignments are, you can retrieve them here by
151 |             # passing return_alignments=True
152 |             specs = synthesizer.synthesize_spectrograms(texts, embeds)
153 |             spec = specs[0]
154 |             print("Created the mel spectrogram")
155 |             
156 |             
157 |             ## Generating the waveform
158 |             print("Synthesizing the waveform:")
159 |             # Synthesizing the waveform is fairly straightforward. Remember that the longer the
160 |             # spectrogram, the more time-efficient the vocoder.
161 |             generated_wav = vocoder.infer_waveform(spec)
162 |             
163 |             
164 |             ## Post-generation
165 |             # There's a bug with sounddevice that makes the audio cut one second earlier, so we
166 |             # pad it.
167 |             generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
168 |             
169 |             # Play the audio (non-blocking)
170 |             if not args.no_sound:
171 |                 sd.stop()
172 |                 sd.play(generated_wav, synthesizer.sample_rate)
173 |                 
174 |             # Save it on the disk
175 |             fpath = "demo_output_%02d.wav" % num_generated
176 |             print(generated_wav.dtype)
177 |             librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
178 |                                      synthesizer.sample_rate)
179 |             num_generated += 1
180 |             print("\nSaved output as %s\n\n" % fpath)
181 |             
182 |             
183 |         except Exception as e:
184 |             print("Caught exception: %s" % repr(e))
185 |             print("Restarting\n")
186 |         


--------------------------------------------------------------------------------
/demo_toolbox.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from toolbox import Toolbox
 3 | from utils.argutils import print_args
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(
 9 |         description="Runs the toolbox",
10 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 |     )
12 |     
13 |     parser.add_argument("-d", "--datasets_root", type=Path, help= \
14 |         "Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
15 |         "supported datasets. You can add your own data by created a directory named UserAudio "
16 |         "in your datasets root. Supported formats are mp3, flac, wav and m4a. Each speaker should "
17 |         "be inside a directory, e.g. <datasets_root>/UserAudio/speaker_01/audio_01.wav.",
18 |                         default=None)
19 |     parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", 
20 |                         help="Directory containing saved encoder models")
21 |     parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", 
22 |                         help="Directory containing saved synthesizer models")
23 |     parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", 
24 |                         help="Directory containing saved vocoder models")
25 |     parser.add_argument("--low_mem", action="store_true", help=\
26 |         "If True, the memory used by the synthesizer will be freed after each use. Adds large "
27 |         "overhead but allows to save some GPU memory for lower-end GPUs.")
28 |     args = parser.parse_args()
29 | 
30 |     # Launch the toolbox
31 |     print_args(args, parser)
32 |     Toolbox(**vars(args))
33 |     


--------------------------------------------------------------------------------
/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwater/Real-Time-Voice-Cloning-Chinese/06882b9a83247beda1d4d84baca0400457096d1b/encoder/__init__.py


--------------------------------------------------------------------------------
/encoder/audio.py:
--------------------------------------------------------------------------------
  1 | from scipy.ndimage.morphology import binary_dilation
  2 | from encoder.params_data import *
  3 | from pathlib import Path
  4 | from typing import Optional, Union
  5 | import numpy as np
  6 | import webrtcvad
  7 | import librosa
  8 | import struct
  9 | 
 10 | int16_max = (2 ** 15) - 1
 11 | 
 12 | 
 13 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
 14 |                    source_sr: Optional[int] = None):
 15 |     """
 16 |     Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
 17 |     either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
 18 | 
 19 |     :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
 20 |     just .wav), either the waveform as a numpy array of floats.
 21 |     :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
 22 |     preprocessing. After preprocessing, the waveform's sampling rate will match the data 
 23 |     hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
 24 |     this argument will be ignored.
 25 |     """
 26 |     # Load the wav from disk if needed
 27 |     if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
 28 |         wav, source_sr = librosa.load(fpath_or_wav, sr=None)
 29 |     else:
 30 |         wav = fpath_or_wav
 31 |     
 32 |     # Resample the wav if needed
 33 |     if source_sr is not None and source_sr != sampling_rate:
 34 |         wav = librosa.resample(wav, source_sr, sampling_rate)
 35 |         
 36 |     # Apply the preprocessing: normalize volume and shorten long silences 
 37 |     wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
 38 |     wav = trim_long_silences(wav)
 39 |     
 40 |     return wav
 41 | 
 42 | 
 43 | def wav_to_mel_spectrogram(wav):
 44 |     """
 45 |     Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
 46 |     Note: this not a log-mel spectrogram.
 47 |     """
 48 |     frames = librosa.feature.melspectrogram(
 49 |         wav,
 50 |         sampling_rate,
 51 |         n_fft=int(sampling_rate * mel_window_length / 1000),
 52 |         hop_length=int(sampling_rate * mel_window_step / 1000),
 53 |         n_mels=mel_n_channels
 54 |     )
 55 |     return frames.astype(np.float32).T
 56 | 
 57 | 
 58 | def trim_long_silences(wav):
 59 |     """
 60 |     Ensures that segments without voice in the waveform remain no longer than a 
 61 |     threshold determined by the VAD parameters in params.py.
 62 | 
 63 |     :param wav: the raw waveform as a numpy array of floats 
 64 |     :return: the same waveform with silences trimmed away (length <= original wav length)
 65 |     """
 66 |     # Compute the voice detection window size
 67 |     samples_per_window = (vad_window_length * sampling_rate) // 1000
 68 |     
 69 |     # Trim the end of the audio to have a multiple of the window size
 70 |     wav = wav[:len(wav) - (len(wav) % samples_per_window)]
 71 |     
 72 |     # Convert the float waveform to 16-bit mono PCM
 73 |     pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
 74 |     
 75 |     # Perform voice activation detection
 76 |     voice_flags = []
 77 |     vad = webrtcvad.Vad(mode=3)
 78 |     for window_start in range(0, len(wav), samples_per_window):
 79 |         window_end = window_start + samples_per_window
 80 |         voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
 81 |                                          sample_rate=sampling_rate))
 82 |     voice_flags = np.array(voice_flags)
 83 |     
 84 |     # Smooth the voice detection with a moving average
 85 |     def moving_average(array, width):
 86 |         array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
 87 |         ret = np.cumsum(array_padded, dtype=float)
 88 |         ret[width:] = ret[width:] - ret[:-width]
 89 |         return ret[width - 1:] / width
 90 |     
 91 |     audio_mask = moving_average(voice_flags, vad_moving_average_width)
 92 |     audio_mask = np.round(audio_mask).astype(np.bool)
 93 |     
 94 |     # Dilate the voiced regions
 95 |     audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
 96 |     audio_mask = np.repeat(audio_mask, samples_per_window)
 97 |     
 98 |     return wav[audio_mask == True]
 99 | 
100 | 
101 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
102 |     if increase_only and decrease_only:
103 |         raise ValueError("Both increase only and decrease only are set")
104 |     dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
105 |     if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
106 |         return wav
107 |     return wav * (10 ** (dBFS_change / 20))
108 | 


--------------------------------------------------------------------------------
/encoder/config.py:
--------------------------------------------------------------------------------
 1 | librispeech_datasets = {
 2 |     "train": {
 3 |         "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
 4 |         "other": ["LibriSpeech/train-other-500"]
 5 |     },
 6 |     "test": {
 7 |         "clean": ["LibriSpeech/test-clean"],
 8 |         "other": ["LibriSpeech/test-other"]
 9 |     },
10 |     "dev": {
11 |         "clean": ["LibriSpeech/dev-clean"],
12 |         "other": ["LibriSpeech/dev-other"]
13 |     },
14 | }
15 | libritts_datasets = {
16 |     "train": {
17 |         "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
18 |         "other": ["LibriTTS/train-other-500"]
19 |     },
20 |     "test": {
21 |         "clean": ["LibriTTS/test-clean"],
22 |         "other": ["LibriTTS/test-other"]
23 |     },
24 |     "dev": {
25 |         "clean": ["LibriTTS/dev-clean"],
26 |         "other": ["LibriTTS/dev-other"]
27 |     },
28 | }
29 | voxceleb_datasets = {
30 |     "voxceleb1" : {
31 |         "train": ["VoxCeleb1/wav"],
32 |         "test": ["VoxCeleb1/test_wav"]
33 |     },
34 |     "voxceleb2" : {
35 |         "train": ["VoxCeleb2/dev/aac"],
36 |         "test": ["VoxCeleb2/test_wav"]
37 |     }
38 | }
39 | aishell1_datasets = {
40 |     "train": ["data_aishell/wav/train"],
41 |     "dev": ["data_aishell/wav/dev"],
42 |     "test": ["data_aishell/wav/test"],
43 | }
44 | 
45 | magicdata_datasets = {
46 |     "train": ["MagicData/train"],
47 |     "dev": ["MagicData/dev"],
48 |     "test": ["MagicData/test"],
49 | }
50 | 
51 | aidatatang_datasets = {
52 |     "train": ["aidatatang_200zh/corpus/train"],
53 |     "dev": ["aidatatang_200zh/corpus/dev"],
54 |     "test": ["aidatatang_200zh/corpus/test"],
55 | }
56 | 
57 | thchs30_datasets = {
58 |     "train": ["data_thchs30/train"],
59 |     "dev": ["data_thchs30/dev"],
60 |     "test": ["data_thchs30/test"],
61 | }
62 | 
63 | mozilla_datasets = {
64 |     "train": "Mozilla/train.tsv",
65 |     "dev": "Mozilla/test.tsv",
66 |     "test": "Mozilla/train.tsv",
67 |     "validated": "Mozilla/validated.tsv",
68 | }
69 | 
70 | stcmds_datasets = "ST-CMDS-20170001_1-OS"
71 | 
72 | primewords_datasets = "primewords_md_2018_set1"
73 | 
74 | other_datasets = [
75 |     "LJSpeech-1.1",
76 |     "VCTK-Corpus/wav48",
77 | ]
78 | 
79 | anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
80 | 


--------------------------------------------------------------------------------
/encoder/data_objects/__init__.py:
--------------------------------------------------------------------------------
1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
2 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
3 | 


--------------------------------------------------------------------------------
/encoder/data_objects/random_cycler.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | class RandomCycler:
 4 |     """
 5 |     Creates an internal copy of a sequence and allows access to its items in a constrained random 
 6 |     order. For a source sequence of n items and one or several consecutive queries of a total 
 7 |     of m items, the following guarantees hold (one implies the other):
 8 |         - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
 9 |         - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
10 |     """
11 |     
12 |     def __init__(self, source):
13 |         if len(source) == 0:
14 |             raise Exception("Can't create RandomCycler from an empty collection")
15 |         self.all_items = list(source)
16 |         self.next_items = []
17 |     
18 |     def sample(self, count: int):
19 |         shuffle = lambda l: random.sample(l, len(l))
20 |         
21 |         out = []
22 |         while count > 0:
23 |             if count >= len(self.all_items):
24 |                 out.extend(shuffle(list(self.all_items)))
25 |                 count -= len(self.all_items)
26 |                 continue
27 |             n = min(count, len(self.next_items))
28 |             out.extend(self.next_items[:n])
29 |             count -= n
30 |             self.next_items = self.next_items[n:]
31 |             if len(self.next_items) == 0:
32 |                 self.next_items = shuffle(list(self.all_items))
33 |         return out
34 |     
35 |     def __next__(self):
36 |         return self.sample(1)[0]
37 | 
38 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker.py:
--------------------------------------------------------------------------------
 1 | from encoder.data_objects.random_cycler import RandomCycler
 2 | from encoder.data_objects.utterance import Utterance
 3 | from pathlib import Path
 4 | 
 5 | # Contains the set of utterances of a single speaker
 6 | class Speaker:
 7 |     def __init__(self, root: Path):
 8 |         self.root = root
 9 |         self.name = root.name
10 |         self.utterances = None
11 |         self.utterance_cycler = None
12 |         
13 |     def _load_utterances(self):
14 |         with self.root.joinpath("_sources.txt").open("r") as sources_file:
15 |             sources = [l.split(",") for l in sources_file]
16 |         sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
17 |         self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
18 |         self.utterance_cycler = RandomCycler(self.utterances)
19 |                
20 |     def random_partial(self, count, n_frames):
21 |         """
22 |         Samples a batch of <count> unique partial utterances from the disk in a way that all 
23 |         utterances come up at least once every two cycles and in a random order every time.
24 |         
25 |         :param count: The number of partial utterances to sample from the set of utterances from 
26 |         that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than 
27 |         the number of utterances available.
28 |         :param n_frames: The number of frames in the partial utterance.
29 |         :return: A list of tuples (utterance, frames, range) where utterance is an Utterance, 
30 |         frames are the frames of the partial utterances and range is the range of the partial 
31 |         utterance with regard to the complete utterance.
32 |         """
33 |         if self.utterances is None:
34 |             self._load_utterances()
35 | 
36 |         utterances = self.utterance_cycler.sample(count)
37 | 
38 |         a = [(u,) + u.random_partial(n_frames) for u in utterances]
39 | 
40 |         return a
41 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker_batch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List
 3 | from encoder.data_objects.speaker import Speaker
 4 | 
 5 | class SpeakerBatch:
 6 |     def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
 7 |         self.speakers = speakers
 8 |         self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
 9 |         
10 |         # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
11 |         # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
12 |         self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
13 | 


--------------------------------------------------------------------------------
/encoder/data_objects/speaker_verification_dataset.py:
--------------------------------------------------------------------------------
 1 | from encoder.data_objects.random_cycler import RandomCycler
 2 | from encoder.data_objects.speaker_batch import SpeakerBatch
 3 | from encoder.data_objects.speaker import Speaker
 4 | from encoder.params_data import partials_n_frames
 5 | from torch.utils.data import Dataset, DataLoader
 6 | from pathlib import Path
 7 | 
 8 | # TODO: improve with a pool of speakers for data efficiency
 9 | 
10 | class SpeakerVerificationDataset(Dataset):
11 |     def __init__(self, datasets_root: Path):
12 |         self.root = datasets_root
13 |         speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
14 |         if len(speaker_dirs) == 0:
15 |             raise Exception("No speakers found. Make sure you are pointing to the directory "
16 |                             "containing all preprocessed speaker directories.")
17 |         self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
18 |         self.speaker_cycler = RandomCycler(self.speakers)
19 | 
20 |     def __len__(self):
21 |         return int(1e10)
22 |         
23 |     def __getitem__(self, index):
24 |         return next(self.speaker_cycler)
25 |     
26 |     def get_logs(self):
27 |         log_string = ""
28 |         for log_fpath in self.root.glob("*.txt"):
29 |             with log_fpath.open("r") as log_file:
30 |                 log_string += "".join(log_file.readlines())
31 |         return log_string
32 |     
33 |     
34 | class SpeakerVerificationDataLoader(DataLoader):
35 |     def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None, 
36 |                  batch_sampler=None, num_workers=0, pin_memory=False, timeout=0, 
37 |                  worker_init_fn=None):
38 |         self.utterances_per_speaker = utterances_per_speaker
39 | 
40 |         super().__init__(
41 |             dataset=dataset, 
42 |             batch_size=speakers_per_batch, 
43 |             shuffle=False, 
44 |             sampler=sampler, 
45 |             batch_sampler=batch_sampler, 
46 |             num_workers=num_workers,
47 |             collate_fn=self.collate, 
48 |             pin_memory=pin_memory, 
49 |             drop_last=False, 
50 |             timeout=timeout, 
51 |             worker_init_fn=worker_init_fn
52 |         )
53 | 
54 |     def collate(self, speakers):
55 |         return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) 
56 |     


--------------------------------------------------------------------------------
/encoder/data_objects/utterance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Utterance:
 5 |     def __init__(self, frames_fpath, wave_fpath):
 6 |         self.frames_fpath = frames_fpath
 7 |         self.wave_fpath = wave_fpath
 8 |         
 9 |     def get_frames(self):
10 |         return np.load(self.frames_fpath)
11 | 
12 |     def random_partial(self, n_frames):
13 |         """
14 |         Crops the frames into a partial utterance of n_frames
15 |         
16 |         :param n_frames: The number of frames of the partial utterance
17 |         :return: the partial utterance frames and a tuple indicating the start and end of the 
18 |         partial utterance in the complete utterance.
19 |         """
20 |         frames = self.get_frames()
21 |         if frames.shape[0] == n_frames:
22 |             start = 0
23 |         else:
24 |             start = np.random.randint(0, frames.shape[0] - n_frames)
25 |         end = start + n_frames
26 |         return frames[start:end], (start, end)


--------------------------------------------------------------------------------
/encoder/inference.py:
--------------------------------------------------------------------------------
  1 | from encoder.params_data import *
  2 | from encoder.model import SpeakerEncoder
  3 | from encoder.audio import preprocess_wav   # We want to expose this function from here
  4 | from matplotlib import cm
  5 | from encoder import audio
  6 | from pathlib import Path
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | _model = None # type: SpeakerEncoder
 12 | _device = None # type: torch.device
 13 | 
 14 | 
 15 | def load_model(weights_fpath: Path, device=None):
 16 |     """
 17 |     Loads the model in memory. If this function is not explicitely called, it will be run on the 
 18 |     first call to embed_frames() with the default weights file.
 19 |     
 20 |     :param weights_fpath: the path to saved model weights.
 21 |     :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
 22 |     model will be loaded and will run on this device. Outputs will however always be on the cpu. 
 23 |     If None, will default to your GPU if it"s available, otherwise your CPU.
 24 |     """
 25 |     # TODO: I think the slow loading of the encoder might have something to do with the device it
 26 |     #   was saved on. Worth investigating.
 27 |     global _model, _device
 28 |     if device is None:
 29 |         _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 30 |     elif isinstance(device, str):
 31 |         _device = torch.device(device)
 32 |     _model = SpeakerEncoder(_device, torch.device("cpu"))
 33 |     checkpoint = torch.load(weights_fpath)
 34 |     _model.load_state_dict(checkpoint["model_state"])
 35 |     _model.eval()
 36 |     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
 37 |     
 38 |     
 39 | def is_loaded():
 40 |     return _model is not None
 41 | 
 42 | 
 43 | def embed_frames_batch(frames_batch):
 44 |     """
 45 |     Computes embeddings for a batch of mel spectrogram.
 46 |     
 47 |     :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
 48 |     (batch_size, n_frames, n_channels)
 49 |     :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
 50 |     """
 51 |     if _model is None:
 52 |         raise Exception("Model was not loaded. Call load_model() before inference.")
 53 |     
 54 |     frames = torch.from_numpy(frames_batch).to(_device)
 55 |     embed = _model.forward(frames).detach().cpu().numpy()
 56 |     return embed
 57 | 
 58 | 
 59 | def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
 60 |                            min_pad_coverage=0.75, overlap=0.5):
 61 |     """
 62 |     Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
 63 |     partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
 64 |     spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
 65 |     its spectrogram. This function assumes that the mel spectrogram parameters used are those 
 66 |     defined in params_data.py.
 67 |     
 68 |     The returned ranges may be indexing further than the length of the waveform. It is 
 69 |     recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
 70 |     
 71 |     :param n_samples: the number of samples in the waveform
 72 |     :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
 73 |     utterance
 74 |     :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
 75 |     enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
 76 |     then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
 77 |     it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
 78 |     utterance, this parameter is ignored so that the function always returns at least 1 slice.
 79 |     :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
 80 |     utterances are entirely disjoint. 
 81 |     :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
 82 |     respectively the waveform and the mel spectrogram with these slices to obtain the partial 
 83 |     utterances.
 84 |     """
 85 |     assert 0 <= overlap < 1
 86 |     assert 0 < min_pad_coverage <= 1
 87 |     
 88 |     samples_per_frame = int((sampling_rate * mel_window_step / 1000))
 89 |     n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
 90 |     frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
 91 | 
 92 |     # Compute the slices
 93 |     wav_slices, mel_slices = [], []
 94 |     steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
 95 |     for i in range(0, steps, frame_step):
 96 |         mel_range = np.array([i, i + partial_utterance_n_frames])
 97 |         wav_range = mel_range * samples_per_frame
 98 |         mel_slices.append(slice(*mel_range))
 99 |         wav_slices.append(slice(*wav_range))
100 |         
101 |     # Evaluate whether extra padding is warranted or not
102 |     last_wav_range = wav_slices[-1]
103 |     coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
104 |     if coverage < min_pad_coverage and len(mel_slices) > 1:
105 |         mel_slices = mel_slices[:-1]
106 |         wav_slices = wav_slices[:-1]
107 |     
108 |     return wav_slices, mel_slices
109 | 
110 | 
111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
112 |     """
113 |     Computes an embedding for a single utterance.
114 |     
115 |     # TODO: handle multiple wavs to benefit from batching on GPU
116 |     :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
117 |     :param using_partials: if True, then the utterance is split in partial utterances of 
118 |     <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
119 |     normalized average. If False, the utterance is instead computed from feeding the entire 
120 |     spectogram to the network.
121 |     :param return_partials: if True, the partial embeddings will also be returned along with the 
122 |     wav slices that correspond to the partial embeddings.
123 |     :param kwargs: additional arguments to compute_partial_splits()
124 |     :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
125 |     <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
126 |     (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
127 |     returned. If <using_partials> is simultaneously set to False, both these values will be None 
128 |     instead.
129 |     """
130 |     # Process the entire utterance if not using partials
131 |     if not using_partials:
132 |         frames = audio.wav_to_mel_spectrogram(wav)
133 |         embed = embed_frames_batch(frames[None, ...])[0]
134 |         if return_partials:
135 |             return embed, None, None
136 |         return embed
137 |     
138 |     # Compute where to split the utterance into partials and pad if necessary
139 |     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
140 |     max_wave_length = wave_slices[-1].stop
141 |     if max_wave_length >= len(wav):
142 |         wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
143 |     
144 |     # Split the utterance into partials
145 |     frames = audio.wav_to_mel_spectrogram(wav)
146 |     frames_batch = np.array([frames[s] for s in mel_slices])
147 |     partial_embeds = embed_frames_batch(frames_batch)
148 |     
149 |     # Compute the utterance embedding from the partial embeddings
150 |     raw_embed = np.mean(partial_embeds, axis=0)
151 |     embed = raw_embed / np.linalg.norm(raw_embed, 2)
152 |     
153 |     if return_partials:
154 |         return embed, partial_embeds, wave_slices
155 |     return embed
156 | 
157 | 
158 | def embed_speaker(wavs, **kwargs):
159 |     raise NotImplemented()
160 | 
161 | 
162 | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
163 |     if ax is None:
164 |         ax = plt.gca()
165 |     
166 |     if shape is None:
167 |         height = int(np.sqrt(len(embed)))
168 |         shape = (height, -1)
169 |     embed = embed.reshape(shape)
170 |     
171 |     cmap = cm.get_cmap()
172 |     mappable = ax.imshow(embed, cmap=cmap)
173 |     cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
174 |     cbar.set_clim(*color_range)
175 |     
176 |     ax.set_xticks([]), ax.set_yticks([])
177 |     ax.set_title(title)
178 | 


--------------------------------------------------------------------------------
/encoder/model.py:
--------------------------------------------------------------------------------
  1 | from encoder.params_model import *
  2 | from encoder.params_data import *
  3 | from scipy.interpolate import interp1d
  4 | from sklearn.metrics import roc_curve
  5 | from torch.nn.utils import clip_grad_norm_
  6 | from scipy.optimize import brentq
  7 | from torch import nn
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | 
 12 | class SpeakerEncoder(nn.Module):
 13 |     def __init__(self, device, loss_device):
 14 |         super().__init__()
 15 |         self.loss_device = loss_device
 16 |         
 17 |         # Network defition
 18 |         self.lstm = nn.LSTM(input_size=mel_n_channels,
 19 |                             hidden_size=model_hidden_size, 
 20 |                             num_layers=model_num_layers, 
 21 |                             batch_first=True).to(device)
 22 |         self.linear = nn.Linear(in_features=model_hidden_size, 
 23 |                                 out_features=model_embedding_size).to(device)
 24 |         self.relu = torch.nn.ReLU().to(device)
 25 |         
 26 |         # Cosine similarity scaling (with fixed initial parameter values)
 27 |         self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
 28 |         self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
 29 | 
 30 |         # Loss
 31 |         self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
 32 |         
 33 |     def do_gradient_ops(self):
 34 |         # Gradient scale
 35 |         self.similarity_weight.grad *= 0.01
 36 |         self.similarity_bias.grad *= 0.01
 37 |             
 38 |         # Gradient clipping
 39 |         clip_grad_norm_(self.parameters(), 3, norm_type=2)
 40 |     
 41 |     def forward(self, utterances, hidden_init=None):
 42 |         """
 43 |         Computes the embeddings of a batch of utterance spectrograms.
 44 |         
 45 |         :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape 
 46 |         (batch_size, n_frames, n_channels) 
 47 |         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, 
 48 |         batch_size, hidden_size). Will default to a tensor of zeros if None.
 49 |         :return: the embeddings as a tensor of shape (batch_size, embedding_size)
 50 |         """
 51 |         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
 52 |         # and the final cell state.
 53 |         out, (hidden, cell) = self.lstm(utterances, hidden_init)
 54 |         
 55 |         # We take only the hidden state of the last layer
 56 |         embeds_raw = self.relu(self.linear(hidden[-1]))
 57 |         
 58 |         # L2-normalize it
 59 |         embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
 60 |         
 61 |         return embeds
 62 |     
 63 |     def similarity_matrix(self, embeds):
 64 |         """
 65 |         Computes the similarity matrix according the section 2.1 of GE2E.
 66 | 
 67 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
 68 |         utterances_per_speaker, embedding_size)
 69 |         :return: the similarity matrix as a tensor of shape (speakers_per_batch,
 70 |         utterances_per_speaker, speakers_per_batch)
 71 |         """
 72 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
 73 |         
 74 |         # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
 75 |         centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
 76 |         centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
 77 | 
 78 |         # Exclusive centroids (1 per utterance)
 79 |         centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
 80 |         centroids_excl /= (utterances_per_speaker - 1)
 81 |         centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
 82 | 
 83 |         # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
 84 |         # product of these vectors (which is just an element-wise multiplication reduced by a sum).
 85 |         # We vectorize the computation for efficiency.
 86 |         sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
 87 |                                  speakers_per_batch).to(self.loss_device)
 88 |         mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
 89 |         for j in range(speakers_per_batch):
 90 |             mask = np.where(mask_matrix[j])[0]
 91 |             sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
 92 |             sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
 93 |         
 94 |         ## Even more vectorized version (slower maybe because of transpose)
 95 |         # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
 96 |         #                           ).to(self.loss_device)
 97 |         # eye = np.eye(speakers_per_batch, dtype=np.int)
 98 |         # mask = np.where(1 - eye)
 99 |         # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
100 |         # mask = np.where(eye)
101 |         # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
102 |         # sim_matrix2 = sim_matrix2.transpose(1, 2)
103 |         
104 |         sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
105 |         return sim_matrix
106 |     
107 |     def loss(self, embeds):
108 |         """
109 |         Computes the softmax loss according the section 2.1 of GE2E.
110 |         
111 |         :param embeds: the embeddings as a tensor of shape (speakers_per_batch, 
112 |         utterances_per_speaker, embedding_size)
113 |         :return: the loss and the EER for this batch of embeddings.
114 |         """
115 |         speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
116 |         
117 |         # Loss
118 |         sim_matrix = self.similarity_matrix(embeds)
119 |         sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker, 
120 |                                          speakers_per_batch))
121 |         ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
122 |         target = torch.from_numpy(ground_truth).long().to(self.loss_device)
123 |         loss = self.loss_fn(sim_matrix, target)
124 |         
125 |         # EER (not backpropagated)
126 |         with torch.no_grad():
127 |             inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
128 |             labels = np.array([inv_argmax(i) for i in ground_truth])
129 |             preds = sim_matrix.detach().cpu().numpy()
130 | 
131 |             # Snippet from https://yangcha.github.io/EER-ROC/
132 |             fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())           
133 |             eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
134 |             
135 |         return loss, eer


--------------------------------------------------------------------------------
/encoder/params_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Mel-filterbank
 3 | mel_window_length = 25  # In milliseconds
 4 | mel_window_step = 10    # In milliseconds
 5 | mel_n_channels = 40
 6 | 
 7 | 
 8 | ## Audio
 9 | sampling_rate = 16000
10 | # Number of spectrogram frames in a partial utterance
11 | partials_n_frames = 160     # 1600 ms
12 | # Number of spectrogram frames at inference
13 | inference_n_frames = 80     #  800 ms
14 | 
15 | 
16 | ## Voice Activation Detection
17 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
18 | # This sets the granularity of the VAD. Should not need to be changed.
19 | vad_window_length = 30  # In milliseconds
20 | # Number of frames to average together when performing the moving average smoothing.
21 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 
22 | vad_moving_average_width = 8
23 | # Maximum number of consecutive silent frames a segment can have.
24 | vad_max_silence_length = 6
25 | 
26 | 
27 | ## Audio volume normalization
28 | audio_norm_target_dBFS = -30
29 | 
30 | 


--------------------------------------------------------------------------------
/encoder/params_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Model parameters
 3 | model_hidden_size = 256
 4 | model_embedding_size = 256
 5 | model_num_layers = 3
 6 | 
 7 | 
 8 | ## Training parameters
 9 | learning_rate_init = 1e-4
10 | speakers_per_batch = 64
11 | utterances_per_speaker = 10
12 | 


--------------------------------------------------------------------------------
/encoder/train.py:
--------------------------------------------------------------------------------
  1 | from encoder.visualizations import Visualizations
  2 | from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
  3 | from encoder.params_model import *
  4 | from encoder.model import SpeakerEncoder
  5 | from utils.profiler import Profiler
  6 | from pathlib import Path
  7 | import torch
  8 | 
  9 | def sync(device: torch.device):
 10 |     # FIXME
 11 |     return 
 12 |     # For correct profiling (cuda operations are async)
 13 |     if device.type == "cuda":
 14 |         torch.cuda.synchronize(device)
 15 | 
 16 | def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
 17 |           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
 18 |           no_visdom: bool):
 19 |     # Create a dataset and a dataloader
 20 |     dataset = SpeakerVerificationDataset(clean_data_root)
 21 |     loader = SpeakerVerificationDataLoader(
 22 |         dataset,
 23 |         speakers_per_batch,
 24 |         utterances_per_speaker,
 25 |         num_workers=8,
 26 |     )
 27 |     
 28 |     # Setup the device on which to run the forward pass and the loss. These can be different, 
 29 |     # because the forward pass is faster on the GPU whereas the loss is often (depending on your
 30 |     # hyperparameters) faster on the CPU.
 31 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 32 |     # FIXME: currently, the gradient is None if loss_device is cuda
 33 |     loss_device = torch.device("cpu")
 34 |     
 35 |     # Create the model and the optimizer
 36 |     model = SpeakerEncoder(device, loss_device)
 37 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
 38 |     init_step = 1
 39 |     
 40 |     # Configure file path for the model
 41 |     state_fpath = models_dir.joinpath(run_id + ".pt")
 42 |     backup_dir = models_dir.joinpath(run_id + "_backups")
 43 | 
 44 |     # Load any existing model
 45 |     if not force_restart:
 46 |         if state_fpath.exists():
 47 |             print("Found existing model \"%s\", loading it and resuming training." % run_id)
 48 |             checkpoint = torch.load(state_fpath)
 49 |             init_step = checkpoint["step"]
 50 |             model.load_state_dict(checkpoint["model_state"])
 51 |             optimizer.load_state_dict(checkpoint["optimizer_state"])
 52 |             optimizer.param_groups[0]["lr"] = learning_rate_init
 53 |         else:
 54 |             print("No model \"%s\" found, starting training from scratch." % run_id)
 55 |     else:
 56 |         print("Starting the training from scratch.")
 57 |     model.train()
 58 |     
 59 |     # Initialize the visualization environment
 60 |     vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
 61 |     vis.log_dataset(dataset)
 62 |     vis.log_params()
 63 |     device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
 64 |     vis.log_implementation({"Device": device_name})
 65 |     
 66 |     # Training loop
 67 |     profiler = Profiler(summarize_every=10, disabled=False)
 68 |     for step, speaker_batch in enumerate(loader, init_step):
 69 |         profiler.tick("Blocking, waiting for batch (threaded)")
 70 |         
 71 |         # Forward pass
 72 |         inputs = torch.from_numpy(speaker_batch.data).to(device)
 73 |         sync(device)
 74 |         profiler.tick("Data to %s" % device)
 75 |         embeds = model(inputs)
 76 |         sync(device)
 77 |         profiler.tick("Forward pass")
 78 |         embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
 79 |         loss, eer = model.loss(embeds_loss)
 80 |         sync(loss_device)
 81 |         profiler.tick("Loss")
 82 | 
 83 |         # Backward pass
 84 |         model.zero_grad()
 85 |         loss.backward()
 86 |         profiler.tick("Backward pass")
 87 |         model.do_gradient_ops()
 88 |         optimizer.step()
 89 |         profiler.tick("Parameter update")
 90 |         
 91 |         # Update visualizations
 92 |         # learning_rate = optimizer.param_groups[0]["lr"]
 93 |         vis.update(loss.item(), eer, step)
 94 |         
 95 |         # Draw projections and save them to the backup folder
 96 |         if umap_every != 0 and step % umap_every == 0:
 97 |             print("Drawing and saving projections (step %d)" % step)
 98 |             backup_dir.mkdir(exist_ok=True)
 99 |             projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
100 |             embeds = embeds.detach().cpu().numpy()
101 |             vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
102 |             vis.save()
103 | 
104 |         # Overwrite the latest version of the model
105 |         if save_every != 0 and step % save_every == 0:
106 |             print("Saving the model (step %d)" % step)
107 |             torch.save({
108 |                 "step": step + 1,
109 |                 "model_state": model.state_dict(),
110 |                 "optimizer_state": optimizer.state_dict(),
111 |             }, state_fpath)
112 |             
113 |         # Make a backup
114 |         if backup_every != 0 and step % backup_every == 0:
115 |             print("Making a backup (step %d)" % step)
116 |             backup_dir.mkdir(exist_ok=True)
117 |             backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
118 |             torch.save({
119 |                 "step": step + 1,
120 |                 "model_state": model.state_dict(),
121 |                 "optimizer_state": optimizer.state_dict(),
122 |             }, backup_fpath)
123 |             
124 |         profiler.tick("Extras (visualizations, saving)")
125 |         


--------------------------------------------------------------------------------
/encoder/visualizations.py:
--------------------------------------------------------------------------------
  1 | from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
  2 | from datetime import datetime
  3 | from time import perf_counter as timer
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | # import webbrowser
  7 | import visdom
  8 | import umap
  9 | 
 10 | colormap = np.array([
 11 |     [76, 255, 0],
 12 |     [0, 127, 70],
 13 |     [255, 0, 0],
 14 |     [255, 217, 38],
 15 |     [0, 135, 255],
 16 |     [165, 0, 165],
 17 |     [255, 167, 255],
 18 |     [0, 255, 255],
 19 |     [255, 96, 38],
 20 |     [142, 76, 0],
 21 |     [33, 0, 127],
 22 |     [0, 0, 0],
 23 |     [183, 183, 183],
 24 | ], dtype=np.float) / 255 
 25 | 
 26 | 
 27 | class Visualizations:
 28 |     def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
 29 |         # Tracking data
 30 |         self.last_update_timestamp = timer()
 31 |         self.update_every = update_every
 32 |         self.step_times = []
 33 |         self.losses = []
 34 |         self.eers = []
 35 |         print("Updating the visualizations every %d steps." % update_every)
 36 |         
 37 |         # If visdom is disabled TODO: use a better paradigm for that
 38 |         self.disabled = disabled    
 39 |         if self.disabled:
 40 |             return 
 41 |         
 42 |         # Set the environment name
 43 |         now = str(datetime.now().strftime("%d-%m %Hh%M"))
 44 |         if env_name is None:
 45 |             self.env_name = now
 46 |         else:
 47 |             self.env_name = "%s (%s)" % (env_name, now)
 48 |         
 49 |         # Connect to visdom and open the corresponding window in the browser
 50 |         try:
 51 |             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
 52 |         except ConnectionError:
 53 |             raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
 54 |                             "start it.")
 55 |         # webbrowser.open("http://localhost:8097/env/" + self.env_name)
 56 |         
 57 |         # Create the windows
 58 |         self.loss_win = None
 59 |         self.eer_win = None
 60 |         # self.lr_win = None
 61 |         self.implementation_win = None
 62 |         self.projection_win = None
 63 |         self.implementation_string = ""
 64 |         
 65 |     def log_params(self):
 66 |         if self.disabled:
 67 |             return 
 68 |         from encoder import params_data
 69 |         from encoder import params_model
 70 |         param_string = "<b>Model parameters</b>:<br>"
 71 |         for param_name in (p for p in dir(params_model) if not p.startswith("__")):
 72 |             value = getattr(params_model, param_name)
 73 |             param_string += "\t%s: %s<br>" % (param_name, value)
 74 |         param_string += "<b>Data parameters</b>:<br>"
 75 |         for param_name in (p for p in dir(params_data) if not p.startswith("__")):
 76 |             value = getattr(params_data, param_name)
 77 |             param_string += "\t%s: %s<br>" % (param_name, value)
 78 |         self.vis.text(param_string, opts={"title": "Parameters"})
 79 |         
 80 |     def log_dataset(self, dataset: SpeakerVerificationDataset):
 81 |         if self.disabled:
 82 |             return 
 83 |         dataset_string = ""
 84 |         dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
 85 |         dataset_string += "\n" + dataset.get_logs()
 86 |         dataset_string = dataset_string.replace("\n", "<br>")
 87 |         self.vis.text(dataset_string, opts={"title": "Dataset"})
 88 |         
 89 |     def log_implementation(self, params):
 90 |         if self.disabled:
 91 |             return 
 92 |         implementation_string = ""
 93 |         for param, value in params.items():
 94 |             implementation_string += "<b>%s</b>: %s\n" % (param, value)
 95 |             implementation_string = implementation_string.replace("\n", "<br>")
 96 |         self.implementation_string = implementation_string
 97 |         self.implementation_win = self.vis.text(
 98 |             implementation_string, 
 99 |             opts={"title": "Training implementation"}
100 |         )
101 | 
102 |     def update(self, loss, eer, step):
103 |         # Update the tracking data
104 |         now = timer()
105 |         self.step_times.append(1000 * (now - self.last_update_timestamp))
106 |         self.last_update_timestamp = now
107 |         self.losses.append(loss)
108 |         self.eers.append(eer)
109 |         print(".", end="")
110 |         
111 |         # Update the plots every <update_every> steps
112 |         if step % self.update_every != 0:
113 |             return
114 |         time_string = "Step time:  mean: %5dms  std: %5dms" % \
115 |                       (int(np.mean(self.step_times)), int(np.std(self.step_times)))
116 |         print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
117 |               (step, np.mean(self.losses), np.mean(self.eers), time_string))
118 |         if not self.disabled:
119 |             self.loss_win = self.vis.line(
120 |                 [np.mean(self.losses)],
121 |                 [step],
122 |                 win=self.loss_win,
123 |                 update="append" if self.loss_win else None,
124 |                 opts=dict(
125 |                     legend=["Avg. loss"],
126 |                     xlabel="Step",
127 |                     ylabel="Loss",
128 |                     title="Loss",
129 |                 )
130 |             )
131 |             self.eer_win = self.vis.line(
132 |                 [np.mean(self.eers)],
133 |                 [step],
134 |                 win=self.eer_win,
135 |                 update="append" if self.eer_win else None,
136 |                 opts=dict(
137 |                     legend=["Avg. EER"],
138 |                     xlabel="Step",
139 |                     ylabel="EER",
140 |                     title="Equal error rate"
141 |                 )
142 |             )
143 |             if self.implementation_win is not None:
144 |                 self.vis.text(
145 |                     self.implementation_string + ("<b>%s</b>" % time_string), 
146 |                     win=self.implementation_win,
147 |                     opts={"title": "Training implementation"},
148 |                 )
149 | 
150 |         # Reset the tracking
151 |         self.losses.clear()
152 |         self.eers.clear()
153 |         self.step_times.clear()
154 |         
155 |     def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
156 |                          max_speakers=10):
157 |         max_speakers = min(max_speakers, len(colormap))
158 |         embeds = embeds[:max_speakers * utterances_per_speaker]
159 |         
160 |         n_speakers = len(embeds) // utterances_per_speaker
161 |         ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
162 |         colors = [colormap[i] for i in ground_truth]
163 |         
164 |         reducer = umap.UMAP()
165 |         projected = reducer.fit_transform(embeds)
166 |         plt.scatter(projected[:, 0], projected[:, 1], c=colors)
167 |         plt.gca().set_aspect("equal", "datalim")
168 |         plt.title("UMAP projection (step %d)" % step)
169 |         if not self.disabled:
170 |             self.projection_win = self.vis.matplot(plt, win=self.projection_win)
171 |         if out_fpath is not None:
172 |             plt.savefig(out_fpath)
173 |         plt.clf()
174 |         
175 |     def save(self):
176 |         if not self.disabled:
177 |             self.vis.save([self.env_name])
178 |         


--------------------------------------------------------------------------------
/encoder_preprocess.py:
--------------------------------------------------------------------------------
 1 | from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aishell1, preprocess_magicdata, preprocess_aidatatang, preprocess_thchs30, preprocess_mozilla, preprocess_primewords, preprocess_stcmds
 2 | from utils.argutils import print_args
 3 | from pathlib import Path
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 9 |         pass
10 |     
11 |     parser = argparse.ArgumentParser(
12 |         description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
13 |                     "writes them to the disk. This will allow you to train the encoder. The "
14 |                     "datasets required are at least one of VoxCeleb1, VoxCeleb2 and LibriSpeech. "
15 |                     "Ideally, you should have all three. You should extract them as they are "
16 |                     "after having downloaded them and put them in a same directory, e.g.:\n"
17 |                     "-[datasets_root]\n"
18 |                     "  -LibriSpeech\n"
19 |                     "    -train-other-500\n"
20 |                     "  -VoxCeleb1\n"
21 |                     "    -wav\n"
22 |                     "    -vox1_meta.csv\n"
23 |                     "  -VoxCeleb2\n"
24 |                     "    -dev",
25 |         formatter_class=MyFormatter
26 |     )
27 |     parser.add_argument("datasets_root", type=Path, help=\
28 |         "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.")
29 |     parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
30 |         "Path to the output directory that will contain the mel spectrograms. If left out, "
31 |         "defaults to <datasets_root>/SV2TTS/encoder/")
32 |     parser.add_argument("-d", "--datasets", type=str, 
33 |                         default="aishell1,magicdata,aidatatang,thchs30,mozilla,primewords,stcmds", help=\
34 |         "Comma-separated list of the name of the datasets you want to preprocess. Only the train "
35 |         "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
36 |         "voxceleb2.")
37 |     parser.add_argument("-s", "--skip_existing", action="store_true", help=\
38 |         "Whether to skip existing output files with the same name. Useful if this script was "
39 |         "interrupted.")
40 |     args = parser.parse_args()
41 | 
42 |     # Process the arguments
43 |     args.datasets = args.datasets.split(",")
44 |     if not hasattr(args, "out_dir"):
45 |         args.out_dir = args.datasets_root.joinpath("SV2TTS", "encoder")
46 |     assert args.datasets_root.exists()
47 |     args.out_dir.mkdir(exist_ok=True, parents=True)
48 |     
49 |     # Preprocess the datasets
50 |     print_args(args, parser)
51 |     preprocess_func = {
52 |         "librispeech_other": preprocess_librispeech,
53 |         "voxceleb1": preprocess_voxceleb1,
54 |         "voxceleb2": preprocess_voxceleb2,
55 |         "aishell1": preprocess_aishell1,
56 |         "magicdata": preprocess_magicdata,
57 |         "aidatatang": preprocess_aidatatang,
58 |         "thchs30": preprocess_thchs30,
59 |         "mozilla": preprocess_mozilla,
60 |         "primewords": preprocess_primewords,
61 |         "stcmds": preprocess_stcmds,
62 |     }
63 |     args = vars(args)
64 |     for dataset in args.pop("datasets"):
65 |         print("Preprocessing %s" % dataset)
66 |         preprocess_func[dataset](**args)
67 | 


--------------------------------------------------------------------------------
/encoder_train.py:
--------------------------------------------------------------------------------
 1 | from utils.argutils import print_args
 2 | from encoder.train import train
 3 | from pathlib import Path
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(
 9 |         description="Trains the speaker encoder. You must have run encoder_preprocess.py first.",
10 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 |     )
12 |     
13 |     parser.add_argument("run_id", type=str, help= \
14 |         "Name for this model instance. If a model state from the same run ID was previously "
15 |         "saved, the training will restart from there. Pass -f to overwrite saved states and "
16 |         "restart from scratch.")
17 |     parser.add_argument("clean_data_root", type=Path, help= \
18 |         "Path to the output directory of encoder_preprocess.py. If you left the default "
19 |         "output directory when preprocessing, it should be <datasets_root>/SV2TTS/encoder/.")
20 |     parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\
21 |         "Path to the output directory that will contain the saved model weights, as well as "
22 |         "backups of those weights and plots generated during training.")
23 |     parser.add_argument("-v", "--vis_every", type=int, default=10, help= \
24 |         "Number of steps between updates of the loss and the plots.")
25 |     parser.add_argument("-u", "--umap_every", type=int, default=100, help= \
26 |         "Number of steps between updates of the umap projection. Set to 0 to never update the "
27 |         "projections.")
28 |     parser.add_argument("-s", "--save_every", type=int, default=500, help= \
29 |         "Number of steps between updates of the model on the disk. Set to 0 to never save the "
30 |         "model.")
31 |     parser.add_argument("-b", "--backup_every", type=int, default=7500, help= \
32 |         "Number of steps between backups of the model. Set to 0 to never make backups of the "
33 |         "model.")
34 |     parser.add_argument("-f", "--force_restart", action="store_true", help= \
35 |         "Do not load any saved model.")
36 |     parser.add_argument("--visdom_server", type=str, default="http://localhost")
37 |     parser.add_argument("--no_visdom", action="store_true", help= \
38 |         "Disable visdom.")
39 |     args = parser.parse_args()
40 |     
41 |     # Process the arguments
42 |     args.models_dir.mkdir(exist_ok=True)
43 |     
44 |     # Run the training
45 |     print_args(args, parser)
46 |     train(**vars(args))
47 |     


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow-gpu>=1.10.0,<=1.14.0
 2 | umap-learn
 3 | visdom
 4 | webrtcvad
 5 | librosa>=0.5.1
 6 | matplotlib>=2.0.2
 7 | numpy>=1.14.0
 8 | scipy>=1.0.0
 9 | tqdm
10 | sounddevice
11 | Unidecode
12 | inflect
13 | PyQt5
14 | multiprocess
15 | numba
16 | pandas


--------------------------------------------------------------------------------
/synthesizer/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/synthesizer/__init__.py:
--------------------------------------------------------------------------------
1 | #


--------------------------------------------------------------------------------
/synthesizer/audio.py:
--------------------------------------------------------------------------------
  1 | import librosa
  2 | import librosa.filters
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from scipy import signal
  6 | from scipy.io import wavfile
  7 | 
  8 | 
  9 | def load_wav(path, sr):
 10 |     return librosa.core.load(path, sr=sr)[0]
 11 | 
 12 | def save_wav(wav, path, sr):
 13 |     wav *= 32767 / max(0.01, np.max(np.abs(wav)))
 14 |     #proposed by @dsmiller
 15 |     wavfile.write(path, sr, wav.astype(np.int16))
 16 | 
 17 | def save_wavenet_wav(wav, path, sr):
 18 |     librosa.output.write_wav(path, wav, sr=sr)
 19 | 
 20 | def preemphasis(wav, k, preemphasize=True):
 21 |     if preemphasize:
 22 |         return signal.lfilter([1, -k], [1], wav)
 23 |     return wav
 24 | 
 25 | def inv_preemphasis(wav, k, inv_preemphasize=True):
 26 |     if inv_preemphasize:
 27 |         return signal.lfilter([1], [1, -k], wav)
 28 |     return wav
 29 | 
 30 | #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 31 | def start_and_end_indices(quantized, silence_threshold=2):
 32 |     for start in range(quantized.size):
 33 |         if abs(quantized[start] - 127) > silence_threshold:
 34 |             break
 35 |     for end in range(quantized.size - 1, 1, -1):
 36 |         if abs(quantized[end] - 127) > silence_threshold:
 37 |             break
 38 |     
 39 |     assert abs(quantized[start] - 127) > silence_threshold
 40 |     assert abs(quantized[end] - 127) > silence_threshold
 41 |     
 42 |     return start, end
 43 | 
 44 | def get_hop_size(hparams):
 45 |     hop_size = hparams.hop_size
 46 |     if hop_size is None:
 47 |         assert hparams.frame_shift_ms is not None
 48 |         hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
 49 |     return hop_size
 50 | 
 51 | def linearspectrogram(wav, hparams):
 52 |     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 53 |     S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
 54 |     
 55 |     if hparams.signal_normalization:
 56 |         return _normalize(S, hparams)
 57 |     return S
 58 | 
 59 | def melspectrogram(wav, hparams):
 60 |     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 61 |     S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
 62 |     
 63 |     if hparams.signal_normalization:
 64 |         return _normalize(S, hparams)
 65 |     return S
 66 | 
 67 | def inv_linear_spectrogram(linear_spectrogram, hparams):
 68 |     """Converts linear spectrogram to waveform using librosa"""
 69 |     if hparams.signal_normalization:
 70 |         D = _denormalize(linear_spectrogram, hparams)
 71 |     else:
 72 |         D = linear_spectrogram
 73 |     
 74 |     S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
 75 |     
 76 |     if hparams.use_lws:
 77 |         processor = _lws_processor(hparams)
 78 |         D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 79 |         y = processor.istft(D).astype(np.float32)
 80 |         return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 81 |     else:
 82 |         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
 83 | 
 84 | def inv_mel_spectrogram(mel_spectrogram, hparams):
 85 |     """Converts mel spectrogram to waveform using librosa"""
 86 |     if hparams.signal_normalization:
 87 |         D = _denormalize(mel_spectrogram, hparams)
 88 |     else:
 89 |         D = mel_spectrogram
 90 |     
 91 |     S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
 92 |     
 93 |     if hparams.use_lws:
 94 |         processor = _lws_processor(hparams)
 95 |         D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 96 |         y = processor.istft(D).astype(np.float32)
 97 |         return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 98 |     else:
 99 |         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
100 | 
101 | def _lws_processor(hparams):
102 |     import lws
103 |     return lws.lws(hparams.n_fft, get_hop_size(hparams), fftsize=hparams.win_size, mode="speech")
104 | 
105 | def _griffin_lim(S, hparams):
106 |     """librosa implementation of Griffin-Lim
107 |     Based on https://github.com/librosa/librosa/issues/434
108 |     """
109 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
110 |     S_complex = np.abs(S).astype(np.complex)
111 |     y = _istft(S_complex * angles, hparams)
112 |     for i in range(hparams.griffin_lim_iters):
113 |         angles = np.exp(1j * np.angle(_stft(y, hparams)))
114 |         y = _istft(S_complex * angles, hparams)
115 |     return y
116 | 
117 | def _stft(y, hparams):
118 |     if hparams.use_lws:
119 |         return _lws_processor(hparams).stft(y).T
120 |     else:
121 |         return librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
122 | 
123 | def _istft(y, hparams):
124 |     return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
125 | 
126 | ##########################################################
127 | #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
128 | def num_frames(length, fsize, fshift):
129 |     """Compute number of time frames of spectrogram
130 |     """
131 |     pad = (fsize - fshift)
132 |     if length % fshift == 0:
133 |         M = (length + pad * 2 - fsize) // fshift + 1
134 |     else:
135 |         M = (length + pad * 2 - fsize) // fshift + 2
136 |     return M
137 | 
138 | 
139 | def pad_lr(x, fsize, fshift):
140 |     """Compute left and right padding
141 |     """
142 |     M = num_frames(len(x), fsize, fshift)
143 |     pad = (fsize - fshift)
144 |     T = len(x) + 2 * pad
145 |     r = (M - 1) * fshift + fsize - T
146 |     return pad, pad + r
147 | ##########################################################
148 | #Librosa correct padding
149 | def librosa_pad_lr(x, fsize, fshift):
150 |     return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
151 | 
152 | # Conversions
153 | _mel_basis = None
154 | _inv_mel_basis = None
155 | 
156 | def _linear_to_mel(spectogram, hparams):
157 |     global _mel_basis
158 |     if _mel_basis is None:
159 |         _mel_basis = _build_mel_basis(hparams)
160 |     return np.dot(_mel_basis, spectogram)
161 | 
162 | def _mel_to_linear(mel_spectrogram, hparams):
163 |     global _inv_mel_basis
164 |     if _inv_mel_basis is None:
165 |         _inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
166 |     return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
167 | 
168 | def _build_mel_basis(hparams):
169 |     assert hparams.fmax <= hparams.sample_rate // 2
170 |     return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
171 |                                fmin=hparams.fmin, fmax=hparams.fmax)
172 | 
173 | def _amp_to_db(x, hparams):
174 |     min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
175 |     return 20 * np.log10(np.maximum(min_level, x))
176 | 
177 | def _db_to_amp(x):
178 |     return np.power(10.0, (x) * 0.05)
179 | 
180 | def _normalize(S, hparams):
181 |     if hparams.allow_clipping_in_normalization:
182 |         if hparams.symmetric_mels:
183 |             return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
184 |                            -hparams.max_abs_value, hparams.max_abs_value)
185 |         else:
186 |             return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)
187 |     
188 |     assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
189 |     if hparams.symmetric_mels:
190 |         return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
191 |     else:
192 |         return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))
193 | 
194 | def _denormalize(D, hparams):
195 |     if hparams.allow_clipping_in_normalization:
196 |         if hparams.symmetric_mels:
197 |             return (((np.clip(D, -hparams.max_abs_value,
198 |                               hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
199 |                     + hparams.min_level_db)
200 |         else:
201 |             return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
202 |     
203 |     if hparams.symmetric_mels:
204 |         return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
205 |     else:
206 |         return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
207 | 


--------------------------------------------------------------------------------
/synthesizer/inference.py:
--------------------------------------------------------------------------------
  1 | from synthesizer.tacotron2 import Tacotron2
  2 | from synthesizer.hparams import hparams
  3 | from multiprocess.pool import Pool  # You're free to use either one
  4 | #from multiprocessing import Pool   # 
  5 | from synthesizer import audio
  6 | from pathlib import Path
  7 | from typing import Union, List
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import numba.cuda
 11 | import librosa
 12 | 
 13 | 
 14 | class Synthesizer:
 15 |     sample_rate = hparams.sample_rate
 16 |     hparams = hparams
 17 |     
 18 |     def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
 19 |         """
 20 |         Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
 21 |         needed or until load() is called.
 22 |         
 23 |         :param checkpoints_dir: path to the directory containing the checkpoint file as well as the
 24 |         weight files (.data, .index and .meta files)
 25 |         :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
 26 |         :param low_mem: if True, the model will be loaded in a separate process and its resources 
 27 |         will be released after each usage. Adds a large overhead, only recommended if your GPU 
 28 |         memory is low (<= 2gb)
 29 |         """
 30 |         self.verbose = verbose
 31 |         self._low_mem = low_mem
 32 |         
 33 |         # Prepare the model
 34 |         self._model = None  # type: Tacotron2
 35 |         checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
 36 |         if checkpoint_state is None:
 37 |             raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
 38 |         self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
 39 |         if verbose:
 40 |             model_name = checkpoints_dir.parent.name.replace("logs-", "")
 41 |             step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
 42 |             print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
 43 |      
 44 |     def is_loaded(self):
 45 |         """
 46 |         Whether the model is loaded in GPU memory.
 47 |         """
 48 |         return self._model is not None
 49 |     
 50 |     def load(self):
 51 |         """
 52 |         Effectively loads the model to GPU memory given the weights file that was passed in the
 53 |         constructor.
 54 |         """
 55 |         if self._low_mem:
 56 |             raise Exception("Cannot load the synthesizer permanently in low mem mode")
 57 |         tf.reset_default_graph()
 58 |         self._model = Tacotron2(self.checkpoint_fpath, hparams)
 59 |             
 60 |     def synthesize_spectrograms(self, texts: List[str],
 61 |                                 embeddings: Union[np.ndarray, List[np.ndarray]],
 62 |                                 return_alignments=False):
 63 |         """
 64 |         Synthesizes mel spectrograms from texts and speaker embeddings.
 65 | 
 66 |         :param texts: a list of N text prompts to be synthesized
 67 |         :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
 68 |         :param return_alignments: if True, a matrix representing the alignments between the 
 69 |         characters
 70 |         and each decoder output step will be returned for each spectrogram
 71 |         :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
 72 |         sequence length of spectrogram i, and possibly the alignments.
 73 |         """
 74 |         if not self._low_mem:
 75 |             # Usual inference mode: load the model on the first request and keep it loaded.
 76 |             if not self.is_loaded():
 77 |                 self.load()
 78 |             specs, alignments = self._model.my_synthesize(embeddings, texts)
 79 |         else:
 80 |             # Low memory inference mode: load the model upon every request. The model has to be 
 81 |             # loaded in a separate process to be able to release GPU memory (a simple workaround 
 82 |             # to tensorflow's intricacies)
 83 |             specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 
 84 |                                                 [(self.checkpoint_fpath, embeddings, texts)])[0]
 85 |     
 86 |         return (specs, alignments) if return_alignments else specs
 87 | 
 88 |     @staticmethod
 89 |     def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
 90 |         # Load the model and forward the inputs
 91 |         tf.reset_default_graph()
 92 |         model = Tacotron2(checkpoint_fpath, hparams)
 93 |         specs, alignments = model.my_synthesize(embeddings, texts)
 94 |         
 95 |         # Detach the outputs (not doing so will cause the process to hang)
 96 |         specs, alignments = [spec.copy() for spec in specs], alignments.copy()
 97 |         
 98 |         # Close cuda for this process
 99 |         model.session.close()
100 |         numba.cuda.select_device(0)
101 |         numba.cuda.close()
102 |         
103 |         return specs, alignments
104 | 
105 |     @staticmethod
106 |     def load_preprocess_wav(fpath):
107 |         """
108 |         Loads and preprocesses an audio file under the same conditions the audio files were used to
109 |         train the synthesizer. 
110 |         """
111 |         wav = librosa.load(fpath, hparams.sample_rate)[0]
112 |         if hparams.rescale:
113 |             wav = wav / np.abs(wav).max() * hparams.rescaling_max
114 |         return wav
115 | 
116 |     @staticmethod
117 |     def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
118 |         """
119 |         Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
120 |         were fed to the synthesizer when training.
121 |         """
122 |         if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
123 |             wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
124 |         else:
125 |             wav = fpath_or_wav
126 |         
127 |         mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
128 |         return mel_spectrogram
129 |     
130 |     @staticmethod
131 |     def griffin_lim(mel):
132 |         """
133 |         Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
134 |         with the same parameters present in hparams.py.
135 |         """
136 |         return audio.inv_mel_spectrogram(mel, hparams)
137 |     


--------------------------------------------------------------------------------
/synthesizer/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import json
 3 | from datetime import datetime
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | _format = "%Y-%m-%d %H:%M:%S.%f"
 8 | _file = None
 9 | _run_name = None
10 | _slack_url = None
11 | 
12 | 
13 | def init(filename, run_name, slack_url=None):
14 | 	global _file, _run_name, _slack_url
15 | 	_close_logfile()
16 | 	_file = open(filename, "a")
17 | 	_file = open(filename, "a")
18 | 	_file.write("\n-----------------------------------------------------------------\n")
19 | 	_file.write("Starting new {} training run\n".format(run_name))
20 | 	_file.write("-----------------------------------------------------------------\n")
21 | 	_run_name = run_name
22 | 	_slack_url = slack_url
23 | 
24 | 
25 | def log(msg, end="\n", slack=False):
26 | 	print(msg, end=end)
27 | 	if _file is not None:
28 | 		_file.write("[%s]  %s\n" % (datetime.now().strftime(_format)[:-3], msg))
29 | 	if slack and _slack_url is not None:
30 | 		Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 | 	global _file
35 | 	if _file is not None:
36 | 		_file.close()
37 | 		_file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 | 	req = Request(_slack_url)
42 | 	req.add_header("Content-Type", "application/json")
43 | 	urlopen(req, json.dumps({
44 | 		"username": "tacotron",
45 | 		"icon_emoji": ":taco:",
46 | 		"text": "*%s*: %s" % (_run_name, msg)
47 | 	}).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/synthesizer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tacotron import Tacotron
2 | 
3 | 
4 | def create_model(name, hparams):
5 |   if name == "Tacotron":
6 |     return Tacotron(hparams)
7 |   else:
8 |     raise Exception("Unknown model: " + name)
9 | 


--------------------------------------------------------------------------------
/synthesizer/models/architecture_wrappers.py:
--------------------------------------------------------------------------------
  1 | """A set of wrappers useful for tacotron 2 architecture
  2 | All notations and variable names were used in concordance with originial tensorflow implementation
  3 | """
  4 | import collections
  5 | import tensorflow as tf
  6 | from synthesizer.models.attention import _compute_attention
  7 | from tensorflow.contrib.rnn import RNNCell
  8 | from tensorflow.python.framework import ops, tensor_shape
  9 | from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
 10 | from tensorflow.python.util import nest
 11 | 
 12 | _zero_state_tensors = rnn_cell_impl._zero_state_tensors
 13 | 
 14 | 
 15 | 
 16 | class TacotronEncoderCell(RNNCell):
 17 | 	"""Tacotron 2 Encoder Cell
 18 | 	Passes inputs through a stack of convolutional layers then through a bidirectional LSTM
 19 | 	layer to predict the hidden representation vector (or memory)
 20 | 	"""
 21 | 
 22 | 	def __init__(self, convolutional_layers, lstm_layer):
 23 | 		"""Initialize encoder parameters
 24 | 
 25 | 		Args:
 26 | 			convolutional_layers: Encoder convolutional block class
 27 | 			lstm_layer: encoder bidirectional lstm layer class
 28 | 		"""
 29 | 		super(TacotronEncoderCell, self).__init__()
 30 | 		#Initialize encoder layers
 31 | 		self._convolutions = convolutional_layers
 32 | 		self._cell = lstm_layer
 33 | 
 34 | 	def __call__(self, inputs, input_lengths=None):
 35 | 		#Pass input sequence through a stack of convolutional layers
 36 | 		conv_output = self._convolutions(inputs)
 37 | 
 38 | 		#Extract hidden representation from encoder lstm cells
 39 | 		hidden_representation = self._cell(conv_output, input_lengths)
 40 | 
 41 | 		#For shape visualization
 42 | 		self.conv_output_shape = conv_output.shape
 43 | 		return hidden_representation
 44 | 
 45 | 
 46 | class TacotronDecoderCellState(
 47 | 	collections.namedtuple("TacotronDecoderCellState",
 48 | 	 ("cell_state", "attention", "time", "alignments",
 49 | 	  "alignment_history"))):
 50 | 	"""`namedtuple` storing the state of a `TacotronDecoderCell`.
 51 | 	Contains:
 52 | 	  - `cell_state`: The state of the wrapped `RNNCell` at the previous time
 53 | 		step.
 54 | 	  - `attention`: The attention emitted at the previous time step.
 55 | 	  - `time`: int32 scalar containing the current time step.
 56 | 	  - `alignments`: A single or tuple of `Tensor`(s) containing the alignments
 57 | 		 emitted at the previous time step for each attention mechanism.
 58 | 	  - `alignment_history`: a single or tuple of `TensorArray`(s)
 59 | 		 containing alignment matrices from all time steps for each attention
 60 | 		 mechanism. Call `stack()` on each to convert to a `Tensor`.
 61 | 	"""
 62 | 	def replace(self, **kwargs):
 63 | 		"""Clones the current state while overwriting components provided by kwargs.
 64 | 		"""
 65 | 		return super(TacotronDecoderCellState, self)._replace(**kwargs)
 66 | 
 67 | class TacotronDecoderCell(RNNCell):
 68 | 	"""Tactron 2 Decoder Cell
 69 | 	Decodes encoder output and previous mel frames into next r frames
 70 | 
 71 | 	Decoder Step i:
 72 | 		1) Prenet to compress last output information
 73 | 		2) Concat compressed inputs with previous context vector (input feeding) *
 74 | 		3) Decoder RNN (actual decoding) to predict current state s_{i} *
 75 | 		4) Compute new context vector c_{i} based on s_{i} and a cumulative sum of previous alignments *
 76 | 		5) Predict new output y_{i} using s_{i} and c_{i} (concatenated)
 77 | 		6) Predict <stop_token> output ys_{i} using s_{i} and c_{i} (concatenated)
 78 | 
 79 | 	* : This is typically taking a vanilla LSTM, wrapping it using tensorflow"s attention wrapper,
 80 | 	and wrap that with the prenet before doing an input feeding, and with the prediction layer
 81 | 	that uses RNN states to project on output space. Actions marked with (*) can be replaced with
 82 | 	tensorflow"s attention wrapper call if it was using cumulative alignments instead of previous alignments only.
 83 | 	"""
 84 | 
 85 | 	def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop_projection):
 86 | 		"""Initialize decoder parameters
 87 | 
 88 | 		Args:
 89 | 		    prenet: A tensorflow fully connected layer acting as the decoder pre-net
 90 | 		    attention_mechanism: A _BaseAttentionMechanism instance, usefull to
 91 | 			    learn encoder-decoder alignments
 92 | 		    rnn_cell: Instance of RNNCell, main body of the decoder
 93 | 		    frame_projection: tensorflow fully connected layer with r * num_mels output units
 94 | 		    stop_projection: tensorflow fully connected layer, expected to project to a scalar
 95 | 			    and through a sigmoid activation
 96 | 			mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
 97 | 		"""
 98 | 		super(TacotronDecoderCell, self).__init__()
 99 | 		#Initialize decoder layers
100 | 		self._prenet = prenet
101 | 		self._attention_mechanism = attention_mechanism
102 | 		self._cell = rnn_cell
103 | 		self._frame_projection = frame_projection
104 | 		self._stop_projection = stop_projection
105 | 
106 | 		self._attention_layer_size = self._attention_mechanism.values.get_shape()[-1].value
107 | 
108 | 	def _batch_size_checks(self, batch_size, error_message):
109 | 		return [check_ops.assert_equal(batch_size,
110 | 		  self._attention_mechanism.batch_size,
111 | 		  message=error_message)]
112 | 
113 | 	@property
114 | 	def output_size(self):
115 | 		return self._frame_projection.shape
116 | 
117 | 	@property
118 | 	def state_size(self):
119 | 		"""The `state_size` property of `TacotronDecoderCell`.
120 | 
121 | 		Returns:
122 | 		  An `TacotronDecoderCell` tuple containing shapes used by this object.
123 | 		"""
124 | 		return TacotronDecoderCellState(
125 | 			cell_state=self._cell._cell.state_size,
126 | 			time=tensor_shape.TensorShape([]),
127 | 			attention=self._attention_layer_size,
128 | 			alignments=self._attention_mechanism.alignments_size,
129 | 			alignment_history=())
130 | 
131 | 	def zero_state(self, batch_size, dtype):
132 | 		"""Return an initial (zero) state tuple for this `AttentionWrapper`.
133 | 
134 | 		Args:
135 | 		  batch_size: `0D` integer tensor: the batch size.
136 | 		  dtype: The internal state data type.
137 | 		Returns:
138 | 		  An `TacotronDecoderCellState` tuple containing zeroed out tensors and,
139 | 		  possibly, empty `TensorArray` objects.
140 | 		Raises:
141 | 		  ValueError: (or, possibly at runtime, InvalidArgument), if
142 | 			`batch_size` does not match the output size of the encoder passed
143 | 			to the wrapper object at initialization time.
144 | 		"""
145 | 		with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
146 | 			cell_state = self._cell._cell.zero_state(batch_size, dtype)
147 | 			error_message = (
148 | 				"When calling zero_state of TacotronDecoderCell %s: " % self._base_name +
149 | 				"Non-matching batch sizes between the memory "
150 | 				"(encoder output) and the requested batch size.")
151 | 			with ops.control_dependencies(
152 | 				self._batch_size_checks(batch_size, error_message)):
153 | 				cell_state = nest.map_structure(
154 | 					lambda s: array_ops.identity(s, name="checked_cell_state"),
155 | 					cell_state)
156 | 			return TacotronDecoderCellState(
157 | 				cell_state=cell_state,
158 | 				time=array_ops.zeros([], dtype=tf.int32),
159 | 				attention=_zero_state_tensors(self._attention_layer_size, batch_size,
160 | 				  dtype),
161 | 				alignments=self._attention_mechanism.initial_alignments(batch_size, dtype),
162 | 				alignment_history=tensor_array_ops.TensorArray(dtype=dtype, size=0,
163 | 				dynamic_size=True))
164 | 
165 | 	def __call__(self, inputs, state):
166 | 		#Information bottleneck (essential for learning attention)
167 | 		prenet_output = self._prenet(inputs)
168 | 
169 | 		#Concat context vector and prenet output to form LSTM cells input (input feeding)
170 | 		LSTM_input = tf.concat([prenet_output, state.attention], axis=-1)
171 | 
172 | 		#Unidirectional LSTM layers
173 | 		LSTM_output, next_cell_state = self._cell(LSTM_input, state.cell_state)
174 | 
175 | 
176 | 		#Compute the attention (context) vector and alignments using
177 | 		#the new decoder cell hidden state as query vector
178 | 		#and cumulative alignments to extract location features
179 | 		#The choice of the new cell hidden state (s_{i}) of the last
180 | 		#decoder RNN Cell is based on Luong et Al. (2015):
181 | 		#https://arxiv.org/pdf/1508.04025.pdf
182 | 		previous_alignments = state.alignments
183 | 		previous_alignment_history = state.alignment_history
184 | 		context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
185 | 			LSTM_output,
186 | 			previous_alignments,
187 | 			attention_layer=None)
188 | 
189 | 		#Concat LSTM outputs and context vector to form projections inputs
190 | 		projections_input = tf.concat([LSTM_output, context_vector], axis=-1)
191 | 
192 | 		#Compute predicted frames and predicted <stop_token>
193 | 		cell_outputs = self._frame_projection(projections_input)
194 | 		stop_tokens = self._stop_projection(projections_input)
195 | 
196 | 		#Save alignment history
197 | 		alignment_history = previous_alignment_history.write(state.time, alignments)
198 | 
199 | 		#Prepare next decoder state
200 | 		next_state = TacotronDecoderCellState(
201 | 			time=state.time + 1,
202 | 			cell_state=next_cell_state,
203 | 			attention=context_vector,
204 | 			alignments=cumulated_alignments,
205 | 			alignment_history=alignment_history)
206 | 
207 | 		return (cell_outputs, stop_tokens), next_state
208 | 


--------------------------------------------------------------------------------
/synthesizer/models/attention.py:
--------------------------------------------------------------------------------
  1 | """Attention file for location based attention (compatible with tensorflow attention wrapper)"""
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import BahdanauAttention
  5 | from tensorflow.python.layers import core as layers_core
  6 | from tensorflow.python.ops import array_ops, math_ops, nn_ops, variable_scope
  7 | 
  8 | 
  9 | #From https://github.com/tensorflow/tensorflow/blob/r1.7/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
 10 | def _compute_attention(attention_mechanism, cell_output, attention_state,
 11 | 					   attention_layer):
 12 | 	"""Computes the attention and alignments for a given attention_mechanism."""
 13 | 	alignments, next_attention_state = attention_mechanism(
 14 | 		cell_output, state=attention_state)
 15 | 
 16 | 	# Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
 17 | 	expanded_alignments = array_ops.expand_dims(alignments, 1)
 18 | 	# Context is the inner product of alignments and values along the
 19 | 	# memory time dimension.
 20 | 	# alignments shape is
 21 | 	#   [batch_size, 1, memory_time]
 22 | 	# attention_mechanism.values shape is
 23 | 	#   [batch_size, memory_time, memory_size]
 24 | 	# the batched matmul is over memory_time, so the output shape is
 25 | 	#   [batch_size, 1, memory_size].
 26 | 	# we then squeeze out the singleton dim.
 27 | 	context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
 28 | 	context = array_ops.squeeze(context, [1])
 29 | 
 30 | 	if attention_layer is not None:
 31 | 		attention = attention_layer(array_ops.concat([cell_output, context], 1))
 32 | 	else:
 33 | 		attention = context
 34 | 
 35 | 	return attention, alignments, next_attention_state
 36 | 
 37 | 
 38 | def _location_sensitive_score(W_query, W_fil, W_keys):
 39 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 40 | 	This attention is described in:
 41 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 42 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 43 | 	  vances in Neural Information Processing Systems, 2015, pp.
 44 | 	  577–585.
 45 | 
 46 | 	#############################################################################
 47 | 			  hybrid attention (content-based + location-based)
 48 | 							   f = F * α_{i-1}
 49 | 	   energy = dot(v_a, tanh(W_keys(h_enc) + W_query(h_dec) + W_fil(f) + b_a))
 50 | 	#############################################################################
 51 | 
 52 | 	Args:
 53 | 		W_query: Tensor, shape "[batch_size, 1, attention_dim]" to compare to location features.
 54 | 		W_location: processed previous alignments into location features, shape "[batch_size, max_time, attention_dim]"
 55 | 		W_keys: Tensor, shape "[batch_size, max_time, attention_dim]", typically the encoder outputs.
 56 | 	Returns:
 57 | 		A "[batch_size, max_time]" attention score (energy)
 58 | 	"""
 59 | 	# Get the number of hidden units from the trailing dimension of keys
 60 | 	dtype = W_query.dtype
 61 | 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 62 | 
 63 | 	v_a = tf.get_variable(
 64 | 		"attention_variable_projection", shape=[num_units], dtype=dtype,
 65 | 		initializer=tf.contrib.layers.xavier_initializer())
 66 | 	b_a = tf.get_variable(
 67 | 		"attention_bias", shape=[num_units], dtype=dtype,
 68 | 		initializer=tf.zeros_initializer())
 69 | 
 70 | 	return tf.reduce_sum(v_a * tf.tanh(W_keys + W_query + W_fil + b_a), [2])
 71 | 
 72 | def _smoothing_normalization(e):
 73 | 	"""Applies a smoothing normalization function instead of softmax
 74 | 	Introduced in:
 75 | 		J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
 76 | 	  gio, “Attention-based models for speech recognition,” in Ad-
 77 | 	  vances in Neural Information Processing Systems, 2015, pp.
 78 | 	  577–585.
 79 | 
 80 | 	############################################################################
 81 | 						Smoothing normalization function
 82 | 				a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
 83 | 	############################################################################
 84 | 
 85 | 	Args:
 86 | 		e: matrix [batch_size, max_time(memory_time)]: expected to be energy (score)
 87 | 			values of an attention mechanism
 88 | 	Returns:
 89 | 		matrix [batch_size, max_time]: [0, 1] normalized alignments with possible
 90 | 			attendance to multiple memory time steps.
 91 | 	"""
 92 | 	return tf.nn.sigmoid(e) / tf.reduce_sum(tf.nn.sigmoid(e), axis=-1, keepdims=True)
 93 | 
 94 | 
 95 | class LocationSensitiveAttention(BahdanauAttention):
 96 | 	"""Impelements Bahdanau-style (cumulative) scoring function.
 97 | 	Usually referred to as "hybrid" attention (content-based + location-based)
 98 | 	Extends the additive attention described in:
 99 | 	"D. Bahdanau, K. Cho, and Y. Bengio, “Neural machine transla-
100 |   tion by jointly learning to align and translate,” in Proceedings
101 |   of ICLR, 2015."
102 | 	to use previous alignments as additional location features.
103 | 
104 | 	This attention is described in:
105 | 	J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
106 |   gio, “Attention-based models for speech recognition,” in Ad-
107 |   vances in Neural Information Processing Systems, 2015, pp.
108 |   577–585.
109 | 	"""
110 | 
111 | 	def __init__(self,
112 | 				 num_units,
113 | 				 memory,
114 | 				 hparams,
115 | 				 mask_encoder=True,
116 | 				 memory_sequence_length=None,
117 | 				 smoothing=False,
118 | 				 cumulate_weights=True,
119 | 				 name="LocationSensitiveAttention"):
120 | 		"""Construct the Attention mechanism.
121 | 		Args:
122 | 			num_units: The depth of the query mechanism.
123 | 			memory: The memory to query; usually the output of an RNN encoder.  This
124 | 				tensor should be shaped `[batch_size, max_time, ...]`.
125 | 			mask_encoder (optional): Boolean, whether to mask encoder paddings.
126 | 			memory_sequence_length (optional): Sequence lengths for the batch entries
127 | 				in memory.  If provided, the memory tensor rows are masked with zeros
128 | 				for values past the respective sequence lengths. Only relevant if mask_encoder = True.
129 | 			smoothing (optional): Boolean. Determines which normalization function to use.
130 | 				Default normalization function (probablity_fn) is softmax. If smoothing is
131 | 				enabled, we replace softmax with:
132 | 						a_{i, j} = sigmoid(e_{i, j}) / sum_j(sigmoid(e_{i, j}))
133 | 				Introduced in:
134 | 					J. K. Chorowski, D. Bahdanau, D. Serdyuk, K. Cho, and Y. Ben-
135 | 				  gio, “Attention-based models for speech recognition,” in Ad-
136 | 				  vances in Neural Information Processing Systems, 2015, pp.
137 | 				  577–585.
138 | 				This is mainly used if the model wants to attend to multiple input parts
139 | 				at the same decoding step. We probably won"t be using it since multiple sound
140 | 				frames may depend on the same character/phone, probably not the way around.
141 | 				Note:
142 | 					We still keep it implemented in case we want to test it. They used it in the
143 | 					paper in the context of speech recognition, where one phoneme may depend on
144 | 					multiple subsequent sound frames.
145 | 			name: Name to use when creating ops.
146 | 		"""
147 | 		#Create normalization function
148 | 		#Setting it to None defaults in using softmax
149 | 		normalization_function = _smoothing_normalization if (smoothing == True) else None
150 | 		memory_length = memory_sequence_length if (mask_encoder==True) else None
151 | 		super(LocationSensitiveAttention, self).__init__(
152 | 				num_units=num_units,
153 | 				memory=memory,
154 | 				memory_sequence_length=memory_length,
155 | 				probability_fn=normalization_function,
156 | 				name=name)
157 | 
158 | 		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
159 | 			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
160 | 			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
161 | 		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
162 | 			dtype=tf.float32, name="location_features_layer")
163 | 		self._cumulate = cumulate_weights
164 | 
165 | 	def __call__(self, query, state):
166 | 		"""Score the query based on the keys and values.
167 | 		Args:
168 | 			query: Tensor of dtype matching `self.values` and shape
169 | 				`[batch_size, query_depth]`.
170 | 			state (previous alignments): Tensor of dtype matching `self.values` and shape
171 | 				`[batch_size, alignments_size]`
172 | 				(`alignments_size` is memory"s `max_time`).
173 | 		Returns:
174 | 			alignments: Tensor of dtype matching `self.values` and shape
175 | 				`[batch_size, alignments_size]` (`alignments_size` is memory's
176 | 				`max_time`).
177 | 		"""
178 | 		previous_alignments = state
179 | 		with variable_scope.variable_scope(None, "Location_Sensitive_Attention", [query]):
180 | 
181 | 			# processed_query shape [batch_size, query_depth] -> [batch_size, attention_dim]
182 | 			processed_query = self.query_layer(query) if self.query_layer else query
183 | 			# -> [batch_size, 1, attention_dim]
184 | 			processed_query = tf.expand_dims(processed_query, 1)
185 | 
186 | 			# processed_location_features shape [batch_size, max_time, attention dimension]
187 | 			# [batch_size, max_time] -> [batch_size, max_time, 1]
188 | 			expanded_alignments = tf.expand_dims(previous_alignments, axis=2)
189 | 			# location features [batch_size, max_time, filters]
190 | 			f = self.location_convolution(expanded_alignments)
191 | 			# Projected location features [batch_size, max_time, attention_dim]
192 | 			processed_location_features = self.location_layer(f)
193 | 
194 | 			# energy shape [batch_size, max_time]
195 | 			energy = _location_sensitive_score(processed_query, processed_location_features, self.keys)
196 | 
197 | 
198 | 		# alignments shape = energy shape = [batch_size, max_time]
199 | 		alignments = self._probability_fn(energy, previous_alignments)
200 | 
201 | 		# Cumulate alignments
202 | 		if self._cumulate:
203 | 			next_state = alignments + previous_alignments
204 | 		else:
205 | 			next_state = alignments
206 | 
207 | 		return alignments, next_state
208 | 


--------------------------------------------------------------------------------
/synthesizer/models/custom_decoder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | import collections
  3 | import tensorflow as tf
  4 | from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
  5 | from tensorflow.contrib.seq2seq.python.ops import decoder
  6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
  7 | from tensorflow.python.framework import ops, tensor_shape
  8 | from tensorflow.python.layers import base as layers_base
  9 | from tensorflow.python.ops import rnn_cell_impl
 10 | from tensorflow.python.util import nest
 11 | 
 12 | 
 13 | class CustomDecoderOutput(
 14 | 		collections.namedtuple("CustomDecoderOutput", ("rnn_output", "token_output", "sample_id"))):
 15 | 	pass
 16 | 
 17 | 
 18 | class CustomDecoder(decoder.Decoder):
 19 | 	"""Custom sampling decoder.
 20 | 
 21 | 	Allows for stop token prediction at inference time
 22 | 	and returns equivalent loss in training time.
 23 | 
 24 | 	Note:
 25 | 	Only use this decoder with Tacotron 2 as it only accepts tacotron custom helpers
 26 | 	"""
 27 | 
 28 | 	def __init__(self, cell, helper, initial_state, output_layer=None):
 29 | 		"""Initialize CustomDecoder.
 30 | 		Args:
 31 | 			cell: An `RNNCell` instance.
 32 | 			helper: A `Helper` instance.
 33 | 			initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 34 | 				The initial state of the RNNCell.
 35 | 			output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 36 | 				`tf.layers.Dense`. Optional layer to apply to the RNN output prior
 37 | 				to storing the result or sampling.
 38 | 		Raises:
 39 | 			TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 40 | 		"""
 41 | 		rnn_cell_impl.assert_like_rnncell(type(cell), cell)
 42 | 		if not isinstance(helper, helper_py.Helper):
 43 | 			raise TypeError("helper must be a Helper, received: %s" % type(helper))
 44 | 		if (output_layer is not None
 45 | 				and not isinstance(output_layer, layers_base.Layer)):
 46 | 			raise TypeError(
 47 | 					"output_layer must be a Layer, received: %s" % type(output_layer))
 48 | 		self._cell = cell
 49 | 		self._helper = helper
 50 | 		self._initial_state = initial_state
 51 | 		self._output_layer = output_layer
 52 | 
 53 | 	@property
 54 | 	def batch_size(self):
 55 | 		return self._helper.batch_size
 56 | 
 57 | 	def _rnn_output_size(self):
 58 | 		size = self._cell.output_size
 59 | 		if self._output_layer is None:
 60 | 			return size
 61 | 		else:
 62 | 			# To use layer"s compute_output_shape, we need to convert the
 63 | 			# RNNCell"s output_size entries into shapes with an unknown
 64 | 			# batch size.  We then pass this through the layer"s
 65 | 			# compute_output_shape and read off all but the first (batch)
 66 | 			# dimensions to get the output size of the rnn with the layer
 67 | 			# applied to the top.
 68 | 			output_shape_with_unknown_batch = nest.map_structure(
 69 | 					lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 70 | 					size)
 71 | 			layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
 72 | 					output_shape_with_unknown_batch)
 73 | 			return nest.map_structure(lambda s: s[1:], layer_output_shape)
 74 | 
 75 | 	@property
 76 | 	def output_size(self):
 77 | 		# Return the cell output and the id
 78 | 		return CustomDecoderOutput(
 79 | 				rnn_output=self._rnn_output_size(),
 80 | 				token_output=self._helper.token_output_size,
 81 | 				sample_id=self._helper.sample_ids_shape)
 82 | 
 83 | 	@property
 84 | 	def output_dtype(self):
 85 | 		# Assume the dtype of the cell is the output_size structure
 86 | 		# containing the input_state"s first component's dtype.
 87 | 		# Return that structure and the sample_ids_dtype from the helper.
 88 | 		dtype = nest.flatten(self._initial_state)[0].dtype
 89 | 		return CustomDecoderOutput(
 90 | 				nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 91 | 				tf.float32,
 92 | 				self._helper.sample_ids_dtype)
 93 | 
 94 | 	def initialize(self, name=None):
 95 | 		"""Initialize the decoder.
 96 | 		Args:
 97 | 			name: Name scope for any created operations.
 98 | 		Returns:
 99 | 			`(finished, first_inputs, initial_state)`.
100 | 		"""
101 | 		return self._helper.initialize() + (self._initial_state,)
102 | 
103 | 	def step(self, time, inputs, state, name=None):
104 | 		"""Perform a custom decoding step.
105 | 		Enables for dyanmic <stop_token> prediction
106 | 		Args:
107 | 			time: scalar `int32` tensor.
108 | 			inputs: A (structure of) input tensors.
109 | 			state: A (structure of) state tensors and TensorArrays.
110 | 			name: Name scope for any created operations.
111 | 		Returns:
112 | 			`(outputs, next_state, next_inputs, finished)`.
113 | 		"""
114 | 		with ops.name_scope(name, "CustomDecoderStep", (time, inputs, state)):
115 | 			#Call outputprojection wrapper cell
116 | 			(cell_outputs, stop_token), cell_state = self._cell(inputs, state)
117 | 
118 | 			#apply output_layer (if existant)
119 | 			if self._output_layer is not None:
120 | 				cell_outputs = self._output_layer(cell_outputs)
121 | 			sample_ids = self._helper.sample(
122 | 					time=time, outputs=cell_outputs, state=cell_state)
123 | 
124 | 			(finished, next_inputs, next_state) = self._helper.next_inputs(
125 | 					time=time,
126 | 					outputs=cell_outputs,
127 | 					state=cell_state,
128 | 					sample_ids=sample_ids,
129 | 					stop_token_prediction=stop_token)
130 | 
131 | 		outputs = CustomDecoderOutput(cell_outputs, stop_token, sample_ids)
132 | 		return (outputs, next_state, next_inputs, finished)
133 | 


--------------------------------------------------------------------------------
/synthesizer/models/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.seq2seq import Helper
  4 | 
  5 | 
  6 | class TacoTestHelper(Helper):
  7 | 	def __init__(self, batch_size, hparams):
  8 | 		with tf.name_scope("TacoTestHelper"):
  9 | 			self._batch_size = batch_size
 10 | 			self._output_dim = hparams.num_mels
 11 | 			self._reduction_factor = hparams.outputs_per_step
 12 | 			self.stop_at_any = hparams.stop_at_any
 13 | 
 14 | 	@property
 15 | 	def batch_size(self):
 16 | 		return self._batch_size
 17 | 
 18 | 	@property
 19 | 	def token_output_size(self):
 20 | 		return self._reduction_factor
 21 | 
 22 | 	@property
 23 | 	def sample_ids_shape(self):
 24 | 		return tf.TensorShape([])
 25 | 
 26 | 	@property
 27 | 	def sample_ids_dtype(self):
 28 | 		return np.int32
 29 | 
 30 | 	def initialize(self, name=None):
 31 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
 32 | 
 33 | 	def sample(self, time, outputs, state, name=None):
 34 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
 35 | 
 36 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
 37 | 		"""Stop on EOS. Otherwise, pass the last output as the next input and pass through state."""
 38 | 		with tf.name_scope("TacoTestHelper"):
 39 | 			#A sequence is finished when the output probability is > 0.5
 40 | 			finished = tf.cast(tf.round(stop_token_prediction), tf.bool)
 41 | 
 42 | 			#Since we are predicting r frames at each step, two modes are
 43 | 			#then possible:
 44 | 			#	Stop when the model outputs a p > 0.5 for any frame between r frames (Recommended)
 45 | 			#	Stop when the model outputs a p > 0.5 for all r frames (Safer)
 46 | 			#Note:
 47 | 			#	With enough training steps, the model should be able to predict when to stop correctly
 48 | 			#	and the use of stop_at_any = True would be recommended. If however the model didn"t
 49 | 			#	learn to stop correctly yet, (stops too soon) one could choose to use the safer option
 50 | 			#	to get a correct synthesis
 51 | 			if self.stop_at_any:
 52 | 				finished = tf.reduce_any(tf.reduce_all(finished, axis=0)) #Recommended
 53 | 			else:
 54 | 				finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option
 55 | 
 56 | 			# Feed last output frame as next input. outputs is [N, output_dim * r]
 57 | 			next_inputs = outputs[:, -self._output_dim:]
 58 | 			next_state = state
 59 | 			return (finished, next_inputs, next_state)
 60 | 
 61 | 
 62 | class TacoTrainingHelper(Helper):
 63 | 	def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):
 64 | 		# inputs is [N, T_in], targets is [N, T_out, D]
 65 | 		with tf.name_scope("TacoTrainingHelper"):
 66 | 			self._batch_size = batch_size
 67 | 			self._output_dim = hparams.num_mels
 68 | 			self._reduction_factor = hparams.outputs_per_step
 69 | 			self._ratio = tf.convert_to_tensor(hparams.tacotron_teacher_forcing_ratio)
 70 | 			self.gta = gta
 71 | 			self.eval = evaluating
 72 | 			self._hparams = hparams
 73 | 			self.global_step = global_step
 74 | 
 75 | 			r = self._reduction_factor
 76 | 			# Feed every r-th target frame as input
 77 | 			self._targets = targets[:, r-1::r, :]
 78 | 
 79 | 			#Maximal sequence length
 80 | 			self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
 81 | 
 82 | 	@property
 83 | 	def batch_size(self):
 84 | 		return self._batch_size
 85 | 
 86 | 	@property
 87 | 	def token_output_size(self):
 88 | 		return self._reduction_factor
 89 | 
 90 | 	@property
 91 | 	def sample_ids_shape(self):
 92 | 		return tf.TensorShape([])
 93 | 
 94 | 	@property
 95 | 	def sample_ids_dtype(self):
 96 | 		return np.int32
 97 | 
 98 | 	def initialize(self, name=None):
 99 | 		#Compute teacher forcing ratio for this global step.
100 | 		#In GTA mode, override teacher forcing scheme to work with full teacher forcing
101 | 		if self.gta:
102 | 			self._ratio = tf.convert_to_tensor(1.) #Force GTA model to always feed ground-truth
103 | 		elif self.eval and self._hparams.natural_eval:
104 | 			self._ratio = tf.convert_to_tensor(0.) #Force eval model to always feed predictions
105 | 		else:
106 | 			if self._hparams.tacotron_teacher_forcing_mode == "scheduled":
107 | 				self._ratio = _teacher_forcing_ratio_decay(self._hparams.tacotron_teacher_forcing_init_ratio,
108 | 					self.global_step, self._hparams)
109 | 
110 | 		return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
111 | 
112 | 	def sample(self, time, outputs, state, name=None):
113 | 		return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
114 | 
115 | 	def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, name=None):
116 | 		with tf.name_scope(name or "TacoTrainingHelper"):
117 | 			#synthesis stop (we let the model see paddings as we mask them when computing loss functions)
118 | 			finished = (time + 1 >= self._lengths)
119 | 
120 | 			#Pick previous outputs randomly with respect to teacher forcing ratio
121 | 			next_inputs = tf.cond(
122 | 				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
123 | 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
124 | 				lambda: outputs[:,-self._output_dim:])
125 | 
126 | 			#Pass on state
127 | 			next_state = state
128 | 			return (finished, next_inputs, next_state)
129 | 
130 | 
131 | def _go_frames(batch_size, output_dim):
132 | 	"""Returns all-zero <GO> frames for a given batch size and output dimension"""
133 | 	return tf.tile([[0.0]], [batch_size, output_dim])
134 | 
135 | def _teacher_forcing_ratio_decay(init_tfr, global_step, hparams):
136 | 		#################################################################
137 | 		# Narrow Cosine Decay:
138 | 
139 | 		# Phase 1: tfr = 1
140 | 		# We only start learning rate decay after 10k steps
141 | 
142 | 		# Phase 2: tfr in ]0, 1[
143 | 		# decay reach minimal value at step ~280k
144 | 
145 | 		# Phase 3: tfr = 0
146 | 		# clip by minimal teacher forcing ratio value (step >~ 280k)
147 | 		#################################################################
148 | 		#Compute natural cosine decay
149 | 		tfr = tf.train.cosine_decay(init_tfr,
150 | 			global_step=global_step - hparams.tacotron_teacher_forcing_start_decay, #tfr = 1 at step 10k
151 | 			decay_steps=hparams.tacotron_teacher_forcing_decay_steps, #tfr = 0 at step ~280k
152 | 			alpha=hparams.tacotron_teacher_forcing_decay_alpha, #tfr = 0% of init_tfr as final value
153 | 			name="tfr_cosine_decay")
154 | 
155 | 		#force teacher forcing ratio to take initial value when global step < start decay step.
156 | 		narrow_tfr = tf.cond(
157 | 			tf.less(global_step, tf.convert_to_tensor(hparams.tacotron_teacher_forcing_start_decay)),
158 | 			lambda: tf.convert_to_tensor(init_tfr),
159 | 			lambda: tfr)
160 | 
161 | 		return narrow_tfr


--------------------------------------------------------------------------------
/synthesizer/preprocess.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing.pool import Pool 
  2 | from synthesizer import audio
  3 | from functools import partial
  4 | from itertools import chain
  5 | from encoder import inference as encoder
  6 | from pathlib import Path
  7 | from utils import logmmse
  8 | from tqdm import tqdm
  9 | import numpy as np
 10 | import librosa
 11 | 
 12 | 
 13 | def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, 
 14 |                            skip_existing: bool, hparams):
 15 |     # Gather the input directories
 16 |     dataset_root = datasets_root.joinpath("LibriSpeech")
 17 |     input_dirs = [dataset_root.joinpath("train-clean-100"), 
 18 |                   dataset_root.joinpath("train-clean-360")]
 19 |     print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
 20 |     assert all(input_dir.exists() for input_dir in input_dirs)
 21 |     
 22 |     # Create the output directories for each output file type
 23 |     out_dir.joinpath("mels").mkdir(exist_ok=True)
 24 |     out_dir.joinpath("audio").mkdir(exist_ok=True)
 25 |     
 26 |     # Create a metadata file
 27 |     metadata_fpath = out_dir.joinpath("train.txt")
 28 |     metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
 29 | 
 30 |     # Preprocess the dataset
 31 |     speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
 32 |     func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
 33 |                    hparams=hparams)
 34 |     job = Pool(n_processes).imap(func, speaker_dirs)
 35 |     for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
 36 |         for metadatum in speaker_metadata:
 37 |             metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
 38 |     metadata_file.close()
 39 | 
 40 |     # Verify the contents of the metadata file
 41 |     with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
 42 |         metadata = [line.split("|") for line in metadata_file]
 43 |     mel_frames = sum([int(m[4]) for m in metadata])
 44 |     timesteps = sum([int(m[3]) for m in metadata])
 45 |     sample_rate = hparams.sample_rate
 46 |     hours = (timesteps / sample_rate) / 3600
 47 |     print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
 48 |           (len(metadata), mel_frames, timesteps, hours))
 49 |     print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
 50 |     print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
 51 |     print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
 52 | 
 53 | 
 54 | def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams):
 55 |     metadata = []
 56 |     for book_dir in speaker_dir.glob("*"):
 57 |         # Gather the utterance audios and texts
 58 |         try:
 59 |             alignments_fpath = next(book_dir.glob("*.alignment.txt"))
 60 |             with alignments_fpath.open("r") as alignments_file:
 61 |                 alignments = [line.rstrip().split(" ") for line in alignments_file]
 62 |         except StopIteration:
 63 |             # A few alignment files will be missing
 64 |             continue
 65 |         
 66 |         # Iterate over each entry in the alignments file
 67 |         for wav_fname, words, end_times in alignments:
 68 |             wav_fpath = book_dir.joinpath(wav_fname + ".flac")
 69 |             assert wav_fpath.exists()
 70 |             words = words.replace("\"", "").split(",")
 71 |             end_times = list(map(float, end_times.replace("\"", "").split(",")))
 72 |             
 73 |             # Process each sub-utterance
 74 |             wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams)
 75 |             for i, (wav, text) in enumerate(zip(wavs, texts)):
 76 |                 sub_basename = "%s_%02d" % (wav_fname, i)
 77 |                 metadata.append(process_utterance(wav, text, out_dir, sub_basename, 
 78 |                                                   skip_existing, hparams))
 79 |     
 80 |     return [m for m in metadata if m is not None]
 81 | 
 82 | 
 83 | def split_on_silences(wav_fpath, words, end_times, hparams):
 84 |     # Load the audio waveform
 85 |     wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
 86 |     if hparams.rescale:
 87 |         wav = wav / np.abs(wav).max() * hparams.rescaling_max
 88 |     
 89 |     words = np.array(words)
 90 |     start_times = np.array([0.0] + end_times[:-1])
 91 |     end_times = np.array(end_times)
 92 |     assert len(words) == len(end_times) == len(start_times)
 93 |     assert words[0] == "" and words[-1] == ""
 94 |     
 95 |     # Find pauses that are too long
 96 |     mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
 97 |     mask[0] = mask[-1] = True
 98 |     breaks = np.where(mask)[0]
 99 | 
100 |     # Profile the noise from the silences and perform noise reduction on the waveform
101 |     silence_times = [[start_times[i], end_times[i]] for i in breaks]
102 |     silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
103 |     noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
104 |     if len(noisy_wav) > hparams.sample_rate * 0.02:
105 |         profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
106 |         wav = logmmse.denoise(wav, profile, eta=0)
107 |     
108 |     # Re-attach segments that are too short
109 |     segments = list(zip(breaks[:-1], breaks[1:]))
110 |     segment_durations = [start_times[end] - end_times[start] for start, end in segments]
111 |     i = 0
112 |     while i < len(segments) and len(segments) > 1:
113 |         if segment_durations[i] < hparams.utterance_min_duration:
114 |             # See if the segment can be re-attached with the right or the left segment
115 |             left_duration = float("inf") if i == 0 else segment_durations[i - 1]
116 |             right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
117 |             joined_duration = segment_durations[i] + min(left_duration, right_duration)
118 | 
119 |             # Do not re-attach if it causes the joined utterance to be too long
120 |             if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
121 |                 i += 1
122 |                 continue
123 | 
124 |             # Re-attach the segment with the neighbour of shortest duration
125 |             j = i - 1 if left_duration <= right_duration else i
126 |             segments[j] = (segments[j][0], segments[j + 1][1])
127 |             segment_durations[j] = joined_duration
128 |             del segments[j + 1], segment_durations[j + 1]
129 |         else:
130 |             i += 1
131 |     
132 |     # Split the utterance
133 |     segment_times = [[end_times[start], start_times[end]] for start, end in segments]
134 |     segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
135 |     wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
136 |     texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
137 |     
138 |     # # DEBUG: play the audio segments (run with -n=1)
139 |     # import sounddevice as sd
140 |     # if len(wavs) > 1:
141 |     #     print("This sentence was split in %d segments:" % len(wavs))
142 |     # else:
143 |     #     print("There are no silences long enough for this sentence to be split:")
144 |     # for wav, text in zip(wavs, texts):
145 |     #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
146 |     #     # when playing them. You shouldn't need to do that in your parsers.
147 |     #     wav = np.concatenate((wav, [0] * 16000))
148 |     #     print("\t%s" % text)
149 |     #     sd.play(wav, 16000, blocking=True)
150 |     # print("")
151 |     
152 |     return wavs, texts
153 |     
154 |     
155 | def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
156 |                       skip_existing: bool, hparams):
157 |     ## FOR REFERENCE:
158 |     # For you not to lose your head if you ever wish to change things here or implement your own
159 |     # synthesizer.
160 |     # - Both the audios and the mel spectrograms are saved as numpy arrays
161 |     # - There is no processing done to the audios that will be saved to disk beyond volume  
162 |     #   normalization (in split_on_silences)
163 |     # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
164 |     #   is why we re-apply it on the audio on the side of the vocoder.
165 |     # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
166 |     #   without extra padding. This means that you won't have an exact relation between the length
167 |     #   of the wav and of the mel spectrogram. See the vocoder data loader.
168 |     
169 |     
170 |     # Skip existing utterances if needed
171 |     mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
172 |     wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
173 |     if skip_existing and mel_fpath.exists() and wav_fpath.exists():
174 |         return None
175 |     
176 |     # Skip utterances that are too short
177 |     if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
178 |         return None
179 |     
180 |     # Compute the mel spectrogram
181 |     mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
182 |     mel_frames = mel_spectrogram.shape[1]
183 |     
184 |     # Skip utterances that are too long
185 |     if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
186 |         return None
187 |     
188 |     # Write the spectrogram, embed and audio to disk
189 |     np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
190 |     np.save(wav_fpath, wav, allow_pickle=False)
191 |     
192 |     # Return a tuple describing this training example
193 |     return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
194 |  
195 |  
196 | def embed_utterance(fpaths, encoder_model_fpath):
197 |     if not encoder.is_loaded():
198 |         encoder.load_model(encoder_model_fpath)
199 | 
200 |     # Compute the speaker embedding of the utterance
201 |     wav_fpath, embed_fpath = fpaths
202 |     wav = np.load(wav_fpath)
203 |     wav = encoder.preprocess_wav(wav)
204 |     embed = encoder.embed_utterance(wav)
205 |     np.save(embed_fpath, embed, allow_pickle=False)
206 |     
207 |  
208 | def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
209 |     wav_dir = synthesizer_root.joinpath("audio")
210 |     metadata_fpath = synthesizer_root.joinpath("train.txt")
211 |     assert wav_dir.exists() and metadata_fpath.exists()
212 |     embed_dir = synthesizer_root.joinpath("embeds")
213 |     embed_dir.mkdir(exist_ok=True)
214 |     
215 |     # Gather the input wave filepath and the target output embed filepath
216 |     with metadata_fpath.open("r") as metadata_file:
217 |         metadata = [line.split("|") for line in metadata_file]
218 |         fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
219 |         
220 |     # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
221 |     # Embed the utterances in separate threads
222 |     func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
223 |     job = Pool(n_processes).imap(func, fpaths)
224 |     list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
225 |     


--------------------------------------------------------------------------------
/synthesizer/synthesize.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.tacotron2 import Tacotron2
 2 | from synthesizer.hparams import hparams_debug_string
 3 | from synthesizer.infolog import log
 4 | import tensorflow as tf
 5 | from tqdm import tqdm
 6 | import time
 7 | import os
 8 | 
 9 | 
10 | def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
11 |     eval_dir = os.path.join(output_dir, "eval")
12 |     log_dir = os.path.join(output_dir, "logs-eval")
13 |     
14 |     #Create output path if it doesn"t exist
15 |     os.makedirs(eval_dir, exist_ok=True)
16 |     os.makedirs(log_dir, exist_ok=True)
17 |     os.makedirs(os.path.join(log_dir, "wavs"), exist_ok=True)
18 |     os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
19 |     
20 |     log(hparams_debug_string())
21 |     synth = Tacotron2(checkpoint_path, hparams)
22 |     
23 |     #Set inputs batch wise
24 |     sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 
25 |                  in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]
26 |     
27 |     log("Starting Synthesis")
28 |     with open(os.path.join(eval_dir, "map.txt"), "w") as file:
29 |         for i, texts in enumerate(tqdm(sentences)):
30 |             start = time.time()
31 |             basenames = ["batch_{}_sentence_{}".format(i, j) for j in range(len(texts))]
32 |             mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None)
33 |             
34 |             for elems in zip(texts, mel_filenames, speaker_ids):
35 |                 file.write("|".join([str(x) for x in elems]) + "\n")
36 |     log("synthesized mel spectrograms at {}".format(eval_dir))
37 |     return eval_dir
38 | 
39 | def run_synthesis(in_dir, out_dir, model_dir, hparams):
40 |     synth_dir = os.path.join(out_dir, "mels_gta")
41 |     os.makedirs(synth_dir, exist_ok=True)
42 |     metadata_filename = os.path.join(in_dir, "train.txt")
43 |     print(hparams_debug_string())
44 |     
45 |     # Load the model in memory
46 |     weights_dir = os.path.join(model_dir, "taco_pretrained")
47 |     checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
48 |     synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
49 |     
50 |     # Load the metadata
51 |     with open(metadata_filename, encoding="utf-8") as f:
52 |         metadata = [line.strip().split("|") for line in f]
53 |         frame_shift_ms = hparams.hop_size / hparams.sample_rate
54 |         hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / 3600
55 |         print("Loaded metadata for {} examples ({:.2f} hours)".format(len(metadata), hours))
56 |         
57 |     #Set inputs batch wise
58 |     metadata = [metadata[i: i + hparams.tacotron_synthesis_batch_size] for i in
59 |                 range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
60 |     # TODO: come on big boy, fix this
61 |     # Quick and dirty fix to make sure that all batches have the same size 
62 |     metadata = metadata[:-1]
63 |     
64 |     print("Starting Synthesis")
65 |     mel_dir = os.path.join(in_dir, "mels")
66 |     embed_dir = os.path.join(in_dir, "embeds")
67 |     meta_out_fpath = os.path.join(out_dir, "synthesized.txt")
68 |     with open(meta_out_fpath, "w") as file:
69 |         for i, meta in enumerate(tqdm(metadata)):
70 |             texts = [m[5] for m in meta]
71 |             mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
72 |             embed_filenames = [os.path.join(embed_dir, m[2]) for m in meta]
73 |             basenames = [os.path.basename(m).replace(".npy", "").replace("mel-", "") 
74 |                          for m in mel_filenames]
75 |             synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, embed_filenames)
76 |             
77 |             for elems in meta:
78 |                 file.write("|".join([str(x) for x in elems]) + "\n")
79 |                 
80 |     print("Synthesized mel spectrograms at {}".format(synth_dir))
81 |     return meta_out_fpath
82 | 
83 | 


--------------------------------------------------------------------------------
/synthesizer/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class ValueWindow():
 2 |   def __init__(self, window_size=100):
 3 |     self._window_size = window_size
 4 |     self._values = []
 5 | 
 6 |   def append(self, x):
 7 |     self._values = self._values[-(self._window_size - 1):] + [x]
 8 | 
 9 |   @property
10 |   def sum(self):
11 |     return sum(self._values)
12 | 
13 |   @property
14 |   def count(self):
15 |     return len(self._values)
16 | 
17 |   @property
18 |   def average(self):
19 |     return self.sum / max(1, self.count)
20 | 
21 |   def reset(self):
22 |     self._values = []
23 | 


--------------------------------------------------------------------------------
/synthesizer/utils/_cmudict.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | valid_symbols = [
 4 |   "AA", "AA0", "AA1", "AA2", "AE", "AE0", "AE1", "AE2", "AH", "AH0", "AH1", "AH2",
 5 |   "AO", "AO0", "AO1", "AO2", "AW", "AW0", "AW1", "AW2", "AY", "AY0", "AY1", "AY2",
 6 |   "B", "CH", "D", "DH", "EH", "EH0", "EH1", "EH2", "ER", "ER0", "ER1", "ER2", "EY",
 7 |   "EY0", "EY1", "EY2", "F", "G", "HH", "IH", "IH0", "IH1", "IH2", "IY", "IY0", "IY1",
 8 |   "IY2", "JH", "K", "L", "M", "N", "NG", "OW", "OW0", "OW1", "OW2", "OY", "OY0",
 9 |   "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH", "UH0", "UH1", "UH2", "UW",
10 |   "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"
11 | ]
12 | 
13 | _valid_symbol_set = set(valid_symbols)
14 | 
15 | 
16 | class CMUDict:
17 |   """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
18 |   def __init__(self, file_or_path, keep_ambiguous=True):
19 |     if isinstance(file_or_path, str):
20 |       with open(file_or_path, encoding="latin-1") as f:
21 |         entries = _parse_cmudict(f)
22 |     else:
23 |       entries = _parse_cmudict(file_or_path)
24 |     if not keep_ambiguous:
25 |       entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
26 |     self._entries = entries
27 | 
28 | 
29 |   def __len__(self):
30 |     return len(self._entries)
31 | 
32 | 
33 |   def lookup(self, word):
34 |     """Returns list of ARPAbet pronunciations of the given word."""
35 |     return self._entries.get(word.upper())
36 | 
37 | 
38 | 
39 | _alt_re = re.compile(r"\([0-9]+\)")
40 | 
41 | 
42 | def _parse_cmudict(file):
43 |   cmudict = {}
44 |   for line in file:
45 |     if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
46 |       parts = line.split("  ")
47 |       word = re.sub(_alt_re, "", parts[0])
48 |       pronunciation = _get_pronunciation(parts[1])
49 |       if pronunciation:
50 |         if word in cmudict:
51 |           cmudict[word].append(pronunciation)
52 |         else:
53 |           cmudict[word] = [pronunciation]
54 |   return cmudict
55 | 
56 | 
57 | def _get_pronunciation(s):
58 |   parts = s.strip().split(" ")
59 |   for part in parts:
60 |     if part not in _valid_symbol_set:
61 |       return None
62 |   return " ".join(parts)
63 | 


--------------------------------------------------------------------------------
/synthesizer/utils/cleaners.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cleaners are transformations that run over the input text at both training and eval time.
 3 | 
 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 5 | hyperparameter. Some cleaners are English-specific. You"ll typically want to use:
 6 |   1. "english_cleaners" for English text
 7 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 8 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 9 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
10 |      the symbols in symbols.py to match your data).
11 | """
12 | 
13 | import re
14 | from unidecode import unidecode
15 | from .numbers import normalize_numbers
16 | 
17 | # Regular expression matching whitespace:
18 | _whitespace_re = re.compile(r"\s+")
19 | 
20 | # List of (regular expression, replacement) pairs for abbreviations:
21 | _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
22 |   ("mrs", "misess"),
23 |   ("mr", "mister"),
24 |   ("dr", "doctor"),
25 |   ("st", "saint"),
26 |   ("co", "company"),
27 |   ("jr", "junior"),
28 |   ("maj", "major"),
29 |   ("gen", "general"),
30 |   ("drs", "doctors"),
31 |   ("rev", "reverend"),
32 |   ("lt", "lieutenant"),
33 |   ("hon", "honorable"),
34 |   ("sgt", "sergeant"),
35 |   ("capt", "captain"),
36 |   ("esq", "esquire"),
37 |   ("ltd", "limited"),
38 |   ("col", "colonel"),
39 |   ("ft", "fort"),
40 | ]]
41 | 
42 | 
43 | def expand_abbreviations(text):
44 |   for regex, replacement in _abbreviations:
45 |     text = re.sub(regex, replacement, text)
46 |   return text
47 | 
48 | 
49 | def expand_numbers(text):
50 |   return normalize_numbers(text)
51 | 
52 | 
53 | def lowercase(text):
54 |   """lowercase input tokens."""
55 |   return text.lower()
56 | 
57 | 
58 | def collapse_whitespace(text):
59 |   return re.sub(_whitespace_re, " ", text)
60 | 
61 | 
62 | def convert_to_ascii(text):
63 |   return unidecode(text)
64 | 
65 | 
66 | def basic_cleaners(text):
67 |   """Basic pipeline that lowercases and collapses whitespace without transliteration."""
68 |   text = lowercase(text)
69 |   text = collapse_whitespace(text)
70 |   return text
71 | 
72 | 
73 | def transliteration_cleaners(text):
74 |   """Pipeline for non-English text that transliterates to ASCII."""
75 |   text = convert_to_ascii(text)
76 |   text = lowercase(text)
77 |   text = collapse_whitespace(text)
78 |   return text
79 | 
80 | 
81 | def english_cleaners(text):
82 |   """Pipeline for English text, including number and abbreviation expansion."""
83 |   text = convert_to_ascii(text)
84 |   text = lowercase(text)
85 |   text = expand_numbers(text)
86 |   text = expand_abbreviations(text)
87 |   text = collapse_whitespace(text)
88 |   return text
89 | 


--------------------------------------------------------------------------------
/synthesizer/utils/numbers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import inflect
 3 | 
 4 | _inflect = inflect.engine()
 5 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 6 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 7 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
 8 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
 9 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
10 | _number_re = re.compile(r"[0-9]+")
11 | 
12 | 
13 | def _remove_commas(m):
14 |   return m.group(1).replace(",", "")
15 | 
16 | 
17 | def _expand_decimal_point(m):
18 |   return m.group(1).replace(".", " point ")
19 | 
20 | 
21 | def _expand_dollars(m):
22 |   match = m.group(1)
23 |   parts = match.split(".")
24 |   if len(parts) > 2:
25 |     return match + " dollars"  # Unexpected format
26 |   dollars = int(parts[0]) if parts[0] else 0
27 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
28 |   if dollars and cents:
29 |     dollar_unit = "dollar" if dollars == 1 else "dollars"
30 |     cent_unit = "cent" if cents == 1 else "cents"
31 |     return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
32 |   elif dollars:
33 |     dollar_unit = "dollar" if dollars == 1 else "dollars"
34 |     return "%s %s" % (dollars, dollar_unit)
35 |   elif cents:
36 |     cent_unit = "cent" if cents == 1 else "cents"
37 |     return "%s %s" % (cents, cent_unit)
38 |   else:
39 |     return "zero dollars"
40 | 
41 | 
42 | def _expand_ordinal(m):
43 |   return _inflect.number_to_words(m.group(0))
44 | 
45 | 
46 | def _expand_number(m):
47 |   num = int(m.group(0))
48 |   if num > 1000 and num < 3000:
49 |     if num == 2000:
50 |       return "two thousand"
51 |     elif num > 2000 and num < 2010:
52 |       return "two thousand " + _inflect.number_to_words(num % 100)
53 |     elif num % 100 == 0:
54 |       return _inflect.number_to_words(num // 100) + " hundred"
55 |     else:
56 |       return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
57 |   else:
58 |     return _inflect.number_to_words(num, andword="")
59 | 
60 | 
61 | def normalize_numbers(text):
62 |   text = re.sub(_comma_number_re, _remove_commas, text)
63 |   text = re.sub(_pounds_re, r"\1 pounds", text)
64 |   text = re.sub(_dollars_re, _expand_dollars, text)
65 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
66 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
67 |   text = re.sub(_number_re, _expand_number, text)
68 |   return text
69 | 


--------------------------------------------------------------------------------
/synthesizer/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use("Agg")
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def split_title_line(title_text, max_words=5):
 8 | 	"""
 9 | 	A function that splits any string based on specific character
10 | 	(returning it with the string), with maximum number of words on it
11 | 	"""
12 | 	seq = title_text.split()
13 | 	return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
14 | 
15 | def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
16 | 	if max_len is not None:
17 | 		alignment = alignment[:, :max_len]
18 | 
19 | 	fig = plt.figure(figsize=(8, 6))
20 | 	ax = fig.add_subplot(111)
21 | 
22 | 	im = ax.imshow(
23 | 		alignment,
24 | 		aspect="auto",
25 | 		origin="lower",
26 | 		interpolation="none")
27 | 	fig.colorbar(im, ax=ax)
28 | 	xlabel = "Decoder timestep"
29 | 
30 | 	if split_title:
31 | 		title = split_title_line(title)
32 | 
33 | 	plt.xlabel(xlabel)
34 | 	plt.title(title)
35 | 	plt.ylabel("Encoder timestep")
36 | 	plt.tight_layout()
37 | 	plt.savefig(path, format="png")
38 | 	plt.close()
39 | 
40 | 
41 | def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
42 | 	if max_len is not None:
43 | 		target_spectrogram = target_spectrogram[:max_len]
44 | 		pred_spectrogram = pred_spectrogram[:max_len]
45 | 
46 | 	if split_title:
47 | 		title = split_title_line(title)
48 | 
49 | 	fig = plt.figure(figsize=(10, 8))
50 | 	# Set common labels
51 | 	fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
52 | 
53 | 	#target spectrogram subplot
54 | 	if target_spectrogram is not None:
55 | 		ax1 = fig.add_subplot(311)
56 | 		ax2 = fig.add_subplot(312)
57 | 
58 | 		if auto_aspect:
59 | 			im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
60 | 		else:
61 | 			im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
62 | 		ax1.set_title("Target Mel-Spectrogram")
63 | 		fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
64 | 		ax2.set_title("Predicted Mel-Spectrogram")
65 | 	else:
66 | 		ax2 = fig.add_subplot(211)
67 | 
68 | 	if auto_aspect:
69 | 		im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
70 | 	else:
71 | 		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
72 | 	fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
73 | 
74 | 	plt.tight_layout()
75 | 	plt.savefig(path, format="png")
76 | 	plt.close()
77 | 


--------------------------------------------------------------------------------
/synthesizer/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | """
 7 | # from . import cmudict
 8 | 
 9 | _pad        = "_"
10 | _eos        = "~"
11 | _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'\"(),-.:;? "
12 | 
13 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14 | #_arpabet = ["@' + s for s in cmudict.valid_symbols]
15 | 
16 | # Export all symbols:
17 | symbols = [_pad, _eos] + list(_characters) #+ _arpabet
18 | 


--------------------------------------------------------------------------------
/synthesizer/utils/text.py:
--------------------------------------------------------------------------------
 1 | from .symbols import symbols
 2 | from . import cleaners
 3 | import re
 4 | 
 5 | # Mappings from symbol to numeric ID and vice versa:
 6 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 7 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 8 | 
 9 | # Regular expression matching text enclosed in curly braces:
10 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
11 | 
12 | 
13 | def text_to_sequence(text, cleaner_names):
14 |   """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
15 | 
16 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
17 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
18 | 
19 |     Args:
20 |       text: string to convert to a sequence
21 |       cleaner_names: names of the cleaner functions to run the text through
22 | 
23 |     Returns:
24 |       List of integers corresponding to the symbols in the text
25 |   """
26 |   sequence = []
27 | 
28 |   # Check for curly braces and treat their contents as ARPAbet:
29 |   while len(text):
30 |     m = _curly_re.match(text)
31 |     if not m:
32 |       sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
33 |       break
34 |     sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
35 |     sequence += _arpabet_to_sequence(m.group(2))
36 |     text = m.group(3)
37 | 
38 |   # Append EOS token
39 |   sequence.append(_symbol_to_id["~"])
40 |   return sequence
41 | 
42 | 
43 | def sequence_to_text(sequence):
44 |   """Converts a sequence of IDs back to a string"""
45 |   result = ""
46 |   for symbol_id in sequence:
47 |     if symbol_id in _id_to_symbol:
48 |       s = _id_to_symbol[symbol_id]
49 |       # Enclose ARPAbet back in curly braces:
50 |       if len(s) > 1 and s[0] == "@":
51 |         s = "{%s}" % s[1:]
52 |       result += s
53 |   return result.replace("}{", " ")
54 | 
55 | 
56 | def _clean_text(text, cleaner_names):
57 |   for name in cleaner_names:
58 |     cleaner = getattr(cleaners, name)
59 |     if not cleaner:
60 |       raise Exception("Unknown cleaner: %s" % name)
61 |     text = cleaner(text)
62 |   return text
63 | 
64 | 
65 | def _symbols_to_sequence(symbols):
66 |   return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
67 | 
68 | 
69 | def _arpabet_to_sequence(text):
70 |   return _symbols_to_sequence(["@" + s for s in text.split()])
71 | 
72 | 
73 | def _should_keep_symbol(s):
74 |   return s in _symbol_to_id and s not in ("_", "~")
75 | 


--------------------------------------------------------------------------------
/synthesizer_preprocess_audio.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.preprocess import preprocess_librispeech
 2 | from synthesizer.hparams import hparams
 3 | from utils.argutils import print_args
 4 | from pathlib import Path
 5 | import argparse
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser(
10 |         description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
11 |                     "and writes them to  the disk. Audio files are also saved, to be used by the "
12 |                     "vocoder for training.",
13 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
14 |     )
15 |     parser.add_argument("datasets_root", type=Path, help=\
16 |         "Path to the directory containing your LibriSpeech/TTS datasets.")
17 |     parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
18 |         "Path to the output directory that will contain the mel spectrograms, the audios and the "
19 |         "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
20 |     parser.add_argument("-n", "--n_processes", type=int, default=None, help=\
21 |         "Number of processes in parallel.")
22 |     parser.add_argument("-s", "--skip_existing", action="store_true", help=\
23 |         "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
24 |         "interrupted.")
25 |     parser.add_argument("--hparams", type=str, default="", help=\
26 |         "Hyperparameter overrides as a comma-separated list of name-value pairs")
27 |     args = parser.parse_args()
28 |     
29 |     # Process the arguments
30 |     if not hasattr(args, "out_dir"):
31 |         args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
32 | 
33 |     # Create directories
34 |     assert args.datasets_root.exists()
35 |     args.out_dir.mkdir(exist_ok=True, parents=True)
36 | 
37 |     # Preprocess the dataset
38 |     print_args(args, parser)
39 |     args.hparams = hparams.parse(args.hparams)
40 |     preprocess_librispeech(**vars(args))    
41 | 


--------------------------------------------------------------------------------
/synthesizer_preprocess_embeds.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.preprocess import create_embeddings
 2 | from utils.argutils import print_args
 3 | from pathlib import Path
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(
 9 |         description="Creates embeddings for the synthesizer from the LibriSpeech utterances.",
10 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
11 |     )
12 |     parser.add_argument("synthesizer_root", type=Path, help=\
13 |         "Path to the synthesizer training data that contains the audios and the train.txt file. "
14 |         "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
15 |     parser.add_argument("-e", "--encoder_model_fpath", type=Path, 
16 |                         default="encoder/saved_models/pretrained.pt", help=\
17 |         "Path your trained encoder model.")
18 |     parser.add_argument("-n", "--n_processes", type=int, default=4, help= \
19 |         "Number of parallel processes. An encoder is created for each, so you may need to lower "
20 |         "this value on GPUs with low memory. Set it to 1 if CUDA is unhappy.")
21 |     args = parser.parse_args()
22 |     
23 |     # Preprocess the dataset
24 |     print_args(args, parser)
25 |     create_embeddings(**vars(args))    
26 | 


--------------------------------------------------------------------------------
/synthesizer_train.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.hparams import hparams
 2 | from synthesizer.train import tacotron_train
 3 | from utils.argutils import print_args
 4 | from synthesizer import infolog
 5 | import argparse
 6 | import os
 7 | 
 8 | 
 9 | def prepare_run(args):
10 |     modified_hp = hparams.parse(args.hparams)
11 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = str(args.tf_log_level)
12 |     run_name = args.name
13 |     log_dir = os.path.join(args.models_dir, "logs-{}".format(run_name))
14 |     os.makedirs(log_dir, exist_ok=True)
15 |     infolog.init(os.path.join(log_dir, "Terminal_train_log"), run_name, args.slack_url)
16 |     return log_dir, modified_hp
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("name", help="Name of the run and of the logging directory.")
22 |     parser.add_argument("synthesizer_root", type=str, help=\
23 |         "Path to the synthesizer training data that contains the audios and the train.txt file. "
24 |         "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
25 |     parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
26 |         "Path to the output directory that will contain the saved model weights and the logs.")
27 |     parser.add_argument("--mode", default="synthesis",
28 |                         help="mode for synthesis of tacotron after training")
29 |     parser.add_argument("--GTA", default="True",
30 |                         help="Ground truth aligned synthesis, defaults to True, only considered "
31 | 							 "in Tacotron synthesis mode")
32 |     parser.add_argument("--restore", type=bool, default=True,
33 |                         help="Set this to False to do a fresh training")
34 |     parser.add_argument("--summary_interval", type=int, default=2500,
35 |                         help="Steps between running summary ops")
36 |     parser.add_argument("--embedding_interval", type=int, default=10000,
37 |                         help="Steps between updating embeddings projection visualization")
38 |     parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000
39 |                         help="Steps between writing checkpoints")
40 |     parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000
41 |                         help="Steps between eval on test data")
42 |     parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000
43 |                         help="total number of tacotron training steps")
44 |     parser.add_argument("--tf_log_level", type=int, default=1, help="Tensorflow C++ log level.")
45 |     parser.add_argument("--slack_url", default=None,
46 |                         help="slack webhook notification destination link")
47 |     parser.add_argument("--hparams", default="",
48 |                         help="Hyperparameter overrides as a comma-separated list of name=value "
49 | 							 "pairs")
50 |     args = parser.parse_args()
51 |     print_args(args, parser)
52 |     
53 |     log_dir, hparams = prepare_run(args)
54 |     
55 |     tacotron_train(args, log_dir, hparams)
56 | 


--------------------------------------------------------------------------------
/toolbox/__init__.py:
--------------------------------------------------------------------------------
  1 | from toolbox.ui import UI
  2 | from encoder import inference as encoder
  3 | from synthesizer.inference import Synthesizer
  4 | from vocoder import inference as vocoder
  5 | from pathlib import Path
  6 | from time import perf_counter as timer
  7 | from toolbox.utterance import Utterance
  8 | import numpy as np
  9 | import traceback
 10 | import sys
 11 | 
 12 | 
 13 | # Use this directory structure for your datasets, or modify it to fit your needs
 14 | recognized_datasets = [
 15 |     "LibriSpeech/dev-clean",
 16 |     "LibriSpeech/dev-other",
 17 |     "LibriSpeech/test-clean",
 18 |     "LibriSpeech/test-other",
 19 |     "LibriSpeech/train-clean-100",
 20 |     "LibriSpeech/train-clean-360",
 21 |     "LibriSpeech/train-other-500",
 22 |     "LibriTTS/dev-clean",
 23 |     "LibriTTS/dev-other",
 24 |     "LibriTTS/test-clean",
 25 |     "LibriTTS/test-other",
 26 |     "LibriTTS/train-clean-100",
 27 |     "LibriTTS/train-clean-360",
 28 |     "LibriTTS/train-other-500",
 29 |     "LJSpeech-1.1",
 30 |     "VoxCeleb1/wav",
 31 |     "VoxCeleb1/test_wav",
 32 |     "VoxCeleb2/dev/aac",
 33 |     "VoxCeleb2/test/aac",
 34 |     "VCTK-Corpus/wav48",
 35 | ]
 36 | 
 37 | class Toolbox:
 38 |     def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, low_mem):
 39 |         sys.excepthook = self.excepthook
 40 |         self.datasets_root = datasets_root
 41 |         self.low_mem = low_mem
 42 |         self.utterances = set()
 43 |         self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
 44 |         
 45 |         self.synthesizer = None # type: Synthesizer
 46 |         
 47 |         # Initialize the events and the interface
 48 |         self.ui = UI()
 49 |         self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
 50 |         self.setup_events()
 51 |         self.ui.start()
 52 |         
 53 |     def excepthook(self, exc_type, exc_value, exc_tb):
 54 |         traceback.print_exception(exc_type, exc_value, exc_tb)
 55 |         self.ui.log("Exception: %s" % exc_value)
 56 |         
 57 |     def setup_events(self):
 58 |         # Dataset, speaker and utterance selection
 59 |         self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
 60 |         random_func = lambda level: lambda: self.ui.populate_browser(self.datasets_root,
 61 |                                                                      recognized_datasets,
 62 |                                                                      level)
 63 |         self.ui.random_dataset_button.clicked.connect(random_func(0))
 64 |         self.ui.random_speaker_button.clicked.connect(random_func(1))
 65 |         self.ui.random_utterance_button.clicked.connect(random_func(2))
 66 |         self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
 67 |         self.ui.speaker_box.currentIndexChanged.connect(random_func(2))
 68 |         
 69 |         # Model selection
 70 |         self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
 71 |         def func(): 
 72 |             self.synthesizer = None
 73 |         self.ui.synthesizer_box.currentIndexChanged.connect(func)
 74 |         self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
 75 |         
 76 |         # Utterance selection
 77 |         func = lambda: self.load_from_browser(self.ui.browse_file())
 78 |         self.ui.browser_browse_button.clicked.connect(func)
 79 |         func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
 80 |         self.ui.utterance_history.currentIndexChanged.connect(func)
 81 |         func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
 82 |         self.ui.play_button.clicked.connect(func)
 83 |         self.ui.stop_button.clicked.connect(self.ui.stop)
 84 |         self.ui.record_button.clicked.connect(self.record)
 85 |         
 86 |         # Generation
 87 |         func = lambda: self.synthesize() or self.vocode()
 88 |         self.ui.generate_button.clicked.connect(func)
 89 |         self.ui.synthesize_button.clicked.connect(self.synthesize)
 90 |         self.ui.vocode_button.clicked.connect(self.vocode)
 91 |         
 92 |         # UMAP legend
 93 |         self.ui.clear_button.clicked.connect(self.clear_utterances)
 94 | 
 95 |     def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir):
 96 |         self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
 97 |         self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir)
 98 |         
 99 |     def load_from_browser(self, fpath=None):
100 |         if fpath is None:
101 |             fpath = Path(self.datasets_root,
102 |                          self.ui.current_dataset_name,
103 |                          self.ui.current_speaker_name,
104 |                          self.ui.current_utterance_name)
105 |             name = str(fpath.relative_to(self.datasets_root))
106 |             speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name
107 |             
108 |             # Select the next utterance
109 |             if self.ui.auto_next_checkbox.isChecked():
110 |                 self.ui.browser_select_next()
111 |         elif fpath == "":
112 |             return 
113 |         else:
114 |             name = fpath.name
115 |             speaker_name = fpath.parent.name
116 |         
117 |         # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
118 |         # playback, so as to have a fair comparison with the generated audio
119 |         wav = Synthesizer.load_preprocess_wav(fpath)
120 |         self.ui.log("Loaded %s" % name)
121 | 
122 |         self.add_real_utterance(wav, name, speaker_name)
123 |         
124 |     def record(self):
125 |         wav = self.ui.record_one(encoder.sampling_rate, 5)
126 |         if wav is None:
127 |             return 
128 |         self.ui.play(wav, encoder.sampling_rate)
129 | 
130 |         speaker_name = "user01"
131 |         name = speaker_name + "_rec_%05d" % np.random.randint(100000)
132 |         self.add_real_utterance(wav, name, speaker_name)
133 |         
134 |     def add_real_utterance(self, wav, name, speaker_name):
135 |         # Compute the mel spectrogram
136 |         spec = Synthesizer.make_spectrogram(wav)
137 |         self.ui.draw_spec(spec, "current")
138 | 
139 |         # Compute the embedding
140 |         if not encoder.is_loaded():
141 |             self.init_encoder()
142 |         encoder_wav = encoder.preprocess_wav(wav)
143 |         embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
144 | 
145 |         # Add the utterance
146 |         utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
147 |         self.utterances.add(utterance)
148 |         self.ui.register_utterance(utterance)
149 | 
150 |         # Plot it
151 |         self.ui.draw_embed(embed, name, "current")
152 |         self.ui.draw_umap_projections(self.utterances)
153 |         
154 |     def clear_utterances(self):
155 |         self.utterances.clear()
156 |         self.ui.draw_umap_projections(self.utterances)
157 |         
158 |     def synthesize(self):
159 |         self.ui.log("Generating the mel spectrogram...")
160 |         self.ui.set_loading(1)
161 |         
162 |         # Synthesize the spectrogram
163 |         if self.synthesizer is None:
164 |             model_dir = self.ui.current_synthesizer_model_dir
165 |             checkpoints_dir = model_dir.joinpath("taco_pretrained")
166 |             self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
167 |         if not self.synthesizer.is_loaded():
168 |             self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)
169 |         
170 |         texts = self.ui.text_prompt.toPlainText().split("\n")
171 |         embed = self.ui.selected_utterance.embed
172 |         embeds = np.stack([embed] * len(texts))
173 |         specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
174 |         breaks = [spec.shape[1] for spec in specs]
175 |         spec = np.concatenate(specs, axis=1)
176 |         
177 |         self.ui.draw_spec(spec, "generated")
178 |         self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
179 |         self.ui.set_loading(0)
180 | 
181 |     def vocode(self):
182 |         speaker_name, spec, breaks, _ = self.current_generated
183 |         assert spec is not None
184 | 
185 |         # Synthesize the waveform
186 |         if not vocoder.is_loaded():
187 |             self.init_vocoder()
188 |         def vocoder_progress(i, seq_len, b_size, gen_rate):
189 |             real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
190 |             line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
191 |                    % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
192 |             self.ui.log(line, "overwrite")
193 |             self.ui.set_loading(i, seq_len)
194 |         if self.ui.current_vocoder_fpath is not None:
195 |             self.ui.log("")
196 |             wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
197 |         else:
198 |             self.ui.log("Waveform generation with Griffin-Lim... ")
199 |             wav = Synthesizer.griffin_lim(spec)
200 |         self.ui.set_loading(0)
201 |         self.ui.log(" Done!", "append")
202 |         
203 |         # Add breaks
204 |         b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
205 |         b_starts = np.concatenate(([0], b_ends[:-1]))
206 |         wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
207 |         breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
208 |         wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
209 | 
210 |         # Play it
211 |         wav = wav / np.abs(wav).max() * 0.97
212 |         self.ui.play(wav, Synthesizer.sample_rate)
213 | 
214 |         # Compute the embedding
215 |         # TODO: this is problematic with different sampling rates, gotta fix it
216 |         if not encoder.is_loaded():
217 |             self.init_encoder()
218 |         encoder_wav = encoder.preprocess_wav(wav)
219 |         embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
220 |         
221 |         # Add the utterance
222 |         name = speaker_name + "_gen_%05d" % np.random.randint(100000)
223 |         utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)
224 |         self.utterances.add(utterance)
225 |         
226 |         # Plot it
227 |         self.ui.draw_embed(embed, name, "generated")
228 |         self.ui.draw_umap_projections(self.utterances)
229 |         
230 |     def init_encoder(self):
231 |         model_fpath = self.ui.current_encoder_fpath
232 |         
233 |         self.ui.log("Loading the encoder %s... " % model_fpath)
234 |         self.ui.set_loading(1)
235 |         start = timer()
236 |         encoder.load_model(model_fpath)
237 |         self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
238 |         self.ui.set_loading(0)
239 |            
240 |     def init_vocoder(self):
241 |         model_fpath = self.ui.current_vocoder_fpath
242 |         # Case of Griffin-lim
243 |         if model_fpath is None:
244 |             return 
245 |     
246 |         self.ui.log("Loading the vocoder %s... " % model_fpath)
247 |         self.ui.set_loading(1)
248 |         start = timer()
249 |         vocoder.load_model(model_fpath)
250 |         self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
251 |         self.ui.set_loading(0)
252 | 


--------------------------------------------------------------------------------
/toolbox/utterance.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | 
3 | Utterance = namedtuple("Utterance", "name speaker_name wav spec embed partial_embeds synth")
4 | Utterance.__eq__ = lambda x, y: x.name == y.name
5 | Utterance.__hash__ = lambda x: hash(x.name)
6 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iwater/Real-Time-Voice-Cloning-Chinese/06882b9a83247beda1d4d84baca0400457096d1b/utils/__init__.py


--------------------------------------------------------------------------------
/utils/argutils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import numpy as np
 3 | import argparse
 4 | 
 5 | _type_priorities = [    # In decreasing order
 6 |     Path,
 7 |     str,
 8 |     int,
 9 |     float,
10 |     bool,
11 | ]
12 | 
13 | def _priority(o):
14 |     p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None) 
15 |     if p is not None:
16 |         return p
17 |     p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None) 
18 |     if p is not None:
19 |         return p
20 |     return len(_type_priorities)
21 | 
22 | def print_args(args: argparse.Namespace, parser=None):
23 |     args = vars(args)
24 |     if parser is None:
25 |         priorities = list(map(_priority, args.values()))
26 |     else:
27 |         all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
28 |         priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
29 |         priorities = list(map(priority, args.keys()))
30 |     
31 |     pad = max(map(len, args.keys())) + 3
32 |     indices = np.lexsort((list(args.keys()), priorities))
33 |     items = list(args.items())
34 |     
35 |     print("Arguments:")
36 |     for i in indices:
37 |         param, value = items[i]
38 |         print("    {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
39 |     print("")
40 |     


--------------------------------------------------------------------------------
/utils/logmmse.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # 
  3 | # Copyright (c) 2015 braindead
  4 | # 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | # 
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | # 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | #
 23 | #
 24 | # This code was extracted from the logmmse package (https://pypi.org/project/logmmse/) and I
 25 | # simply modified the interface to meet my needs.
 26 | 
 27 | 
 28 | import numpy as np
 29 | import math
 30 | from scipy.special import expn
 31 | from collections import namedtuple
 32 | 
 33 | NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
 34 | 
 35 | 
 36 | def profile_noise(noise, sampling_rate, window_size=0):
 37 |     """
 38 |     Creates a profile of the noise in a given waveform.
 39 |     
 40 |     :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints. 
 41 |     :param sampling_rate: the sampling rate of the audio
 42 |     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
 43 |     will be picked if left as 0.
 44 |     :return: a NoiseProfile object
 45 |     """
 46 |     noise, dtype = to_float(noise)
 47 |     noise += np.finfo(np.float64).eps
 48 | 
 49 |     if window_size == 0:
 50 |         window_size = int(math.floor(0.02 * sampling_rate))
 51 | 
 52 |     if window_size % 2 == 1:
 53 |         window_size = window_size + 1
 54 |     
 55 |     perc = 50
 56 |     len1 = int(math.floor(window_size * perc / 100))
 57 |     len2 = int(window_size - len1)
 58 | 
 59 |     win = np.hanning(window_size)
 60 |     win = win * len2 / np.sum(win)
 61 |     n_fft = 2 * window_size
 62 | 
 63 |     noise_mean = np.zeros(n_fft)
 64 |     n_frames = len(noise) // window_size
 65 |     for j in range(0, window_size * n_frames, window_size):
 66 |         noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
 67 |     noise_mu2 = (noise_mean / n_frames) ** 2
 68 |     
 69 |     return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
 70 | 
 71 | 
 72 | def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
 73 |     """
 74 |     Cleans the noise from a speech waveform given a noise profile. The waveform must have the 
 75 |     same sampling rate as the one used to create the noise profile. 
 76 |     
 77 |     :param wav: a speech waveform as a numpy array of floats or ints.
 78 |     :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of 
 79 |     the same) waveform.
 80 |     :param eta: voice threshold for noise update. While the voice activation detection value is 
 81 |     below this threshold, the noise profile will be continuously updated throughout the audio. 
 82 |     Set to 0 to disable updating the noise profile.
 83 |     :return: the clean wav as a numpy array of floats or ints of the same length.
 84 |     """
 85 |     wav, dtype = to_float(wav)
 86 |     wav += np.finfo(np.float64).eps
 87 |     p = noise_profile
 88 |     
 89 |     nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
 90 |     x_final = np.zeros(nframes * p.len2)
 91 | 
 92 |     aa = 0.98
 93 |     mu = 0.98
 94 |     ksi_min = 10 ** (-25 / 10)
 95 |     
 96 |     x_old = np.zeros(p.len1)
 97 |     xk_prev = np.zeros(p.len1)
 98 |     noise_mu2 = p.noise_mu2
 99 |     for k in range(0, nframes * p.len2, p.len2):
100 |         insign = p.win * wav[k:k + p.window_size]
101 | 
102 |         spec = np.fft.fft(insign, p.n_fft, axis=0)
103 |         sig = np.absolute(spec)
104 |         sig2 = sig ** 2
105 | 
106 |         gammak = np.minimum(sig2 / noise_mu2, 40)
107 | 
108 |         if xk_prev.all() == 0:
109 |             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
110 |         else:
111 |             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
112 |             ksi = np.maximum(ksi_min, ksi)
113 | 
114 |         log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
115 |         vad_decision = np.sum(log_sigma_k) / p.window_size
116 |         if vad_decision < eta:
117 |             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
118 | 
119 |         a = ksi / (1 + ksi)
120 |         vk = a * gammak
121 |         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
122 |         hw = a * np.exp(ei_vk)
123 |         sig = sig * hw
124 |         xk_prev = sig ** 2
125 |         xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
126 |         xi_w = np.real(xi_w)
127 | 
128 |         x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
129 |         x_old = xi_w[p.len1:p.window_size]
130 | 
131 |     output = from_float(x_final, dtype)
132 |     output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
133 |     return output
134 | 
135 | 
136 | ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that 
137 | ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of 
138 | ## webrctvad
139 | # def vad(wav, sampling_rate, eta=0.15, window_size=0):
140 | #     """
141 | #     TODO: fix doc
142 | #     Creates a profile of the noise in a given waveform.
143 | # 
144 | #     :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints. 
145 | #     :param sampling_rate: the sampling rate of the audio
146 | #     :param window_size: the size of the window the logmmse algorithm operates on. A default value 
147 | #     will be picked if left as 0.
148 | #     :param eta: voice threshold for noise update. While the voice activation detection value is 
149 | #     below this threshold, the noise profile will be continuously updated throughout the audio. 
150 | #     Set to 0 to disable updating the noise profile.
151 | #     """
152 | #     wav, dtype = to_float(wav)
153 | #     wav += np.finfo(np.float64).eps
154 | #     
155 | #     if window_size == 0:
156 | #         window_size = int(math.floor(0.02 * sampling_rate))
157 | #     
158 | #     if window_size % 2 == 1:
159 | #         window_size = window_size + 1
160 | #     
161 | #     perc = 50
162 | #     len1 = int(math.floor(window_size * perc / 100))
163 | #     len2 = int(window_size - len1)
164 | #     
165 | #     win = np.hanning(window_size)
166 | #     win = win * len2 / np.sum(win)
167 | #     n_fft = 2 * window_size
168 | #     
169 | #     wav_mean = np.zeros(n_fft)
170 | #     n_frames = len(wav) // window_size
171 | #     for j in range(0, window_size * n_frames, window_size):
172 | #         wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
173 | #     noise_mu2 = (wav_mean / n_frames) ** 2
174 | #     
175 | #     wav, dtype = to_float(wav)
176 | #     wav += np.finfo(np.float64).eps
177 | #     
178 | #     nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
179 | #     vad = np.zeros(nframes * len2, dtype=np.bool)
180 | # 
181 | #     aa = 0.98
182 | #     mu = 0.98
183 | #     ksi_min = 10 ** (-25 / 10)
184 | #     
185 | #     xk_prev = np.zeros(len1)
186 | #     noise_mu2 = noise_mu2
187 | #     for k in range(0, nframes * len2, len2):
188 | #         insign = win * wav[k:k + window_size]
189 | #         
190 | #         spec = np.fft.fft(insign, n_fft, axis=0)
191 | #         sig = np.absolute(spec)
192 | #         sig2 = sig ** 2
193 | #         
194 | #         gammak = np.minimum(sig2 / noise_mu2, 40)
195 | #         
196 | #         if xk_prev.all() == 0:
197 | #             ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
198 | #         else:
199 | #             ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
200 | #             ksi = np.maximum(ksi_min, ksi)
201 | #         
202 | #         log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
203 | #         vad_decision = np.sum(log_sigma_k) / window_size
204 | #         if vad_decision < eta:
205 | #             noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
206 | #         print(vad_decision)
207 | #         
208 | #         a = ksi / (1 + ksi)
209 | #         vk = a * gammak
210 | #         ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
211 | #         hw = a * np.exp(ei_vk)
212 | #         sig = sig * hw
213 | #         xk_prev = sig ** 2
214 | #         
215 | #         vad[k:k + len2] = vad_decision >= eta
216 | #         
217 | #     vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
218 | #     return vad
219 | 
220 | 
221 | def to_float(_input):
222 |     if _input.dtype == np.float64:
223 |         return _input, _input.dtype
224 |     elif _input.dtype == np.float32:
225 |         return _input.astype(np.float64), _input.dtype
226 |     elif _input.dtype == np.uint8:
227 |         return (_input - 128) / 128., _input.dtype
228 |     elif _input.dtype == np.int16:
229 |         return _input / 32768., _input.dtype
230 |     elif _input.dtype == np.int32:
231 |         return _input / 2147483648., _input.dtype
232 |     raise ValueError('Unsupported wave file format')
233 | 
234 | 
235 | def from_float(_input, dtype):
236 |     if dtype == np.float64:
237 |         return _input, np.float64
238 |     elif dtype == np.float32:
239 |         return _input.astype(np.float32)
240 |     elif dtype == np.uint8:
241 |         return ((_input * 128) + 128).astype(np.uint8)
242 |     elif dtype == np.int16:
243 |         return (_input * 32768).astype(np.int16)
244 |     elif dtype == np.int32:
245 |         print(_input)
246 |         return (_input * 2147483648).astype(np.int32)
247 |     raise ValueError('Unsupported wave file format')
248 | 


--------------------------------------------------------------------------------
/utils/profiler.py:
--------------------------------------------------------------------------------
 1 | from time import perf_counter as timer
 2 | from collections import OrderedDict
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Profiler:
 7 |     def __init__(self, summarize_every=5, disabled=False):
 8 |         self.last_tick = timer()
 9 |         self.logs = OrderedDict()
10 |         self.summarize_every = summarize_every
11 |         self.disabled = disabled
12 |     
13 |     def tick(self, name):
14 |         if self.disabled:
15 |             return
16 |         
17 |         # Log the time needed to execute that function
18 |         if not name in self.logs:
19 |             self.logs[name] = []
20 |         if len(self.logs[name]) >= self.summarize_every:
21 |             self.summarize()
22 |             self.purge_logs()
23 |         self.logs[name].append(timer() - self.last_tick)
24 |         
25 |         self.reset_timer()
26 |         
27 |     def purge_logs(self):
28 |         for name in self.logs:
29 |             self.logs[name].clear()
30 |     
31 |     def reset_timer(self):
32 |         self.last_tick = timer()
33 |     
34 |     def summarize(self):
35 |         n = max(map(len, self.logs.values()))
36 |         assert n == self.summarize_every
37 |         print("\nAverage execution time over %d steps:" % n)
38 | 
39 |         name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
40 |         pad = max(map(len, name_msgs))
41 |         for name_msg, deltas in zip(name_msgs, self.logs.values()):
42 |             print("  %s  mean: %4.0fms   std: %4.0fms" % 
43 |                   (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
44 |         print("", flush=True)    
45 |         


--------------------------------------------------------------------------------
/vocoder/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
 4 | Modified work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/vocoder/audio.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import librosa
  4 | import vocoder.hparams as hp
  5 | from scipy.signal import lfilter
  6 | 
  7 | 
  8 | def label_2_float(x, bits) :
  9 |     return 2 * x / (2**bits - 1.) - 1.
 10 | 
 11 | 
 12 | def float_2_label(x, bits) :
 13 |     assert abs(x).max() <= 1.0
 14 |     x = (x + 1.) * (2**bits - 1) / 2
 15 |     return x.clip(0, 2**bits - 1)
 16 | 
 17 | 
 18 | def load_wav(path) :
 19 |     return librosa.load(path, sr=hp.sample_rate)[0]
 20 | 
 21 | 
 22 | def save_wav(x, path) :
 23 |     librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate)
 24 | 
 25 | 
 26 | def split_signal(x) :
 27 |     unsigned = x + 2**15
 28 |     coarse = unsigned // 256
 29 |     fine = unsigned % 256
 30 |     return coarse, fine
 31 | 
 32 | 
 33 | def combine_signal(coarse, fine) :
 34 |     return coarse * 256 + fine - 2**15
 35 | 
 36 | 
 37 | def encode_16bits(x) :
 38 |     return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
 39 | 
 40 | 
 41 | mel_basis = None
 42 | 
 43 | 
 44 | def linear_to_mel(spectrogram):
 45 |     global mel_basis
 46 |     if mel_basis is None:
 47 |         mel_basis = build_mel_basis()
 48 |     return np.dot(mel_basis, spectrogram)
 49 | 
 50 | 
 51 | def build_mel_basis():
 52 |     return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
 53 | 
 54 | 
 55 | def normalize(S):
 56 |     return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
 57 | 
 58 | 
 59 | def denormalize(S):
 60 |     return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
 61 | 
 62 | 
 63 | def amp_to_db(x):
 64 |     return 20 * np.log10(np.maximum(1e-5, x))
 65 | 
 66 | 
 67 | def db_to_amp(x):
 68 |     return np.power(10.0, x * 0.05)
 69 | 
 70 | 
 71 | def spectrogram(y):
 72 |     D = stft(y)
 73 |     S = amp_to_db(np.abs(D)) - hp.ref_level_db
 74 |     return normalize(S)
 75 | 
 76 | 
 77 | def melspectrogram(y):
 78 |     D = stft(y)
 79 |     S = amp_to_db(linear_to_mel(np.abs(D)))
 80 |     return normalize(S)
 81 | 
 82 | 
 83 | def stft(y):
 84 |     return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
 85 | 
 86 | 
 87 | def pre_emphasis(x):
 88 |     return lfilter([1, -hp.preemphasis], [1], x)
 89 | 
 90 | 
 91 | def de_emphasis(x):
 92 |     return lfilter([1], [1, -hp.preemphasis], x)
 93 | 
 94 | 
 95 | def encode_mu_law(x, mu) :
 96 |     mu = mu - 1
 97 |     fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
 98 |     return np.floor((fx + 1) / 2 * mu + 0.5)
 99 | 
100 | 
101 | def decode_mu_law(y, mu, from_labels=True) :
102 |     if from_labels: 
103 |         y = label_2_float(y, math.log2(mu))
104 |     mu = mu - 1
105 |     x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
106 |     return x
107 | 
108 | 


--------------------------------------------------------------------------------
/vocoder/display.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import time
  3 | import numpy as np
  4 | import sys
  5 | 
  6 | 
  7 | def progbar(i, n, size=16):
  8 |     done = (i * size) // n
  9 |     bar = ''
 10 |     for i in range(size):
 11 |         bar += '█' if i <= done else '░'
 12 |     return bar
 13 | 
 14 | 
 15 | def stream(message) :
 16 |     sys.stdout.write("\r{%s}" % message)
 17 | 
 18 | 
 19 | def simple_table(item_tuples) :
 20 | 
 21 |     border_pattern = '+---------------------------------------'
 22 |     whitespace = '                                            '
 23 | 
 24 |     headings, cells, = [], []
 25 | 
 26 |     for item in item_tuples :
 27 | 
 28 |         heading, cell = str(item[0]), str(item[1])
 29 | 
 30 |         pad_head = True if len(heading) < len(cell) else False
 31 | 
 32 |         pad = abs(len(heading) - len(cell))
 33 |         pad = whitespace[:pad]
 34 | 
 35 |         pad_left = pad[:len(pad)//2]
 36 |         pad_right = pad[len(pad)//2:]
 37 | 
 38 |         if pad_head :
 39 |             heading = pad_left + heading + pad_right
 40 |         else :
 41 |             cell = pad_left + cell + pad_right
 42 | 
 43 |         headings += [heading]
 44 |         cells += [cell]
 45 | 
 46 |     border, head, body = '', '', ''
 47 | 
 48 |     for i in range(len(item_tuples)) :
 49 | 
 50 |         temp_head = f'| {headings[i]} '
 51 |         temp_body = f'| {cells[i]} '
 52 | 
 53 |         border += border_pattern[:len(temp_head)]
 54 |         head += temp_head
 55 |         body += temp_body
 56 | 
 57 |         if i == len(item_tuples) - 1 :
 58 |             head += '|'
 59 |             body += '|'
 60 |             border += '+'
 61 | 
 62 |     print(border)
 63 |     print(head)
 64 |     print(border)
 65 |     print(body)
 66 |     print(border)
 67 |     print(' ')
 68 | 
 69 | 
 70 | def time_since(started) :
 71 |     elapsed = time.time() - started
 72 |     m = int(elapsed // 60)
 73 |     s = int(elapsed % 60)
 74 |     if m >= 60 :
 75 |         h = int(m // 60)
 76 |         m = m % 60
 77 |         return f'{h}h {m}m {s}s'
 78 |     else :
 79 |         return f'{m}m {s}s'
 80 | 
 81 | 
 82 | def save_attention(attn, path) :
 83 |     fig = plt.figure(figsize=(12, 6))
 84 |     plt.imshow(attn.T, interpolation='nearest', aspect='auto')
 85 |     fig.savefig(f'{path}.png', bbox_inches='tight')
 86 |     plt.close(fig)
 87 | 
 88 | 
 89 | def save_spectrogram(M, path, length=None) :
 90 |     M = np.flip(M, axis=0)
 91 |     if length : M = M[:, :length]
 92 |     fig = plt.figure(figsize=(12, 6))
 93 |     plt.imshow(M, interpolation='nearest', aspect='auto')
 94 |     fig.savefig(f'{path}.png', bbox_inches='tight')
 95 |     plt.close(fig)
 96 | 
 97 | 
 98 | def plot(array) : 
 99 |     fig = plt.figure(figsize=(30, 5))
100 |     ax = fig.add_subplot(111)
101 |     ax.xaxis.label.set_color('grey')
102 |     ax.yaxis.label.set_color('grey')
103 |     ax.xaxis.label.set_fontsize(23)
104 |     ax.yaxis.label.set_fontsize(23)
105 |     ax.tick_params(axis='x', colors='grey', labelsize=23)
106 |     ax.tick_params(axis='y', colors='grey', labelsize=23)
107 |     plt.plot(array)
108 | 
109 | 
110 | def plot_spec(M) :
111 |     M = np.flip(M, axis=0)
112 |     plt.figure(figsize=(18,4))
113 |     plt.imshow(M, interpolation='nearest', aspect='auto')
114 |     plt.show()
115 | 
116 | 


--------------------------------------------------------------------------------
/vocoder/distribution.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def log_sum_exp(x):
  7 |     """ numerically stable log_sum_exp implementation that prevents overflow """
  8 |     # TF ordering
  9 |     axis = len(x.size()) - 1
 10 |     m, _ = torch.max(x, dim=axis)
 11 |     m2, _ = torch.max(x, dim=axis, keepdim=True)
 12 |     return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
 13 | 
 14 | 
 15 | # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
 16 | def discretized_mix_logistic_loss(y_hat, y, num_classes=65536,
 17 |                                   log_scale_min=None, reduce=True):
 18 |     if log_scale_min is None:
 19 |         log_scale_min = float(np.log(1e-14))
 20 |     y_hat = y_hat.permute(0,2,1)
 21 |     assert y_hat.dim() == 3
 22 |     assert y_hat.size(1) % 3 == 0
 23 |     nr_mix = y_hat.size(1) // 3
 24 | 
 25 |     # (B x T x C)
 26 |     y_hat = y_hat.transpose(1, 2)
 27 | 
 28 |     # unpack parameters. (B, T, num_mixtures) x 3
 29 |     logit_probs = y_hat[:, :, :nr_mix]
 30 |     means = y_hat[:, :, nr_mix:2 * nr_mix]
 31 |     log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
 32 | 
 33 |     # B x T x 1 -> B x T x num_mixtures
 34 |     y = y.expand_as(means)
 35 | 
 36 |     centered_y = y - means
 37 |     inv_stdv = torch.exp(-log_scales)
 38 |     plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
 39 |     cdf_plus = torch.sigmoid(plus_in)
 40 |     min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
 41 |     cdf_min = torch.sigmoid(min_in)
 42 | 
 43 |     # log probability for edge case of 0 (before scaling)
 44 |     # equivalent: torch.log(F.sigmoid(plus_in))
 45 |     log_cdf_plus = plus_in - F.softplus(plus_in)
 46 | 
 47 |     # log probability for edge case of 255 (before scaling)
 48 |     # equivalent: (1 - F.sigmoid(min_in)).log()
 49 |     log_one_minus_cdf_min = -F.softplus(min_in)
 50 | 
 51 |     # probability for all other cases
 52 |     cdf_delta = cdf_plus - cdf_min
 53 | 
 54 |     mid_in = inv_stdv * centered_y
 55 |     # log probability in the center of the bin, to be used in extreme cases
 56 |     # (not actually used in our code)
 57 |     log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
 58 | 
 59 |     # tf equivalent
 60 |     """
 61 |     log_probs = tf.where(x < -0.999, log_cdf_plus,
 62 |                          tf.where(x > 0.999, log_one_minus_cdf_min,
 63 |                                   tf.where(cdf_delta > 1e-5,
 64 |                                            tf.log(tf.maximum(cdf_delta, 1e-12)),
 65 |                                            log_pdf_mid - np.log(127.5))))
 66 |     """
 67 |     # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
 68 |     # for num_classes=65536 case? 1e-7? not sure..
 69 |     inner_inner_cond = (cdf_delta > 1e-5).float()
 70 | 
 71 |     inner_inner_out = inner_inner_cond * \
 72 |         torch.log(torch.clamp(cdf_delta, min=1e-12)) + \
 73 |         (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
 74 |     inner_cond = (y > 0.999).float()
 75 |     inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out
 76 |     cond = (y < -0.999).float()
 77 |     log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
 78 | 
 79 |     log_probs = log_probs + F.log_softmax(logit_probs, -1)
 80 | 
 81 |     if reduce:
 82 |         return -torch.mean(log_sum_exp(log_probs))
 83 |     else:
 84 |         return -log_sum_exp(log_probs).unsqueeze(-1)
 85 | 
 86 | 
 87 | def sample_from_discretized_mix_logistic(y, log_scale_min=None):
 88 |     """
 89 |     Sample from discretized mixture of logistic distributions
 90 |     Args:
 91 |         y (Tensor): B x C x T
 92 |         log_scale_min (float): Log scale minimum value
 93 |     Returns:
 94 |         Tensor: sample in range of [-1, 1].
 95 |     """
 96 |     if log_scale_min is None:
 97 |         log_scale_min = float(np.log(1e-14))
 98 |     assert y.size(1) % 3 == 0
 99 |     nr_mix = y.size(1) // 3
100 | 
101 |     # B x T x C
102 |     y = y.transpose(1, 2)
103 |     logit_probs = y[:, :, :nr_mix]
104 | 
105 |     # sample mixture indicator from softmax
106 |     temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
107 |     temp = logit_probs.data - torch.log(- torch.log(temp))
108 |     _, argmax = temp.max(dim=-1)
109 | 
110 |     # (B, T) -> (B, T, nr_mix)
111 |     one_hot = to_one_hot(argmax, nr_mix)
112 |     # select logistic parameters
113 |     means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1)
114 |     log_scales = torch.clamp(torch.sum(
115 |         y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min)
116 |     # sample from logistic & clip to interval
117 |     # we don't actually round to the nearest 8bit value when sampling
118 |     u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
119 |     x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
120 | 
121 |     x = torch.clamp(torch.clamp(x, min=-1.), max=1.)
122 | 
123 |     return x
124 | 
125 | 
126 | def to_one_hot(tensor, n, fill_with=1.):
127 |     # we perform one hot encore with respect to the last axis
128 |     one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
129 |     if tensor.is_cuda:
130 |         one_hot = one_hot.cuda()
131 |     one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
132 |     return one_hot
133 | 


--------------------------------------------------------------------------------
/vocoder/gen_wavernn.py:
--------------------------------------------------------------------------------
 1 | from vocoder.models.fatchord_version import  WaveRNN
 2 | from vocoder.audio import *
 3 | 
 4 | 
 5 | def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path):
 6 |     k = model.get_step() // 1000
 7 | 
 8 |     for i, (m, x) in enumerate(test_set, 1):
 9 |         if i > samples: 
10 |             break
11 | 
12 |         print('\n| Generating: %i/%i' % (i, samples))
13 | 
14 |         x = x[0].numpy()
15 | 
16 |         bits = 16 if hp.voc_mode == 'MOL' else hp.bits
17 | 
18 |         if hp.mu_law and hp.voc_mode != 'MOL' :
19 |             x = decode_mu_law(x, 2**bits, from_labels=True)
20 |         else :
21 |             x = label_2_float(x, bits)
22 | 
23 |         save_wav(x, save_path.joinpath("%dk_steps_%d_target.wav" % (k, i)))
24 |         
25 |         batch_str = "gen_batched_target%d_overlap%d" % (target, overlap) if batched else \
26 |             "gen_not_batched"
27 |         save_str = save_path.joinpath("%dk_steps_%d_%s.wav" % (k, i, batch_str))
28 | 
29 |         wav = model.generate(m, batched, target, overlap, hp.mu_law)
30 |         save_wav(wav, save_str)
31 | 
32 | 


--------------------------------------------------------------------------------
/vocoder/hparams.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.hparams import hparams as _syn_hp
 2 | 
 3 | 
 4 | # Audio settings------------------------------------------------------------------------
 5 | # Match the values of the synthesizer
 6 | sample_rate = _syn_hp.sample_rate
 7 | n_fft = _syn_hp.n_fft
 8 | num_mels = _syn_hp.num_mels
 9 | hop_length = _syn_hp.hop_size
10 | win_length = _syn_hp.win_size
11 | fmin = _syn_hp.fmin
12 | min_level_db = _syn_hp.min_level_db
13 | ref_level_db = _syn_hp.ref_level_db
14 | mel_max_abs_value = _syn_hp.max_abs_value
15 | preemphasis = _syn_hp.preemphasis
16 | apply_preemphasis = _syn_hp.preemphasize
17 | 
18 | bits = 9                            # bit depth of signal
19 | mu_law = True                       # Recommended to suppress noise if using raw bits in hp.voc_mode
20 |                                     # below
21 | 
22 | 
23 | # WAVERNN / VOCODER --------------------------------------------------------------------------------
24 | voc_mode = 'RAW'                    # either 'RAW' (softmax on raw bits) or 'MOL' (sample from 
25 | # mixture of logistics)
26 | voc_upsample_factors = (5, 5, 8)    # NB - this needs to correctly factorise hop_length
27 | voc_rnn_dims = 512
28 | voc_fc_dims = 512
29 | voc_compute_dims = 128
30 | voc_res_out_dims = 128
31 | voc_res_blocks = 10
32 | 
33 | # Training
34 | voc_batch_size = 100
35 | voc_lr = 1e-4
36 | voc_gen_at_checkpoint = 5           # number of samples to generate at each checkpoint
37 | voc_pad = 2                         # this will pad the input so that the resnet can 'see' wider 
38 |                                     # than input length
39 | voc_seq_len = hop_length * 5        # must be a multiple of hop_length
40 | 
41 | # Generating / Synthesizing
42 | voc_gen_batched = True              # very fast (realtime+) single utterance batched generation
43 | voc_target = 8000                   # target number of samples to be generated in each batch entry
44 | voc_overlap = 400                   # number of samples for crossfading between batches
45 | 


--------------------------------------------------------------------------------
/vocoder/inference.py:
--------------------------------------------------------------------------------
 1 | from vocoder.models.fatchord_version import WaveRNN
 2 | from vocoder import hparams as hp
 3 | import torch
 4 | 
 5 | 
 6 | _model = None   # type: WaveRNN
 7 | 
 8 | def load_model(weights_fpath, verbose=True):
 9 |     global _model
10 |     
11 |     if verbose:
12 |         print("Building Wave-RNN")
13 |     _model = WaveRNN(
14 |         rnn_dims=hp.voc_rnn_dims,
15 |         fc_dims=hp.voc_fc_dims,
16 |         bits=hp.bits,
17 |         pad=hp.voc_pad,
18 |         upsample_factors=hp.voc_upsample_factors,
19 |         feat_dims=hp.num_mels,
20 |         compute_dims=hp.voc_compute_dims,
21 |         res_out_dims=hp.voc_res_out_dims,
22 |         res_blocks=hp.voc_res_blocks,
23 |         hop_length=hp.hop_length,
24 |         sample_rate=hp.sample_rate,
25 |         mode=hp.voc_mode
26 |     ).cuda()
27 |     
28 |     if verbose:
29 |         print("Loading model weights at %s" % weights_fpath)
30 |     checkpoint = torch.load(weights_fpath)
31 |     _model.load_state_dict(checkpoint['model_state'])
32 |     _model.eval()
33 | 
34 | 
35 | def is_loaded():
36 |     return _model is not None
37 | 
38 | 
39 | def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800, 
40 |                    progress_callback=None):
41 |     """
42 |     Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 
43 |     that of the synthesizer!)
44 |     
45 |     :param normalize:  
46 |     :param batched: 
47 |     :param target: 
48 |     :param overlap: 
49 |     :return: 
50 |     """
51 |     if _model is None:
52 |         raise Exception("Please load Wave-RNN in memory before using it")
53 |     
54 |     if normalize:
55 |         mel = mel / hp.mel_max_abs_value
56 |     mel = torch.from_numpy(mel[None, ...])
57 |     wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
58 |     return wav
59 | 


--------------------------------------------------------------------------------
/vocoder/models/deepmind_version.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from utils.display import *
  5 | from utils.dsp import *
  6 | 
  7 | 
  8 | class WaveRNN(nn.Module) :
  9 |     def __init__(self, hidden_size=896, quantisation=256) :
 10 |         super(WaveRNN, self).__init__()
 11 |         
 12 |         self.hidden_size = hidden_size
 13 |         self.split_size = hidden_size // 2
 14 |         
 15 |         # The main matmul
 16 |         self.R = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
 17 |         
 18 |         # Output fc layers
 19 |         self.O1 = nn.Linear(self.split_size, self.split_size)
 20 |         self.O2 = nn.Linear(self.split_size, quantisation)
 21 |         self.O3 = nn.Linear(self.split_size, self.split_size)
 22 |         self.O4 = nn.Linear(self.split_size, quantisation)
 23 |         
 24 |         # Input fc layers
 25 |         self.I_coarse = nn.Linear(2, 3 * self.split_size, bias=False)
 26 |         self.I_fine = nn.Linear(3, 3 * self.split_size, bias=False)
 27 | 
 28 |         # biases for the gates
 29 |         self.bias_u = nn.Parameter(torch.zeros(self.hidden_size))
 30 |         self.bias_r = nn.Parameter(torch.zeros(self.hidden_size))
 31 |         self.bias_e = nn.Parameter(torch.zeros(self.hidden_size))
 32 |         
 33 |         # display num params
 34 |         self.num_params()
 35 | 
 36 |         
 37 |     def forward(self, prev_y, prev_hidden, current_coarse) :
 38 |         
 39 |         # Main matmul - the projection is split 3 ways
 40 |         R_hidden = self.R(prev_hidden)
 41 |         R_u, R_r, R_e, = torch.split(R_hidden, self.hidden_size, dim=1)
 42 |         
 43 |         # Project the prev input 
 44 |         coarse_input_proj = self.I_coarse(prev_y)
 45 |         I_coarse_u, I_coarse_r, I_coarse_e = \
 46 |             torch.split(coarse_input_proj, self.split_size, dim=1)
 47 |         
 48 |         # Project the prev input and current coarse sample
 49 |         fine_input = torch.cat([prev_y, current_coarse], dim=1)
 50 |         fine_input_proj = self.I_fine(fine_input)
 51 |         I_fine_u, I_fine_r, I_fine_e = \
 52 |             torch.split(fine_input_proj, self.split_size, dim=1)
 53 |         
 54 |         # concatenate for the gates
 55 |         I_u = torch.cat([I_coarse_u, I_fine_u], dim=1)
 56 |         I_r = torch.cat([I_coarse_r, I_fine_r], dim=1)
 57 |         I_e = torch.cat([I_coarse_e, I_fine_e], dim=1)
 58 |         
 59 |         # Compute all gates for coarse and fine 
 60 |         u = F.sigmoid(R_u + I_u + self.bias_u)
 61 |         r = F.sigmoid(R_r + I_r + self.bias_r)
 62 |         e = F.tanh(r * R_e + I_e + self.bias_e)
 63 |         hidden = u * prev_hidden + (1. - u) * e
 64 |         
 65 |         # Split the hidden state
 66 |         hidden_coarse, hidden_fine = torch.split(hidden, self.split_size, dim=1)
 67 |         
 68 |         # Compute outputs 
 69 |         out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
 70 |         out_fine = self.O4(F.relu(self.O3(hidden_fine)))
 71 | 
 72 |         return out_coarse, out_fine, hidden
 73 |     
 74 |         
 75 |     def generate(self, seq_len):
 76 |         with torch.no_grad():
 77 |             # First split up the biases for the gates 
 78 |             b_coarse_u, b_fine_u = torch.split(self.bias_u, self.split_size)
 79 |             b_coarse_r, b_fine_r = torch.split(self.bias_r, self.split_size)
 80 |             b_coarse_e, b_fine_e = torch.split(self.bias_e, self.split_size)
 81 | 
 82 |             # Lists for the two output seqs
 83 |             c_outputs, f_outputs = [], []
 84 | 
 85 |             # Some initial inputs
 86 |             out_coarse = torch.LongTensor([0]).cuda()
 87 |             out_fine = torch.LongTensor([0]).cuda()
 88 | 
 89 |             # We'll meed a hidden state
 90 |             hidden = self.init_hidden()
 91 | 
 92 |             # Need a clock for display
 93 |             start = time.time()
 94 | 
 95 |             # Loop for generation
 96 |             for i in range(seq_len) :
 97 | 
 98 |                 # Split into two hidden states
 99 |                 hidden_coarse, hidden_fine = \
100 |                     torch.split(hidden, self.split_size, dim=1)
101 | 
102 |                 # Scale and concat previous predictions
103 |                 out_coarse = out_coarse.unsqueeze(0).float() / 127.5 - 1.
104 |                 out_fine = out_fine.unsqueeze(0).float() / 127.5 - 1.
105 |                 prev_outputs = torch.cat([out_coarse, out_fine], dim=1)
106 | 
107 |                 # Project input 
108 |                 coarse_input_proj = self.I_coarse(prev_outputs)
109 |                 I_coarse_u, I_coarse_r, I_coarse_e = \
110 |                     torch.split(coarse_input_proj, self.split_size, dim=1)
111 | 
112 |                 # Project hidden state and split 6 ways
113 |                 R_hidden = self.R(hidden)
114 |                 R_coarse_u , R_fine_u, \
115 |                 R_coarse_r, R_fine_r, \
116 |                 R_coarse_e, R_fine_e = torch.split(R_hidden, self.split_size, dim=1)
117 | 
118 |                 # Compute the coarse gates
119 |                 u = F.sigmoid(R_coarse_u + I_coarse_u + b_coarse_u)
120 |                 r = F.sigmoid(R_coarse_r + I_coarse_r + b_coarse_r)
121 |                 e = F.tanh(r * R_coarse_e + I_coarse_e + b_coarse_e)
122 |                 hidden_coarse = u * hidden_coarse + (1. - u) * e
123 | 
124 |                 # Compute the coarse output
125 |                 out_coarse = self.O2(F.relu(self.O1(hidden_coarse)))
126 |                 posterior = F.softmax(out_coarse, dim=1)
127 |                 distrib = torch.distributions.Categorical(posterior)
128 |                 out_coarse = distrib.sample()
129 |                 c_outputs.append(out_coarse)
130 | 
131 |                 # Project the [prev outputs and predicted coarse sample]
132 |                 coarse_pred = out_coarse.float() / 127.5 - 1.
133 |                 fine_input = torch.cat([prev_outputs, coarse_pred.unsqueeze(0)], dim=1)
134 |                 fine_input_proj = self.I_fine(fine_input)
135 |                 I_fine_u, I_fine_r, I_fine_e = \
136 |                     torch.split(fine_input_proj, self.split_size, dim=1)
137 | 
138 |                 # Compute the fine gates
139 |                 u = F.sigmoid(R_fine_u + I_fine_u + b_fine_u)
140 |                 r = F.sigmoid(R_fine_r + I_fine_r + b_fine_r)
141 |                 e = F.tanh(r * R_fine_e + I_fine_e + b_fine_e)
142 |                 hidden_fine = u * hidden_fine + (1. - u) * e
143 | 
144 |                 # Compute the fine output
145 |                 out_fine = self.O4(F.relu(self.O3(hidden_fine)))
146 |                 posterior = F.softmax(out_fine, dim=1)
147 |                 distrib = torch.distributions.Categorical(posterior)
148 |                 out_fine = distrib.sample()
149 |                 f_outputs.append(out_fine)
150 | 
151 |                 # Put the hidden state back together
152 |                 hidden = torch.cat([hidden_coarse, hidden_fine], dim=1)
153 | 
154 |                 # Display progress
155 |                 speed = (i + 1) / (time.time() - start)
156 |                 stream('Gen: %i/%i -- Speed: %i',  (i + 1, seq_len, speed))
157 | 
158 |             coarse = torch.stack(c_outputs).squeeze(1).cpu().data.numpy()
159 |             fine = torch.stack(f_outputs).squeeze(1).cpu().data.numpy()        
160 |             output = combine_signal(coarse, fine)
161 |         
162 |         return output, coarse, fine
163 | 
164 |     def init_hidden(self, batch_size=1) :
165 |         return torch.zeros(batch_size, self.hidden_size).cuda()
166 |     
167 |     def num_params(self) :
168 |         parameters = filter(lambda p: p.requires_grad, self.parameters())
169 |         parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
170 |         print('Trainable Parameters: %.3f million' % parameters)


--------------------------------------------------------------------------------
/vocoder/train.py:
--------------------------------------------------------------------------------
  1 | from vocoder.models.fatchord_version import WaveRNN
  2 | from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
  3 | from vocoder.distribution import discretized_mix_logistic_loss
  4 | from vocoder.display import stream, simple_table
  5 | from vocoder.gen_wavernn import gen_testset
  6 | from torch.utils.data import DataLoader
  7 | from pathlib import Path
  8 | from torch import optim
  9 | import torch.nn.functional as F
 10 | import vocoder.hparams as hp
 11 | import numpy as np
 12 | import time
 13 | 
 14 | 
 15 | def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool,
 16 |           save_every: int, backup_every: int, force_restart: bool):
 17 |     # Check to make sure the hop length is correctly factorised
 18 |     assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
 19 |     
 20 |     # Instantiate the model
 21 |     print("Initializing the model...")
 22 |     model = WaveRNN(
 23 |         rnn_dims=hp.voc_rnn_dims,
 24 |         fc_dims=hp.voc_fc_dims,
 25 |         bits=hp.bits,
 26 |         pad=hp.voc_pad,
 27 |         upsample_factors=hp.voc_upsample_factors,
 28 |         feat_dims=hp.num_mels,
 29 |         compute_dims=hp.voc_compute_dims,
 30 |         res_out_dims=hp.voc_res_out_dims,
 31 |         res_blocks=hp.voc_res_blocks,
 32 |         hop_length=hp.hop_length,
 33 |         sample_rate=hp.sample_rate,
 34 |         mode=hp.voc_mode
 35 |     ).cuda()
 36 |        
 37 |     # Initialize the optimizer
 38 |     optimizer = optim.Adam(model.parameters())
 39 |     for p in optimizer.param_groups: 
 40 |         p["lr"] = hp.voc_lr
 41 |     loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss
 42 | 
 43 |     # Load the weights
 44 |     model_dir = models_dir.joinpath(run_id)
 45 |     model_dir.mkdir(exist_ok=True)
 46 |     weights_fpath = model_dir.joinpath(run_id + ".pt")
 47 |     if force_restart or not weights_fpath.exists():
 48 |         print("\nStarting the training of WaveRNN from scratch\n")
 49 |         model.save(weights_fpath, optimizer)
 50 |     else:
 51 |         print("\nLoading weights at %s" % weights_fpath)
 52 |         model.load(weights_fpath, optimizer)
 53 |         print("WaveRNN weights loaded from step %d" % model.step)
 54 |     
 55 |     # Initialize the dataset
 56 |     metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
 57 |         voc_dir.joinpath("synthesized.txt")
 58 |     mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta")
 59 |     wav_dir = syn_dir.joinpath("audio")
 60 |     dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
 61 |     test_loader = DataLoader(dataset,
 62 |                              batch_size=1,
 63 |                              shuffle=True,
 64 |                              pin_memory=True)
 65 | 
 66 |     # Begin the training
 67 |     simple_table([('Batch size', hp.voc_batch_size),
 68 |                   ('LR', hp.voc_lr),
 69 |                   ('Sequence Len', hp.voc_seq_len)])
 70 |     
 71 |     for epoch in range(1, 350):
 72 |         data_loader = DataLoader(dataset,
 73 |                                  collate_fn=collate_vocoder,
 74 |                                  batch_size=hp.voc_batch_size,
 75 |                                  num_workers=2,
 76 |                                  shuffle=True,
 77 |                                  pin_memory=True)
 78 |         start = time.time()
 79 |         running_loss = 0.
 80 | 
 81 |         for i, (x, y, m) in enumerate(data_loader, 1):
 82 |             x, m, y = x.cuda(), m.cuda(), y.cuda()
 83 |             
 84 |             # Forward pass
 85 |             y_hat = model(x, m)
 86 |             if model.mode == 'RAW':
 87 |                 y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
 88 |             elif model.mode == 'MOL':
 89 |                 y = y.float()
 90 |             y = y.unsqueeze(-1)
 91 |             
 92 |             # Backward pass
 93 |             loss = loss_func(y_hat, y)
 94 |             optimizer.zero_grad()
 95 |             loss.backward()
 96 |             optimizer.step()
 97 | 
 98 |             running_loss += loss.item()
 99 |             speed = i / (time.time() - start)
100 |             avg_loss = running_loss / i
101 | 
102 |             step = model.get_step()
103 |             k = step // 1000
104 | 
105 |             if backup_every != 0 and step % backup_every == 0 :
106 |                 model.checkpoint(model_dir, optimizer)
107 |                 
108 |             if save_every != 0 and step % save_every == 0 :
109 |                 model.save(weights_fpath, optimizer)
110 | 
111 |             msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
112 |                 f"Loss: {avg_loss:.4f} | {speed:.1f} " \
113 |                 f"steps/s | Step: {k}k | "
114 |             stream(msg)
115 | 
116 | 
117 |         gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
118 |                     hp.voc_target, hp.voc_overlap, model_dir)
119 |         print("")
120 | 


--------------------------------------------------------------------------------
/vocoder/vocoder_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from pathlib import Path
 3 | from vocoder import audio
 4 | import vocoder.hparams as hp
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | class VocoderDataset(Dataset):
10 |     def __init__(self, metadata_fpath: Path, mel_dir: Path, wav_dir: Path):
11 |         print("Using inputs from:\n\t%s\n\t%s\n\t%s" % (metadata_fpath, mel_dir, wav_dir))
12 |         
13 |         with metadata_fpath.open("r") as metadata_file:
14 |             metadata = [line.split("|") for line in metadata_file]
15 |         
16 |         gta_fnames = [x[1] for x in metadata if int(x[4])]
17 |         gta_fpaths = [mel_dir.joinpath(fname) for fname in gta_fnames]
18 |         wav_fnames = [x[0] for x in metadata if int(x[4])]
19 |         wav_fpaths = [wav_dir.joinpath(fname) for fname in wav_fnames]
20 |         self.samples_fpaths = list(zip(gta_fpaths, wav_fpaths))
21 |         
22 |         print("Found %d samples" % len(self.samples_fpaths))
23 |     
24 |     def __getitem__(self, index):  
25 |         mel_path, wav_path = self.samples_fpaths[index]
26 |         
27 |         # Load the mel spectrogram and adjust its range to [-1, 1]
28 |         mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value
29 |         
30 |         # Load the wav
31 |         wav = np.load(wav_path)
32 |         if hp.apply_preemphasis:
33 |             wav = audio.pre_emphasis(wav)
34 |         wav = np.clip(wav, -1, 1)
35 |         
36 |         # Fix for missing padding   # TODO: settle on whether this is any useful
37 |         r_pad =  (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
38 |         wav = np.pad(wav, (0, r_pad), mode='constant')
39 |         assert len(wav) >= mel.shape[1] * hp.hop_length
40 |         wav = wav[:mel.shape[1] * hp.hop_length]
41 |         assert len(wav) % hp.hop_length == 0
42 |         
43 |         # Quantize the wav
44 |         if hp.voc_mode == 'RAW':
45 |             if hp.mu_law:
46 |                 quant = audio.encode_mu_law(wav, mu=2 ** hp.bits)
47 |             else:
48 |                 quant = audio.float_2_label(wav, bits=hp.bits)
49 |         elif hp.voc_mode == 'MOL':
50 |             quant = audio.float_2_label(wav, bits=16)
51 |             
52 |         return mel.astype(np.float32), quant.astype(np.int64)
53 | 
54 |     def __len__(self):
55 |         return len(self.samples_fpaths)
56 |         
57 |         
58 | def collate_vocoder(batch):
59 |     mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
60 |     max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
61 |     mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
62 |     sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]
63 | 
64 |     mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
65 | 
66 |     labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]
67 | 
68 |     mels = np.stack(mels).astype(np.float32)
69 |     labels = np.stack(labels).astype(np.int64)
70 | 
71 |     mels = torch.tensor(mels)
72 |     labels = torch.tensor(labels).long()
73 | 
74 |     x = labels[:, :hp.voc_seq_len]
75 |     y = labels[:, 1:]
76 | 
77 |     bits = 16 if hp.voc_mode == 'MOL' else hp.bits
78 | 
79 |     x = audio.label_2_float(x.float(), bits)
80 | 
81 |     if hp.voc_mode == 'MOL' :
82 |         y = audio.label_2_float(y.float(), bits)
83 | 
84 |     return x, y, mels


--------------------------------------------------------------------------------
/vocoder_preprocess.py:
--------------------------------------------------------------------------------
 1 | from synthesizer.synthesize import run_synthesis
 2 | from synthesizer.hparams import hparams
 3 | from utils.argutils import print_args
 4 | import argparse
 5 | import os
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
10 |         pass
11 |     
12 |     parser = argparse.ArgumentParser(
13 |         description="Creates ground-truth aligned (GTA) spectrograms from the vocoder.",
14 |         formatter_class=MyFormatter
15 |     )
16 |     parser.add_argument("datasets_root", type=str, help=\
17 |         "Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
18 |         "--out_dir, this argument won't be used.")
19 |     parser.add_argument("--model_dir", type=str, 
20 |                         default="synthesizer/saved_models/logs-pretrained/", help=\
21 |         "Path to the pretrained model directory.")
22 |     parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
23 |         "Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "
24 |         "embeds. Defaults to  <datasets_root>/SV2TTS/synthesizer/.")
25 |     parser.add_argument("-o", "--out_dir", type=str, default=argparse.SUPPRESS, help= \
26 |         "Path to the output vocoder directory that will contain the ground truth aligned mel "
27 |         "spectrograms. Defaults to <datasets_root>/SV2TTS/vocoder/.")
28 |     parser.add_argument("--hparams", default="",
29 |                         help="Hyperparameter overrides as a comma-separated list of name=value "
30 |                              "pairs")
31 |     args = parser.parse_args()
32 |     print_args(args, parser)
33 |     modified_hp = hparams.parse(args.hparams)
34 |     
35 |     if not hasattr(args, "in_dir"):
36 |         args.in_dir = os.path.join(args.datasets_root, "SV2TTS", "synthesizer")
37 |     if not hasattr(args, "out_dir"):
38 |         args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder")
39 |     
40 |     run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp)
41 |     


--------------------------------------------------------------------------------
/vocoder_train.py:
--------------------------------------------------------------------------------
 1 | from utils.argutils import print_args
 2 | from vocoder.train import train
 3 | from pathlib import Path
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(
 9 |         description="Trains the vocoder from the synthesizer audios and the GTA synthesized mels, "
10 |                     "or ground truth mels.",
11 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
12 |     )
13 |     
14 |     parser.add_argument("run_id", type=str, help= \
15 |         "Name for this model instance. If a model state from the same run ID was previously "
16 |         "saved, the training will restart from there. Pass -f to overwrite saved states and "
17 |         "restart from scratch.")
18 |     parser.add_argument("datasets_root", type=str, help= \
19 |         "Path to the directory containing your SV2TTS directory. Specifying --syn_dir or --voc_dir "
20 |         "will take priority over this argument.")
21 |     parser.add_argument("--syn_dir", type=str, default=argparse.SUPPRESS, help= \
22 |         "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
23 |         "the wavs and the embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/.")
24 |     parser.add_argument("--voc_dir", type=str, default=argparse.SUPPRESS, help= \
25 |         "Path to the vocoder directory that contains the GTA synthesized mel spectrograms. "
26 |         "Defaults to <datasets_root>/SV2TTS/vocoder/. Unused if --ground_truth is passed.")
27 |     parser.add_argument("-m", "--models_dir", type=str, default="vocoder/saved_models/", help=\
28 |         "Path to the directory that will contain the saved model weights, as well as backups "
29 |         "of those weights and wavs generated during training.")
30 |     parser.add_argument("-g", "--ground_truth", action="store_true", help= \
31 |         "Train on ground truth spectrograms (<datasets_root>/SV2TTS/synthesizer/mels).")
32 |     parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
33 |         "Number of steps between updates of the model on the disk. Set to 0 to never save the "
34 |         "model.")
35 |     parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \
36 |         "Number of steps between backups of the model. Set to 0 to never make backups of the "
37 |         "model.")
38 |     parser.add_argument("-f", "--force_restart", action="store_true", help= \
39 |         "Do not load any saved model and restart from scratch.")
40 |     args = parser.parse_args()
41 | 
42 |     # Process the arguments
43 |     if not hasattr(args, "syn_dir"):
44 |         args.syn_dir = Path(args.datasets_root, "SV2TTS", "synthesizer")
45 |     args.syn_dir = Path(args.syn_dir)
46 |     if not hasattr(args, "voc_dir"):
47 |         args.voc_dir = Path(args.datasets_root, "SV2TTS", "vocoder")
48 |     args.voc_dir = Path(args.voc_dir)
49 |     del args.datasets_root
50 |     args.models_dir = Path(args.models_dir)
51 |     args.models_dir.mkdir(exist_ok=True)
52 | 
53 |     # Run the training
54 |     print_args(args, parser)
55 |     train(**vars(args))
56 |     


--------------------------------------------------------------------------------