├── Modules ├── __init__.py ├── Aligner │ ├── __init__.py │ ├── README.md │ └── Reconstructor.py ├── Vocoder │ ├── __init__.py │ ├── README.md │ ├── SAN_LICENSE │ ├── HiFiGAN_LICENSE │ ├── Avocodo_LICENSE │ ├── FeatureMatchingLoss.py │ ├── Snake.py │ ├── AMP.py │ ├── BigVGAN.py │ ├── AdversarialLoss.py │ └── MelSpecLoss.py ├── GeneralLayers │ ├── __init__.py │ ├── README.md │ ├── Swish.py │ ├── MultiSequential.py │ ├── PositionwiseFeedForward.py │ ├── LayerNorm.py │ ├── Convolution.py │ ├── LengthRegulator.py │ ├── ResidualStack.py │ ├── MultiLayeredConv1d.py │ ├── VariancePredictor.py │ ├── ResidualBlock.py │ ├── STFT.py │ ├── ConditionalLayerNorm.py │ └── EncoderLayer.py ├── ToucanTTS │ ├── __init__.py │ ├── README.md │ ├── DurationCalculator.py │ ├── StochasticToucanTTSLoss.py │ ├── glow_utils.py │ ├── ToucanTTSLoss.py │ ├── LanguageEmbeddingSpaceStructureLoss.py │ ├── CodecDiscriminator.py │ ├── EnergyCalculator.py │ ├── toucantts_train_loop_arbiter.py │ ├── PitchCalculator.py │ ├── wavenet.py │ └── flow_matching.py ├── ControllabilityGAN │ ├── __init__.py │ ├── wgan │ │ ├── __init__.py │ │ ├── resnet_init.py │ │ ├── init_weights.py │ │ └── init_wgan.py │ ├── dataset │ │ ├── __init__.py │ │ └── speaker_embeddings_dataset.py │ └── GAN.py ├── EmbeddingModel │ ├── __init__.py │ ├── README.md │ ├── StyleEmbedding.py │ └── StyleTTSEncoder.py └── README.md ├── Recipes ├── __init__.py ├── README.md ├── ToucanTTS_Nancy.py ├── ToucanTTS_Massive_English_stage2.py ├── ToucanTTS_IntegrationTest.py ├── BigVGAN_e2e.py ├── HiFiGAN_e2e.py ├── finetuning_example_simple.py ├── ToucanTTS_Massive_German.py └── finetuning_example_multilingual.py ├── Utility ├── __init__.py ├── storage_config.py ├── toucan.png ├── README.md ├── weight_averaging.py ├── WarmupScheduler.py ├── corpus_preparation.py ├── silence_removal.py └── diverse_losses.py ├── Preprocessing ├── __init__.py ├── Codec │ ├── __init__.py │ ├── README.md │ └── encodec.py ├── multilinguality │ ├── __init__.py │ ├── README.md │ ├── generate_zero_shot_lang_embs.py │ └── visualize_distances.py └── README.md ├── InferenceInterfaces ├── __init__.py └── README.md ├── .gitignore ├── run_scorer.py ├── requirements.txt ├── run_prosody_override.py ├── .github └── FUNDING.yml ├── run_zero_shot_lang_emb_injection.py ├── run_simple_GUI_demo.py ├── run_training_pipeline.py └── run_text_to_file_reader.py /Modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Recipes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Utility/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/Aligner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/Vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /InferenceInterfaces/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Preprocessing/Codec/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/EmbeddingModel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/wgan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Preprocessing/multilinguality/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Utility/storage_config.py: -------------------------------------------------------------------------------- 1 | MODEL_DIR = "Models/" 2 | PREPROCESSING_DIR = "Corpora/" 3 | -------------------------------------------------------------------------------- /Utility/toucan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xSojalSec/IMS-Toucan/HEAD/Utility/toucan.png -------------------------------------------------------------------------------- /Modules/Vocoder/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the code needed to train a HiFiGAN vocoder on the spectrogram representation that we use. -------------------------------------------------------------------------------- /Preprocessing/README.md: -------------------------------------------------------------------------------- 1 | This directory contains scripts that wrap around text processing and audio processing to allow for high-level 2 | interactions with the feature extraction. -------------------------------------------------------------------------------- /Modules/Aligner/README.md: -------------------------------------------------------------------------------- 1 | Everything that is concerned with training and using the aligner model is contained in this directory. It is recommended 2 | to use the universal aligner model that we supply in the GitHub releases. -------------------------------------------------------------------------------- /Modules/README.md: -------------------------------------------------------------------------------- 1 | This directory contains all the models that are used in this toolkit for various tasks. The models' directories contain 2 | their 3 | feature extractors, their datasets, their architectures, and their train loops. -------------------------------------------------------------------------------- /Modules/GeneralLayers/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a collection of layers that are used both during training time and during inference time. Large 2 | portions of these layers are either directly taken from ESPnet or adaptations of such. -------------------------------------------------------------------------------- /Utility/README.md: -------------------------------------------------------------------------------- 1 | This directory includes general utility scripts that include additional losses for certain tasks, the scheduler, and the 2 | interfaces to the files on the disks, as well as the scorer, which can help in finding samples that might be causing 3 | problems for the TTS. -------------------------------------------------------------------------------- /Preprocessing/Codec/README.md: -------------------------------------------------------------------------------- 1 | This code is taken from https://github.com/yangdongchao/AcademiCodec/tree/master 2 | 3 | It is their version of encodec that is sampled at 16kHz, which the original encodec repository does not offer. The 4 | download of the necessary files should happen automatically. -------------------------------------------------------------------------------- /InferenceInterfaces/README.md: -------------------------------------------------------------------------------- 1 | This directory contains interfaces that enable high level interactions with trained TTS models, which are just loaded 2 | for different inference tasks, like cloning the exact prosody of a reference utterance, or simply reading an audio out 3 | loud or reading it to a file. -------------------------------------------------------------------------------- /Modules/EmbeddingModel/README.md: -------------------------------------------------------------------------------- 1 | Everything that is concerned with the embedding model is contained in this directory. The embedding function does not 2 | have its own train loop, because it is always trained jointly with the TTS. Most of the time however, it is used in a 3 | frozen state. We recommend using the embedding function that we publish in the GitHub releases. -------------------------------------------------------------------------------- /Modules/ToucanTTS/README.md: -------------------------------------------------------------------------------- 1 | This directory contains everything needed to extract features for our TTS model and train it. It contains a lot of 2 | designs from different origins, so bringing it all together, we call it ToucanTTS. 3 | 4 | In German, when somebody talks a lot, you can say that they have a big beak. And this system sure talks a lot, so it has 5 | to be the bird with the most prominent beak! -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | tensorboard_logs/ 3 | Corpora/ 4 | Models/ 5 | audios/ 6 | Preprocessing/glottolog/ 7 | Preprocessing/multilinguality/datasets/ 8 | apex/ 9 | pretrained_models/ 10 | .tmp/ 11 | .vscode/ 12 | split/ 13 | singing/ 14 | toucan_conda_venv/ 15 | venv/ 16 | vis/ 17 | Utility/storage_config.py 18 | Preprocessing/multilinguality/distance_datasets 19 | 20 | 21 | *_graph 22 | app.py 23 | gradio* 24 | *playground* 25 | run_phonemizer.py 26 | 27 | *.pt 28 | *.out 29 | *.wav 30 | *.flac 31 | *.json 32 | *.pyc 33 | *.png 34 | *.pdf 35 | *.pkl 36 | *.gif -------------------------------------------------------------------------------- /Modules/GeneralLayers/Swish.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 2 | # Northwestern Polytechnical University (Pengcheng Guo) 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | # Adapted by Florian Lux 2021 5 | 6 | import torch 7 | 8 | 9 | class Swish(torch.nn.Module): 10 | """ 11 | Construct a Swish activation function for Conformer. 12 | """ 13 | 14 | def forward(self, x): 15 | """ 16 | Return Swish activation function. 17 | """ 18 | return x * torch.sigmoid(x) 19 | -------------------------------------------------------------------------------- /Recipes/README.md: -------------------------------------------------------------------------------- 1 | This directory contains all the pipelines, which are called in the run_training_pipeline.py script. A pipeline is a 2 | wrapper around the train loop, that loads a dataset and sets hyperparameters and settings, which it then all forwards 3 | into the actual train loop of the corresponding task. Since the TTS train loops have an arbiter that 4 | decides whether the mono-lingual or the multi-lingual train loop will be run, this does not need to be decided in the 5 | pipeline. All datasets that belong to the same language should be combined into a concat dataset before being passed to 6 | the train loop function for the arbiter to work correctly. -------------------------------------------------------------------------------- /run_scorer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example use of the scorer utility to inspect data. 3 | (pre-trained models and cache files with already extracted features are required.) 4 | """ 5 | 6 | from Utility.Scorer import TTSScorer 7 | from Utility.path_to_transcript_dicts import * 8 | from Utility.storage_config import PREPROCESSING_DIR 9 | 10 | exec_device = "cuda:8" # ADAPT THIS 11 | 12 | lang_id = "eng" 13 | tts_scorer = TTSScorer(path_to_model=None, device=exec_device) 14 | tts_scorer.score(path_to_toucantts_dataset=os.path.join(PREPROCESSING_DIR, "IntegrationTest"), lang_id=lang_id) 15 | tts_scorer.show_samples_with_highest_loss(20) 16 | tts_scorer.remove_samples_with_highest_loss(5) 17 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/wgan/resnet_init.py: -------------------------------------------------------------------------------- 1 | from Modules.ControllabilityGAN.wgan.init_weights import weights_init_D 2 | from Modules.ControllabilityGAN.wgan.init_weights import weights_init_G 3 | from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_D 4 | from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_G 5 | 6 | 7 | def init_resnet(parameters): 8 | critic = ResNet_D(parameters['data_dim'][-1], parameters['size'], nfilter=parameters['nfilter'], nfilter_max=parameters['nfilter_max']) 9 | generator = ResNet_G(parameters['data_dim'][-1], parameters['z_dim'], parameters['size'], nfilter=parameters['nfilter'], 10 | nfilter_max=parameters['nfilter_max']) 11 | 12 | generator.apply(weights_init_G) 13 | critic.apply(weights_init_D) 14 | 15 | return generator, critic 16 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/wgan/init_weights.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def weights_init_D(m): 5 | classname = m.__class__.__name__ 6 | if classname.find('Conv') != -1: 7 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu') 8 | # nn.init.constant_(m.bias, 0) 9 | elif classname.find('BatchNorm') != -1: 10 | nn.init.constant_(m.weight, 1) 11 | nn.init.constant_(m.bias, 0) 12 | 13 | 14 | def weights_init_G(m): 15 | classname = m.__class__.__name__ 16 | if classname.find('Conv') != -1: 17 | nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu') 18 | # nn.init.constant_(m.bias, 0) 19 | elif classname.find('BatchNorm') != -1: 20 | nn.init.constant_(m.weight, 1) 21 | nn.init.constant_(m.bias, 0) 22 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/MultiSequential.py: -------------------------------------------------------------------------------- 1 | # Written by Shigeki Karita, 2019 2 | # Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux, 2021 4 | 5 | import torch 6 | 7 | 8 | class MultiSequential(torch.nn.Sequential): 9 | """ 10 | Multi-input multi-output torch.nn.Sequential. 11 | """ 12 | 13 | def forward(self, *args): 14 | """ 15 | Repeat. 16 | """ 17 | for m in self: 18 | args = m(*args) 19 | return args 20 | 21 | 22 | def repeat(N, fn): 23 | """ 24 | Repeat module N times. 25 | 26 | Args: 27 | N (int): Number of repeat time. 28 | fn (Callable): Function to generate module. 29 | 30 | Returns: 31 | MultiSequential: Repeated model instance. 32 | """ 33 | return MultiSequential(*[fn(n) for n in range(N)]) 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch~=2.4.0 2 | torchaudio~=2.4.0 3 | torch_complex~=0.4.3 4 | epitran==1.24 5 | tqdm~=4.64.1 6 | scipy~=1.9.3 7 | librosa~=0.9.2 8 | praat-parselmouth~=0.4.2 9 | numpy~=1.23.4 10 | soundfile~=0.12.0 11 | pypinyin~=0.47.1 12 | pyloudnorm~=0.1.0 13 | cvxopt~=1.3.0 14 | sounddevice~=0.4.5 15 | matplotlib~=3.9.2 16 | phonemizer~=3.2.1 17 | gradio~=5.23.2 18 | pyqt5~=5.15.11 19 | pyqtgraph~=0.13.7 20 | wandb~=0.13.5 21 | speechbrain==0.5.13 22 | dragonmapper~=0.2.6 23 | alias_free_torch~=0.0.6 24 | dotwiz==0.4.0 25 | transphone==1.5.3 26 | phonepiece==1.4.2 27 | geopy==2.4.1 28 | einops==0.7.0 29 | datasets~=2.10.1 30 | pandas~=1.5.0 31 | rich~=13.4.2 32 | PyYAML~=6.0 33 | imageio~=2.34.0 34 | pykakasi~=2.2.1 35 | jamo~=0.4.1 36 | g2pk~=0.9.4 37 | pykan~=0.2.6 38 | huggingface-hub~=0.25.2 39 | pynput~=1.7.7 40 | PyAutoGUI~=0.9.54 41 | networkx~=3.3 42 | scikit-learn~=1.5.0 -------------------------------------------------------------------------------- /run_prosody_override.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from InferenceInterfaces.UtteranceCloner import UtteranceCloner 4 | 5 | if __name__ == '__main__': 6 | uc = UtteranceCloner(model_id=None, device="cuda" if torch.cuda.is_available() else "cpu" if torch.cuda.is_available() else "cpu") 7 | 8 | # What is said in path_to_reference_audio_for_intonation has to match the text in the reference_transcription exactly! 9 | uc.clone_utterance(path_to_reference_audio_for_intonation="audios/speaker_references_for_testing/sad.wav", 10 | path_to_reference_audio_for_voice="audios/speaker_references_for_testing/female_mid_voice.wav", # the two reference audios can be the same, but don't have to be 11 | transcription_of_intonation_reference="This report is due tomorrow.", 12 | filename_of_result="audios/test_cloned.wav", 13 | lang="eng") 14 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/PositionwiseFeedForward.py: -------------------------------------------------------------------------------- 1 | # Written by Shigeki Karita, 2019 2 | # Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux, 2021 4 | 5 | 6 | import torch 7 | 8 | 9 | class PositionwiseFeedForward(torch.nn.Module): 10 | """ 11 | Args: 12 | idim (int): Input dimenstion. 13 | hidden_units (int): The number of hidden units. 14 | dropout_rate (float): Dropout rate. 15 | 16 | """ 17 | 18 | def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): 19 | super(PositionwiseFeedForward, self).__init__() 20 | self.w_1 = torch.nn.Linear(idim, hidden_units) 21 | self.w_2 = torch.nn.Linear(hidden_units, idim) 22 | self.dropout = torch.nn.Dropout(dropout_rate) 23 | self.activation = activation 24 | 25 | def forward(self, x): 26 | return self.w_2(self.dropout(self.activation(self.w_1(x)))) 27 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [ Flux9665 ] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 16 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/LayerNorm.py: -------------------------------------------------------------------------------- 1 | # Written by Shigeki Karita, 2019 2 | # Published under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux, 2021 4 | 5 | import torch 6 | 7 | 8 | class LayerNorm(torch.nn.LayerNorm): 9 | """ 10 | Layer normalization module. 11 | 12 | Args: 13 | nout (int): Output dim size. 14 | dim (int): Dimension to be normalized. 15 | """ 16 | 17 | def __init__(self, nout, dim=-1, eps=1e-12): 18 | """ 19 | Construct an LayerNorm object. 20 | """ 21 | super(LayerNorm, self).__init__(nout, eps=eps) 22 | self.dim = dim 23 | 24 | def forward(self, x): 25 | """ 26 | Apply layer normalization. 27 | 28 | Args: 29 | x (torch.Tensor): Input tensor. 30 | 31 | Returns: 32 | torch.Tensor: Normalized tensor. 33 | """ 34 | if self.dim == -1: 35 | return super(LayerNorm, self).forward(x) 36 | return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1) 37 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/DurationCalculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Nagoya University (Tomoki Hayashi) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux 2021 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | 9 | 10 | class DurationCalculator(torch.nn.Module): 11 | 12 | def __init__(self, reduction_factor=1.0): 13 | super().__init__() 14 | 15 | @torch.no_grad() 16 | def forward(self, att_ws, vis=None): 17 | """ 18 | Convert alignment matrix to durations. 19 | """ 20 | if vis is not None: 21 | plt.figure(figsize=(8, 4)) 22 | plt.imshow(att_ws.cpu().numpy(), interpolation='nearest', aspect='auto', origin="lower") 23 | plt.xlabel("Inputs") 24 | plt.ylabel("Outputs") 25 | plt.tight_layout() 26 | plt.savefig(vis) 27 | plt.close() 28 | # calculate duration from 2d alignment matrix 29 | durations = torch.stack([att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])]) 30 | return durations.view(-1) 31 | -------------------------------------------------------------------------------- /Modules/Vocoder/SAN_LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Sony Research Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Modules/Vocoder/HiFiGAN_LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2022 Rishikesh (ऋषिकेश) 4 | Modified 2022 by Florian Lux 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. -------------------------------------------------------------------------------- /Modules/Vocoder/Avocodo_LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Tomoki Hayashi 4 | Modified 2021 by Florian Lux 5 | Further code integrated from 2022 Rishikesh (ऋषिकेश), same license applies 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | THE SOFTWARE. -------------------------------------------------------------------------------- /Modules/Aligner/Reconstructor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.multiprocessing 3 | 4 | from Utility.utils import make_non_pad_mask 5 | 6 | 7 | class Reconstructor(torch.nn.Module): 8 | 9 | def __init__(self, 10 | n_features=128, 11 | num_symbols=145, 12 | speaker_embedding_dim=192, 13 | hidden_dim=256): 14 | super().__init__() 15 | self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, hidden_dim) 16 | self.hidden_proj = torch.nn.Linear(hidden_dim, hidden_dim) 17 | self.out_proj = torch.nn.Linear(hidden_dim, n_features) 18 | self.l1_criterion = torch.nn.L1Loss(reduction="none") 19 | 20 | def forward(self, x, lens, ys): 21 | x = self.in_proj(x) 22 | x = torch.nn.functional.leaky_relu(x) 23 | x = self.hidden_proj(x) 24 | x = torch.nn.functional.leaky_relu(x) 25 | x = self.out_proj(x) 26 | out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device) 27 | out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float() 28 | out_weights /= ys.size(0) * ys.size(2) 29 | return self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum() 30 | 31 | 32 | if __name__ == '__main__': 33 | print(sum(p.numel() for p in Reconstructor().parameters() if p.requires_grad)) 34 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/StochasticToucanTTSLoss.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from ESPNet 3 | Adapted by Flux 4 | """ 5 | 6 | import torch 7 | 8 | from Utility.utils import make_non_pad_mask 9 | 10 | 11 | class StochasticToucanTTSLoss(torch.nn.Module): 12 | 13 | def __init__(self): 14 | super().__init__() 15 | self.l1_criterion = torch.nn.L1Loss(reduction="none") 16 | 17 | def forward(self, predicted_features, gold_features, features_lengths): 18 | """ 19 | Args: 20 | predicted_features (Tensor): Batch of outputs (B, Lmax, odim). 21 | gold_features (Tensor): Batch of target features (B, Lmax, odim). 22 | features_lengths (LongTensor): Batch of the lengths of each target (B,). 23 | 24 | Returns: 25 | Tensor: L1 loss value. 26 | """ 27 | 28 | # calculate loss 29 | l1_loss = self.l1_criterion(predicted_features, gold_features) 30 | 31 | # make weighted mask and apply it 32 | out_masks = make_non_pad_mask(features_lengths).unsqueeze(-1).to(gold_features.device) 33 | out_masks = torch.nn.functional.pad(out_masks.transpose(1, 2), [0, gold_features.size(1) - out_masks.size(1), 0, 0, 0, 0], value=False).transpose(1, 2) 34 | out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float() 35 | out_weights /= gold_features.size(0) * gold_features.size(2) 36 | 37 | # apply weight 38 | l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum() 39 | 40 | return l1_loss 41 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/wgan/init_wgan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from Modules.ControllabilityGAN.wgan.resnet_init import init_resnet 4 | from Modules.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost 5 | 6 | 7 | def create_wgan(parameters, device, optimizer='adam'): 8 | if parameters['model'] == "resnet": 9 | generator, discriminator = init_resnet(parameters) 10 | else: 11 | raise NotImplementedError 12 | 13 | if optimizer == 'adam': 14 | optimizer_g = torch.optim.Adam(generator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas']) 15 | optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas']) 16 | elif optimizer == 'rmsprop': 17 | optimizer_g = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate']) 18 | optimizer_d = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate']) 19 | 20 | criterion = torch.nn.MSELoss() 21 | 22 | gan = WassersteinGanQuadraticCost(generator, 23 | discriminator, 24 | optimizer_g, 25 | optimizer_d, 26 | criterion=criterion, 27 | data_dimensions=parameters['data_dim'], 28 | epochs=parameters['epochs'], 29 | batch_size=parameters['batch_size'], 30 | device=device, 31 | n_max_iterations=parameters['n_max_iterations'], 32 | gamma=parameters['gamma']) 33 | 34 | return gan 35 | -------------------------------------------------------------------------------- /Preprocessing/multilinguality/README.md: -------------------------------------------------------------------------------- 1 | ## Zero-Shot Approximation of Language Embeddings 2 | 3 | This directory contains all scripts that are needed to reproduce the meta learning for zero-shot part of our system. 4 | These scripts allow you to predict representations of languages purely based on distances between them, as measured by a 5 | variety of linguistically informed metrics, or even better, a learned combination thereof. 6 | 7 | ### Applying zero-shot approximation to a trained model 8 | 9 | Use `run_zero_shot_lang_emb_injection.py` to update the language embeddings of a trained model for all languages that 10 | were *not* seen during training (by default, `supervised_languages.json` is used to determine which languages *were* 11 | seen). 12 | See the script for arguments that can be passed (e.g. to use a custom model path). Here is an example: 13 | 14 | ``` 15 | cd IMS-Toucan/ 16 | python run_zero_shot_lang_emb_injection.py -m -d -k 17 | ``` 18 | 19 | By default, the updated model is saved with a modified filename in the same directory. 20 | 21 | ### Cached distance lookups 22 | 23 | In order to apply any zero-shot approximation, cache files for distance lookups are required. 24 | 25 | The ASP lookup file (`asp_dict.pkl`) needs to be downloaded from the release page. All other cache files are 26 | automatically generated as required when running `run_zero_shot_lang_emb_injection.py`. 27 | 28 | **Note:** While the map, tree, and inverse ASP distances are model independent, **the learned distance lookup is only 29 | applicable for the model it was trained on**, i.e., different Toucan models require different learned-distance lookups. 30 | If you want to apply zero-shot approximation to a new model, make sure that you are not using an outdated, pre-existing 31 | learned distance lookup, but instead train a new learned distance metric. 32 | -------------------------------------------------------------------------------- /run_zero_shot_lang_emb_injection.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | from Preprocessing.multilinguality.create_distance_lookups import CacheCreator 6 | from Preprocessing.multilinguality.create_lang_dist_dataset import LangDistDatasetCreator 7 | from Preprocessing.multilinguality.generate_zero_shot_lang_embs import approximate_and_inject_language_embeddings 8 | from Utility.storage_config import MODEL_DIR 9 | 10 | if __name__ == "__main__": 11 | default_model_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--model_path", "-m", type=str, default=default_model_path, help="model path from which to obtain pretrained language embeddings") 14 | parser.add_argument("--distance_type", "-d", type=str, choices=["map", "tree", "asp", "learned", "combined"], default="learned", 15 | help="which type of distance to use for finding nearest languages") 16 | parser.add_argument("--n_closest", "-k", type=int, default=50, help="how many nearest languages to select for language embedding approximation") 17 | 18 | args = parser.parse_args() 19 | 20 | # make sure that cache files exist 21 | cc = CacheCreator(cache_root="Preprocessing/multilinguality") 22 | cc.create_required_files(model_path=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt")) 23 | 24 | # create distance dataset 25 | dc = LangDistDatasetCreator(args.model_path, cache_root="Preprocessing/multilinguality") 26 | distance_dataset = dc.create_dataset(args.distance_type, n_closest=args.n_closest, zero_shot=True) 27 | 28 | # generate zero-shot lang embs and inject into pretrained model, then save to modified model path 29 | approximate_and_inject_language_embeddings(model_path=args.model_path, 30 | df=distance_dataset, 31 | iso_lookup=dc.iso_lookup) 32 | -------------------------------------------------------------------------------- /Modules/Vocoder/FeatureMatchingLoss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2021 4 | 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | 9 | 10 | def feature_loss(fmap_r, fmap_g): 11 | loss = 0 12 | for dr, dg in zip(fmap_r, fmap_g): 13 | loss += torch.mean(torch.abs(dr - dg)) 14 | 15 | return loss / len(fmap_g) 16 | 17 | 18 | class FeatureMatchLoss(torch.nn.Module): 19 | 20 | def __init__(self, 21 | average_by_layers=True, 22 | average_by_discriminators=False, 23 | include_final_outputs=False, ): 24 | super().__init__() 25 | self.average_by_layers = average_by_layers 26 | self.average_by_discriminators = average_by_discriminators 27 | self.include_final_outputs = include_final_outputs 28 | 29 | def forward(self, feats_hat, feats): 30 | """ 31 | Calculate feature matching loss. 32 | 33 | Args: 34 | feats_hat (list): List of lists of discriminator outputs 35 | calculated from generator outputs. 36 | feats (list): List of lists of discriminator outputs 37 | calculated from ground-truth. 38 | 39 | Returns: 40 | Tensor: Feature matching loss value. 41 | """ 42 | feat_match_loss = 0.0 43 | for i, (feats_hat_, feats_) in enumerate(zip(feats_hat, feats)): 44 | feat_match_loss_ = 0.0 45 | if not self.include_final_outputs: 46 | feats_hat_ = feats_hat_[:-1] 47 | feats_ = feats_[:-1] 48 | for j, (feat_hat_, feat_) in enumerate(zip(feats_hat_, feats_)): 49 | feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach()) 50 | if self.average_by_layers: 51 | feat_match_loss_ /= j + 1 52 | feat_match_loss += feat_match_loss_ 53 | if self.average_by_discriminators: 54 | feat_match_loss /= i + 1 55 | 56 | return feat_match_loss 57 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/Convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 2 | # Northwestern Polytechnical University (Pengcheng Guo) 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | # Adapted by Florian Lux 2021 5 | 6 | 7 | from torch import nn 8 | 9 | 10 | class ConvolutionModule(nn.Module): 11 | """ 12 | ConvolutionModule in Conformer model. 13 | 14 | Args: 15 | channels (int): The number of channels of conv layers. 16 | kernel_size (int): Kernel size of conv layers. 17 | 18 | """ 19 | 20 | def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): 21 | super(ConvolutionModule, self).__init__() 22 | # kernel_size should be an odd number for 'SAME' padding 23 | assert (kernel_size - 1) % 2 == 0 24 | 25 | self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, ) 26 | self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, ) 27 | self.norm = nn.SyncBatchNorm.convert_sync_batchnorm(nn.BatchNorm1d(channels)) 28 | self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, ) 29 | self.activation = activation 30 | 31 | def forward(self, x): 32 | """ 33 | Compute convolution module. 34 | 35 | Args: 36 | x (torch.Tensor): Input tensor (#batch, time, channels). 37 | 38 | Returns: 39 | torch.Tensor: Output tensor (#batch, time, channels). 40 | 41 | """ 42 | # exchange the temporal dimension and the feature dimension 43 | x = x.transpose(1, 2) 44 | 45 | # GLU mechanism 46 | x = self.pointwise_conv1(x) # (batch, 2*channel, dim) 47 | x = nn.functional.glu(x, dim=1) # (batch, channel, dim) 48 | 49 | # 1D Depthwise Conv 50 | x = self.depthwise_conv(x) 51 | x = self.activation(self.norm(x)) 52 | 53 | x = self.pointwise_conv2(x) 54 | 55 | return x.transpose(1, 2) 56 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/LengthRegulator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2021 4 | 5 | from abc import ABC 6 | 7 | import torch 8 | 9 | from Utility.utils import pad_list 10 | 11 | 12 | class LengthRegulator(torch.nn.Module, ABC): 13 | """ 14 | Length regulator module for feed-forward Transformer. 15 | 16 | This is a module of length regulator described in 17 | `FastSpeech: Fast, Robust and Controllable Text to Speech`_. 18 | The length regulator expands char or 19 | phoneme-level embedding features to frame-level by repeating each 20 | feature based on the corresponding predicted durations. 21 | 22 | .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: 23 | https://arxiv.org/pdf/1905.09263.pdf 24 | 25 | """ 26 | 27 | def __init__(self, pad_value=0.0): 28 | """ 29 | Initialize length regulator module. 30 | 31 | Args: 32 | pad_value (float, optional): Value used for padding. 33 | """ 34 | super(LengthRegulator, self).__init__() 35 | self.pad_value = pad_value 36 | 37 | def forward(self, xs, ds, alpha=1.0): 38 | """ 39 | Calculate forward propagation. 40 | Args: 41 | xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). 42 | ds (LongTensor): Batch of durations of each frame (B, T). 43 | alpha (float, optional): Alpha value to control speed of speech. 44 | Returns: 45 | Tensor: replicated input tensor based on durations (B, T*, D). 46 | """ 47 | 48 | if alpha != 1.0: 49 | assert alpha > 0 50 | ds = torch.round(ds.float() * alpha).long() 51 | 52 | if ds.sum() == 0: 53 | ds[ds.sum(dim=1).eq(0)] = 1 54 | 55 | return pad_list([self._repeat_one_sequence(x, d) for x, d in zip(xs, ds)], self.pad_value) 56 | 57 | def _repeat_one_sequence(self, x, d): 58 | """ 59 | Repeat each frame according to duration 60 | """ 61 | d = torch.clamp(d, min=0) 62 | return torch.repeat_interleave(x, d, dim=0) 63 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/glow_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | MIT License 3 | 4 | Copyright (c) 2022 Yi Ren 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | import torch 26 | 27 | 28 | def squeeze(x, nonpadding=None, n_sqz=2): 29 | b, c, t = x.size() 30 | 31 | t = (t // n_sqz) * n_sqz 32 | x = x[:, :, :t] 33 | x_sqz = x.view(b, c, t // n_sqz, n_sqz) 34 | x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz) 35 | 36 | if nonpadding is not None: 37 | nonpadding = nonpadding[:, :, n_sqz - 1::n_sqz] 38 | else: 39 | nonpadding = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype) 40 | return x_sqz * nonpadding, nonpadding 41 | 42 | 43 | def unsqueeze(x, nonpadding=None, n_sqz=2): 44 | b, c, t = x.size() 45 | 46 | x_unsqz = x.view(b, n_sqz, c // n_sqz, t) 47 | x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz) 48 | 49 | if nonpadding is not None: 50 | nonpadding = nonpadding.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz) 51 | else: 52 | nonpadding = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype) 53 | return x_unsqz * nonpadding, nonpadding 54 | -------------------------------------------------------------------------------- /Preprocessing/Codec/encodec.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | 4 | import numpy as np 5 | import torch.nn as nn 6 | 7 | from Preprocessing.Codec.seanet import SEANetDecoder 8 | from Preprocessing.Codec.seanet import SEANetEncoder 9 | from Preprocessing.Codec.vq import ResidualVectorQuantizer 10 | 11 | 12 | # Generator 13 | class EnCodec(nn.Module): 14 | def __init__(self, 15 | n_filters, 16 | D, 17 | target_bandwidths=[1, 1.5, 2, 4, 6, 12], 18 | ratios=[8, 5, 4, 2], 19 | sample_rate=16000, 20 | bins=1024, 21 | normalize=False): 22 | super().__init__() 23 | self.hop_length = np.prod(ratios) # 计算乘积 24 | self.encoder = SEANetEncoder(n_filters=n_filters, dimension=D, ratios=ratios) 25 | n_q = int(1000 * target_bandwidths[-1] // (math.ceil(sample_rate / self.hop_length) * 10)) 26 | self.frame_rate = math.ceil(sample_rate / np.prod(ratios)) # 50 27 | self.bits_per_codebook = int(math.log2(bins)) 28 | self.target_bandwidths = target_bandwidths 29 | self.quantizer = ResidualVectorQuantizer(dimension=D, n_q=n_q, bins=bins) 30 | self.decoder = SEANetDecoder(n_filters=n_filters, dimension=D, ratios=ratios) 31 | 32 | def get_last_layer(self): 33 | return self.decoder.layers[-1].weight 34 | 35 | def forward(self, x): 36 | e = self.encoder(x) 37 | max_idx = len(self.target_bandwidths) - 1 38 | bw = self.target_bandwidths[random.randint(0, max_idx)] 39 | quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw) 40 | o = self.decoder(quantized) 41 | return o, commit_loss, None 42 | 43 | def encode(self, x, target_bw=None, st=None): 44 | e = self.encoder(x) 45 | if target_bw is None: 46 | bw = self.target_bandwidths[-1] 47 | else: 48 | bw = target_bw 49 | if st is None: 50 | st = 0 51 | codes = self.quantizer.encode(e, self.frame_rate, bw, st) 52 | return codes 53 | 54 | def decode(self, codes): 55 | quantized = self.quantizer.decode(codes) 56 | o = self.decoder(quantized) 57 | return o 58 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/ResidualStack.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2021 4 | 5 | 6 | import torch 7 | 8 | 9 | class ResidualStack(torch.nn.Module): 10 | 11 | def __init__(self, kernel_size=3, channels=32, dilation=1, bias=True, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, 12 | pad="ReflectionPad1d", pad_params={}, ): 13 | """ 14 | Initialize ResidualStack module. 15 | 16 | Args: 17 | kernel_size (int): Kernel size of dilation convolution layer. 18 | channels (int): Number of channels of convolution layers. 19 | dilation (int): Dilation factor. 20 | bias (bool): Whether to add bias parameter in convolution layers. 21 | nonlinear_activation (str): Activation function module name. 22 | nonlinear_activation_params (dict): Hyperparameters for activation function. 23 | pad (str): Padding function module name before dilated convolution layer. 24 | pad_params (dict): Hyperparameters for padding function. 25 | 26 | """ 27 | super(ResidualStack, self).__init__() 28 | 29 | # defile residual stack part 30 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 31 | self.stack = torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 32 | getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params), 33 | torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias), 34 | getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 35 | torch.nn.Conv1d(channels, channels, 1, bias=bias), ) 36 | 37 | # defile extra layer for skip connection 38 | self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias) 39 | 40 | def forward(self, c): 41 | """ 42 | Calculate forward propagation. 43 | 44 | Args: 45 | c (Tensor): Input tensor (B, channels, T). 46 | 47 | Returns: 48 | Tensor: Output tensor (B, chennels, T). 49 | 50 | """ 51 | return self.stack(c) + self.skip_layer(c) 52 | -------------------------------------------------------------------------------- /Modules/Vocoder/Snake.py: -------------------------------------------------------------------------------- 1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. 2 | 3 | import torch 4 | from torch import nn 5 | from torch import pow 6 | from torch import sin 7 | from torch.nn import Parameter 8 | 9 | 10 | class SnakeBeta(nn.Module): 11 | """ 12 | A modified Snake function which uses separate parameters for the magnitude of the periodic components 13 | Shape: 14 | - Input: (B, C, T) 15 | - Output: (B, C, T), same shape as the input 16 | Parameters: 17 | - alpha - trainable parameter that controls frequency 18 | - beta - trainable parameter that controls magnitude 19 | References: 20 | - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: 21 | https://arxiv.org/abs/2006.08195 22 | """ 23 | 24 | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): 25 | """ 26 | Initialization. 27 | INPUT: 28 | - in_features: shape of the input 29 | - alpha - trainable parameter that controls frequency 30 | - beta - trainable parameter that controls magnitude 31 | alpha is initialized to 1 by default, higher values = higher-frequency. 32 | beta is initialized to 1 by default, higher values = higher-magnitude. 33 | alpha will be trained along with the rest of your model. 34 | """ 35 | super(SnakeBeta, self).__init__() 36 | self.in_features = in_features 37 | 38 | # initialize alpha 39 | self.alpha_logscale = alpha_logscale 40 | if self.alpha_logscale: # log scale alphas initialized to zeros 41 | self.alpha = Parameter(torch.zeros(in_features) * alpha) 42 | self.beta = Parameter(torch.zeros(in_features) * alpha) 43 | else: # linear scale alphas initialized to ones 44 | self.alpha = Parameter(torch.ones(in_features) * alpha) 45 | self.beta = Parameter(torch.ones(in_features) * alpha) 46 | 47 | self.alpha.requires_grad = alpha_trainable 48 | self.beta.requires_grad = alpha_trainable 49 | 50 | self.no_div_by_zero = 0.000000001 51 | 52 | def forward(self, x): 53 | """ 54 | Forward pass of the function. 55 | Applies the function to the input elementwise. 56 | SnakeBeta ∶= x + 1/b * sin^2 (xa) 57 | """ 58 | alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] 59 | beta = self.beta.unsqueeze(0).unsqueeze(-1) 60 | if self.alpha_logscale: 61 | alpha = torch.exp(alpha) 62 | beta = torch.exp(beta) 63 | x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) 64 | 65 | return x 66 | -------------------------------------------------------------------------------- /Recipes/ToucanTTS_Nancy.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import wandb 5 | 6 | from Utility.path_to_transcript_dicts import * 7 | 8 | 9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 10 | from torch.utils.data import ConcatDataset 11 | 12 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 13 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 14 | from Utility.corpus_preparation import prepare_tts_corpus 15 | from Utility.storage_config import MODEL_DIR 16 | from Utility.storage_config import PREPROCESSING_DIR 17 | 18 | if gpu_id == "cpu": 19 | device = torch.device("cpu") 20 | else: 21 | device = torch.device("cuda") 22 | 23 | print("Preparing") 24 | 25 | if model_dir is not None: 26 | save_dir = model_dir 27 | else: 28 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_Nancy") 29 | os.makedirs(save_dir, exist_ok=True) 30 | 31 | if gpu_count > 1: 32 | rank = int(os.environ["LOCAL_RANK"]) 33 | torch.cuda.set_device(rank) 34 | torch.distributed.init_process_group(backend="nccl") 35 | else: 36 | rank = 0 37 | 38 | train_set = prepare_tts_corpus(transcript_dict=build_path_to_transcript_nancy(), 39 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Nancy"), 40 | lang="eng", 41 | save_imgs=False, 42 | gpu_count=gpu_count, 43 | rank=rank) 44 | 45 | model = ToucanTTS() 46 | 47 | if gpu_count > 1: 48 | model.to(rank) 49 | model = torch.nn.parallel.DistributedDataParallel( 50 | model, 51 | device_ids=[rank], 52 | output_device=rank, 53 | find_unused_parameters=True, 54 | ) 55 | torch.distributed.barrier() 56 | train_sampler = torch.utils.data.RandomSampler(train_set) 57 | 58 | if use_wandb: 59 | if rank == 0: 60 | wandb.init( 61 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 62 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 63 | resume="must" if wandb_resume_id is not None else None) 64 | print("Training model") 65 | train_loop(net=model, 66 | datasets=[train_set], 67 | device=device, 68 | warmup_steps=4000, 69 | steps=200000, 70 | batch_size=16, 71 | save_directory=save_dir, 72 | eval_lang="eng", 73 | path_to_checkpoint=resume_checkpoint, 74 | fine_tune=finetune, 75 | resume=resume, 76 | use_wandb=use_wandb, 77 | train_samplers=[train_sampler], 78 | gpu_count=gpu_count) 79 | if use_wandb: 80 | wandb.finish() 81 | -------------------------------------------------------------------------------- /Recipes/ToucanTTS_Massive_English_stage2.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import wandb 5 | 6 | from Utility.path_to_transcript_dicts import * 7 | 8 | 9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 10 | from torch.utils.data import ConcatDataset 11 | 12 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 13 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 14 | from Utility.corpus_preparation import prepare_tts_corpus 15 | from Utility.storage_config import MODEL_DIR 16 | from Utility.storage_config import PREPROCESSING_DIR 17 | 18 | if gpu_id == "cpu": 19 | device = torch.device("cpu") 20 | else: 21 | device = torch.device("cuda") 22 | 23 | print("Preparing") 24 | 25 | if model_dir is not None: 26 | save_dir = model_dir 27 | else: 28 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_English_v4") 29 | os.makedirs(save_dir, exist_ok=True) 30 | 31 | if gpu_count > 1: 32 | rank = int(os.environ["LOCAL_RANK"]) 33 | torch.cuda.set_device(rank) 34 | torch.distributed.init_process_group(backend="nccl") 35 | else: 36 | rank = 0 37 | 38 | datasets = list() 39 | 40 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_libritts_all_clean, 41 | corpus_dir=os.path.join(PREPROCESSING_DIR, "libri_all_clean"), 42 | lang="eng", 43 | gpu_count=gpu_count, 44 | rank=rank)) 45 | 46 | train_set = ConcatDataset(datasets) 47 | 48 | model = ToucanTTS() 49 | 50 | if gpu_count > 1: 51 | model.to(rank) 52 | model = torch.nn.parallel.DistributedDataParallel( 53 | model, 54 | device_ids=[rank], 55 | output_device=rank, 56 | find_unused_parameters=True, 57 | ) 58 | torch.distributed.barrier() 59 | train_sampler = torch.utils.data.RandomSampler(train_set) 60 | 61 | if use_wandb: 62 | if rank == 0: 63 | wandb.init( 64 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 65 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 66 | resume="must" if wandb_resume_id is not None else None) 67 | print("Training model") 68 | train_loop(net=model, 69 | datasets=[train_set], 70 | device=device, 71 | batch_size=12, 72 | steps_per_checkpoint=1000, 73 | save_directory=save_dir, 74 | eval_lang="eng", 75 | path_to_checkpoint=resume_checkpoint, 76 | fine_tune=finetune, 77 | resume=resume, 78 | use_wandb=use_wandb, 79 | train_samplers=[train_sampler], 80 | gpu_count=gpu_count) 81 | if use_wandb: 82 | wandb.finish() 83 | -------------------------------------------------------------------------------- /Recipes/ToucanTTS_IntegrationTest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is basically an integration test 3 | """ 4 | 5 | import time 6 | 7 | import torch 8 | import wandb 9 | 10 | from Utility.path_to_transcript_dicts import * 11 | 12 | 13 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 14 | from torch.utils.data import ConcatDataset 15 | 16 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 17 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 18 | from Utility.corpus_preparation import prepare_tts_corpus 19 | from Utility.storage_config import MODEL_DIR 20 | from Utility.storage_config import PREPROCESSING_DIR 21 | 22 | if gpu_id == "cpu": 23 | device = torch.device("cpu") 24 | else: 25 | device = torch.device("cuda") 26 | 27 | print("Preparing") 28 | 29 | if model_dir is not None: 30 | save_dir = model_dir 31 | else: 32 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_IntegrationTest") 33 | os.makedirs(save_dir, exist_ok=True) 34 | 35 | if gpu_count > 1: 36 | rank = int(os.environ["LOCAL_RANK"]) 37 | torch.cuda.set_device(rank) 38 | torch.distributed.init_process_group(backend="nccl") 39 | else: 40 | rank = 0 41 | 42 | train_set = prepare_tts_corpus(transcript_dict=build_path_to_transcript_integration_test(), 43 | corpus_dir=os.path.join(PREPROCESSING_DIR, "IntegrationTest"), 44 | lang="eng", 45 | save_imgs=True, 46 | gpu_count=gpu_count, 47 | rank=rank) 48 | 49 | model = ToucanTTS() 50 | 51 | if gpu_count > 1: 52 | model.to(rank) 53 | model = torch.nn.parallel.DistributedDataParallel( 54 | model, 55 | device_ids=[rank], 56 | output_device=rank, 57 | find_unused_parameters=True, 58 | ) 59 | torch.distributed.barrier() 60 | train_sampler = torch.utils.data.RandomSampler(train_set) 61 | 62 | if use_wandb: 63 | if rank == 0: 64 | wandb.init( 65 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 66 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 67 | resume="must" if wandb_resume_id is not None else None) 68 | print("Training model") 69 | train_loop(net=model, 70 | datasets=[train_set], 71 | device=device, 72 | save_directory=save_dir, 73 | batch_size=8, 74 | eval_lang="eng", 75 | warmup_steps=500, 76 | path_to_checkpoint=resume_checkpoint, 77 | fine_tune=finetune, 78 | resume=resume, 79 | steps=5000, 80 | use_wandb=use_wandb, 81 | train_samplers=[train_sampler], 82 | gpu_count=gpu_count) 83 | if use_wandb: 84 | wandb.finish() 85 | -------------------------------------------------------------------------------- /Recipes/BigVGAN_e2e.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import wandb 5 | 6 | from Utility.path_to_transcript_dicts import * 7 | 8 | 9 | def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count): 10 | from Modules.Vocoder.BigVGAN import BigVGAN 11 | from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset 12 | from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator 13 | from Modules.Vocoder.HiFiGAN_train_loop import train_loop 14 | from Utility.storage_config import MODEL_DIR 15 | 16 | if gpu_id == "cpu": 17 | device = torch.device("cpu") 18 | else: 19 | device = torch.device("cuda") 20 | 21 | if gpu_count > 1: 22 | print("Multi GPU training not supported for BigVGAN!") 23 | import sys 24 | sys.exit() 25 | 26 | print("Preparing") 27 | if model_dir is not None: 28 | model_save_dir = model_dir 29 | else: 30 | model_save_dir = os.path.join(MODEL_DIR, "BigVGAN_e2e") 31 | os.makedirs(model_save_dir, exist_ok=True) 32 | 33 | # To prepare the data, have a look at Modules/Vocoder/run_end-to-end_data_creation 34 | 35 | print("Collecting new data...") 36 | 37 | file_lists_for_this_run_combined = list() 38 | file_lists_for_this_run_combined_synthetic = list() 39 | 40 | fl = list(build_path_to_transcript_libritts_all_clean().keys()) 41 | fisher_yates_shuffle(fl) 42 | fisher_yates_shuffle(fl) 43 | for i, f in enumerate(fl): 44 | if os.path.exists(f.replace(".wav", "_synthetic_spec.pt")): 45 | file_lists_for_this_run_combined.append(f) 46 | file_lists_for_this_run_combined_synthetic.append(f.replace(".wav", "_synthetic_spec.pt")) 47 | print("filepaths collected") 48 | 49 | train_set = HiFiGANDataset(list_of_original_paths=file_lists_for_this_run_combined, 50 | list_of_synthetic_paths=file_lists_for_this_run_combined_synthetic) 51 | 52 | generator = BigVGAN() 53 | discriminator = AvocodoHiFiGANJointDiscriminator() 54 | 55 | print("Training model") 56 | if use_wandb: 57 | wandb.init( 58 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 59 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 60 | resume="must" if wandb_resume_id is not None else None) 61 | train_loop(batch_size=16, 62 | epochs=5180000, 63 | generator=generator, 64 | discriminator=discriminator, 65 | train_dataset=train_set, 66 | device=device, 67 | epochs_per_save=1, 68 | model_save_dir=model_save_dir, 69 | path_to_checkpoint=resume_checkpoint, 70 | resume=resume, 71 | use_wandb=use_wandb, 72 | finetune=finetune) 73 | if use_wandb: 74 | wandb.finish() 75 | 76 | 77 | def fisher_yates_shuffle(lst): 78 | for i in range(len(lst) - 1, 0, -1): 79 | j = random.randint(0, i) 80 | lst[i], lst[j] = lst[j], lst[i] 81 | -------------------------------------------------------------------------------- /Modules/Vocoder/AMP.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 5 | # LICENSE is in incl_licenses directory. 6 | 7 | 8 | from alias_free_torch import * 9 | from alias_free_torch import Activation1d 10 | from torch.nn import Conv1d 11 | from torch.nn.utils import remove_weight_norm 12 | from torch.nn.utils import weight_norm 13 | 14 | from Modules.Vocoder.Snake import SnakeBeta 15 | 16 | LRELU_SLOPE = 0.1 17 | 18 | 19 | class AMPBlock1(torch.nn.Module): 20 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 21 | super(AMPBlock1, self).__init__() 22 | 23 | self.convs1 = nn.ModuleList([ 24 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 25 | padding=get_padding(kernel_size, dilation[0]))), 26 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 27 | padding=get_padding(kernel_size, dilation[1]))), 28 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 29 | padding=get_padding(kernel_size, dilation[2]))) 30 | ]) 31 | self.convs1.apply(init_weights) 32 | 33 | self.convs2 = nn.ModuleList([ 34 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 35 | padding=get_padding(kernel_size, 1))), 36 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 37 | padding=get_padding(kernel_size, 1))), 38 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 39 | padding=get_padding(kernel_size, 1))) 40 | ]) 41 | self.convs2.apply(init_weights) 42 | 43 | self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers 44 | 45 | self.activations = nn.ModuleList([ 46 | Activation1d( 47 | activation=SnakeBeta(channels, alpha_logscale=True)) 48 | for _ in range(self.num_layers) 49 | ]) 50 | 51 | def forward(self, x): 52 | acts1, acts2 = self.activations[::2], self.activations[1::2] 53 | for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): 54 | xt = a1(x) 55 | xt = c1(xt) 56 | xt = a2(xt) 57 | xt = c2(xt) 58 | x = xt + x 59 | 60 | return x 61 | 62 | def remove_weight_norm(self): 63 | for l in self.convs1: 64 | remove_weight_norm(l) 65 | for l in self.convs2: 66 | remove_weight_norm(l) 67 | 68 | 69 | def init_weights(m, mean=0.0, std=0.01): 70 | classname = m.__class__.__name__ 71 | if classname.find("Conv") != -1: 72 | m.weight.data.normal_(mean, std) 73 | 74 | 75 | def apply_weight_norm(m): 76 | classname = m.__class__.__name__ 77 | if classname.find("Conv") != -1: 78 | weight_norm(m) 79 | 80 | 81 | def get_padding(kernel_size, dilation=1): 82 | return int((kernel_size * dilation - dilation) / 2) 83 | -------------------------------------------------------------------------------- /Recipes/HiFiGAN_e2e.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import wandb 5 | 6 | from Utility.path_to_transcript_dicts import * 7 | 8 | 9 | def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count): 10 | from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator 11 | from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset 12 | from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN 13 | from Modules.Vocoder.HiFiGAN_train_loop import train_loop 14 | from Utility.storage_config import MODEL_DIR 15 | 16 | if gpu_id == "cpu": 17 | device = torch.device("cpu") 18 | else: 19 | device = torch.device("cuda") 20 | 21 | if gpu_count > 1: 22 | print("Multi GPU training not supported for HiFiGAN!") 23 | import sys 24 | sys.exit() 25 | 26 | print("Preparing") 27 | if model_dir is not None: 28 | model_save_dir = model_dir 29 | else: 30 | model_save_dir = os.path.join(MODEL_DIR, "HiFiGAN_e2e_scratch_direct_cont") 31 | os.makedirs(model_save_dir, exist_ok=True) 32 | 33 | # To prepare the data, have a look at Modules/Vocoder/run_end-to-end_data_creation 34 | 35 | print("Collecting new data...") 36 | 37 | file_lists_for_this_run_combined = list() 38 | file_lists_for_this_run_combined_synthetic = list() 39 | 40 | fl = list(build_path_to_transcript_libritts_all_clean().keys()) 41 | fisher_yates_shuffle(fl) 42 | fisher_yates_shuffle(fl) 43 | for i, f in enumerate(fl): 44 | if os.path.exists(f.replace(".wav", "_synthetic_spec.pt")): 45 | file_lists_for_this_run_combined.append(f) 46 | file_lists_for_this_run_combined_synthetic.append(f.replace(".wav", "_synthetic_spec.pt")) 47 | print("filepaths collected") 48 | 49 | train_set = HiFiGANDataset(list_of_original_paths=file_lists_for_this_run_combined, 50 | list_of_synthetic_paths=file_lists_for_this_run_combined_synthetic) 51 | 52 | generator = HiFiGAN() 53 | discriminator = AvocodoHiFiGANJointDiscriminator() 54 | 55 | print("Training model") 56 | if use_wandb: 57 | wandb.init( 58 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 59 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 60 | resume="must" if wandb_resume_id is not None else None) 61 | train_loop(batch_size=24, 62 | epochs=5180000, 63 | generator=generator, 64 | discriminator=discriminator, 65 | train_dataset=train_set, 66 | device=device, 67 | epochs_per_save=1, 68 | model_save_dir=model_save_dir, 69 | path_to_checkpoint=resume_checkpoint, 70 | resume=resume, 71 | use_wandb=use_wandb, 72 | finetune=finetune) 73 | if use_wandb: 74 | wandb.finish() 75 | 76 | 77 | def fisher_yates_shuffle(lst): 78 | for i in range(len(lst) - 1, 0, -1): 79 | j = random.randint(0, i) 80 | lst[i], lst[j] = lst[j], lst[i] 81 | -------------------------------------------------------------------------------- /Utility/weight_averaging.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://alexander-stasiuk.medium.com/pytorch-weights-averaging-e2c0fa611a0c 3 | """ 4 | 5 | import os 6 | 7 | import torch 8 | 9 | from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS 10 | from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN 11 | 12 | 13 | def load_net_toucan(path): 14 | check_dict = torch.load(path, map_location=torch.device("cpu")) 15 | net = ToucanTTS(weights=check_dict["model"], config=check_dict["config"]) 16 | return net, check_dict["default_emb"] 17 | 18 | 19 | def load_net_bigvgan(path): 20 | check_dict = torch.load(path, map_location=torch.device("cpu")) 21 | net = HiFiGAN(weights=check_dict["generator"]) 22 | return net, None 23 | 24 | 25 | def get_n_recent_checkpoints_paths(checkpoint_dir, n=5): 26 | print("selecting checkpoints...") 27 | checkpoint_list = list() 28 | for el in os.listdir(checkpoint_dir): 29 | if el.endswith(".pt") and el.startswith("checkpoint_"): 30 | try: 31 | checkpoint_list.append(int(el.split(".")[0].split("_")[1])) 32 | except RuntimeError: 33 | pass 34 | if len(checkpoint_list) == 0: 35 | return None 36 | elif len(checkpoint_list) < n: 37 | n = len(checkpoint_list) 38 | checkpoint_list.sort(reverse=True) 39 | return [os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(step)) for step in checkpoint_list[:n]] 40 | 41 | 42 | def average_checkpoints(list_of_checkpoint_paths, load_func): 43 | # COLLECT CHECKPOINTS 44 | if list_of_checkpoint_paths is None or len(list_of_checkpoint_paths) == 0: 45 | return None 46 | checkpoints_weights = {} 47 | model = None 48 | default_embed = None 49 | 50 | # LOAD CHECKPOINTS 51 | for path_to_checkpoint in list_of_checkpoint_paths: 52 | print("loading model {}".format(path_to_checkpoint)) 53 | model, default_embed = load_func(path=path_to_checkpoint) 54 | checkpoints_weights[path_to_checkpoint] = dict(model.named_parameters()) 55 | 56 | # AVERAGE CHECKPOINTS 57 | params = model.named_parameters() 58 | dict_params = dict(params) 59 | checkpoint_amount = len(checkpoints_weights) 60 | print("averaging...") 61 | for name in dict_params.keys(): 62 | custom_params = None 63 | for _, checkpoint_parameters in checkpoints_weights.items(): 64 | if custom_params is None: 65 | custom_params = checkpoint_parameters[name].data 66 | else: 67 | custom_params += checkpoint_parameters[name].data 68 | dict_params[name].data.copy_(custom_params / checkpoint_amount) 69 | model_dict = model.state_dict() 70 | model_dict.update(dict_params) 71 | model.load_state_dict(model_dict) 72 | model.eval() 73 | return model, default_embed 74 | 75 | 76 | def save_model_for_use(model, name="", default_embed=None, dict_name="model"): 77 | print("saving model...") 78 | torch.save({dict_name: model.state_dict(), "default_emb": default_embed, "config": model.config}, name) 79 | print("...done!") 80 | 81 | 82 | def count_parameters(net): 83 | return sum(p.numel() for p in net.parameters() if p.requires_grad) 84 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/ToucanTTSLoss.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from ESPNet 3 | Adapted by Flux 4 | """ 5 | 6 | import torch 7 | 8 | from Modules.GeneralLayers.DurationPredictor import DurationPredictorLoss 9 | from Utility.utils import make_non_pad_mask 10 | 11 | 12 | class ToucanTTSLoss(torch.nn.Module): 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.l1_criterion = torch.nn.L1Loss(reduction="none") 17 | self.l2_criterion = torch.nn.MSELoss(reduction="none") 18 | self.duration_criterion = DurationPredictorLoss(reduction="none") 19 | 20 | def forward(self, predicted_features, gold_features, features_lengths, text_lengths, gold_durations, predicted_durations, predicted_pitch, predicted_energy, gold_pitch, gold_energy): 21 | """ 22 | Args: 23 | predicted_features (Tensor): Batch of outputs before postnets (B, Lmax, odim). 24 | gold_features (Tensor): Batch of target features (B, Lmax, odim). 25 | features_lengths (LongTensor): Batch of the lengths of each target (B,). 26 | gold_durations (LongTensor): Batch of durations (B, Tmax). 27 | gold_pitch (LongTensor): Batch of pitch (B, Tmax). 28 | gold_energy (LongTensor): Batch of energy (B, Tmax). 29 | predicted_durations (LongTensor): Batch of outputs of duration predictor (B, Tmax). 30 | predicted_pitch (LongTensor): Batch of outputs of pitch predictor (B, Tmax). 31 | predicted_energy (LongTensor): Batch of outputs of energy predictor (B, Tmax). 32 | text_lengths (LongTensor): Batch of the lengths of each input (B,). 33 | 34 | Returns: 35 | Tensor: L1 loss value. 36 | Tensor: Duration loss value 37 | """ 38 | 39 | # calculate losses 40 | distance_loss = self.l1_criterion(predicted_features, gold_features) 41 | duration_loss = self.duration_criterion(predicted_durations, gold_durations) 42 | pitch_loss = self.l2_criterion(predicted_pitch, gold_pitch) 43 | energy_loss = self.l2_criterion(predicted_energy, gold_energy) 44 | 45 | # make weighted masks to ensure that long samples and short samples are all equally important 46 | out_masks = make_non_pad_mask(features_lengths).unsqueeze(-1).to(gold_features.device) 47 | out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float() 48 | out_weights /= gold_features.size(0) * gold_features.size(-1) 49 | duration_masks = make_non_pad_mask(text_lengths).to(gold_features.device) 50 | duration_weights = (duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float()) 51 | variance_masks = duration_masks.unsqueeze(-1) 52 | variance_weights = duration_weights.unsqueeze(-1) 53 | 54 | # apply weighted masks 55 | distance_loss = distance_loss.mul(out_weights).masked_select(out_masks).sum() 56 | duration_loss = (duration_loss.mul(duration_weights).masked_select(duration_masks).sum()) 57 | pitch_loss = pitch_loss.mul(variance_weights).masked_select(variance_masks).sum() 58 | energy_loss = (energy_loss.mul(variance_weights).masked_select(variance_masks).sum()) 59 | 60 | return distance_loss, duration_loss, pitch_loss, energy_loss 61 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/MultiLayeredConv1d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2021 4 | 5 | """ 6 | Layer modules for FFT block in FastSpeech (Feed-forward Transformer). 7 | """ 8 | 9 | import torch 10 | 11 | 12 | class MultiLayeredConv1d(torch.nn.Module): 13 | """ 14 | Multi-layered conv1d for Transformer block. 15 | 16 | This is a module of multi-layered conv1d designed 17 | to replace positionwise feed-forward network 18 | in Transformer block, which is introduced in 19 | `FastSpeech: Fast, Robust and Controllable Text to Speech`_. 20 | 21 | .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: 22 | https://arxiv.org/pdf/1905.09263.pdf 23 | """ 24 | 25 | def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): 26 | """ 27 | Initialize MultiLayeredConv1d module. 28 | 29 | Args: 30 | in_chans (int): Number of input channels. 31 | hidden_chans (int): Number of hidden channels. 32 | kernel_size (int): Kernel size of conv1d. 33 | dropout_rate (float): Dropout rate. 34 | """ 35 | super(MultiLayeredConv1d, self).__init__() 36 | self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) 37 | self.w_2 = torch.nn.Conv1d(hidden_chans, in_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) 38 | self.dropout = torch.nn.Dropout(dropout_rate) 39 | 40 | def forward(self, x): 41 | """ 42 | Calculate forward propagation. 43 | 44 | Args: 45 | x (torch.Tensor): Batch of input tensors (B, T, in_chans). 46 | 47 | Returns: 48 | torch.Tensor: Batch of output tensors (B, T, hidden_chans). 49 | """ 50 | x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) 51 | return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1) 52 | 53 | 54 | class Conv1dLinear(torch.nn.Module): 55 | """ 56 | Conv1D + Linear for Transformer block. 57 | 58 | A variant of MultiLayeredConv1d, which replaces second conv-layer to linear. 59 | """ 60 | 61 | def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): 62 | """ 63 | Initialize Conv1dLinear module. 64 | 65 | Args: 66 | in_chans (int): Number of input channels. 67 | hidden_chans (int): Number of hidden channels. 68 | kernel_size (int): Kernel size of conv1d. 69 | dropout_rate (float): Dropout rate. 70 | """ 71 | super(Conv1dLinear, self).__init__() 72 | self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ) 73 | self.w_2 = torch.nn.Linear(hidden_chans, in_chans) 74 | self.dropout = torch.nn.Dropout(dropout_rate) 75 | 76 | def forward(self, x): 77 | """ 78 | Calculate forward propagation. 79 | 80 | Args: 81 | x (torch.Tensor): Batch of input tensors (B, T, in_chans). 82 | 83 | Returns: 84 | torch.Tensor: Batch of output tensors (B, T, hidden_chans). 85 | """ 86 | x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1) 87 | return self.w_2(self.dropout(x)) 88 | -------------------------------------------------------------------------------- /Recipes/finetuning_example_simple.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example script for fine-tuning the pretrained model to your own data. 3 | 4 | Comments in ALL CAPS are instructions 5 | """ 6 | 7 | import time 8 | 9 | import torch 10 | import wandb 11 | 12 | from Utility.path_to_transcript_dicts import * 13 | 14 | 15 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 16 | from huggingface_hub import hf_hub_download 17 | from torch.utils.data import ConcatDataset 18 | 19 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 20 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 21 | from Utility.corpus_preparation import prepare_tts_corpus 22 | from Utility.storage_config import MODEL_DIR 23 | from Utility.storage_config import PREPROCESSING_DIR 24 | 25 | if gpu_id == "cpu": 26 | device = torch.device("cpu") 27 | else: 28 | device = torch.device("cuda") 29 | assert gpu_count == 1 # distributed finetuning is not supported 30 | 31 | # IF YOU'RE ADDING A NEW LANGUAGE, YOU MIGHT NEED TO ADD HANDLING FOR IT IN Preprocessing/TextFrontend.py 32 | 33 | print("Preparing") 34 | 35 | if model_dir is not None: 36 | save_dir = model_dir 37 | else: 38 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_FinetuningExample") # RENAME TO SOMETHING MEANINGFUL FOR YOUR DATA 39 | os.makedirs(save_dir, exist_ok=True) 40 | 41 | train_data = prepare_tts_corpus(transcript_dict=build_path_to_transcript_integration_test(), 42 | corpus_dir=os.path.join(PREPROCESSING_DIR, "integration_test"), 43 | lang="eng") # CHANGE THE TRANSCRIPT DICT, THE NAME OF THE CACHE DIRECTORY AND THE LANGUAGE TO YOUR NEEDS 44 | 45 | model = ToucanTTS() 46 | 47 | if use_wandb: 48 | wandb.init( 49 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 50 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 51 | resume="must" if wandb_resume_id is not None else None) 52 | 53 | print("Training model") 54 | train_loop(net=model, 55 | datasets=[train_data], 56 | device=device, 57 | save_directory=save_dir, 58 | batch_size=12, # YOU MIGHT GET OUT OF MEMORY ISSUES ON SMALL GPUs, IF SO, DECREASE THIS. 59 | eval_lang="eng", # THE LANGUAGE YOUR PROGRESS PLOTS WILL BE MADE IN 60 | warmup_steps=500, 61 | lr=1e-5, # if you have enough data (over ~1000 datapoints) you can increase this up to 1e-4 and it will still be stable, but learn quicker. 62 | # DOWNLOAD THESE INITIALIZATION MODELS FROM THE RELEASE PAGE OF THE GITHUB OR RUN THE DOWNLOADER SCRIPT TO GET THEM AUTOMATICALLY 63 | path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") if resume_checkpoint is None else resume_checkpoint, 64 | fine_tune=True if resume_checkpoint is None and not resume else finetune, 65 | resume=resume, 66 | steps=5000, 67 | use_wandb=use_wandb, 68 | train_samplers=[torch.utils.data.RandomSampler(train_data)], 69 | gpu_count=1) 70 | if use_wandb: 71 | wandb.finish() 72 | -------------------------------------------------------------------------------- /Modules/EmbeddingModel/StyleEmbedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from Modules.EmbeddingModel.GST import GSTStyleEncoder 4 | from Modules.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder 5 | 6 | 7 | class StyleEmbedding(torch.nn.Module): 8 | """ 9 | The style embedding should provide information of the speaker and their speaking style 10 | 11 | The feedback signal for the module will come from the TTS objective, so it doesn't have a dedicated train loop. 12 | The train loop does however supply supervision in the form of a barlow twins objective. 13 | 14 | See the git history for some other approaches for style embedding, like the SWIN transformer 15 | and a simple LSTM baseline. GST turned out to be the best. 16 | """ 17 | 18 | def __init__(self, embedding_dim=16, style_tts_encoder=False): 19 | super().__init__() 20 | self.embedding_dim = embedding_dim 21 | self.use_gst = not style_tts_encoder 22 | if style_tts_encoder: 23 | self.style_encoder = StyleTTSEncoder(style_dim=embedding_dim) 24 | else: 25 | self.style_encoder = GSTStyleEncoder(gst_token_dim=embedding_dim) 26 | 27 | def forward(self, 28 | batch_of_feature_sequences, 29 | batch_of_feature_sequence_lengths): 30 | """ 31 | Args: 32 | batch_of_feature_sequences: b is the batch axis, 128 features per timestep 33 | and l time-steps, which may include padding 34 | for most elements in the batch (b, l, 128) 35 | batch_of_feature_sequence_lengths: indicate for every element in the batch, 36 | what the true length is, since they are 37 | all padded to the length of the longest 38 | element in the batch (b, 1) 39 | Returns: 40 | batch of n dimensional embeddings (b,n) 41 | """ 42 | 43 | minimum_sequence_length = 512 44 | specs = list() 45 | for index, spec_length in enumerate(batch_of_feature_sequence_lengths): 46 | spec = batch_of_feature_sequences[index][:spec_length] 47 | # double the length at least once, then check 48 | spec = spec.repeat((2, 1)) 49 | current_spec_length = len(spec) 50 | while current_spec_length < minimum_sequence_length: 51 | # make it longer 52 | spec = spec.repeat((2, 1)) 53 | current_spec_length = len(spec) 54 | specs.append(spec[:minimum_sequence_length]) 55 | 56 | spec_batch = torch.stack(specs, dim=0) 57 | return self.style_encoder(speech=spec_batch) 58 | 59 | 60 | if __name__ == '__main__': 61 | style_emb = StyleEmbedding(style_tts_encoder=False) 62 | print(f"GST parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}") 63 | 64 | seq_length = 398 65 | print(style_emb(torch.randn(5, seq_length, 512), 66 | torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape) 67 | 68 | style_emb = StyleEmbedding(style_tts_encoder=True) 69 | print(f"StyleTTS encoder parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}") 70 | 71 | seq_length = 398 72 | print(style_emb(torch.randn(5, seq_length, 512), 73 | torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape) 74 | -------------------------------------------------------------------------------- /Utility/WarmupScheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import _LRScheduler 2 | 3 | 4 | # This is rather suboptimal, because we need to import a protected class. Unfortunately, I don't see another way. 5 | 6 | 7 | class ToucanWarmupScheduler(_LRScheduler): 8 | """ 9 | A warmup scheduler that should be called after every batch. 10 | """ 11 | 12 | def __init__(self, optimizer, peak_lr=0.0002, warmup_steps=20000, max_steps=200000, last_epoch=-1): 13 | self.warmup_steps = warmup_steps 14 | self.peak_lr = peak_lr 15 | self.max_steps = max_steps 16 | self.plateau = self.warmup_steps * 4 17 | self.last_lr = 0.0 18 | # __init__() must be invoked before setting field 19 | # because step() is also invoked in __init__() 20 | super().__init__(optimizer, last_epoch) 21 | 22 | def __repr__(self): 23 | return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" 24 | 25 | def get_lr(self): 26 | step_num = self.last_epoch + 1 27 | if step_num <= self.warmup_steps: 28 | lr = self.peak_lr * min(step_num / self.warmup_steps, 1.0) 29 | self.last_lr = lr 30 | return [lr for _ in self.base_lrs] 31 | elif step_num < self.warmup_steps + self.plateau: 32 | self.last_lr = self.peak_lr 33 | return [self.peak_lr for _ in self.base_lrs] 34 | else: 35 | scale = 1 - (((step_num - (self.warmup_steps + self.plateau)) / self.max_steps) / (self.max_steps / 10)) 36 | self.last_lr = max(self.last_lr * scale, 1e-7) 37 | return [self.last_lr for _ in self.base_lrs] 38 | 39 | 40 | class WarmupScheduler(_LRScheduler): 41 | """ 42 | The WarmupLR scheduler 43 | This scheduler is almost same as NoamLR Scheduler except for following difference: 44 | NoamLR: 45 | lr = optimizer.lr * model_size ** -0.5 46 | * min(step ** -0.5, step * warmup_step ** -1.5) 47 | WarmupLR: 48 | lr = optimizer.lr * warmup_step ** 0.5 49 | * min(step ** -0.5, step * warmup_step ** -1.5) 50 | Note that the maximum lr equals to optimizer.lr in this scheduler. 51 | 52 | Taken from ESPnet 53 | """ 54 | 55 | def __init__(self, optimizer, warmup_steps=25000, last_epoch=-1): 56 | self.warmup_steps = warmup_steps 57 | # __init__() must be invoked before setting field 58 | # because step() is also invoked in __init__() 59 | super().__init__(optimizer, last_epoch) 60 | 61 | def __repr__(self): 62 | return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" 63 | 64 | def get_lr(self): 65 | step_num = self.last_epoch + 1 66 | return [lr * self.warmup_steps ** 0.5 * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) for lr in 67 | self.base_lrs] 68 | 69 | 70 | if __name__ == '__main__': 71 | lrs = list() 72 | warmup_steps = 30000 73 | peak_lr = 0.0005 74 | max_steps = 800000 75 | plateau_size = warmup_steps * 5 76 | for step_num in range(max_steps): 77 | if step_num <= warmup_steps: 78 | lr = peak_lr * min(step_num / warmup_steps, 1.0) 79 | lrs.append(lr) 80 | elif step_num < warmup_steps + plateau_size: 81 | lrs.append(peak_lr) 82 | else: 83 | scale = 1 - (((step_num - (warmup_steps + plateau_size)) / max_steps) / (max_steps / 10)) 84 | lrs.append(max(lrs[-1] * scale, 1e-7)) 85 | import matplotlib.pyplot as plt 86 | 87 | plt.plot(lrs) 88 | plt.show() 89 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/dataset/speaker_embeddings_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | class SpeakerEmbeddingsDataset(torch.utils.data.Dataset): 8 | 9 | def __init__(self, feature_path, device, mode='utterance'): 10 | super(SpeakerEmbeddingsDataset, self).__init__() 11 | 12 | modes = ['utterance', 'speaker'] 13 | assert mode in modes, f'mode: {mode} is not supported' 14 | if mode == 'utterance': 15 | self.mode = 'utt' 16 | elif mode == 'speaker': 17 | self.mode = 'spk' 18 | 19 | self.device = device 20 | 21 | self.x, self.speakers = self._load_features(feature_path) 22 | # unique_speakers = set(self.speakers) 23 | # spk2class = dict(zip(unique_speakers, range(len(unique_speakers)))) 24 | # #self.x = self._reformat_features(self.x) 25 | # self.y = torch.tensor([spk2class[spk] for spk in self.speakers]).to(self.device) 26 | # self.class2spk = dict(zip(spk2class.values(), spk2class.keys())) 27 | 28 | def __len__(self): 29 | return len(self.speakers) 30 | 31 | def __getitem__(self, index): 32 | embedding = self.normalize_embedding(self.x[index]) 33 | # speaker_id = self.y[index] 34 | return embedding, torch.zeros([0]) 35 | 36 | def normalize_embedding(self, vector): 37 | return torch.sub(vector, self.mean) / self.std 38 | 39 | def get_speaker(self, label): 40 | return self.class2spk[label] 41 | 42 | def get_embedding_dim(self): 43 | return self.x.shape[-1] 44 | 45 | def get_num_speaker(self): 46 | return len(torch.unique((self.y))) 47 | 48 | def set_labels(self, labels): 49 | self.y_old = self.y 50 | self.y = torch.full(size=(len(self),), fill_value=labels).to(self.device) 51 | # if isinstance(labels, int) or isinstance(labels, float): 52 | # self.y = torch.full(size=len(self), fill_value=labels) 53 | # elif len(labels) == len(self): 54 | # self.y = torch.tensor(labels) 55 | 56 | def _load_features(self, feature_path): 57 | if os.path.isfile(feature_path): 58 | vectors = torch.load(feature_path, map_location=self.device) 59 | if isinstance(vectors, list): 60 | vectors = torch.stack(vectors) 61 | 62 | self.mean = torch.mean(vectors) 63 | self.std = torch.std(vectors) 64 | return vectors, torch.zeros(vectors.size(0)) 65 | else: 66 | vectors = torch.load(feature_path, map_location=self.device) 67 | 68 | self.mean = torch.mean(vectors) 69 | self.std = torch.std(vectors) 70 | 71 | spk2idx = {} 72 | with open(feature_path / f'{self.mode}2idx', 'r') as f: 73 | for line in f: 74 | split_line = line.strip().split() 75 | if len(split_line) == 2: 76 | spk2idx[split_line[0].strip()] = int(split_line[1]) 77 | 78 | speakers, indices = zip(*spk2idx.items()) 79 | 80 | if (feature_path / 'utt2spk').exists(): # spk2idx contains utt_ids not speaker_ids 81 | utt2spk = {} 82 | with open(feature_path / 'utt2spk', 'r') as f: 83 | for line in f: 84 | split_line = line.strip().split() 85 | if len(split_line) == 2: 86 | utt2spk[split_line[0].strip()] = split_line[1].strip() 87 | 88 | speakers = [utt2spk[utt] for utt in speakers] 89 | 90 | return vectors[np.array(indices)], speakers 91 | 92 | def _reformat_features(self, features): 93 | if len(features.shape) == 2: 94 | return features.reshape(features.shape[0], 1, 1, features.shape[1]) 95 | -------------------------------------------------------------------------------- /Modules/ControllabilityGAN/GAN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from Modules.ControllabilityGAN.wgan.init_wgan import create_wgan 4 | 5 | 6 | class GanWrapper: 7 | 8 | def __init__(self, path_wgan, device, num_cached_voices=10): 9 | self.device = device 10 | self.path_wgan = path_wgan 11 | 12 | self.mean = None 13 | self.std = None 14 | self.wgan = None 15 | self.normalize = True 16 | 17 | torch.manual_seed(160923) 18 | 19 | self.load_model(path_wgan) 20 | 21 | self.U = self.compute_controllability() 22 | 23 | self.z_list = list() 24 | 25 | while len(self.z_list) < num_cached_voices + 2: 26 | z = self.wgan.G.sample_latent(1, self.wgan.G.z_dim, temperature=0.4) 27 | l1_distances = [100.0] 28 | for other_z in self.z_list: 29 | l1_distances.append(torch.nn.functional.l1_loss(z, other_z)) 30 | print("dist: ", min(l1_distances), len(self.z_list)) 31 | if min(l1_distances) > 0.5: 32 | self.z_list.append(z) 33 | self.z = self.z_list[0] 34 | 35 | def set_latent(self, seed): 36 | self.z = self.z_list[seed] 37 | 38 | def load_model(self, path): 39 | gan_checkpoint = torch.load(path, map_location="cpu") 40 | 41 | self.wgan = create_wgan(parameters=gan_checkpoint['model_parameters'], device=self.device) 42 | # Create a new state dict without 'module.' prefix 43 | new_state_dict_G = {} 44 | for key, value in gan_checkpoint['generator_state_dict'].items(): 45 | # Remove 'module.' prefix 46 | new_key = key.replace('module.', '') 47 | new_state_dict_G[new_key] = value 48 | 49 | new_state_dict_D = {} 50 | for key, value in gan_checkpoint['critic_state_dict'].items(): 51 | # Remove 'module.' prefix 52 | new_key = key.replace('module.', '') 53 | new_state_dict_D[new_key] = value 54 | 55 | self.wgan.G.load_state_dict(new_state_dict_G) 56 | self.wgan.D.load_state_dict(new_state_dict_D) 57 | 58 | self.mean = gan_checkpoint["dataset_mean"] 59 | self.std = gan_checkpoint["dataset_std"] 60 | 61 | def compute_controllability(self, n_samples=200000): 62 | _, intermediate, z = self.wgan.sample_generator(num_samples=n_samples, nograd=True, return_intermediate=True) 63 | intermediate = intermediate.cpu() 64 | z = z.cpu() 65 | U = self.controllable_speakers(intermediate, z) 66 | return U 67 | 68 | def controllable_speakers(self, intermediate, z): 69 | pca = torch.pca_lowrank(intermediate) 70 | mu = intermediate.mean() 71 | X = torch.matmul((intermediate - mu), pca[2]) 72 | U = torch.linalg.lstsq(X, z) 73 | return U 74 | 75 | def get_original_embed(self): 76 | self.wgan.G.eval() 77 | embed_original = self.wgan.G.module.forward(self.z.to(self.device)) 78 | 79 | if self.normalize: 80 | embed_original = inverse_normalize( 81 | embed_original.cpu(), 82 | self.mean.cpu().unsqueeze(0), 83 | self.std.cpu().unsqueeze(0) 84 | ) 85 | return embed_original 86 | 87 | def modify_embed(self, x): 88 | self.wgan.G.eval() 89 | z_new = self.z.squeeze() + torch.matmul(self.U.solution.t(), x) 90 | embed_modified = self.wgan.G.forward(z_new.unsqueeze(0).to(self.device)) 91 | if self.normalize: 92 | embed_modified = inverse_normalize( 93 | embed_modified.cpu(), 94 | self.mean.cpu().unsqueeze(0), 95 | self.std.cpu().unsqueeze(0) 96 | ) 97 | return embed_modified 98 | 99 | 100 | def inverse_normalize(tensor, mean, std): 101 | return tensor * std + mean 102 | -------------------------------------------------------------------------------- /Utility/corpus_preparation.py: -------------------------------------------------------------------------------- 1 | import torch.multiprocessing 2 | from huggingface_hub import hf_hub_download 3 | 4 | from Modules.Aligner.CodecAlignerDataset import CodecAlignerDataset 5 | from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner 6 | from Modules.ToucanTTS.TTSDataset import TTSDataset 7 | from Utility.path_to_transcript_dicts import * 8 | from Utility.storage_config import MODEL_DIR 9 | 10 | 11 | def prepare_aligner_corpus(transcript_dict, corpus_dir, lang, device, phone_input=False, 12 | gpu_count=1, 13 | rank=0): 14 | return CodecAlignerDataset(transcript_dict, 15 | cache_dir=corpus_dir, 16 | lang=lang, 17 | loading_processes=5, # this can be increased for massive clusters, but the overheads that are introduced are kind of not really worth it 18 | device=device, 19 | phone_input=phone_input, 20 | gpu_count=gpu_count, 21 | rank=rank) 22 | 23 | 24 | def prepare_tts_corpus(transcript_dict, 25 | corpus_dir, 26 | lang, 27 | # For small datasets it's best to turn this off and instead inspect the data with the scorer, if there are any issues. 28 | fine_tune_aligner=True, 29 | use_reconstruction=True, 30 | phone_input=False, 31 | save_imgs=False, 32 | gpu_count=1, 33 | rank=0): 34 | """ 35 | create an aligner dataset, 36 | fine-tune an aligner, 37 | create a TTS dataset, 38 | return it. 39 | 40 | Automatically skips parts that have been done before. 41 | """ 42 | if not os.path.exists(os.path.join(corpus_dir, "tts_train_cache.pt")): 43 | if fine_tune_aligner: 44 | aligner_dir = os.path.join(corpus_dir, "Aligner") 45 | aligner_loc = os.path.join(corpus_dir, "Aligner", "aligner.pt") 46 | 47 | if not os.path.exists(os.path.join(corpus_dir, "aligner_train_cache.pt")): 48 | prepare_aligner_corpus(transcript_dict, corpus_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda")) 49 | 50 | if not os.path.exists(os.path.join(aligner_dir, "aligner.pt")): 51 | aligner_datapoints = prepare_aligner_corpus(transcript_dict, corpus_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda")) 52 | train_aligner(train_dataset=aligner_datapoints, 53 | device=torch.device("cuda"), 54 | save_directory=aligner_dir, 55 | steps=min(len(aligner_datapoints) // 2, 10000), # relatively good finetuning heuristic 56 | batch_size=16 if len(aligner_datapoints) > 16 else len(aligner_datapoints) // 2, 57 | path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="Aligner.pt"), 58 | fine_tune=True, 59 | debug_img_path=aligner_dir, 60 | resume=False, 61 | use_reconstruction=use_reconstruction) 62 | else: 63 | aligner_loc = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="Aligner.pt") 64 | else: 65 | aligner_loc = None 66 | return TTSDataset(transcript_dict, 67 | acoustic_checkpoint_path=aligner_loc, 68 | cache_dir=corpus_dir, 69 | device=torch.device("cuda"), 70 | lang=lang, 71 | save_imgs=save_imgs, 72 | gpu_count=gpu_count, 73 | rank=rank) 74 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/LanguageEmbeddingSpaceStructureLoss.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import torch 4 | 5 | from Preprocessing.multilinguality.create_distance_lookups import CacheCreator 6 | from Utility.utils import load_json_from_path 7 | 8 | 9 | class LanguageEmbeddingSpaceStructureLoss(torch.nn.Module): 10 | 11 | def __init__(self): 12 | super().__init__() 13 | cc = CacheCreator(cache_root="Preprocessing/multilinguality") 14 | if not os.path.exists('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json'): 15 | cc.create_tree_cache(cache_root="Preprocessing/multilinguality") 16 | if not os.path.exists('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json'): 17 | cc.create_map_cache(cache_root="Preprocessing/multilinguality") 18 | 19 | self.tree_dist = load_json_from_path('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json') 20 | self.map_dist = load_json_from_path('Preprocessing/multilinguality/lang_1_to_lang_2_to_map_dist.json') 21 | # with open("Preprocessing/multilinguality/asp_dict.pkl", 'rb') as dictfile: 22 | # self.asp_sim = pickle.load(dictfile) 23 | # self.lang_list = list(self.asp_sim.keys()) # list of all languages, to get lang_b's index 24 | 25 | self.largest_value_map_dist = 0.0 26 | for _, values in self.map_dist.items(): 27 | for _, value in values.items(): 28 | self.largest_value_map_dist = max(self.largest_value_map_dist, value) 29 | 30 | self.iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1] 31 | self.ids_to_iso_codes = {v: k for k, v in self.iso_codes_to_ids.items()} 32 | 33 | def forward(self, language_ids, language_embeddings): 34 | """ 35 | Args: 36 | language_ids (Tensor): IDs of languages in the same order as the embeddings to calculate the distances according to the metrics. 37 | language_embeddings (Tensor): Batch of language embeddings, of which the distances will be compared to the distances according to the metrics. 38 | 39 | Returns: 40 | Tensor: Language Embedding Structure Loss Value 41 | """ 42 | 43 | losses = list() 44 | for language_id_1, language_embedding_1 in zip(language_ids, language_embeddings): 45 | for language_id_2, language_embedding_2 in zip(language_ids, language_embeddings): 46 | if language_id_1 != language_id_2: 47 | embed_dist = torch.nn.functional.l1_loss(language_embedding_1, language_embedding_2) 48 | lang_1 = self.ids_to_iso_codes[language_id_1] 49 | lang_2 = self.ids_to_iso_codes[language_id_2] 50 | 51 | # Value Range Normalized Tree Dist 52 | try: 53 | tree_dist = self.tree_dist[lang_1][lang_2] 54 | except KeyError: 55 | tree_dist = self.tree_dist[lang_2][lang_1] 56 | 57 | # Value Range Normalized Map Dist 58 | try: 59 | map_dist = self.map_dist[lang_1][lang_2] / self.largest_value_map_dist 60 | except KeyError: 61 | map_dist = self.map_dist[lang_2][lang_1] / self.largest_value_map_dist 62 | 63 | # Value Range Normalized ASP Dist 64 | # lang_2_idx = self.lang_list.index(lang_2) 65 | # asp_dist = 1.0 - self.asp_sim[lang_1][lang_2_idx] # it's a similarity measure that goes from 0 to 1, so we subtract it from 1 to turn it into a distance 66 | 67 | # Average distance should be similar to embedding distance to bring some structure into the embedding-space 68 | # metric_distance = (torch.tensor(tree_dist) + torch.tensor(map_dist) + torch.tensor(asp_dist)) / 3 69 | metric_distance = (torch.tensor(tree_dist) + torch.tensor(map_dist)) / 2 70 | losses.append(torch.nn.functional.l1_loss(embed_dist, metric_distance)) 71 | 72 | return sum(losses) / len(losses) 73 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/CodecDiscriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def weights_init_D(m): 6 | classname = m.__class__.__name__ 7 | if classname.find('Conv') != -1: 8 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu') 9 | elif classname.find('BatchNorm') != -1: 10 | nn.init.constant_(m.weight, 1) 11 | nn.init.constant_(m.bias, 0) 12 | 13 | 14 | class SpectrogramDiscriminator(torch.nn.Module): 15 | def __init__(self): 16 | super().__init__() 17 | self.D = DiscriminatorNet() 18 | self.D.apply(weights_init_D) 19 | 20 | def _generator_feedback(self, data_generated, data_real): 21 | for p in self.D.parameters(): 22 | p.requires_grad = False # freeze critic 23 | 24 | score_fake, fmap_fake = self.D(data_generated) 25 | _, fmap_real = self.D(data_real) 26 | 27 | feature_matching_loss = 0.0 28 | for feat_fake, feat_real in zip(fmap_fake, fmap_real): 29 | feature_matching_loss += nn.functional.l1_loss(feat_fake, feat_real.detach()) 30 | 31 | discr_loss = nn.functional.mse_loss(input=score_fake, target=torch.ones(score_fake.shape, device=score_fake.device), reduction="mean") 32 | 33 | return feature_matching_loss + discr_loss 34 | 35 | def _discriminator_feature_matching(self, data_generated, data_real): 36 | for p in self.D.parameters(): 37 | p.requires_grad = True # unfreeze critic 38 | self.D.train() 39 | 40 | score_fake, _ = self.D(data_generated) 41 | score_real, _ = self.D(data_real) 42 | 43 | discr_loss = 0.0 44 | discr_loss = discr_loss + nn.functional.mse_loss(input=score_fake, target=torch.zeros(score_fake.shape, device=score_fake.device), reduction="mean") 45 | discr_loss = discr_loss + nn.functional.mse_loss(input=score_real, target=torch.ones(score_real.shape, device=score_real.device), reduction="mean") 46 | 47 | return discr_loss 48 | 49 | def calc_discriminator_loss(self, data_generated, data_real): 50 | return self._discriminator_feature_matching(data_generated.detach(), data_real) 51 | 52 | def calc_generator_feedback(self, data_generated, data_real): 53 | return self._generator_feedback(data_generated, data_real) 54 | 55 | 56 | class DiscriminatorNet(nn.Module): 57 | def __init__(self): 58 | super().__init__() 59 | self.filters = nn.ModuleList([ 60 | nn.utils.weight_norm(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), 61 | nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), 62 | nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), 63 | nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), 64 | nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))), 65 | ]) 66 | 67 | self.out = nn.utils.weight_norm(nn.Conv2d(32, 1, 3, 1, 1)) 68 | 69 | self.fc = nn.Linear(900, 1) # this needs to be changed everytime the window length is changes. It would be nice if this could be done dynamically. 70 | 71 | def forward(self, y): 72 | feature_maps = list() 73 | feature_maps.append(y) 74 | for d in self.filters: 75 | y = d(y) 76 | feature_maps.append(y) 77 | y = nn.functional.leaky_relu(y, 0.1) 78 | y = self.out(y) 79 | feature_maps.append(y) 80 | y = torch.flatten(y, 1, -1) 81 | y = self.fc(y) 82 | 83 | return y, feature_maps 84 | 85 | 86 | if __name__ == '__main__': 87 | d = SpectrogramDiscriminator() 88 | fake = torch.randn([2, 100, 72]) # [Batch, Sequence Length, Spectrogram Buckets] 89 | real = torch.randn([2, 100, 72]) # [Batch, Sequence Length, Spectrogram Buckets] 90 | 91 | critic_loss = d.calc_discriminator_loss((fake.unsqueeze(1)), real.unsqueeze(1)) 92 | generator_loss = d.calc_generator_feedback(fake.unsqueeze(1), real.unsqueeze(1)) 93 | print(critic_loss) 94 | print(generator_loss) 95 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/VariancePredictor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2023 4 | 5 | from abc import ABC 6 | 7 | import torch 8 | 9 | from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d 10 | from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm 11 | from Modules.GeneralLayers.LayerNorm import LayerNorm 12 | from Utility.utils import integrate_with_utt_embed 13 | 14 | 15 | class VariancePredictor(torch.nn.Module, ABC): 16 | """ 17 | Variance predictor module. 18 | 19 | This is a module of variance predictor described in `FastSpeech 2: 20 | Fast and High-Quality End-to-End Text to Speech`_. 21 | 22 | .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`: 23 | https://arxiv.org/abs/2006.04558 24 | 25 | """ 26 | 27 | def __init__(self, 28 | idim, 29 | n_layers=2, 30 | n_chans=384, 31 | kernel_size=3, 32 | bias=True, 33 | dropout_rate=0.5, 34 | utt_embed_dim=None, 35 | embedding_integration="AdaIN"): 36 | """ 37 | Initialize duration predictor module. 38 | 39 | Args: 40 | idim (int): Input dimension. 41 | n_layers (int, optional): Number of convolutional layers. 42 | n_chans (int, optional): Number of channels of convolutional layers. 43 | kernel_size (int, optional): Kernel size of convolutional layers. 44 | dropout_rate (float, optional): Dropout rate. 45 | """ 46 | super().__init__() 47 | self.conv = torch.nn.ModuleList() 48 | self.dropouts = torch.nn.ModuleList() 49 | self.norms = torch.nn.ModuleList() 50 | self.embedding_projections = torch.nn.ModuleList() 51 | self.utt_embed_dim = utt_embed_dim 52 | self.use_conditional_layernorm_embedding_integration = embedding_integration in ["AdaIN", "ConditionalLayerNorm"] 53 | 54 | for idx in range(n_layers): 55 | if utt_embed_dim is not None: 56 | if embedding_integration == "AdaIN": 57 | self.embedding_projections += [AdaIN1d(style_dim=utt_embed_dim, num_features=idim)] 58 | elif embedding_integration == "ConditionalLayerNorm": 59 | self.embedding_projections += [ConditionalLayerNorm(speaker_embedding_dim=utt_embed_dim, hidden_dim=idim)] 60 | else: 61 | self.embedding_projections += [torch.nn.Linear(utt_embed_dim + idim, idim)] 62 | else: 63 | self.embedding_projections += [lambda x: x] 64 | in_chans = idim if idx == 0 else n_chans 65 | self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, bias=bias, ), 66 | torch.nn.ReLU())] 67 | self.norms += [LayerNorm(n_chans, dim=1)] 68 | self.dropouts += [torch.nn.Dropout(dropout_rate)] 69 | 70 | self.linear = torch.nn.Linear(n_chans, 1) 71 | 72 | def forward(self, xs, padding_mask=None, utt_embed=None): 73 | """ 74 | Calculate forward propagation. 75 | 76 | Args: 77 | xs (Tensor): Batch of input sequences (B, Tmax, idim). 78 | padding_mask (ByteTensor, optional): 79 | Batch of masks indicating padded part (B, Tmax). 80 | 81 | Returns: 82 | Tensor: Batch of predicted sequences (B, Tmax, 1). 83 | """ 84 | xs = xs.transpose(1, -1) # (B, idim, Tmax) 85 | 86 | for f, c, d, p in zip(self.conv, self.norms, self.dropouts, self.embedding_projections): 87 | xs = f(xs) # (B, C, Tmax) 88 | if self.utt_embed_dim is not None: 89 | xs = integrate_with_utt_embed(hs=xs.transpose(1, 2), utt_embeddings=utt_embed, projection=p, embedding_training=self.use_conditional_layernorm_embedding_integration).transpose(1, 2) 90 | xs = c(xs) 91 | xs = d(xs) 92 | 93 | xs = self.linear(xs.transpose(1, 2)) # (B, Tmax, 1) 94 | 95 | if padding_mask is not None: 96 | xs = xs.masked_fill(padding_mask, 0.0) 97 | 98 | return xs 99 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/ResidualBlock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | References: 5 | - https://github.com/jik876/hifi-gan 6 | - https://github.com/kan-bayashi/ParallelWaveGAN 7 | """ 8 | 9 | import torch 10 | 11 | 12 | class Conv1d(torch.nn.Conv1d): 13 | """ 14 | Conv1d module with customized initialization. 15 | """ 16 | 17 | def __init__(self, *args, **kwargs): 18 | super(Conv1d, self).__init__(*args, **kwargs) 19 | 20 | def reset_parameters(self): 21 | torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu") 22 | if self.bias is not None: 23 | torch.nn.init.constant_(self.bias, 0.0) 24 | 25 | 26 | class Conv1d1x1(Conv1d): 27 | """ 28 | 1x1 Conv1d with customized initialization. 29 | """ 30 | 31 | def __init__(self, in_channels, out_channels, bias): 32 | super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias) 33 | 34 | 35 | class HiFiGANResidualBlock(torch.nn.Module): 36 | """Residual block module in HiFiGAN.""" 37 | 38 | def __init__(self, 39 | kernel_size=3, 40 | channels=512, 41 | dilations=(1, 3, 5), 42 | bias=True, 43 | use_additional_convs=True, 44 | nonlinear_activation="LeakyReLU", 45 | nonlinear_activation_params={"negative_slope": 0.1}, ): 46 | """ 47 | Initialize HiFiGANResidualBlock module. 48 | 49 | Args: 50 | kernel_size (int): Kernel size of dilation convolution layer. 51 | channels (int): Number of channels for convolution layer. 52 | dilations (List[int]): List of dilation factors. 53 | use_additional_convs (bool): Whether to use additional convolution layers. 54 | bias (bool): Whether to add bias parameter in convolution layers. 55 | nonlinear_activation (str): Activation function module name. 56 | nonlinear_activation_params (dict): Hyperparameters for activation function. 57 | """ 58 | super().__init__() 59 | self.use_additional_convs = use_additional_convs 60 | self.convs1 = torch.nn.ModuleList() 61 | if use_additional_convs: 62 | self.convs2 = torch.nn.ModuleList() 63 | assert kernel_size % 2 == 1, "Kernel size must be odd number." 64 | for dilation in dilations: 65 | self.convs1 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 66 | torch.nn.Conv1d(channels, 67 | channels, 68 | kernel_size, 69 | 1, 70 | dilation=dilation, 71 | bias=bias, 72 | padding=(kernel_size - 1) // 2 * dilation, ), )] 73 | if use_additional_convs: 74 | self.convs2 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params), 75 | torch.nn.Conv1d(channels, 76 | channels, 77 | kernel_size, 78 | 1, 79 | dilation=1, 80 | bias=bias, 81 | padding=(kernel_size - 1) // 2, ), )] 82 | 83 | def forward(self, x): 84 | """ 85 | Calculate forward propagation. 86 | 87 | Args: 88 | x (Tensor): Input tensor (B, channels, T). 89 | 90 | Returns: 91 | Tensor: Output tensor (B, channels, T). 92 | """ 93 | for idx in range(len(self.convs1)): 94 | xt = self.convs1[idx](x) 95 | if self.use_additional_convs: 96 | xt = self.convs2[idx](xt) 97 | x = xt + x 98 | return x 99 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/EnergyCalculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Nagoya University (Tomoki Hayashi) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux 2021 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from Modules.GeneralLayers.STFT import STFT 9 | from Utility.utils import pad_list 10 | 11 | 12 | class EnergyCalculator(torch.nn.Module): 13 | 14 | def __init__(self, fs=16000, n_fft=1024, win_length=None, hop_length=256, window="hann", center=True, 15 | normalized=False, onesided=True, use_token_averaged_energy=True, reduction_factor=1): 16 | super().__init__() 17 | 18 | self.fs = fs 19 | self.n_fft = n_fft 20 | self.hop_length = hop_length 21 | self.win_length = win_length 22 | self.window = window 23 | self.use_token_averaged_energy = use_token_averaged_energy 24 | if use_token_averaged_energy: 25 | assert reduction_factor >= 1 26 | self.reduction_factor = reduction_factor 27 | 28 | self.stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=center, normalized=normalized, onesided=onesided) 29 | 30 | def output_size(self): 31 | return 1 32 | 33 | def get_parameters(self): 34 | return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window, win_length=self.win_length, center=self.stft.center, 35 | normalized=self.stft.normalized, use_token_averaged_energy=self.use_token_averaged_energy, reduction_factor=self.reduction_factor) 36 | 37 | def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None, 38 | durations_lengths=None, norm_by_average=True, text=None): 39 | # If not provided, we assume that the inputs have the same length 40 | if input_waves_lengths is None: 41 | input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1]) 42 | 43 | # Domain-conversion: e.g. Stft: time -> time-freq 44 | input_stft, energy_lengths = self.stft(input_waves, input_waves_lengths) 45 | 46 | assert input_stft.dim() >= 4, input_stft.shape 47 | assert input_stft.shape[-1] == 2, input_stft.shape 48 | 49 | # input_stft: (..., F, 2) -> (..., F) 50 | input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2 51 | # sum over frequency (B, N, F) -> (B, N) 52 | energy = torch.sqrt(torch.clamp(input_power.sum(dim=2), min=1.0e-10)) 53 | 54 | # (Optional): Adjust length to match with the features 55 | if feats_lengths is not None: 56 | energy = [self._adjust_num_frames(e[:el].view(-1), fl) for e, el, fl in zip(energy, energy_lengths, feats_lengths)] 57 | energy_lengths = feats_lengths 58 | 59 | # (Optional): Average by duration to calculate token-wise energy 60 | if self.use_token_averaged_energy: 61 | energy = [self._average_by_duration(e[:el].view(-1), d, text) for e, el, d in zip(energy, energy_lengths, durations)] 62 | energy_lengths = durations_lengths 63 | 64 | # Padding 65 | if isinstance(energy, list): 66 | energy = pad_list(energy, 0.0) 67 | 68 | if norm_by_average: 69 | average = energy[0][energy[0] != 0.0].mean() 70 | energy = energy / average 71 | 72 | # Return with the shape (B, T, 1) 73 | return energy.unsqueeze(-1), energy_lengths 74 | 75 | def _average_by_duration(self, x, d, text=None): 76 | d_cumsum = F.pad(d.cumsum(dim=0), (1, 0)) 77 | x_avg = [x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0) for start, end in zip(d_cumsum[:-1], d_cumsum[1:])] 78 | 79 | # find tokens that are not phoneme and set energy to 0 80 | # while this makes sense, it make sit harder to model, so we leave this out 81 | # if text is not None: 82 | # for i, vector in enumerate(text): 83 | # if vector[get_feature_to_index_lookup()["phoneme"]] == 0: 84 | # x_avg[i] = torch.tensor(0.0, device=x.device) 85 | 86 | return torch.stack(x_avg) 87 | 88 | @staticmethod 89 | def _adjust_num_frames(x, num_frames): 90 | if num_frames > len(x): 91 | x = F.pad(x, (0, num_frames - len(x))) 92 | elif num_frames < len(x): 93 | x = x[:num_frames] 94 | return x 95 | -------------------------------------------------------------------------------- /Utility/silence_removal.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import soundfile as sf 3 | import torch 4 | from tqdm import tqdm 5 | 6 | from Preprocessing.TextFrontend import get_feature_to_index_lookup 7 | from Utility.path_to_transcript_dicts import * 8 | 9 | 10 | def make_silence_cleaned_versions(train_sets): 11 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround 12 | # careful: assumes 16kHz or 8kHz audio 13 | silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', 14 | model='silero_vad', 15 | force_reload=False, 16 | onnx=False, 17 | verbose=False) 18 | (get_speech_timestamps, 19 | save_audio, 20 | read_audio, 21 | VADIterator, 22 | collect_chunks) = utils 23 | torch.set_grad_enabled(True) # finding this issue was very infuriating: silero sets 24 | # this to false globally during model loading rather than using inference mode or no_grad 25 | device = "cuda" if torch.cuda.is_available() else "cpu" 26 | silero_model = silero_model.to(device) 27 | 28 | for train_set in train_sets: 29 | for index in tqdm(range(len(train_set))): 30 | filepath = train_set.datapoints[index][8] 31 | phonemes = train_set.datapoints[index][0] 32 | speech_length = train_set.datapoints[index][3] 33 | durations = train_set.datapoints[index][4] 34 | cumsum = 0 35 | legal_silences = list() 36 | for phoneme_index, phone in enumerate(phonemes): 37 | if phone[get_feature_to_index_lookup()["silence"]] == 1 or phone[get_feature_to_index_lookup()["end of sentence"]] == 1 or phone[get_feature_to_index_lookup()["questionmark"]] == 1 or phone[get_feature_to_index_lookup()["exclamationmark"]] == 1 or phone[get_feature_to_index_lookup()["fullstop"]] == 1: 38 | legal_silences.append([cumsum, cumsum + durations[phoneme_index]]) 39 | cumsum = cumsum + durations[phoneme_index] 40 | wave, sr = sf.read(filepath) 41 | resampled_wave = librosa.resample(wave, orig_sr=sr, target_sr=16000) 42 | with torch.inference_mode(): 43 | speech_timestamps = get_speech_timestamps(torch.Tensor(resampled_wave).to(device), silero_model, sampling_rate=16000) 44 | silences = list() 45 | prev_end = 0 46 | for speech_segment in speech_timestamps: 47 | if prev_end != 0: 48 | silences.append([prev_end, speech_segment["start"]]) 49 | prev_end = speech_segment["end"] 50 | # at this point we know all the silences and we know the legal silences. 51 | # We have to transform them both into ratios, so we can compare them. 52 | # If a silence overlaps with a legal silence, it can stay. 53 | illegal_silences = list() 54 | for silence in silences: 55 | illegal = True 56 | start = silence[0] / len(resampled_wave) 57 | end = silence[1] / len(resampled_wave) 58 | for legal_silence in legal_silences: 59 | legal_start = legal_silence[0] / speech_length 60 | legal_end = legal_silence[1] / speech_length 61 | if legal_start < start < legal_end or legal_start < end < legal_end: 62 | illegal = False 63 | break 64 | if illegal: 65 | # If it is an illegal silence, it is marked for removal in the original wave according to ration with real samplingrate. 66 | illegal_silences.append([start, end]) 67 | 68 | # print(f"{len(illegal_silences)} illegal silences detected. ({len(silences) - len(illegal_silences)} legal silences left)") 69 | wave = list(wave) 70 | orig_wave_length = len(wave) 71 | for illegal_silence in reversed(illegal_silences): 72 | wave = wave[:int(illegal_silence[0] * orig_wave_length)] + wave[int(illegal_silence[1] * orig_wave_length):] 73 | # Audio with illegal silences removed will be saved into a new directory. 74 | new_filepath_list = filepath.split("/") 75 | new_filepath_list[-2] = new_filepath_list[-2] + "_silence_removed" 76 | os.makedirs("/".join(new_filepath_list[:-1]), exist_ok=True) 77 | sf.write("/".join(new_filepath_list), wave, sr) 78 | -------------------------------------------------------------------------------- /Modules/Vocoder/BigVGAN.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 NVIDIA CORPORATION. 2 | # Licensed under the MIT license. 3 | 4 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 5 | 6 | import torch 7 | from alias_free_torch import Activation1d 8 | from torch.nn import Conv1d 9 | from torch.nn import ConvTranspose1d 10 | from torch.nn import ModuleList 11 | from torch.nn.utils import remove_weight_norm 12 | from torch.nn.utils import weight_norm 13 | 14 | from Modules.Vocoder.AMP import AMPBlock1 15 | from Modules.Vocoder.Snake import SnakeBeta 16 | 17 | 18 | class BigVGAN(torch.nn.Module): 19 | # this is the main BigVGAN model. Applies anti-aliased periodic activation for resblocks. 20 | 21 | def __init__(self, 22 | num_mels=128, 23 | upsample_initial_channel=1024, 24 | upsample_rates=(8, 6, 2, 2, 2), # CAREFUL: Avocodo discriminator assumes that there are always 4 upsample scales, because it takes intermediate results. 25 | upsample_kernel_sizes=(16, 12, 4, 4, 4), 26 | resblock_kernel_sizes=(3, 7, 11), 27 | resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5)), 28 | weights=None 29 | ): 30 | super(BigVGAN, self).__init__() 31 | 32 | self.num_kernels = len(resblock_kernel_sizes) 33 | self.num_upsamples = len(upsample_rates) 34 | 35 | # pre conv 36 | self.conv_pre = weight_norm(Conv1d(num_mels, upsample_initial_channel, 7, 1, padding=3)) 37 | 38 | # transposed conv-based upsamplers. does not apply anti-aliasing 39 | self.ups = ModuleList() 40 | for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): 41 | self.ups.append(ModuleList([ 42 | weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), 43 | upsample_initial_channel // (2 ** (i + 1)), 44 | k, u, padding=(k - u) // 2)) 45 | ])) 46 | 47 | # residual blocks using anti-aliased multi-periodicity composition modules (AMP) 48 | self.resblocks = ModuleList() 49 | for i in range(len(self.ups)): 50 | ch = upsample_initial_channel // (2 ** (i + 1)) 51 | for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): 52 | self.resblocks.append(AMPBlock1(ch, k, d)) 53 | 54 | # post conv 55 | activation_post = SnakeBeta(ch, alpha_logscale=True) 56 | self.activation_post = Activation1d(activation=activation_post) 57 | 58 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 59 | 60 | # weight initialization 61 | for i in range(len(self.ups)): 62 | self.ups[i].apply(init_weights) 63 | self.conv_post.apply(init_weights) 64 | 65 | if weights is not None: 66 | self.load_state_dict(weights) 67 | 68 | def forward(self, x): 69 | # pre conv 70 | x = self.conv_pre(x) 71 | 72 | for i in range(self.num_upsamples): 73 | # upsampling 74 | for i_up in range(len(self.ups[i])): 75 | x = self.ups[i][i_up](x) 76 | # AMP blocks 77 | xs = None 78 | for j in range(self.num_kernels): 79 | if xs is None: 80 | xs = self.resblocks[i * self.num_kernels + j](x) 81 | else: 82 | xs += self.resblocks[i * self.num_kernels + j](x) 83 | x = xs / self.num_kernels 84 | 85 | # post conv 86 | x = self.activation_post(x) 87 | x = self.conv_post(x) 88 | x = torch.tanh(x) 89 | 90 | return x 91 | 92 | def remove_weight_norm(self): 93 | print('Removing weight norm...') 94 | for l in self.ups: 95 | for l_i in l: 96 | remove_weight_norm(l_i) 97 | for l in self.resblocks: 98 | l.remove_weight_norm() 99 | remove_weight_norm(self.conv_pre) 100 | remove_weight_norm(self.conv_post) 101 | 102 | 103 | def init_weights(m, mean=0.0, std=0.01): 104 | classname = m.__class__.__name__ 105 | if classname.find("Conv") != -1: 106 | m.weight.data.normal_(mean, std) 107 | 108 | 109 | def apply_weight_norm(m): 110 | classname = m.__class__.__name__ 111 | if classname.find("Conv") != -1: 112 | weight_norm(m) 113 | 114 | 115 | def get_padding(kernel_size, dilation=1): 116 | return int((kernel_size * dilation - dilation) / 2) 117 | 118 | 119 | if __name__ == '__main__': 120 | vgan = BigVGAN() 121 | print(f"BigVGAN parameter count: {sum(p.numel() for p in vgan.parameters() if p.requires_grad)}") 122 | print(BigVGAN()(torch.randn([1, 128, 100])).shape) 123 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/toucantts_train_loop_arbiter.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from Modules.ToucanTTS.toucantts_meta_train_loop import train_loop as multi_language_loop 4 | from Modules.ToucanTTS.toucantts_train_loop import train_loop as mono_language_loop 5 | 6 | 7 | def train_loop(net, # an already initialized ToucanTTS model that should be trained. 8 | datasets, 9 | # a list of datasets to train on. Every dataset within a language should already be a concat dataset of all the datasets 10 | # in that language. So every list entry here should be a (combined) dataset for each language. For the case of a monolingual model, pass a list 11 | # with only one dataset in it. This will trigger the arbiter to call the train loop for simple one language training runs rather than the complex 12 | # LAML based one. 13 | train_samplers, # the sampler(s) for the dataloader(s) (gpu_count or single GPU use different ones) 14 | gpu_count, # amount of GPUs to use 15 | device, # the device where this training should run on. 16 | save_directory, # directory where the models and visualizations should be saved. 17 | steps_per_checkpoint=None, # how many steps should be trained before a checkpoint is created. This is only relevant for the multilingual case, 18 | # the monolingual case will do this once per epoch, regardless of the steps. 19 | path_to_checkpoint=None, # path to a trained checkpoint to either continue training or fine-tune from. 20 | lr=0.0001, # learning rate of the model. 21 | resume=False, # whether to automatically load the most recent checkpoint and resume training from it. 22 | warmup_steps=4000, # how many steps until the learning rate reaches the specified value and starts decreasing again. 23 | use_wandb=False, # whether to use online experiment tracking with weights and biases. Requires prior CLI login. 24 | batch_size=32, # how many samples to put into one batch. Higher batch size is more stable, but requires more VRAM. 25 | eval_lang="eng", # in which language the evaluation sentence is to be plotted. 26 | fine_tune=False, # whether to use the provided checkpoint as basis for fine-tuning. 27 | steps=200000, # how many updates to run until training is completed 28 | use_less_loss=False, # whether to use the loss that enforces a structure in the language embedding space 29 | freeze_lang_embs=False, # whether to use the language embeddings from a checkpoint without modifying them, to maintain compatibility with the zero-shot method. This treats language embeddings from the given checkpoint as constants. 30 | ): 31 | torch.multiprocessing.set_start_method('spawn', force=True) 32 | if type(datasets) != list: 33 | datasets = [datasets] 34 | if len(datasets) > 1: 35 | multi_language_loop(net=net, 36 | datasets=datasets, 37 | train_samplers=train_samplers, 38 | device=device, 39 | save_directory=save_directory, 40 | batch_size=batch_size, 41 | steps=steps, 42 | steps_per_checkpoint=steps_per_checkpoint, 43 | lr=lr, 44 | lang=eval_lang, 45 | path_to_checkpoint=path_to_checkpoint, 46 | resume=resume, 47 | fine_tune=fine_tune, 48 | warmup_steps=warmup_steps, 49 | use_wandb=use_wandb, 50 | gpu_count=gpu_count, 51 | use_less_loss=use_less_loss, 52 | freeze_lang_embs=freeze_lang_embs 53 | ) 54 | else: 55 | mono_language_loop(net=net, 56 | train_dataset=datasets[0], 57 | train_sampler=train_samplers[0], 58 | device=device, 59 | save_directory=save_directory, 60 | batch_size=batch_size, 61 | lang=eval_lang, 62 | lr=lr, 63 | warmup_steps=warmup_steps, 64 | path_to_checkpoint=path_to_checkpoint, 65 | fine_tune=fine_tune, 66 | resume=resume, 67 | steps=steps, 68 | use_wandb=use_wandb, 69 | gpu_count=gpu_count, 70 | steps_per_checkpoint=steps_per_checkpoint 71 | ) 72 | -------------------------------------------------------------------------------- /Recipes/ToucanTTS_Massive_German.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import wandb 5 | 6 | from Utility.path_to_transcript_dicts import * 7 | 8 | 9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 10 | from torch.utils.data import ConcatDataset 11 | 12 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 13 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 14 | from Utility.corpus_preparation import prepare_tts_corpus 15 | from Utility.storage_config import MODEL_DIR 16 | from Utility.storage_config import PREPROCESSING_DIR 17 | 18 | if gpu_id == "cpu": 19 | device = torch.device("cpu") 20 | else: 21 | device = torch.device("cuda") 22 | 23 | print("Preparing") 24 | 25 | if model_dir is not None: 26 | save_dir = model_dir 27 | else: 28 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_German_refined") 29 | os.makedirs(save_dir, exist_ok=True) 30 | 31 | if gpu_count > 1: 32 | rank = int(os.environ["LOCAL_RANK"]) 33 | torch.cuda.set_device(rank) 34 | torch.distributed.init_process_group(backend="nccl") 35 | else: 36 | rank = 0 37 | 38 | datasets = list() 39 | 40 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_karlsson, 41 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Karlsson"), 42 | lang="deu", 43 | gpu_count=gpu_count, 44 | rank=rank)) 45 | 46 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_eva, 47 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Eva"), 48 | lang="deu", 49 | gpu_count=gpu_count, 50 | rank=rank)) 51 | 52 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_hokus, 53 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Hokus"), 54 | lang="deu", 55 | gpu_count=gpu_count, 56 | rank=rank)) 57 | 58 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_bernd, 59 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Bernd"), 60 | lang="deu", 61 | gpu_count=gpu_count, 62 | rank=rank)) 63 | 64 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_friedrich, 65 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Friedrich"), 66 | lang="deu", 67 | gpu_count=gpu_count, 68 | rank=rank)) 69 | 70 | datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_hui_others, 71 | corpus_dir=os.path.join(PREPROCESSING_DIR, "hui_others"), 72 | lang="deu", 73 | gpu_count=gpu_count, 74 | rank=rank)) 75 | 76 | train_set = ConcatDataset(datasets) 77 | 78 | model = ToucanTTS() 79 | 80 | if gpu_count > 1: 81 | model.to(rank) 82 | model = torch.nn.parallel.DistributedDataParallel( 83 | model, 84 | device_ids=[rank], 85 | output_device=rank, 86 | find_unused_parameters=True, 87 | ) 88 | torch.distributed.barrier() 89 | train_sampler = torch.utils.data.RandomSampler(train_set) 90 | 91 | if use_wandb: 92 | if rank == 0: 93 | wandb.init( 94 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 95 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 96 | resume="must" if wandb_resume_id is not None else None) 97 | print("Training model") 98 | train_loop(net=model, 99 | datasets=[train_set], 100 | batch_size=12, 101 | steps_per_checkpoint=1000, 102 | device=device, 103 | save_directory=save_dir, 104 | eval_lang="deu", 105 | path_to_checkpoint=resume_checkpoint, 106 | fine_tune=finetune, 107 | resume=resume, 108 | use_wandb=use_wandb, 109 | train_samplers=[train_sampler], 110 | gpu_count=gpu_count) 111 | if use_wandb: 112 | wandb.finish() 113 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/STFT.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from ESPNet 3 | """ 4 | 5 | import torch 6 | from torch.functional import stft as torch_stft 7 | from torch_complex.tensor import ComplexTensor 8 | 9 | from Utility.utils import make_pad_mask 10 | 11 | 12 | class STFT(torch.nn.Module): 13 | 14 | def __init__(self, n_fft=512, 15 | win_length=None, 16 | hop_length=128, 17 | window="hann", 18 | center=True, 19 | normalized=False, 20 | onesided=True): 21 | super().__init__() 22 | self.n_fft = n_fft 23 | if win_length is None: 24 | self.win_length = n_fft 25 | else: 26 | self.win_length = win_length 27 | self.hop_length = hop_length 28 | self.center = center 29 | self.normalized = normalized 30 | self.onesided = onesided 31 | self.window = window 32 | 33 | def extra_repr(self): 34 | return (f"n_fft={self.n_fft}, " 35 | f"win_length={self.win_length}, " 36 | f"hop_length={self.hop_length}, " 37 | f"center={self.center}, " 38 | f"normalized={self.normalized}, " 39 | f"onesided={self.onesided}") 40 | 41 | def forward(self, input_wave, ilens=None): 42 | """ 43 | STFT forward function. 44 | Args: 45 | input_wave: (Batch, Nsamples) or (Batch, Nsample, Channels) 46 | ilens: (Batch) 47 | Returns: 48 | output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2) 49 | """ 50 | bs = input_wave.size(0) 51 | 52 | if input_wave.dim() == 3: 53 | multi_channel = True 54 | # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample) 55 | input_wave = input_wave.transpose(1, 2).reshape(-1, input_wave.size(1)) 56 | else: 57 | multi_channel = False 58 | 59 | # output: (Batch, Freq, Frames, 2=real_imag) 60 | # or (Batch, Channel, Freq, Frames, 2=real_imag) 61 | if self.window is not None: 62 | window_func = getattr(torch, f"{self.window}_window") 63 | window = window_func(self.win_length, dtype=input_wave.dtype, device=input_wave.device) 64 | else: 65 | window = None 66 | 67 | complex_output = torch_stft(input=input_wave, 68 | n_fft=self.n_fft, 69 | win_length=self.win_length, 70 | hop_length=self.hop_length, 71 | center=self.center, 72 | window=window, 73 | normalized=self.normalized, 74 | onesided=self.onesided, 75 | return_complex=True) 76 | output = torch.view_as_real(complex_output) 77 | # output: (Batch, Freq, Frames, 2=real_imag) 78 | # -> (Batch, Frames, Freq, 2=real_imag) 79 | output = output.transpose(1, 2) 80 | if multi_channel: 81 | # output: (Batch * Channel, Frames, Freq, 2=real_imag) 82 | # -> (Batch, Frame, Channel, Freq, 2=real_imag) 83 | output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(1, 2) 84 | 85 | if ilens is not None: 86 | if self.center: 87 | pad = self.win_length // 2 88 | ilens = ilens + 2 * pad 89 | 90 | olens = torch.div((ilens - self.win_length), self.hop_length, rounding_mode='trunc') + 1 91 | output.masked_fill_(make_pad_mask(olens, output, 1), 0.0) 92 | else: 93 | olens = None 94 | 95 | return output, olens 96 | 97 | def inverse(self, input, ilens=None): 98 | """ 99 | Inverse STFT. 100 | Args: 101 | input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F) 102 | ilens: (batch,) 103 | Returns: 104 | wavs: (batch, samples) 105 | ilens: (batch,) 106 | """ 107 | istft = torch.functional.istft 108 | 109 | if self.window is not None: 110 | window_func = getattr(torch, f"{self.window}_window") 111 | window = window_func(self.win_length, dtype=input.dtype, device=input.device) 112 | else: 113 | window = None 114 | 115 | if isinstance(input, ComplexTensor): 116 | input = torch.stack([input.real, input.imag], dim=-1) 117 | assert input.shape[-1] == 2 118 | input = input.transpose(1, 2) 119 | 120 | wavs = istft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=window, center=self.center, 121 | normalized=self.normalized, onesided=self.onesided, length=ilens.max() if ilens is not None else ilens) 122 | 123 | return wavs, ilens 124 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/PitchCalculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Nagoya University (Tomoki Hayashi) 2 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 3 | # Adapted by Florian Lux 2021 4 | 5 | import math 6 | 7 | import numpy as np 8 | import parselmouth 9 | import torch 10 | import torch.nn.functional as F 11 | from scipy.interpolate import interp1d 12 | 13 | 14 | class Parselmouth(torch.nn.Module): 15 | """ 16 | F0 estimation with Parselmouth https://parselmouth.readthedocs.io/en/stable/index.html 17 | """ 18 | 19 | def __init__(self, fs=16000, n_fft=1024, hop_length=256, f0min=40, f0max=600, use_token_averaged_f0=True, 20 | use_continuous_f0=True, use_log_f0=False, reduction_factor=1): 21 | super().__init__() 22 | self.fs = fs 23 | self.n_fft = n_fft 24 | self.hop_length = hop_length 25 | self.frame_period = 1000 * hop_length / fs 26 | self.f0min = f0min 27 | self.f0max = f0max 28 | self.use_token_averaged_f0 = use_token_averaged_f0 29 | self.use_continuous_f0 = use_continuous_f0 30 | self.use_log_f0 = use_log_f0 31 | if use_token_averaged_f0: 32 | assert reduction_factor >= 1 33 | self.reduction_factor = reduction_factor 34 | 35 | def output_size(self): 36 | return 1 37 | 38 | def get_parameters(self): 39 | return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, f0min=self.f0min, f0max=self.f0max, 40 | use_token_averaged_f0=self.use_token_averaged_f0, use_continuous_f0=self.use_continuous_f0, use_log_f0=self.use_log_f0, 41 | reduction_factor=self.reduction_factor) 42 | 43 | def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None, 44 | durations_lengths=None, norm_by_average=True, text=None): 45 | 46 | # F0 extraction 47 | pitch = self._calculate_f0(input_waves[0]) 48 | 49 | # Adjust length to match with the feature sequences 50 | pitch = self._adjust_num_frames(pitch, feats_lengths[0]).view(-1) 51 | 52 | pitch = self._average_by_duration(pitch, durations[0], text).view(-1) 53 | pitch_lengths = durations_lengths 54 | 55 | if norm_by_average: 56 | average = pitch[pitch != 0.0].mean() 57 | pitch = pitch / average 58 | 59 | # Return with the shape (B, T, 1) 60 | return pitch.unsqueeze(-1), pitch_lengths 61 | 62 | def _calculate_f0(self, input): 63 | x = input.cpu().numpy().astype(np.double) 64 | snd = parselmouth.Sound(values=x, sampling_frequency=self.fs) 65 | f0 = snd.to_pitch(time_step=self.hop_length / self.fs, pitch_floor=self.f0min, pitch_ceiling=self.f0max).selected_array['frequency'] 66 | if self.use_continuous_f0: 67 | f0 = self._convert_to_continuous_f0(f0) 68 | if self.use_log_f0: 69 | nonzero_idxs = np.where(f0 != 0)[0] 70 | f0[nonzero_idxs] = np.log(f0[nonzero_idxs]) 71 | return input.new_tensor(f0.reshape(-1), dtype=torch.float) 72 | 73 | @staticmethod 74 | def _adjust_num_frames(x, num_frames): 75 | if num_frames > len(x): 76 | # x = F.pad(x, (0, num_frames - len(x))) 77 | x = F.pad(x, (math.ceil((num_frames - len(x)) / 2), math.floor((num_frames - len(x)) / 2))) 78 | elif num_frames < len(x): 79 | x = x[:num_frames] 80 | return x 81 | 82 | @staticmethod 83 | def _convert_to_continuous_f0(f0: np.array): 84 | if (f0 == 0).all(): 85 | return f0 86 | 87 | # padding start and end of f0 sequence 88 | start_f0 = f0[f0 != 0][0] 89 | end_f0 = f0[f0 != 0][-1] 90 | start_idx = np.where(f0 == start_f0)[0][0] 91 | end_idx = np.where(f0 == end_f0)[0][-1] 92 | f0[:start_idx] = start_f0 93 | f0[end_idx:] = end_f0 94 | 95 | # get non-zero frame index 96 | nonzero_idxs = np.where(f0 != 0)[0] 97 | 98 | # perform linear interpolation 99 | interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs]) 100 | f0 = interp_fn(np.arange(0, f0.shape[0])) 101 | 102 | return f0 103 | 104 | def _average_by_duration(self, x, d, text=None): 105 | d_cumsum = F.pad(d.cumsum(dim=0), (1, 0)) 106 | x_avg = [ 107 | x[start:end].masked_select(x[start:end].gt(0.0)).mean(dim=0) if len(x[start:end].masked_select(x[start:end].gt(0.0))) != 0 else x.new_tensor(0.0) 108 | for start, end in zip(d_cumsum[:-1], d_cumsum[1:])] 109 | 110 | # find tokens that are not voiced and set pitch to 0 111 | # while this makes sense, it makes it harder for the model to learn, so we leave this out now. 112 | # if text is not None: 113 | # for i, vector in enumerate(text): 114 | # if vector[get_feature_to_index_lookup()["voiced"]] == 0: 115 | # x_avg[i] = torch.tensor(0.0, device=x.device) 116 | 117 | return torch.stack(x_avg) 118 | -------------------------------------------------------------------------------- /Recipes/finetuning_example_multilingual.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example script for fine-tuning the pretrained model to your own data. 3 | 4 | Comments in ALL CAPS are instructions 5 | """ 6 | 7 | import time 8 | 9 | import torch 10 | import wandb 11 | 12 | from Utility.path_to_transcript_dicts import * 13 | 14 | 15 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count): 16 | from huggingface_hub import hf_hub_download 17 | from torch.utils.data import ConcatDataset 18 | 19 | from Modules.ToucanTTS.ToucanTTS import ToucanTTS 20 | from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop 21 | from Utility.corpus_preparation import prepare_tts_corpus 22 | from Utility.storage_config import MODEL_DIR 23 | from Utility.storage_config import PREPROCESSING_DIR 24 | 25 | if gpu_id == "cpu": 26 | device = torch.device("cpu") 27 | else: 28 | device = torch.device("cuda") 29 | assert gpu_count == 1 # distributed finetuning is not supported 30 | 31 | # IF YOU'RE ADDING A NEW LANGUAGE, YOU MIGHT NEED TO ADD HANDLING FOR IT IN Preprocessing/TextFrontend.py 32 | 33 | print("Preparing") 34 | 35 | if model_dir is not None: 36 | save_dir = model_dir 37 | else: 38 | save_dir = os.path.join(MODEL_DIR, "ToucanTTS_German_and_English") # RENAME TO SOMETHING MEANINGFUL FOR YOUR DATA 39 | os.makedirs(save_dir, exist_ok=True) 40 | 41 | all_train_sets = list() # YOU CAN HAVE MULTIPLE LANGUAGES, OR JUST ONE. JUST MAKE ONE ConcatDataset PER LANGUAGE AND ADD IT TO THE LIST. 42 | train_samplers = list() 43 | 44 | # ======================= 45 | # = German Data = 46 | # ======================= 47 | german_datasets = list() 48 | german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_karlsson(), 49 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Karlsson"), 50 | lang="deu")) # CHANGE THE TRANSCRIPT DICT, THE NAME OF THE CACHE DIRECTORY AND THE LANGUAGE TO YOUR NEEDS 51 | 52 | german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_eva(), 53 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Eva"), 54 | lang="deu")) # YOU CAN SIMPLY ADD MODE CORPORA AND DO THE SAME, BUT YOU DON'T HAVE TO, ONE IS ENOUGH 55 | 56 | all_train_sets.append(ConcatDataset(german_datasets)) 57 | 58 | # ======================== 59 | # = English Data = 60 | # ======================== 61 | english_datasets = list() 62 | english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_nancy(), 63 | corpus_dir=os.path.join(PREPROCESSING_DIR, "Nancy"), 64 | lang="eng")) 65 | 66 | english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_ljspeech(), 67 | corpus_dir=os.path.join(PREPROCESSING_DIR, "LJSpeech"), 68 | lang="eng")) 69 | 70 | all_train_sets.append(ConcatDataset(english_datasets)) 71 | 72 | model = ToucanTTS() 73 | 74 | for train_set in all_train_sets: 75 | train_samplers.append(torch.utils.data.RandomSampler(train_set)) 76 | 77 | if use_wandb: 78 | wandb.init( 79 | name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None, 80 | id=wandb_resume_id, # this is None if not specified in the command line arguments. 81 | resume="must" if wandb_resume_id is not None else None) 82 | 83 | print("Training model") 84 | train_loop(net=model, 85 | datasets=all_train_sets, 86 | device=device, 87 | save_directory=save_dir, 88 | batch_size=12, # YOU MIGHT GET OUT OF MEMORY ISSUES ON SMALL GPUs, IF SO, DECREASE THIS. 89 | eval_lang="deu", # THE LANGUAGE YOUR PROGRESS PLOTS WILL BE MADE IN 90 | warmup_steps=500, 91 | lr=1e-5, # if you have enough data (over ~1000 datapoints) you can increase this up to 1e-4 and it will still be stable, but learn quicker. 92 | # DOWNLOAD THESE INITIALIZATION MODELS FROM THE RELEASE PAGE OF THE GITHUB OR RUN THE DOWNLOADER SCRIPT TO GET THEM AUTOMATICALLY 93 | path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") if resume_checkpoint is None else resume_checkpoint, 94 | fine_tune=True if resume_checkpoint is None and not resume else finetune, 95 | resume=resume, 96 | steps=5000, 97 | use_wandb=use_wandb, 98 | train_samplers=train_samplers, 99 | gpu_count=1) 100 | if use_wandb: 101 | wandb.finish() 102 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/ConditionalLayerNorm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code taken from https://github.com/tuanh123789/AdaSpeech/blob/main/model/adaspeech_modules.py 3 | By https://github.com/tuanh123789 4 | No license specified 5 | 6 | Implemented as outlined in AdaSpeech https://arxiv.org/pdf/2103.00993.pdf 7 | Used in this toolkit similar to how it is done in AdaSpeech 4 https://arxiv.org/pdf/2204.00436.pdf 8 | 9 | """ 10 | 11 | import torch 12 | from torch import nn 13 | 14 | 15 | class ConditionalLayerNorm(nn.Module): 16 | 17 | def __init__(self, 18 | hidden_dim, 19 | speaker_embedding_dim, 20 | dim=-1): 21 | super(ConditionalLayerNorm, self).__init__() 22 | self.dim = dim 23 | if isinstance(hidden_dim, int): 24 | self.normal_shape = hidden_dim 25 | self.speaker_embedding_dim = speaker_embedding_dim 26 | self.W_scale = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape), 27 | nn.Tanh(), 28 | nn.Linear(self.normal_shape, self.normal_shape)) 29 | self.W_bias = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape), 30 | nn.Tanh(), 31 | nn.Linear(self.normal_shape, self.normal_shape)) 32 | self.reset_parameters() 33 | 34 | def reset_parameters(self): 35 | torch.nn.init.constant_(self.W_scale[0].weight, 0.0) 36 | torch.nn.init.constant_(self.W_scale[2].weight, 0.0) 37 | torch.nn.init.constant_(self.W_scale[0].bias, 1.0) 38 | torch.nn.init.constant_(self.W_scale[2].bias, 1.0) 39 | torch.nn.init.constant_(self.W_bias[0].weight, 0.0) 40 | torch.nn.init.constant_(self.W_bias[2].weight, 0.0) 41 | torch.nn.init.constant_(self.W_bias[0].bias, 0.0) 42 | torch.nn.init.constant_(self.W_bias[2].bias, 0.0) 43 | 44 | def forward(self, x, speaker_embedding): 45 | 46 | if self.dim != -1: 47 | x = x.transpose(-1, self.dim) 48 | 49 | mean = x.mean(dim=-1, keepdim=True) 50 | var = ((x - mean) ** 2).mean(dim=-1, keepdim=True) 51 | scale = self.W_scale(speaker_embedding) 52 | bias = self.W_bias(speaker_embedding) 53 | 54 | y = scale.unsqueeze(1) * ((x - mean) / var) + bias.unsqueeze(1) 55 | 56 | if self.dim != -1: 57 | y = y.transpose(-1, self.dim) 58 | 59 | return y 60 | 61 | 62 | class SequentialWrappableConditionalLayerNorm(nn.Module): 63 | 64 | def __init__(self, 65 | hidden_dim, 66 | speaker_embedding_dim): 67 | super(SequentialWrappableConditionalLayerNorm, self).__init__() 68 | if isinstance(hidden_dim, int): 69 | self.normal_shape = hidden_dim 70 | self.speaker_embedding_dim = speaker_embedding_dim 71 | self.W_scale = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape), 72 | nn.Tanh(), 73 | nn.Linear(self.normal_shape, self.normal_shape)) 74 | self.W_bias = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape), 75 | nn.Tanh(), 76 | nn.Linear(self.normal_shape, self.normal_shape)) 77 | self.reset_parameters() 78 | 79 | def reset_parameters(self): 80 | torch.nn.init.constant_(self.W_scale[0].weight, 0.0) 81 | torch.nn.init.constant_(self.W_scale[2].weight, 0.0) 82 | torch.nn.init.constant_(self.W_scale[0].bias, 1.0) 83 | torch.nn.init.constant_(self.W_scale[2].bias, 1.0) 84 | torch.nn.init.constant_(self.W_bias[0].weight, 0.0) 85 | torch.nn.init.constant_(self.W_bias[2].weight, 0.0) 86 | torch.nn.init.constant_(self.W_bias[0].bias, 0.0) 87 | torch.nn.init.constant_(self.W_bias[2].bias, 0.0) 88 | 89 | def forward(self, packed_input): 90 | x, speaker_embedding = packed_input 91 | mean = x.mean(dim=-1, keepdim=True) 92 | var = ((x - mean) ** 2).mean(dim=-1, keepdim=True) 93 | scale = self.W_scale(speaker_embedding) 94 | bias = self.W_bias(speaker_embedding) 95 | 96 | y = scale.unsqueeze(1) * ((x - mean) / var) + bias.unsqueeze(1) 97 | 98 | return y 99 | 100 | 101 | class AdaIN1d(nn.Module): 102 | """ 103 | MIT Licensed 104 | 105 | Copyright (c) 2022 Aaron (Yinghao) Li 106 | https://github.com/yl4579/StyleTTS/blob/main/models.py 107 | """ 108 | 109 | def __init__(self, style_dim, num_features): 110 | super().__init__() 111 | self.norm = nn.InstanceNorm1d(num_features, affine=False) 112 | self.fc = nn.Linear(style_dim, num_features * 2) 113 | 114 | def forward(self, x, s): 115 | h = self.fc(s) 116 | h = h.view(h.size(0), h.size(1), 1) 117 | gamma, beta = torch.chunk(h, chunks=2, dim=1) 118 | return (1 + gamma.transpose(1, 2)) * self.norm(x.transpose(1, 2)).transpose(1, 2) + beta.transpose(1, 2) 119 | -------------------------------------------------------------------------------- /Modules/Vocoder/AdversarialLoss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2021 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | 11 | def discriminator_adv_loss(disc_real_outputs, disc_generated_outputs): 12 | loss = 0 13 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 14 | dr_fun, dr_dir = dr 15 | dg_fun, dg_dir = dg 16 | r_loss_fun = torch.mean(F.softplus(1 - dr_fun) ** 2) 17 | g_loss_fun = torch.mean(F.softplus(dg_fun) ** 2) 18 | r_loss_dir = torch.mean(F.softplus(1 - dr_dir) ** 2) 19 | g_loss_dir = torch.mean(-F.softplus(1 - dg_dir) ** 2) 20 | r_loss = r_loss_fun + r_loss_dir 21 | g_loss = g_loss_fun + g_loss_dir 22 | loss += (r_loss + g_loss) 23 | 24 | return loss / len(disc_generated_outputs) 25 | 26 | 27 | def generator_adv_loss(disc_outputs): 28 | loss = 0 29 | for dg in disc_outputs: 30 | l = torch.mean(F.softplus(1 - dg) ** 2) 31 | loss += l 32 | 33 | return loss / len(disc_outputs) 34 | 35 | 36 | class GeneratorAdversarialLoss(torch.nn.Module): 37 | 38 | def __init__(self, 39 | average_by_discriminators=True, 40 | loss_type="mse", ): 41 | """Initialize GeneratorAversarialLoss module.""" 42 | super().__init__() 43 | self.average_by_discriminators = average_by_discriminators 44 | assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported." 45 | if loss_type == "mse": 46 | self.criterion = self._mse_loss 47 | else: 48 | self.criterion = self._hinge_loss 49 | 50 | def forward(self, outputs): 51 | """ 52 | Calcualate generator adversarial loss. 53 | 54 | Args: 55 | outputs (Tensor or list): Discriminator outputs or list of 56 | discriminator outputs. 57 | 58 | Returns: 59 | Tensor: Generator adversarial loss value. 60 | """ 61 | if isinstance(outputs, (tuple, list)): 62 | adv_loss = 0.0 63 | for i, outputs_ in enumerate(outputs): 64 | if isinstance(outputs_, (tuple, list)): 65 | outputs_ = outputs_[-1] 66 | adv_loss = adv_loss + self.criterion(outputs_) 67 | if self.average_by_discriminators: 68 | adv_loss /= i + 1 69 | else: 70 | adv_loss = self.criterion(outputs) 71 | 72 | return adv_loss 73 | 74 | def _mse_loss(self, x): 75 | return F.mse_loss(x, x.new_ones(x.size())) 76 | 77 | def _hinge_loss(self, x): 78 | return -x.mean() 79 | 80 | 81 | class DiscriminatorAdversarialLoss(torch.nn.Module): 82 | 83 | def __init__(self, 84 | average_by_discriminators=True, 85 | loss_type="mse", ): 86 | super().__init__() 87 | self.average_by_discriminators = average_by_discriminators 88 | assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported." 89 | if loss_type == "mse": 90 | self.fake_criterion = self._mse_fake_loss 91 | self.real_criterion = self._mse_real_loss 92 | else: 93 | self.fake_criterion = self._hinge_fake_loss 94 | self.real_criterion = self._hinge_real_loss 95 | 96 | def forward(self, outputs_hat, outputs): 97 | """ 98 | Calcualate discriminator adversarial loss. 99 | 100 | Args: 101 | outputs_hat (Tensor or list): Discriminator outputs or list of 102 | discriminator outputs calculated from generator outputs. 103 | outputs (Tensor or list): Discriminator outputs or list of 104 | discriminator outputs calculated from groundtruth. 105 | 106 | Returns: 107 | Tensor: Discriminator real loss value. 108 | Tensor: Discriminator fake loss value. 109 | """ 110 | if isinstance(outputs, (tuple, list)): 111 | real_loss = 0.0 112 | fake_loss = 0.0 113 | for i, (outputs_hat_, outputs_) in enumerate(zip(outputs_hat, outputs)): 114 | if isinstance(outputs_hat_, (tuple, list)): 115 | outputs_hat_ = outputs_hat_[-1] 116 | outputs_ = outputs_[-1] 117 | real_loss = real_loss + self.real_criterion(outputs_) 118 | fake_loss = fake_loss + self.fake_criterion(outputs_hat_) 119 | if self.average_by_discriminators: 120 | fake_loss /= i + 1 121 | real_loss /= i + 1 122 | else: 123 | real_loss = self.real_criterion(outputs) 124 | fake_loss = self.fake_criterion(outputs_hat) 125 | 126 | return real_loss + fake_loss 127 | 128 | def _mse_real_loss(self, x): 129 | return F.mse_loss(x, x.new_ones(x.size())) 130 | 131 | def _mse_fake_loss(self, x): 132 | return F.mse_loss(x, x.new_zeros(x.size())) 133 | 134 | def _hinge_real_loss(self, x): 135 | return -torch.mean(torch.min(x - 1, x.new_zeros(x.size()))) 136 | 137 | def _hinge_fake_loss(self, x): 138 | return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size()))) 139 | -------------------------------------------------------------------------------- /Preprocessing/multilinguality/generate_zero_shot_lang_embs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from huggingface_hub import hf_hub_download 9 | from tqdm import tqdm 10 | 11 | from Utility.storage_config import MODEL_DIR 12 | 13 | 14 | def approximate_and_inject_language_embeddings(model_path, df, iso_lookup, min_n_langs=5, max_n_langs=25, threshold_percentile=50): 15 | # load pretrained language_embeddings 16 | model = torch.load(model_path, map_location="cpu") 17 | lang_embs = model["model"]["encoder.language_embedding.weight"] 18 | 19 | features_per_closest_lang = 2 20 | # for combined, df has up to 5 features (if containing individual distances) per closest lang + 1 target lang column 21 | if "combined_dist_0" in df.columns: 22 | if "map_dist_0" in df.columns: 23 | features_per_closest_lang += 1 24 | if "asp_dist_0" in df.columns: 25 | features_per_closest_lang += 1 26 | if "tree_dist_0" in df.columns: 27 | features_per_closest_lang += 1 28 | n_closest = len(df.columns) // features_per_closest_lang 29 | distance_type = "combined" 30 | # else, df has 2 features per closest lang + 1 target lang column 31 | else: 32 | n_closest = len(df.columns) // features_per_closest_lang 33 | if "map_dist_0" in df.columns: 34 | distance_type = "map" 35 | elif "tree_dist_0" in df.columns: 36 | distance_type = "tree" 37 | elif "asp_dist_0" in df.columns: 38 | distance_type = "asp" 39 | elif "learned_dist_0" in df.columns: 40 | distance_type = "learned" 41 | else: 42 | distance_type = "random" 43 | 44 | # get relevant columns 45 | closest_lang_columns = [f"closest_lang_{i}" for i in range(n_closest)] 46 | closest_dist_columns = [f"{distance_type}_dist_{i}" for i in range(n_closest)] 47 | closest_lang_columns = closest_lang_columns[:max_n_langs] 48 | closest_dist_columns = closest_dist_columns[:max_n_langs] 49 | assert df[closest_dist_columns[-1]].isna().sum().sum() == 0 50 | 51 | # get threshold based on distance of a certain percentile of the furthest language across all samples 52 | threshold = np.percentile(df[closest_dist_columns[-1]], threshold_percentile) 53 | print(f"threshold: {threshold:.4f}") 54 | for row in tqdm(df.itertuples(), total=df.shape[0], desc="Approximating language embeddings"): 55 | avg_emb = torch.zeros([32]) # If you change the size of the language embedding in the model, you need to change the size here as well. 56 | dists = [getattr(row, d) for i, d in enumerate(closest_dist_columns) if i < min_n_langs or getattr(row, d) < threshold] 57 | langs = [getattr(row, l) for l in closest_lang_columns[:len(dists)]] 58 | 59 | for lang in langs: 60 | lang_emb = lang_embs[iso_lookup[-1][str(lang)]] 61 | avg_emb += lang_emb 62 | avg_emb /= len(langs) # normalize 63 | lang_embs[iso_lookup[-1][str(row.target_lang)]] = avg_emb 64 | 65 | # inject language embeddings into Toucan model and save 66 | model["model"]["encoder.language_embedding.weight"] = lang_embs 67 | modified_model_path = model_path.split(".")[0] + "_zeroshot_lang_embs.pt" 68 | torch.save(model, modified_model_path) 69 | print(f"Replaced unsupervised language embeddings with zero-shot approximations.\nSaved modified model to {modified_model_path}") 70 | 71 | 72 | if __name__ == "__main__": 73 | default_model_path = os.path.join(MODEL_DIR, "ToucanTTS_Meta", "best.pt") # MODEL_DIR must be absolute path, the relative path will fail at this location 74 | default_csv_path = "distance_datasets/dataset_learned_top30.csv" 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("--model_path", type=str, default=default_model_path, help="path of the model for which the language embeddings should be modified") 77 | parser.add_argument("--dataset_path", type=str, default=default_csv_path, help="path to distance dataset CSV") 78 | parser.add_argument("--min_n_langs", type=int, default=5, help="minimum amount of languages used for averaging") 79 | parser.add_argument("--max_n_langs", type=int, default=25, help="maximum amount of languages used for averaging") 80 | parser.add_argument("--threshold_percentile", type=int, default=50, help="percentile of the furthest used languages \ 81 | used as cutoff threshold (no langs >= the threshold are used for averaging)") 82 | args = parser.parse_args() 83 | ISO_LOOKUP_PATH = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json") 84 | with open(ISO_LOOKUP_PATH, "r") as f: 85 | iso_lookup = json.load(f) # iso_lookup[-1] = iso2id mapping 86 | # load language distance dataset 87 | distance_df = pd.read_csv(args.dataset_path, sep="|") 88 | approximate_and_inject_language_embeddings(model_path=args.model_path, 89 | df=distance_df, 90 | iso_lookup=iso_lookup, 91 | min_n_langs=args.min_n_langs, 92 | max_n_langs=args.max_n_langs, 93 | threshold_percentile=args.threshold_percentile) 94 | -------------------------------------------------------------------------------- /Utility/diverse_losses.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/facebookresearch/barlowtwins 2 | 3 | from math import exp 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class RedundancyReduction(torch.nn.Module): 11 | 12 | def __init__(self, lambd=1e-5, vector_dimensions=256): 13 | super().__init__() 14 | self.lambd = lambd 15 | self.bn = torch.nn.BatchNorm1d(vector_dimensions, affine=False) 16 | 17 | def forward(self, z1, z2): 18 | c = self.bn(z1).T @ self.bn(z2) 19 | c.div_(z1.size(0)) 20 | off_diag = off_diagonal(c).pow_(2).sum() 21 | return self.lambd * off_diag 22 | 23 | 24 | class BarlowTwinsLoss(torch.nn.Module): 25 | 26 | def __init__(self, lambd=1e-5, vector_dimensions=256): 27 | super().__init__() 28 | self.lambd = lambd 29 | self.bn = torch.nn.BatchNorm1d(vector_dimensions, affine=False) 30 | 31 | def forward(self, z1, z2): 32 | c = self.bn(z1).T @ self.bn(z2) 33 | c.div_(z1.size(0)) 34 | on_diag = torch.diagonal(c).add_(-1).pow_(2).sum() 35 | off_diag = off_diagonal(c).pow_(2).sum() 36 | loss = on_diag + self.lambd * off_diag 37 | return loss 38 | 39 | 40 | def off_diagonal(x): 41 | # return a flattened view of the off-diagonal elements of a square matrix 42 | n, m = x.shape 43 | assert n == m 44 | return x.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten() 45 | 46 | 47 | class TripletLoss(torch.nn.Module): 48 | 49 | def __init__(self, margin): 50 | super().__init__() 51 | self.cosine_similarity = torch.nn.CosineSimilarity() 52 | self.margin = margin 53 | 54 | def forward(self, 55 | anchor_embeddings, 56 | positive_embeddings, 57 | negative_embeddings): 58 | positive_distance = 1 - self.cosine_similarity(anchor_embeddings, positive_embeddings) 59 | negative_distance = 1 - self.cosine_similarity(anchor_embeddings, negative_embeddings) 60 | 61 | losses = torch.max(positive_distance - negative_distance + self.margin, 62 | torch.full_like(positive_distance, 0)) 63 | return torch.mean(losses) 64 | 65 | 66 | # The following is taken from https://github.com/NATSpeech/NATSpeech/blob/aef3aa8899c82e40a28e4f59d559b46b18ba87e8/utils/metrics/ssim.py 67 | 68 | def gaussian(window_size, sigma): 69 | gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)]) 70 | return gauss / gauss.sum() 71 | 72 | 73 | def create_window(window_size, channel): 74 | _1D_window = gaussian(window_size, 1.5).unsqueeze(1) 75 | _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) 76 | window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous()) 77 | return window 78 | 79 | 80 | def _ssim(img1, img2, window, window_size, channel, size_average=True): 81 | mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) 82 | mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) 83 | 84 | mu1_sq = mu1.pow(2) 85 | mu2_sq = mu2.pow(2) 86 | mu1_mu2 = mu1 * mu2 87 | 88 | sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq 89 | sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq 90 | sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2 91 | 92 | C1 = 0.01 ** 2 93 | C2 = 0.03 ** 2 94 | 95 | ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) 96 | 97 | if size_average: 98 | return ssim_map.mean() 99 | else: 100 | return ssim_map.mean(1) 101 | 102 | 103 | class SSIM(torch.nn.Module): 104 | """ 105 | Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim 106 | """ 107 | 108 | def __init__(self, window_size=11, size_average=True): 109 | super(SSIM, self).__init__() 110 | self.window_size = window_size 111 | self.size_average = size_average 112 | self.channel = 1 113 | self.window = create_window(window_size, self.channel) 114 | 115 | def forward(self, img1, img2): 116 | (_, channel, _, _) = img1.size() 117 | 118 | if channel == self.channel and self.window.data.type() == img1.data.type(): 119 | window = self.window 120 | else: 121 | window = create_window(self.window_size, channel) 122 | 123 | if img1.is_cuda: 124 | window = window.cuda(img1.get_device()) 125 | window = window.type_as(img1) 126 | 127 | self.window = window 128 | self.channel = channel 129 | 130 | return _ssim(img1, img2, window, self.window_size, channel, self.size_average) 131 | 132 | 133 | window = None 134 | 135 | 136 | def ssim(img1, img2, window_size=11, size_average=True): 137 | (_, channel, _, _) = img1.size() 138 | global window 139 | if window is None: 140 | window = create_window(window_size, channel) 141 | if img1.is_cuda: 142 | window = window.cuda(img1.get_device()) 143 | window = window.type_as(img1) 144 | return _ssim(img1, img2, window, window_size, channel, size_average) 145 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/wavenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | MIT License 3 | 4 | Copyright (c) 2022 Yi Ren 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | import torch 26 | from torch import nn 27 | 28 | 29 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 30 | n_channels_int = n_channels[0] 31 | in_act = input_a + input_b 32 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 33 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 34 | acts = t_act * s_act 35 | return acts 36 | 37 | 38 | class WN(torch.nn.Module): 39 | 40 | def __init__(self, hidden_size, kernel_size, dilation_rate, n_layers, c_cond=0, 41 | p_dropout=0, share_cond_layers=False, is_BTC=False, use_weightnorm=True): 42 | super(WN, self).__init__() 43 | assert (kernel_size % 2 == 1) 44 | assert (hidden_size % 2 == 0) 45 | self.is_BTC = is_BTC 46 | self.hidden_size = hidden_size 47 | self.kernel_size = kernel_size 48 | self.dilation_rate = dilation_rate 49 | self.n_layers = n_layers 50 | self.gin_channels = c_cond 51 | self.p_dropout = p_dropout 52 | self.share_cond_layers = share_cond_layers 53 | 54 | self.in_layers = torch.nn.ModuleList() 55 | self.res_skip_layers = torch.nn.ModuleList() 56 | self.drop = nn.Dropout(p_dropout) 57 | 58 | if c_cond != 0 and not share_cond_layers: 59 | cond_layer = torch.nn.Conv1d(c_cond, 2 * hidden_size * n_layers, 1) 60 | if use_weightnorm: 61 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 62 | else: 63 | self.cond_layer = cond_layer 64 | 65 | for i in range(n_layers): 66 | dilation = dilation_rate ** i 67 | padding = int((kernel_size * dilation - dilation) / 2) 68 | in_layer = torch.nn.Conv1d(hidden_size, 2 * hidden_size, kernel_size, 69 | dilation=dilation, padding=padding) 70 | if use_weightnorm: 71 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 72 | self.in_layers.append(in_layer) 73 | 74 | # last one is not necessary 75 | if i < n_layers - 1: 76 | res_skip_channels = 2 * hidden_size 77 | else: 78 | res_skip_channels = hidden_size 79 | 80 | res_skip_layer = torch.nn.Conv1d(hidden_size, res_skip_channels, 1) 81 | if use_weightnorm: 82 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 83 | self.res_skip_layers.append(res_skip_layer) 84 | 85 | def forward(self, x, nonpadding=None, cond=None): 86 | if self.is_BTC: 87 | x = x.transpose(1, 2) 88 | cond = cond.transpose(1, 2) if cond is not None else None 89 | nonpadding = nonpadding.transpose(1, 2) if nonpadding is not None else None 90 | if nonpadding is None: 91 | nonpadding = 1 92 | output = torch.zeros_like(x) 93 | n_channels_tensor = torch.IntTensor([self.hidden_size]) 94 | 95 | if cond is not None and not self.share_cond_layers: 96 | cond = self.cond_layer(cond) 97 | 98 | for i in range(self.n_layers): 99 | x_in = self.in_layers[i](x) 100 | x_in = self.drop(x_in) 101 | if cond is not None: 102 | cond_offset = i * 2 * self.hidden_size 103 | cond_l = cond[:, cond_offset:cond_offset + 2 * self.hidden_size, :] 104 | else: 105 | cond_l = torch.zeros_like(x_in) 106 | 107 | acts = fused_add_tanh_sigmoid_multiply(x_in, cond_l, n_channels_tensor) 108 | 109 | res_skip_acts = self.res_skip_layers[i](acts) 110 | if i < self.n_layers - 1: 111 | x = (x + res_skip_acts[:, :self.hidden_size, :]) * nonpadding 112 | output = output + res_skip_acts[:, self.hidden_size:, :] 113 | else: 114 | output = output + res_skip_acts 115 | output = output * nonpadding 116 | if self.is_BTC: 117 | output = output.transpose(1, 2) 118 | return output 119 | 120 | def remove_weight_norm(self): 121 | def remove_weight_norm(m): 122 | try: 123 | nn.utils.remove_weight_norm(m) 124 | except ValueError: # this module didn't have weight norm 125 | return 126 | 127 | self.apply(remove_weight_norm) 128 | -------------------------------------------------------------------------------- /Modules/Vocoder/MelSpecLoss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Tomoki Hayashi 2 | # MIT License (https://opensource.org/licenses/MIT) 3 | # Adapted by Florian Lux 2021 4 | 5 | 6 | import librosa 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | 11 | class MelSpectrogram(torch.nn.Module): 12 | 13 | def __init__(self, 14 | fs=24000, 15 | fft_size=1536, 16 | hop_size=384, 17 | win_length=None, 18 | window="hann", 19 | num_mels=100, 20 | fmin=60, 21 | fmax=None, 22 | center=True, 23 | normalized=False, 24 | onesided=True, 25 | eps=1e-10, 26 | log_base=10.0, ): 27 | super().__init__() 28 | self.fft_size = fft_size 29 | if win_length is None: 30 | self.win_length = fft_size 31 | else: 32 | self.win_length = win_length 33 | self.hop_size = hop_size 34 | self.center = center 35 | self.normalized = normalized 36 | self.onesided = onesided 37 | if window is not None and not hasattr(torch, f"{window}_window"): 38 | raise ValueError(f"{window} window is not implemented") 39 | self.window = window 40 | self.eps = eps 41 | 42 | fmin = 0 if fmin is None else fmin 43 | fmax = fs / 2 if fmax is None else fmax 44 | melmat = librosa.filters.mel(sr=fs, 45 | n_fft=fft_size, 46 | n_mels=num_mels, 47 | fmin=fmin, 48 | fmax=fmax, ) 49 | self.register_buffer("melmat", torch.from_numpy(melmat.T).float()) 50 | self.stft_params = { 51 | "n_fft" : self.fft_size, 52 | "win_length": self.win_length, 53 | "hop_length": self.hop_size, 54 | "center" : self.center, 55 | "normalized": self.normalized, 56 | "onesided" : self.onesided, 57 | } 58 | self.stft_params["return_complex"] = False 59 | 60 | self.log_base = log_base 61 | if self.log_base is None: 62 | self.log = torch.log 63 | elif self.log_base == 2.0: 64 | self.log = torch.log2 65 | elif self.log_base == 10.0: 66 | self.log = torch.log10 67 | else: 68 | raise ValueError(f"log_base: {log_base} is not supported.") 69 | 70 | def forward(self, x): 71 | """ 72 | Calculate Mel-spectrogram. 73 | 74 | Args: 75 | x (Tensor): Input waveform tensor (B, T) or (B, 1, T). 76 | 77 | Returns: 78 | Tensor: Mel-spectrogram (B, #mels, #frames). 79 | """ 80 | if x.dim() == 3: 81 | # (B, C, T) -> (B*C, T) 82 | x = x.reshape(-1, x.size(2)) 83 | 84 | if self.window is not None: 85 | window_func = getattr(torch, f"{self.window}_window") 86 | window = window_func(self.win_length, dtype=x.dtype, device=x.device) 87 | else: 88 | window = None 89 | 90 | x_stft = torch.stft(x, window=window, **self.stft_params) 91 | # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2) 92 | x_stft = x_stft.transpose(1, 2) 93 | x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2 94 | x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps)) 95 | 96 | x_mel = torch.matmul(x_amp, self.melmat) 97 | x_mel = torch.clamp(x_mel, min=self.eps) 98 | 99 | return self.log(x_mel).transpose(1, 2) 100 | 101 | 102 | class MelSpectrogramLoss(torch.nn.Module): 103 | 104 | def __init__(self, 105 | fs=24000, 106 | fft_size=1024, 107 | hop_size=256, 108 | win_length=None, 109 | window="hann", 110 | num_mels=128, 111 | fmin=20, 112 | fmax=None, 113 | center=True, 114 | normalized=False, 115 | onesided=True, 116 | eps=1e-10, 117 | log_base=10.0, ): 118 | super().__init__() 119 | self.mel_spectrogram = MelSpectrogram(fs=fs, 120 | fft_size=fft_size, 121 | hop_size=hop_size, 122 | win_length=win_length, 123 | window=window, 124 | num_mels=num_mels, 125 | fmin=fmin, 126 | fmax=fmax, 127 | center=center, 128 | normalized=normalized, 129 | onesided=onesided, 130 | eps=eps, 131 | log_base=log_base, ) 132 | 133 | def forward(self, y_hat, y): 134 | """ 135 | Calculate Mel-spectrogram loss. 136 | 137 | Args: 138 | y_hat (Tensor): Generated single tensor (B, 1, T). 139 | y (Tensor): Groundtruth single tensor (B, 1, T). 140 | 141 | Returns: 142 | Tensor: Mel-spectrogram loss value. 143 | """ 144 | mel_hat = self.mel_spectrogram(y_hat) 145 | mel = self.mel_spectrogram(y) 146 | mel_loss = F.l1_loss(mel_hat, mel) 147 | 148 | return mel_loss 149 | -------------------------------------------------------------------------------- /run_simple_GUI_demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import torch.cuda 3 | from huggingface_hub import hf_hub_download 4 | 5 | from InferenceInterfaces.ControllableInterface import ControllableInterface 6 | from Utility.storage_config import MODEL_DIR 7 | from Utility.utils import float2pcm 8 | from Utility.utils import load_json_from_path 9 | 10 | 11 | class TTSWebUI: 12 | 13 | def __init__(self, 14 | gpu_id="cpu", 15 | title="Controllable Text-to-Speech for over 7000 Languages", 16 | article="", 17 | tts_model_path=None, 18 | vocoder_model_path=None, 19 | embedding_gan_path=None, 20 | available_artificial_voices=10 # be careful with this, if you want too many, it might lead to an endless loop 21 | ): 22 | path_to_iso_list = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json") 23 | iso_to_name = load_json_from_path(path_to_iso_list) 24 | text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name] 25 | # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name] 26 | 27 | self.controllable_ui = ControllableInterface(gpu_id=gpu_id, 28 | available_artificial_voices=available_artificial_voices, 29 | tts_model_path=tts_model_path, 30 | vocoder_model_path=vocoder_model_path, 31 | embedding_gan_path=embedding_gan_path) 32 | self.iface = gr.Interface(fn=self.read, 33 | inputs=[gr.Textbox(lines=2, 34 | placeholder="write what you want the synthesis to read here...", 35 | value="What I cannot create, I do not understand.", 36 | label="Text input"), 37 | gr.Dropdown(text_selection, 38 | type="value", 39 | value='English (eng)', 40 | label="Select the Language of the Text (type on your keyboard to find it quickly)"), 41 | gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"), 42 | gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"), 43 | gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=5, label="Random Seed for the artificial Voice"), 44 | gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"), 45 | gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"), 46 | # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"), 47 | # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"), 48 | # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth") 49 | ], 50 | outputs=[gr.Audio(type="numpy", label="Speech"), 51 | gr.Image(label="Visualization")], 52 | title=title, 53 | allow_flagging="never", 54 | article=article, 55 | theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange")) 56 | self.iface.launch() 57 | 58 | def read(self, 59 | prompt, 60 | language, 61 | prosody_creativity, 62 | duration_scaling_factor, 63 | voice_seed, 64 | emb1, 65 | reference_audio, 66 | # pitch_variance_scale, 67 | # energy_variance_scale, 68 | # emb2 69 | ): 70 | sr, wav, fig = self.controllable_ui.read(prompt, 71 | reference_audio, 72 | language.split(" ")[-1].split("(")[1].split(")")[0], 73 | language.split(" ")[-1].split("(")[1].split(")")[0], 74 | voice_seed, 75 | prosody_creativity, 76 | duration_scaling_factor, 77 | 1., 78 | 1.0, 79 | 1.0, 80 | emb1, 81 | 0., 82 | 0., 83 | 0., 84 | 0., 85 | 0., 86 | -24.) 87 | return (sr, float2pcm(wav)), fig 88 | 89 | 90 | if __name__ == '__main__': 91 | TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu") 92 | -------------------------------------------------------------------------------- /Modules/ToucanTTS/flow_matching.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copied from https://github.com/KdaiP/StableTTS by https://github.com/KdaiP 3 | 4 | https://github.com/KdaiP/StableTTS/blob/eebb177ebf195fd1246dedabec4ef69d9351a4f8/models/flow_matching.py 5 | 6 | Code is under MIT License 7 | """ 8 | 9 | import imageio 10 | import torch 11 | import torch.nn.functional as F 12 | 13 | from Modules.ToucanTTS.dit_wrapper import Decoder 14 | from Utility.utils import plot_spec_tensor 15 | 16 | 17 | # copied from https://github.com/jaywalnut310/vits/blob/main/commons.py#L121 18 | def sequence_mask(length: torch.Tensor, max_length: int = None) -> torch.Tensor: 19 | if max_length is None: 20 | max_length = length.max() 21 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 22 | return x.unsqueeze(0) < length.unsqueeze(1) 23 | 24 | 25 | # modified from https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/flow_matching.py 26 | class CFMDecoder(torch.nn.Module): 27 | def __init__(self, hidden_channels, out_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, gin_channels): 28 | super().__init__() 29 | self.hidden_channels = hidden_channels 30 | self.out_channels = out_channels 31 | self.filter_channels = filter_channels 32 | self.gin_channels = gin_channels 33 | self.sigma_min = 1e-4 34 | 35 | self.estimator = Decoder(hidden_channels, out_channels, filter_channels, p_dropout, n_layers, n_heads, kernel_size, gin_channels) 36 | 37 | @torch.inference_mode() 38 | def forward(self, mu, mask, n_timesteps, temperature=1.0, c=None): 39 | """Forward diffusion 40 | 41 | Args: 42 | mu (torch.Tensor): output of encoder 43 | shape: (batch_size, n_feats, mel_timesteps) 44 | mask (torch.Tensor): output_mask 45 | shape: (batch_size, 1, mel_timesteps) 46 | n_timesteps (int): number of diffusion steps 47 | temperature (float, optional): temperature for scaling noise. Defaults to 1.0. 48 | c (torch.Tensor, optional): shape: (batch_size, gin_channels) 49 | 50 | Returns: 51 | sample: generated mel-spectrogram 52 | shape: (batch_size, n_feats, mel_timesteps) 53 | """ 54 | size = list(mu.size()) 55 | size[1] = self.out_channels 56 | z = torch.randn(size=size).to(mu.device) * temperature 57 | t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) 58 | return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, c=c) 59 | 60 | def solve_euler(self, x, t_span, mu, mask, c, plot_solutions=False): 61 | """ 62 | Fixed euler solver for ODEs. 63 | Args: 64 | x (torch.Tensor): random noise 65 | t_span (torch.Tensor): n_timesteps interpolated 66 | shape: (n_timesteps + 1,) 67 | mu (torch.Tensor): output of encoder 68 | shape: (batch_size, n_feats, mel_timesteps) 69 | mask (torch.Tensor): output_mask 70 | shape: (batch_size, 1, mel_timesteps) 71 | c (torch.Tensor, optional): speaker condition. 72 | shape: (batch_size, gin_channels) 73 | """ 74 | t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0] 75 | 76 | sol = [] 77 | 78 | for step in range(1, len(t_span)): 79 | 80 | dphi_dt = self.estimator(x, mask, mu, t, c) 81 | 82 | x = x + dt * dphi_dt 83 | t = t + dt 84 | sol.append(x) 85 | if step < len(t_span) - 1: 86 | dt = t_span[step + 1] - t 87 | 88 | if plot_solutions: 89 | create_plot_of_all_solutions(sol) 90 | 91 | return sol[-1] 92 | 93 | def compute_loss(self, x1, mask, mu, c): 94 | """Computes diffusion loss 95 | 96 | Args: 97 | x1 (torch.Tensor): Target 98 | shape: (batch_size, n_feats, mel_timesteps) 99 | mask (torch.Tensor): target mask 100 | shape: (batch_size, 1, mel_timesteps) 101 | mu (torch.Tensor): output of encoder 102 | shape: (batch_size, n_feats, mel_timesteps) 103 | c (torch.Tensor, optional): speaker condition. 104 | 105 | Returns: 106 | loss: conditional flow matching loss 107 | y: conditional flow 108 | shape: (batch_size, n_feats, mel_timesteps) 109 | """ 110 | b, _, t = mu.shape 111 | 112 | # random timestep 113 | t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) 114 | # sample noise p(x_0) 115 | z = torch.randn_like(x1) 116 | 117 | y = (1 - (1 - self.sigma_min) * t) * z + t * x1 118 | u = x1 - (1 - self.sigma_min) * z 119 | 120 | loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), c), 121 | u, 122 | reduction="sum") / (torch.sum(mask) * u.shape[1]) 123 | return loss, y 124 | 125 | 126 | def create_plot_of_all_solutions(sol, fps=8): 127 | gif_collector = list() 128 | for step_index, solution in enumerate(sol): 129 | unbatched_solution = solution[0] # remove the batch axis (if there are more than one element in the batch, we only take the first) 130 | plot_spec_tensor(unbatched_solution, "tmp", step_index, title=step_index + 1) 131 | gif_collector.append(imageio.v2.imread(f"tmp/{step_index}.png")) 132 | for _ in range(fps * 2): 133 | gif_collector.append(gif_collector[-1]) # freeze-frame on the final one for two seconds 134 | imageio.mimsave("tmp/animation.gif", gif_collector, fps=fps, loop=0) 135 | -------------------------------------------------------------------------------- /run_training_pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import sys 5 | 6 | import torch 7 | 8 | from Recipes.AlignerPipeline import run as aligner 9 | from Recipes.BigVGAN_e2e import run as be2e 10 | from Recipes.HiFiGAN_combined import run as HiFiGAN 11 | from Recipes.HiFiGAN_e2e import run as e2e 12 | from Recipes.ToucanTTS_IntegrationTest import run as tt_integration_test 13 | from Recipes.ToucanTTS_Massive_Asian import run as asian 14 | from Recipes.ToucanTTS_Massive_English_stage1 import run as eng1 15 | from Recipes.ToucanTTS_Massive_English_stage2 import run as eng2 16 | from Recipes.ToucanTTS_Massive_German import run as deu 17 | from Recipes.ToucanTTS_Massive_stage1 import run as stage1 18 | from Recipes.ToucanTTS_Massive_stage2 import run as stage2 19 | from Recipes.ToucanTTS_Massive_stage3 import run as stage3 20 | from Recipes.ToucanTTS_Nancy import run as nancy 21 | from Recipes.finetuning_example_multilingual import run as fine_tuning_example_multilingual 22 | from Recipes.finetuning_example_simple import run as fine_tuning_example_simple 23 | 24 | pipeline_dict = { 25 | # the finetuning examples 26 | "finetuning_example_simple" : fine_tuning_example_simple, 27 | "finetuning_example_multilingual": fine_tuning_example_multilingual, 28 | # integration test 29 | "tt_it" : tt_integration_test, 30 | # regular ToucanTTS pipelines 31 | "nancy" : nancy, 32 | "eng1" : eng1, 33 | "eng2" : eng2, 34 | "deu" : deu, 35 | "asian": asian, 36 | "stage1" : stage1, 37 | "stage2" : stage2, 38 | "stage3" : stage3, 39 | # training the aligner from scratch (not recommended, best to use provided checkpoint) 40 | "aligner" : aligner, 41 | # vocoder training (not recommended, best to use provided checkpoint) 42 | "hifigan" : HiFiGAN, 43 | "e2e" : e2e, 44 | "be2e" : be2e 45 | } 46 | 47 | if __name__ == '__main__': 48 | 49 | parser = argparse.ArgumentParser(description='Training with the IMS Toucan Speech Synthesis Toolkit') 50 | 51 | parser.add_argument('pipeline', 52 | choices=list(pipeline_dict.keys()), 53 | help="Select pipeline to train.") 54 | 55 | parser.add_argument('--gpu_id', 56 | type=str, 57 | help="Which GPU(s) to run on. If not specified runs on CPU, but other than for integration tests that doesn't make much sense.", 58 | default="cpu") 59 | 60 | parser.add_argument('--resume_checkpoint', 61 | type=str, 62 | help="Path to checkpoint to resume from.", 63 | default=None) 64 | 65 | parser.add_argument('--resume', 66 | action="store_true", 67 | help="Automatically load the highest checkpoint and continue from there.", 68 | default=False) 69 | 70 | parser.add_argument('--finetune', 71 | action="store_true", 72 | help="Whether to fine-tune from the specified checkpoint.", 73 | default=False) 74 | 75 | parser.add_argument('--model_save_dir', 76 | type=str, 77 | help="Directory where the checkpoints should be saved to.", 78 | default=None) 79 | 80 | parser.add_argument('--wandb', 81 | action="store_true", 82 | help="Whether to use weights and biases to track training runs. Requires you to run wandb login and place your auth key before.", 83 | default=False) 84 | 85 | parser.add_argument('--wandb_resume_id', 86 | type=str, 87 | help="ID of a stopped wandb run to continue tracking", 88 | default=None) 89 | 90 | args = parser.parse_args() 91 | 92 | if args.finetune and args.resume_checkpoint is None and not args.resume: 93 | print("Need to provide path to checkpoint to fine-tune from!") 94 | sys.exit() 95 | 96 | if args.gpu_id == "cpu": 97 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 98 | device = torch.device("cpu") 99 | print(f"No GPU specified, using CPU. Training will likely not work without GPU.") 100 | gpu_count = 1 # for technical reasons this is set to one, indicating it's not mutli-GPU training, even though there is no GPU in this case 101 | else: 102 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 103 | os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.gpu_id}" 104 | device = torch.device("cuda") 105 | print(f"Making GPU {os.environ['CUDA_VISIBLE_DEVICES']} the only visible device(s).") 106 | gpu_count = len(args.gpu_id.replace(",", " ").split()) 107 | # example call for gpu_count training: 108 | # torchrun --standalone --nproc_per_node=4 --nnodes=1 run_training_pipeline.py nancy --gpu_id "1,2,3" 109 | 110 | torch.manual_seed(9665) 111 | random.seed(9665) 112 | torch.random.manual_seed(9665) 113 | 114 | torch.multiprocessing.set_sharing_strategy('file_system') 115 | 116 | pipeline_dict[args.pipeline](gpu_id=args.gpu_id, 117 | resume_checkpoint=args.resume_checkpoint, 118 | resume=args.resume, 119 | finetune=args.finetune, 120 | model_dir=args.model_save_dir, 121 | use_wandb=args.wandb, 122 | wandb_resume_id=args.wandb_resume_id, 123 | gpu_count=gpu_count) 124 | -------------------------------------------------------------------------------- /Modules/EmbeddingModel/StyleTTSEncoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | MIT Licensed Code 3 | 4 | Copyright (c) 2022 Aaron (Yinghao) Li 5 | 6 | https://github.com/yl4579/StyleTTS/blob/main/models.py 7 | """ 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn.functional as F 13 | from torch import nn 14 | from torch.nn.utils import spectral_norm 15 | 16 | 17 | class StyleEncoder(nn.Module): 18 | def __init__(self, dim_in=128, style_dim=64, max_conv_dim=384): 19 | super().__init__() 20 | blocks = [] 21 | blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))] 22 | 23 | repeat_num = 4 24 | for _ in range(repeat_num): 25 | dim_out = min(dim_in * 2, max_conv_dim) 26 | blocks += [ResBlk(dim_in, dim_out, downsample='half')] 27 | dim_in = dim_out 28 | 29 | blocks += [nn.LeakyReLU(0.2)] 30 | blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))] 31 | blocks += [nn.AdaptiveAvgPool2d(1)] 32 | blocks += [nn.LeakyReLU(0.2)] 33 | self.shared = nn.Sequential(*blocks) 34 | 35 | self.unshared = nn.Linear(dim_out, style_dim) 36 | 37 | def forward(self, speech): 38 | h = self.shared(speech.unsqueeze(1)) 39 | h = h.view(h.size(0), -1) 40 | s = self.unshared(h) 41 | 42 | return s 43 | 44 | 45 | class ResBlk(nn.Module): 46 | def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2), 47 | normalize=False, downsample='none'): 48 | super().__init__() 49 | self.actv = actv 50 | self.normalize = normalize 51 | self.downsample = DownSample(downsample) 52 | self.downsample_res = LearnedDownSample(downsample, dim_in) 53 | self.learned_sc = dim_in != dim_out 54 | self._build_weights(dim_in, dim_out) 55 | 56 | def _build_weights(self, dim_in, dim_out): 57 | self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1)) 58 | self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1)) 59 | if self.normalize: 60 | self.norm1 = nn.InstanceNorm2d(dim_in, affine=True) 61 | self.norm2 = nn.InstanceNorm2d(dim_in, affine=True) 62 | if self.learned_sc: 63 | self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)) 64 | 65 | def _shortcut(self, x): 66 | if self.learned_sc: 67 | x = self.conv1x1(x) 68 | if self.downsample: 69 | x = self.downsample(x) 70 | return x 71 | 72 | def _residual(self, x): 73 | if self.normalize: 74 | x = self.norm1(x) 75 | x = self.actv(x) 76 | x = self.conv1(x) 77 | x = self.downsample_res(x) 78 | if self.normalize: 79 | x = self.norm2(x) 80 | x = self.actv(x) 81 | x = self.conv2(x) 82 | return x 83 | 84 | def forward(self, x): 85 | x = self._shortcut(x) + self._residual(x) 86 | return x / math.sqrt(2) # unit variance 87 | 88 | 89 | class LearnedDownSample(nn.Module): 90 | def __init__(self, layer_type, dim_in): 91 | super().__init__() 92 | self.layer_type = layer_type 93 | 94 | if self.layer_type == 'none': 95 | self.conv = nn.Identity() 96 | elif self.layer_type == 'timepreserve': 97 | self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))) 98 | elif self.layer_type == 'half': 99 | self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)) 100 | else: 101 | raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) 102 | 103 | def forward(self, x): 104 | return self.conv(x) 105 | 106 | 107 | class LearnedUpSample(nn.Module): 108 | def __init__(self, layer_type, dim_in): 109 | super().__init__() 110 | self.layer_type = layer_type 111 | 112 | if self.layer_type == 'none': 113 | self.conv = nn.Identity() 114 | elif self.layer_type == 'timepreserve': 115 | self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0)) 116 | elif self.layer_type == 'half': 117 | self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1) 118 | else: 119 | raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) 120 | 121 | def forward(self, x): 122 | return self.conv(x) 123 | 124 | 125 | class DownSample(nn.Module): 126 | def __init__(self, layer_type): 127 | super().__init__() 128 | self.layer_type = layer_type 129 | 130 | def forward(self, x): 131 | if self.layer_type == 'none': 132 | return x 133 | elif self.layer_type == 'timepreserve': 134 | return F.avg_pool2d(x, (2, 1)) 135 | elif self.layer_type == 'half': 136 | if x.shape[-1] % 2 != 0: 137 | x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1) 138 | return F.avg_pool2d(x, 2) 139 | else: 140 | raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) 141 | 142 | 143 | class UpSample(nn.Module): 144 | def __init__(self, layer_type): 145 | super().__init__() 146 | self.layer_type = layer_type 147 | 148 | def forward(self, x): 149 | if self.layer_type == 'none': 150 | return x 151 | elif self.layer_type == 'timepreserve': 152 | return F.interpolate(x, scale_factor=(2, 1), mode='nearest') 153 | elif self.layer_type == 'half': 154 | return F.interpolate(x, scale_factor=2, mode='nearest') 155 | else: 156 | raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type) 157 | -------------------------------------------------------------------------------- /run_text_to_file_reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | 5 | from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface 6 | 7 | 8 | def read_texts(sentence, filename, model_id=None, device="cpu", language="eng", speaker_reference=None, duration_scaling_factor=1.0): 9 | tts = ToucanTTSInterface(device=device, tts_model_path=model_id) 10 | tts.set_language(language) 11 | if speaker_reference is not None: 12 | tts.set_utterance_embedding(speaker_reference) 13 | if type(sentence) == str: 14 | sentence = [sentence] 15 | tts.read_to_file(text_list=sentence, file_location=filename, duration_scaling_factor=duration_scaling_factor, prosody_creativity=0.0) 16 | del tts 17 | 18 | 19 | def english_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 20 | os.makedirs("audios", exist_ok=True) 21 | 22 | read_texts(model_id=model_id, 23 | sentence=["""Once upon a midnight dreary, while I pondered, weak, and weary, 24 | Over many a quaint, and curious volume, of forgotten lore, 25 | While I nodded, nearly napping, suddenly, there came a tapping, 26 | As of someone gently rapping, rapping at my chamber door."""], 27 | filename=f"audios/{model_id}_english_test_{version}.wav", 28 | device=exec_device, 29 | language="eng", 30 | speaker_reference=speaker_reference) 31 | 32 | 33 | def japanese_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 34 | os.makedirs("audios", exist_ok=True) 35 | 36 | read_texts(model_id=model_id, 37 | sentence=["医師会がなくても、近隣の病院なら紹介してくれると思います。"], 38 | filename=f"audios/{model_id}_japanese_test_{version}.wav", 39 | device=exec_device, 40 | language="jpn", 41 | speaker_reference=speaker_reference) 42 | 43 | 44 | def chinese_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 45 | os.makedirs("audios", exist_ok=True) 46 | 47 | read_texts(model_id=model_id, 48 | sentence=["李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。"], 49 | filename=f"audios/{model_id}_chinese_test_{version}.wav", 50 | device=exec_device, 51 | language="cmn", 52 | speaker_reference=speaker_reference) 53 | 54 | 55 | def german_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 56 | os.makedirs("audios", exist_ok=True) 57 | 58 | read_texts(model_id=model_id, 59 | sentence=["""Fest gemauert in der Erden, 60 | Steht die Form, aus Lehm gebrannt. 61 | Heute muss die Glocke werden! 62 | Frisch, Gesellen, seid zur Hand!"""], 63 | filename=f"audios/{model_id}_german_test_{version}.wav", 64 | device=exec_device, 65 | language="deu", 66 | speaker_reference=speaker_reference) 67 | 68 | 69 | def vietnamese_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 70 | os.makedirs("audios", exist_ok=True) 71 | 72 | read_texts(model_id=model_id, 73 | sentence=["""Thân phận, 74 | ở một nơi luôn phải nhắc mình, 75 | im miệng, 76 | thân phận, 77 | là khi nói về quá khứ, 78 | ngó trước nhìn sau, 79 | là phải biết nhắm mắt bịt tai làm lơ, 80 | thờ ơ, 81 | với tất cả những điều gai chướng, 82 | thân phận chúng tôi ở đó, 83 | những quyển sách chuyền tay nhau như ăn cắp, 84 | ngôn luận ư? 85 | không có đất cho nghĩa tự do."""], 86 | filename=f"audios/{model_id}_vietnamese_test_{version}.wav", 87 | device=exec_device, 88 | language="vie", 89 | speaker_reference=speaker_reference) 90 | 91 | 92 | def french_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 93 | os.makedirs("audios", exist_ok=True) 94 | 95 | read_texts(model_id=model_id, 96 | sentence=["""Maître corbeau, sur un arbre perché, 97 | Tenait en son bec un fromage. 98 | Maître renard par l'odeur alléché , 99 | Lui tint à peu près ce langage : 100 | «Et bonjour Monsieur du Corbeau. 101 | Que vous êtes joli! que vous me semblez beau!"""], 102 | filename=f"audios/{model_id}_french_test_{version}.wav", 103 | device=exec_device, 104 | language="fra", 105 | speaker_reference=speaker_reference) 106 | 107 | 108 | def all_test(version, model_id=None, exec_device="cpu", speaker_reference=None): 109 | english_test(version, model_id, exec_device, speaker_reference) 110 | german_test(version, model_id, exec_device, speaker_reference) 111 | french_test(version, model_id, exec_device, speaker_reference) 112 | vietnamese_test(version, model_id, exec_device, speaker_reference) 113 | japanese_test(version, model_id, exec_device, speaker_reference) 114 | chinese_test(version, model_id, exec_device, speaker_reference) 115 | 116 | 117 | if __name__ == '__main__': 118 | exec_device = "cuda" if torch.cuda.is_available() else "cpu" 119 | print(f"running on {exec_device}") 120 | 121 | os.makedirs("audios/speaker_references/", exist_ok=True) 122 | merged_speaker_references = ["audios/speaker_references/" + ref for ref in os.listdir("audios/speaker_references/")] 123 | 124 | all_test(version="version_11", 125 | model_id=None, # will use the default 126 | exec_device=exec_device, 127 | speaker_reference=merged_speaker_references if merged_speaker_references != [] else None) 128 | -------------------------------------------------------------------------------- /Modules/GeneralLayers/EncoderLayer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe) 2 | # Northwestern Polytechnical University (Pengcheng Guo) 3 | # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 4 | # Adapted by Florian Lux 2021 5 | 6 | 7 | import torch 8 | from torch import nn 9 | 10 | from Modules.GeneralLayers.LayerNorm import LayerNorm 11 | 12 | 13 | class EncoderLayer(nn.Module): 14 | """ 15 | Encoder layer module. 16 | 17 | Args: 18 | size (int): Input dimension. 19 | self_attn (torch.nn.Module): Self-attention module instance. 20 | `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance 21 | can be used as the argument. 22 | feed_forward (torch.nn.Module): Feed-forward module instance. 23 | `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance 24 | can be used as the argument. 25 | feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance. 26 | `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance 27 | can be used as the argument. 28 | conv_module (torch.nn.Module): Convolution module instance. 29 | `ConvlutionModule` instance can be used as the argument. 30 | dropout_rate (float): Dropout rate. 31 | normalize_before (bool): Whether to use layer_norm before the first block. 32 | concat_after (bool): Whether to concat attention layer's input and output. 33 | if True, additional linear will be applied. 34 | i.e. x -> x + linear(concat(x, att(x))) 35 | if False, no additional linear will be applied. i.e. x -> x + att(x) 36 | 37 | """ 38 | 39 | def __init__(self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ): 40 | super(EncoderLayer, self).__init__() 41 | self.self_attn = self_attn 42 | self.feed_forward = feed_forward 43 | self.feed_forward_macaron = feed_forward_macaron 44 | self.conv_module = conv_module 45 | self.norm_ff = LayerNorm(size) # for the FNN module 46 | self.norm_mha = LayerNorm(size) # for the MHA module 47 | if feed_forward_macaron is not None: 48 | self.norm_ff_macaron = LayerNorm(size) 49 | self.ff_scale = 0.5 50 | else: 51 | self.ff_scale = 1.0 52 | if self.conv_module is not None: 53 | self.norm_conv = LayerNorm(size) # for the CNN module 54 | self.norm_final = LayerNorm(size) # for the final output of the block 55 | self.dropout = nn.Dropout(dropout_rate) 56 | self.size = size 57 | self.normalize_before = normalize_before 58 | self.concat_after = concat_after 59 | if self.concat_after: 60 | self.concat_linear = nn.Linear(size + size, size) 61 | 62 | def forward(self, x_input, mask, cache=None): 63 | """ 64 | Compute encoded features. 65 | 66 | Args: 67 | x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb. 68 | - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. 69 | - w/o pos emb: Tensor (#batch, time, size). 70 | mask (torch.Tensor): Mask tensor for the input (#batch, time). 71 | cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). 72 | 73 | Returns: 74 | torch.Tensor: Output tensor (#batch, time, size). 75 | torch.Tensor: Mask tensor (#batch, time). 76 | 77 | """ 78 | if isinstance(x_input, tuple): 79 | x, pos_emb = x_input[0], x_input[1] 80 | else: 81 | x, pos_emb = x_input, None 82 | 83 | # whether to use macaron style 84 | if self.feed_forward_macaron is not None: 85 | residual = x 86 | if self.normalize_before: 87 | x = self.norm_ff_macaron(x) 88 | x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x)) 89 | if not self.normalize_before: 90 | x = self.norm_ff_macaron(x) 91 | 92 | # multi-headed self-attention module 93 | residual = x 94 | if self.normalize_before: 95 | x = self.norm_mha(x) 96 | 97 | if cache is None: 98 | x_q = x 99 | else: 100 | assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size) 101 | x_q = x[:, -1:, :] 102 | residual = residual[:, -1:, :] 103 | mask = None if mask is None else mask[:, -1:, :] 104 | 105 | if pos_emb is not None: 106 | x_att = self.self_attn(x_q, x, x, pos_emb, mask) 107 | else: 108 | x_att = self.self_attn(x_q, x, x, mask) 109 | 110 | if self.concat_after: 111 | x_concat = torch.cat((x, x_att), dim=-1) 112 | x = residual + self.concat_linear(x_concat) 113 | else: 114 | x = residual + self.dropout(x_att) 115 | if not self.normalize_before: 116 | x = self.norm_mha(x) 117 | 118 | # convolution module 119 | if self.conv_module is not None: 120 | residual = x 121 | if self.normalize_before: 122 | x = self.norm_conv(x) 123 | x = residual + self.dropout(self.conv_module(x)) 124 | if not self.normalize_before: 125 | x = self.norm_conv(x) 126 | 127 | # feed forward module 128 | residual = x 129 | if self.normalize_before: 130 | x = self.norm_ff(x) 131 | x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) 132 | if not self.normalize_before: 133 | x = self.norm_ff(x) 134 | 135 | if self.conv_module is not None: 136 | x = self.norm_final(x) 137 | 138 | if cache is not None: 139 | x = torch.cat([cache, x], dim=1) 140 | 141 | if pos_emb is not None: 142 | return (x, pos_emb), mask 143 | 144 | return x, mask 145 | -------------------------------------------------------------------------------- /Preprocessing/multilinguality/visualize_distances.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import networkx as nx 5 | import torch 6 | from huggingface_hub import hf_hub_download 7 | from tqdm import tqdm 8 | 9 | from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS 10 | from Utility.storage_config import MODEL_DIR 11 | from Utility.utils import load_json_from_path 12 | 13 | distance_types = ["tree", "asp", "map", "learned", "l1"] 14 | distance_type = distance_types[2] # switch here 15 | edge_threshold = 0.1 16 | 17 | cache_root = "." 18 | supervised_iso_codes = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="supervised_languages.json")) 19 | 20 | if distance_type == "l1": 21 | iso_codes_to_ids = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json"))[-1] 22 | model_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") 23 | checkpoint = torch.load(model_path, map_location='cpu') 24 | embedding_provider = ToucanTTS(weights=checkpoint["model"], config=checkpoint["config"]).encoder.language_embedding 25 | embedding_provider.requires_grad_(False) 26 | l1_dist = dict() 27 | seen_langs = set() 28 | for lang_1 in supervised_iso_codes: 29 | if lang_1 not in seen_langs: 30 | seen_langs.add(lang_1) 31 | l1_dist[lang_1] = dict() 32 | for lang_2 in supervised_iso_codes: 33 | if lang_2 not in seen_langs: # it's symmetric 34 | l1_dist[lang_1][lang_2] = torch.nn.functional.mse_loss(embedding_provider(torch.LongTensor([iso_codes_to_ids[lang_1]])).squeeze(), embedding_provider(torch.LongTensor([iso_codes_to_ids[lang_2]])).squeeze()) 35 | largest_value_l1_dist = 0.0 36 | for _, values in l1_dist.items(): 37 | for _, value in values.items(): 38 | largest_value_l1_dist = max(largest_value_l1_dist, value) 39 | for key1 in l1_dist: 40 | for key2 in l1_dist[key1]: 41 | l1_dist[key1][key2] = l1_dist[key1][key2] / largest_value_l1_dist 42 | distance_measure = l1_dist 43 | 44 | if distance_type == "tree": 45 | tree_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_tree_dist.json") 46 | tree_dist = load_json_from_path(tree_lookup_path) 47 | distance_measure = tree_dist 48 | 49 | if distance_type == "map": 50 | map_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_map_dist.json") 51 | map_dist = load_json_from_path(map_lookup_path) 52 | largest_value_map_dist = 0.0 53 | for _, values in map_dist.items(): 54 | for _, value in values.items(): 55 | largest_value_map_dist = max(largest_value_map_dist, value) 56 | for key1 in map_dist: 57 | for key2 in map_dist[key1]: 58 | map_dist[key1][key2] = map_dist[key1][key2] / largest_value_map_dist 59 | distance_measure = map_dist 60 | 61 | if distance_type == "learned": 62 | learned_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_learned_dist.json") 63 | learned_dist = load_json_from_path(learned_lookup_path) 64 | largest_value_learned_dist = 0.0 65 | for _, values in learned_dist.items(): 66 | for _, value in values.items(): 67 | largest_value_learned_dist = max(largest_value_learned_dist, value) 68 | for key1 in learned_dist: 69 | for key2 in learned_dist[key1]: 70 | learned_dist[key1][key2] = learned_dist[key1][key2] / largest_value_learned_dist 71 | distance_measure = learned_dist 72 | 73 | if distance_type == "asp": 74 | asp_dict_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="asp_dict.pkl") 75 | with open(asp_dict_path, 'rb') as dictfile: 76 | asp_sim = pickle.load(dictfile) 77 | lang_list = list(asp_sim.keys()) 78 | asp_dist = dict() 79 | seen_langs = set() 80 | for lang_1 in lang_list: 81 | if lang_1 not in seen_langs: 82 | seen_langs.add(lang_1) 83 | asp_dist[lang_1] = dict() 84 | for index, lang_2 in enumerate(lang_list): 85 | if lang_2 not in seen_langs: # it's symmetric 86 | asp_dist[lang_1][lang_2] = 1 - asp_sim[lang_1][index] 87 | distance_measure = asp_dist 88 | 89 | iso_codes_to_names = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")) 90 | distances = list() 91 | 92 | for lang_1 in distance_measure: 93 | if lang_1 not in iso_codes_to_names: 94 | continue 95 | for lang_2 in distance_measure[lang_1]: 96 | distances.append((iso_codes_to_names[lang_1], iso_codes_to_names[lang_2], distance_measure[lang_1][lang_2])) 97 | 98 | # Create a graph 99 | G = nx.Graph() 100 | 101 | # Add edges along with distances as weights 102 | min_dist = min(d for _, _, d in distances) 103 | max_dist = max(d for _, _, d in distances) 104 | normalized_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances] 105 | 106 | for entity1, entity2, d in tqdm(normalized_distances): 107 | if d <= edge_threshold and entity1 != entity2: 108 | spring_tension = edge_threshold - d 109 | G.add_edge(entity1, entity2, weight=spring_tension * 10) 110 | 111 | # Draw the graph 112 | pos = nx.spring_layout(G, weight="weight") # Positions for all nodes 113 | edges = G.edges(data=True) 114 | 115 | # Draw nodes 116 | nx.draw_networkx_nodes(G, pos, node_size=1, alpha=0.01) 117 | 118 | # Draw edges with labels 119 | nx.draw_networkx_edges(G, pos, alpha=0.01, edge_color="gray") 120 | # nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['weight'] for u, v, d in edges}) 121 | 122 | # Draw node labels 123 | nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif') 124 | 125 | plt.title(f'Graph of {distance_type} Distances') 126 | 127 | plt.subplots_adjust(left=0, right=1, top=1, bottom=0) 128 | plt.tight_layout(pad=0) 129 | 130 | plt.show() 131 | --------------------------------------------------------------------------------