├── Modules
    ├── __init__.py
    ├── Aligner
    │   ├── __init__.py
    │   ├── README.md
    │   └── Reconstructor.py
    ├── Vocoder
    │   ├── __init__.py
    │   ├── README.md
    │   ├── SAN_LICENSE
    │   ├── HiFiGAN_LICENSE
    │   ├── Avocodo_LICENSE
    │   ├── FeatureMatchingLoss.py
    │   ├── Snake.py
    │   ├── AMP.py
    │   ├── BigVGAN.py
    │   ├── AdversarialLoss.py
    │   └── MelSpecLoss.py
    ├── GeneralLayers
    │   ├── __init__.py
    │   ├── README.md
    │   ├── Swish.py
    │   ├── MultiSequential.py
    │   ├── PositionwiseFeedForward.py
    │   ├── LayerNorm.py
    │   ├── Convolution.py
    │   ├── LengthRegulator.py
    │   ├── ResidualStack.py
    │   ├── MultiLayeredConv1d.py
    │   ├── VariancePredictor.py
    │   ├── ResidualBlock.py
    │   ├── STFT.py
    │   ├── ConditionalLayerNorm.py
    │   └── EncoderLayer.py
    ├── ToucanTTS
    │   ├── __init__.py
    │   ├── README.md
    │   ├── DurationCalculator.py
    │   ├── StochasticToucanTTSLoss.py
    │   ├── glow_utils.py
    │   ├── ToucanTTSLoss.py
    │   ├── LanguageEmbeddingSpaceStructureLoss.py
    │   ├── CodecDiscriminator.py
    │   ├── EnergyCalculator.py
    │   ├── toucantts_train_loop_arbiter.py
    │   ├── PitchCalculator.py
    │   ├── wavenet.py
    │   └── flow_matching.py
    ├── ControllabilityGAN
    │   ├── __init__.py
    │   ├── wgan
    │   │   ├── __init__.py
    │   │   ├── resnet_init.py
    │   │   ├── init_weights.py
    │   │   └── init_wgan.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   └── speaker_embeddings_dataset.py
    │   └── GAN.py
    ├── EmbeddingModel
    │   ├── __init__.py
    │   ├── README.md
    │   ├── StyleEmbedding.py
    │   └── StyleTTSEncoder.py
    └── README.md
├── Recipes
    ├── __init__.py
    ├── README.md
    ├── ToucanTTS_Nancy.py
    ├── ToucanTTS_Massive_English_stage2.py
    ├── ToucanTTS_IntegrationTest.py
    ├── BigVGAN_e2e.py
    ├── HiFiGAN_e2e.py
    ├── finetuning_example_simple.py
    ├── ToucanTTS_Massive_German.py
    └── finetuning_example_multilingual.py
├── Utility
    ├── __init__.py
    ├── storage_config.py
    ├── toucan.png
    ├── README.md
    ├── weight_averaging.py
    ├── WarmupScheduler.py
    ├── corpus_preparation.py
    ├── silence_removal.py
    └── diverse_losses.py
├── Preprocessing
    ├── __init__.py
    ├── Codec
    │   ├── __init__.py
    │   ├── README.md
    │   └── encodec.py
    ├── multilinguality
    │   ├── __init__.py
    │   ├── README.md
    │   ├── generate_zero_shot_lang_embs.py
    │   └── visualize_distances.py
    └── README.md
├── InferenceInterfaces
    ├── __init__.py
    └── README.md
├── .gitignore
├── run_scorer.py
├── requirements.txt
├── run_prosody_override.py
├── .github
    └── FUNDING.yml
├── run_zero_shot_lang_emb_injection.py
├── run_simple_GUI_demo.py
├── run_training_pipeline.py
└── run_text_to_file_reader.py


/Modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Recipes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Utility/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/Aligner/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/InferenceInterfaces/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Preprocessing/Codec/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/EmbeddingModel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/wgan/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Preprocessing/multilinguality/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Utility/storage_config.py:
--------------------------------------------------------------------------------
1 | MODEL_DIR = "Models/"
2 | PREPROCESSING_DIR = "Corpora/"
3 | 


--------------------------------------------------------------------------------
/Utility/toucan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xSojalSec/IMS-Toucan/HEAD/Utility/toucan.png


--------------------------------------------------------------------------------
/Modules/Vocoder/README.md:
--------------------------------------------------------------------------------
1 | This directory contains the code needed to train a HiFiGAN vocoder on the spectrogram representation that we use.


--------------------------------------------------------------------------------
/Preprocessing/README.md:
--------------------------------------------------------------------------------
1 | This directory contains scripts that wrap around text processing and audio processing to allow for high-level
2 | interactions with the feature extraction.


--------------------------------------------------------------------------------
/Modules/Aligner/README.md:
--------------------------------------------------------------------------------
1 | Everything that is concerned with training and using the aligner model is contained in this directory. It is recommended
2 | to use the universal aligner model that we supply in the GitHub releases. 


--------------------------------------------------------------------------------
/Modules/README.md:
--------------------------------------------------------------------------------
1 | This directory contains all the models that are used in this toolkit for various tasks. The models' directories contain
2 | their
3 | feature extractors, their datasets, their architectures, and their train loops. 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a collection of layers that are used both during training time and during inference time. Large
2 | portions of these layers are either directly taken from ESPnet or adaptations of such.


--------------------------------------------------------------------------------
/Utility/README.md:
--------------------------------------------------------------------------------
1 | This directory includes general utility scripts that include additional losses for certain tasks, the scheduler, and the
2 | interfaces to the files on the disks, as well as the scorer, which can help in finding samples that might be causing
3 | problems for the TTS.


--------------------------------------------------------------------------------
/Preprocessing/Codec/README.md:
--------------------------------------------------------------------------------
1 | This code is taken from https://github.com/yangdongchao/AcademiCodec/tree/master
2 | 
3 | It is their version of encodec that is sampled at 16kHz, which the original encodec repository does not offer. The
4 | download of the necessary files should happen automatically.


--------------------------------------------------------------------------------
/InferenceInterfaces/README.md:
--------------------------------------------------------------------------------
1 | This directory contains interfaces that enable high level interactions with trained TTS models, which are just loaded
2 | for different inference tasks, like cloning the exact prosody of a reference utterance, or simply reading an audio out
3 | loud or reading it to a file.


--------------------------------------------------------------------------------
/Modules/EmbeddingModel/README.md:
--------------------------------------------------------------------------------
1 | Everything that is concerned with the embedding model is contained in this directory. The embedding function does not
2 | have its own train loop, because it is always trained jointly with the TTS. Most of the time however, it is used in a
3 | frozen state. We recommend using the embedding function that we publish in the GitHub releases.


--------------------------------------------------------------------------------
/Modules/ToucanTTS/README.md:
--------------------------------------------------------------------------------
1 | This directory contains everything needed to extract features for our TTS model and train it. It contains a lot of
2 | designs from different origins, so bringing it all together, we call it ToucanTTS.
3 | 
4 | In German, when somebody talks a lot, you can say that they have a big beak. And this system sure talks a lot, so it has
5 | to be the bird with the most prominent beak!


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | tensorboard_logs/
 3 | Corpora/
 4 | Models/
 5 | audios/
 6 | Preprocessing/glottolog/
 7 | Preprocessing/multilinguality/datasets/
 8 | apex/
 9 | pretrained_models/
10 | .tmp/
11 | .vscode/
12 | split/
13 | singing/
14 | toucan_conda_venv/
15 | venv/
16 | vis/
17 | Utility/storage_config.py
18 | Preprocessing/multilinguality/distance_datasets
19 | 
20 | 
21 | *_graph
22 | app.py
23 | gradio*
24 | *playground*
25 | run_phonemizer.py
26 | 
27 | *.pt
28 | *.out
29 | *.wav
30 | *.flac
31 | *.json
32 | *.pyc
33 | *.png
34 | *.pdf
35 | *.pkl
36 | *.gif


--------------------------------------------------------------------------------
/Modules/GeneralLayers/Swish.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #                Northwestern Polytechnical University (Pengcheng Guo)
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | # Adapted by Florian Lux 2021
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | class Swish(torch.nn.Module):
10 |     """
11 |     Construct a Swish activation function for Conformer.
12 |     """
13 | 
14 |     def forward(self, x):
15 |         """
16 |         Return Swish activation function.
17 |         """
18 |         return x * torch.sigmoid(x)
19 | 


--------------------------------------------------------------------------------
/Recipes/README.md:
--------------------------------------------------------------------------------
1 | This directory contains all the pipelines, which are called in the run_training_pipeline.py script. A pipeline is a
2 | wrapper around the train loop, that loads a dataset and sets hyperparameters and settings, which it then all forwards
3 | into the actual train loop of the corresponding task. Since the TTS train loops have an arbiter that
4 | decides whether the mono-lingual or the multi-lingual train loop will be run, this does not need to be decided in the
5 | pipeline. All datasets that belong to the same language should be combined into a concat dataset before being passed to
6 | the train loop function for the arbiter to work correctly.


--------------------------------------------------------------------------------
/run_scorer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example use of the scorer utility to inspect data.
 3 | (pre-trained models and cache files with already extracted features are required.)
 4 | """
 5 | 
 6 | from Utility.Scorer import TTSScorer
 7 | from Utility.path_to_transcript_dicts import *
 8 | from Utility.storage_config import PREPROCESSING_DIR
 9 | 
10 | exec_device = "cuda:8"  # ADAPT THIS
11 | 
12 | lang_id = "eng"
13 | tts_scorer = TTSScorer(path_to_model=None, device=exec_device)
14 | tts_scorer.score(path_to_toucantts_dataset=os.path.join(PREPROCESSING_DIR, "IntegrationTest"), lang_id=lang_id)
15 | tts_scorer.show_samples_with_highest_loss(20)
16 | tts_scorer.remove_samples_with_highest_loss(5)
17 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/wgan/resnet_init.py:
--------------------------------------------------------------------------------
 1 | from Modules.ControllabilityGAN.wgan.init_weights import weights_init_D
 2 | from Modules.ControllabilityGAN.wgan.init_weights import weights_init_G
 3 | from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_D
 4 | from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_G
 5 | 
 6 | 
 7 | def init_resnet(parameters):
 8 |     critic = ResNet_D(parameters['data_dim'][-1], parameters['size'], nfilter=parameters['nfilter'], nfilter_max=parameters['nfilter_max'])
 9 |     generator = ResNet_G(parameters['data_dim'][-1], parameters['z_dim'], parameters['size'], nfilter=parameters['nfilter'],
10 |                          nfilter_max=parameters['nfilter_max'])
11 | 
12 |     generator.apply(weights_init_G)
13 |     critic.apply(weights_init_D)
14 | 
15 |     return generator, critic
16 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/wgan/init_weights.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | def weights_init_D(m):
 5 |     classname = m.__class__.__name__
 6 |     if classname.find('Conv') != -1:
 7 |         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
 8 |         # nn.init.constant_(m.bias, 0)
 9 |     elif classname.find('BatchNorm') != -1:
10 |         nn.init.constant_(m.weight, 1)
11 |         nn.init.constant_(m.bias, 0)
12 | 
13 | 
14 | def weights_init_G(m):
15 |     classname = m.__class__.__name__
16 |     if classname.find('Conv') != -1:
17 |         nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
18 |         # nn.init.constant_(m.bias, 0)
19 |     elif classname.find('BatchNorm') != -1:
20 |         nn.init.constant_(m.weight, 1)
21 |         nn.init.constant_(m.bias, 0)
22 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/MultiSequential.py:
--------------------------------------------------------------------------------
 1 | # Written by Shigeki Karita, 2019
 2 | # Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | # Adapted by Florian Lux, 2021
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class MultiSequential(torch.nn.Sequential):
 9 |     """
10 |     Multi-input multi-output torch.nn.Sequential.
11 |     """
12 | 
13 |     def forward(self, *args):
14 |         """
15 |         Repeat.
16 |         """
17 |         for m in self:
18 |             args = m(*args)
19 |         return args
20 | 
21 | 
22 | def repeat(N, fn):
23 |     """
24 |     Repeat module N times.
25 | 
26 |     Args:
27 |         N (int): Number of repeat time.
28 |         fn (Callable): Function to generate module.
29 | 
30 |     Returns:
31 |         MultiSequential: Repeated model instance.
32 |     """
33 |     return MultiSequential(*[fn(n) for n in range(N)])
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch~=2.4.0
 2 | torchaudio~=2.4.0
 3 | torch_complex~=0.4.3
 4 | epitran==1.24
 5 | tqdm~=4.64.1
 6 | scipy~=1.9.3
 7 | librosa~=0.9.2
 8 | praat-parselmouth~=0.4.2
 9 | numpy~=1.23.4
10 | soundfile~=0.12.0
11 | pypinyin~=0.47.1
12 | pyloudnorm~=0.1.0
13 | cvxopt~=1.3.0
14 | sounddevice~=0.4.5
15 | matplotlib~=3.9.2
16 | phonemizer~=3.2.1
17 | gradio~=5.23.2
18 | pyqt5~=5.15.11
19 | pyqtgraph~=0.13.7
20 | wandb~=0.13.5
21 | speechbrain==0.5.13
22 | dragonmapper~=0.2.6
23 | alias_free_torch~=0.0.6
24 | dotwiz==0.4.0
25 | transphone==1.5.3
26 | phonepiece==1.4.2
27 | geopy==2.4.1
28 | einops==0.7.0
29 | datasets~=2.10.1
30 | pandas~=1.5.0
31 | rich~=13.4.2
32 | PyYAML~=6.0
33 | imageio~=2.34.0
34 | pykakasi~=2.2.1
35 | jamo~=0.4.1
36 | g2pk~=0.9.4
37 | pykan~=0.2.6
38 | huggingface-hub~=0.25.2
39 | pynput~=1.7.7
40 | PyAutoGUI~=0.9.54
41 | networkx~=3.3
42 | scikit-learn~=1.5.0


--------------------------------------------------------------------------------
/run_prosody_override.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from InferenceInterfaces.UtteranceCloner import UtteranceCloner
 4 | 
 5 | if __name__ == '__main__':
 6 |     uc = UtteranceCloner(model_id=None, device="cuda" if torch.cuda.is_available() else "cpu" if torch.cuda.is_available() else "cpu")
 7 | 
 8 |     # What is said in path_to_reference_audio_for_intonation has to match the text in the reference_transcription exactly!
 9 |     uc.clone_utterance(path_to_reference_audio_for_intonation="audios/speaker_references_for_testing/sad.wav",
10 |                        path_to_reference_audio_for_voice="audios/speaker_references_for_testing/female_mid_voice.wav",  # the two reference audios can be the same, but don't have to be
11 |                        transcription_of_intonation_reference="This report is due tomorrow.",
12 |                        filename_of_result="audios/test_cloned.wav",
13 |                        lang="eng")
14 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/PositionwiseFeedForward.py:
--------------------------------------------------------------------------------
 1 | # Written by Shigeki Karita, 2019
 2 | # Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | # Adapted by Florian Lux, 2021
 4 | 
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | class PositionwiseFeedForward(torch.nn.Module):
10 |     """
11 |     Args:
12 |         idim (int): Input dimenstion.
13 |         hidden_units (int): The number of hidden units.
14 |         dropout_rate (float): Dropout rate.
15 | 
16 |     """
17 | 
18 |     def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
19 |         super(PositionwiseFeedForward, self).__init__()
20 |         self.w_1 = torch.nn.Linear(idim, hidden_units)
21 |         self.w_2 = torch.nn.Linear(hidden_units, idim)
22 |         self.dropout = torch.nn.Dropout(dropout_rate)
23 |         self.activation = activation
24 | 
25 |     def forward(self, x):
26 |         return self.w_2(self.dropout(self.activation(self.w_1(x))))
27 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [ Flux9665 ] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
16 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/LayerNorm.py:
--------------------------------------------------------------------------------
 1 | # Written by Shigeki Karita, 2019
 2 | # Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | # Adapted by Florian Lux, 2021
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class LayerNorm(torch.nn.LayerNorm):
 9 |     """
10 |     Layer normalization module.
11 | 
12 |     Args:
13 |         nout (int): Output dim size.
14 |         dim (int): Dimension to be normalized.
15 |     """
16 | 
17 |     def __init__(self, nout, dim=-1, eps=1e-12):
18 |         """
19 |         Construct an LayerNorm object.
20 |         """
21 |         super(LayerNorm, self).__init__(nout, eps=eps)
22 |         self.dim = dim
23 | 
24 |     def forward(self, x):
25 |         """
26 |         Apply layer normalization.
27 | 
28 |         Args:
29 |             x (torch.Tensor): Input tensor.
30 | 
31 |         Returns:
32 |             torch.Tensor: Normalized tensor.
33 |         """
34 |         if self.dim == -1:
35 |             return super(LayerNorm, self).forward(x)
36 |         return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
37 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/DurationCalculator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Nagoya University (Tomoki Hayashi)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class DurationCalculator(torch.nn.Module):
11 | 
12 |     def __init__(self, reduction_factor=1.0):
13 |         super().__init__()
14 | 
15 |     @torch.no_grad()
16 |     def forward(self, att_ws, vis=None):
17 |         """
18 |         Convert alignment matrix to durations.
19 |         """
20 |         if vis is not None:
21 |             plt.figure(figsize=(8, 4))
22 |             plt.imshow(att_ws.cpu().numpy(), interpolation='nearest', aspect='auto', origin="lower")
23 |             plt.xlabel("Inputs")
24 |             plt.ylabel("Outputs")
25 |             plt.tight_layout()
26 |             plt.savefig(vis)
27 |             plt.close()
28 |         # calculate duration from 2d alignment matrix
29 |         durations = torch.stack([att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])])
30 |         return durations.view(-1)
31 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/SAN_LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Sony Research Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Modules/Vocoder/HiFiGAN_LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2022 Rishikesh (ऋषिकेश)
 4 | Modified 2022 by Florian Lux
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.


--------------------------------------------------------------------------------
/Modules/Vocoder/Avocodo_LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020 Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
 4 | Modified 2021 by Florian Lux
 5 | Further code integrated from 2022 Rishikesh (ऋषिकेश), same license applies
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in
15 | all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | THE SOFTWARE.


--------------------------------------------------------------------------------
/Modules/Aligner/Reconstructor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.multiprocessing
 3 | 
 4 | from Utility.utils import make_non_pad_mask
 5 | 
 6 | 
 7 | class Reconstructor(torch.nn.Module):
 8 | 
 9 |     def __init__(self,
10 |                  n_features=128,
11 |                  num_symbols=145,
12 |                  speaker_embedding_dim=192,
13 |                  hidden_dim=256):
14 |         super().__init__()
15 |         self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, hidden_dim)
16 |         self.hidden_proj = torch.nn.Linear(hidden_dim, hidden_dim)
17 |         self.out_proj = torch.nn.Linear(hidden_dim, n_features)
18 |         self.l1_criterion = torch.nn.L1Loss(reduction="none")
19 | 
20 |     def forward(self, x, lens, ys):
21 |         x = self.in_proj(x)
22 |         x = torch.nn.functional.leaky_relu(x)
23 |         x = self.hidden_proj(x)
24 |         x = torch.nn.functional.leaky_relu(x)
25 |         x = self.out_proj(x)
26 |         out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device)
27 |         out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
28 |         out_weights /= ys.size(0) * ys.size(2)
29 |         return self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     print(sum(p.numel() for p in Reconstructor().parameters() if p.requires_grad))
34 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/StochasticToucanTTSLoss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Taken from ESPNet
 3 | Adapted by Flux
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | from Utility.utils import make_non_pad_mask
 9 | 
10 | 
11 | class StochasticToucanTTSLoss(torch.nn.Module):
12 | 
13 |     def __init__(self):
14 |         super().__init__()
15 |         self.l1_criterion = torch.nn.L1Loss(reduction="none")
16 | 
17 |     def forward(self, predicted_features, gold_features, features_lengths):
18 |         """
19 |         Args:
20 |             predicted_features (Tensor): Batch of outputs (B, Lmax, odim).
21 |             gold_features (Tensor): Batch of target features (B, Lmax, odim).
22 |             features_lengths (LongTensor): Batch of the lengths of each target (B,).
23 | 
24 |         Returns:
25 |             Tensor: L1 loss value.
26 |         """
27 | 
28 |         # calculate loss
29 |         l1_loss = self.l1_criterion(predicted_features, gold_features)
30 | 
31 |         # make weighted mask and apply it
32 |         out_masks = make_non_pad_mask(features_lengths).unsqueeze(-1).to(gold_features.device)
33 |         out_masks = torch.nn.functional.pad(out_masks.transpose(1, 2), [0, gold_features.size(1) - out_masks.size(1), 0, 0, 0, 0], value=False).transpose(1, 2)
34 |         out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
35 |         out_weights /= gold_features.size(0) * gold_features.size(2)
36 | 
37 |         # apply weight
38 |         l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
39 | 
40 |         return l1_loss
41 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/wgan/init_wgan.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from Modules.ControllabilityGAN.wgan.resnet_init import init_resnet
 4 | from Modules.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
 5 | 
 6 | 
 7 | def create_wgan(parameters, device, optimizer='adam'):
 8 |     if parameters['model'] == "resnet":
 9 |         generator, discriminator = init_resnet(parameters)
10 |     else:
11 |         raise NotImplementedError
12 | 
13 |     if optimizer == 'adam':
14 |         optimizer_g = torch.optim.Adam(generator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas'])
15 |         optimizer_d = torch.optim.Adam(discriminator.parameters(), lr=parameters['learning_rate'], betas=parameters['betas'])
16 |     elif optimizer == 'rmsprop':
17 |         optimizer_g = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate'])
18 |         optimizer_d = torch.optim.RMSprop(generator.parameters(), lr=parameters['learning_rate'])
19 | 
20 |     criterion = torch.nn.MSELoss()
21 | 
22 |     gan = WassersteinGanQuadraticCost(generator,
23 |                                       discriminator,
24 |                                       optimizer_g,
25 |                                       optimizer_d,
26 |                                       criterion=criterion,
27 |                                       data_dimensions=parameters['data_dim'],
28 |                                       epochs=parameters['epochs'],
29 |                                       batch_size=parameters['batch_size'],
30 |                                       device=device,
31 |                                       n_max_iterations=parameters['n_max_iterations'],
32 |                                       gamma=parameters['gamma'])
33 | 
34 |     return gan
35 | 


--------------------------------------------------------------------------------
/Preprocessing/multilinguality/README.md:
--------------------------------------------------------------------------------
 1 | ## Zero-Shot Approximation of Language Embeddings
 2 | 
 3 | This directory contains all scripts that are needed to reproduce the meta learning for zero-shot part of our system.
 4 | These scripts allow you to predict representations of languages purely based on distances between them, as measured by a
 5 | variety of linguistically informed metrics, or even better, a learned combination thereof.
 6 | 
 7 | ### Applying zero-shot approximation to a trained model
 8 | 
 9 | Use `run_zero_shot_lang_emb_injection.py` to update the language embeddings of a trained model for all languages that
10 | were *not* seen during training (by default, `supervised_languages.json` is used to determine which languages *were*
11 | seen).
12 | See the script for arguments that can be passed (e.g. to use a custom model path). Here is an example:
13 | 
14 | ```
15 | cd IMS-Toucan/
16 | python run_zero_shot_lang_emb_injection.py -m <model_path> -d <distance_type> -k <number_of_nearest_neighbors>
17 | ```
18 | 
19 | By default, the updated model is saved with a modified filename in the same directory.
20 | 
21 | ### Cached distance lookups
22 | 
23 | In order to apply any zero-shot approximation, cache files for distance lookups are required.
24 | 
25 | The ASP lookup file (`asp_dict.pkl`) needs to be downloaded from the release page. All other cache files are
26 | automatically generated as required when running `run_zero_shot_lang_emb_injection.py`.
27 | 
28 | **Note:** While the map, tree, and inverse ASP distances are model independent, **the learned distance lookup is only
29 | applicable for the model it was trained on**, i.e., different Toucan models require different learned-distance lookups.
30 | If you want to apply zero-shot approximation to a new model, make sure that you are not using an outdated, pre-existing
31 | learned distance lookup, but instead train a new learned distance metric.
32 | 


--------------------------------------------------------------------------------
/run_zero_shot_lang_emb_injection.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from Preprocessing.multilinguality.create_distance_lookups import CacheCreator
 6 | from Preprocessing.multilinguality.create_lang_dist_dataset import LangDistDatasetCreator
 7 | from Preprocessing.multilinguality.generate_zero_shot_lang_embs import approximate_and_inject_language_embeddings
 8 | from Utility.storage_config import MODEL_DIR
 9 | 
10 | if __name__ == "__main__":
11 |     default_model_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt")
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--model_path", "-m", type=str, default=default_model_path, help="model path from which to obtain pretrained language embeddings")
14 |     parser.add_argument("--distance_type", "-d", type=str, choices=["map", "tree", "asp", "learned", "combined"], default="learned",
15 |                         help="which type of distance to use for finding nearest languages")
16 |     parser.add_argument("--n_closest", "-k", type=int, default=50, help="how many nearest languages to select for language embedding approximation")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     # make sure that cache files exist
21 |     cc = CacheCreator(cache_root="Preprocessing/multilinguality")
22 |     cc.create_required_files(model_path=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt"))
23 | 
24 |     # create distance dataset
25 |     dc = LangDistDatasetCreator(args.model_path, cache_root="Preprocessing/multilinguality")
26 |     distance_dataset = dc.create_dataset(args.distance_type, n_closest=args.n_closest, zero_shot=True)
27 | 
28 |     # generate zero-shot lang embs and inject into pretrained model, then save to modified model path
29 |     approximate_and_inject_language_embeddings(model_path=args.model_path,
30 |                                                df=distance_dataset,
31 |                                                iso_lookup=dc.iso_lookup)
32 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/FeatureMatchingLoss.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Tomoki Hayashi
 2 | # MIT License (https://opensource.org/licenses/MIT)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | 
 6 | import torch
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | def feature_loss(fmap_r, fmap_g):
11 |     loss = 0
12 |     for dr, dg in zip(fmap_r, fmap_g):
13 |         loss += torch.mean(torch.abs(dr - dg))
14 | 
15 |     return loss / len(fmap_g)
16 | 
17 | 
18 | class FeatureMatchLoss(torch.nn.Module):
19 | 
20 |     def __init__(self,
21 |                  average_by_layers=True,
22 |                  average_by_discriminators=False,
23 |                  include_final_outputs=False, ):
24 |         super().__init__()
25 |         self.average_by_layers = average_by_layers
26 |         self.average_by_discriminators = average_by_discriminators
27 |         self.include_final_outputs = include_final_outputs
28 | 
29 |     def forward(self, feats_hat, feats):
30 |         """
31 |         Calculate feature matching loss.
32 | 
33 |         Args:
34 |             feats_hat (list): List of lists of discriminator outputs
35 |                 calculated from generator outputs.
36 |             feats (list): List of lists of discriminator outputs
37 |                 calculated from ground-truth.
38 | 
39 |         Returns:
40 |             Tensor: Feature matching loss value.
41 |         """
42 |         feat_match_loss = 0.0
43 |         for i, (feats_hat_, feats_) in enumerate(zip(feats_hat, feats)):
44 |             feat_match_loss_ = 0.0
45 |             if not self.include_final_outputs:
46 |                 feats_hat_ = feats_hat_[:-1]
47 |                 feats_ = feats_[:-1]
48 |             for j, (feat_hat_, feat_) in enumerate(zip(feats_hat_, feats_)):
49 |                 feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach())
50 |             if self.average_by_layers:
51 |                 feat_match_loss_ /= j + 1
52 |             feat_match_loss += feat_match_loss_
53 |         if self.average_by_discriminators:
54 |             feat_match_loss /= i + 1
55 | 
56 |         return feat_match_loss
57 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/Convolution.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
 2 | #                Northwestern Polytechnical University (Pengcheng Guo)
 3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 4 | # Adapted by Florian Lux 2021
 5 | 
 6 | 
 7 | from torch import nn
 8 | 
 9 | 
10 | class ConvolutionModule(nn.Module):
11 |     """
12 |     ConvolutionModule in Conformer model.
13 | 
14 |     Args:
15 |         channels (int): The number of channels of conv layers.
16 |         kernel_size (int): Kernel size of conv layers.
17 | 
18 |     """
19 | 
20 |     def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
21 |         super(ConvolutionModule, self).__init__()
22 |         # kernel_size should be an odd number for 'SAME' padding
23 |         assert (kernel_size - 1) % 2 == 0
24 | 
25 |         self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, )
26 |         self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, )
27 |         self.norm = nn.SyncBatchNorm.convert_sync_batchnorm(nn.BatchNorm1d(channels))
28 |         self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, )
29 |         self.activation = activation
30 | 
31 |     def forward(self, x):
32 |         """
33 |         Compute convolution module.
34 | 
35 |         Args:
36 |             x (torch.Tensor): Input tensor (#batch, time, channels).
37 | 
38 |         Returns:
39 |             torch.Tensor: Output tensor (#batch, time, channels).
40 | 
41 |         """
42 |         # exchange the temporal dimension and the feature dimension
43 |         x = x.transpose(1, 2)
44 | 
45 |         # GLU mechanism
46 |         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
47 |         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
48 | 
49 |         # 1D Depthwise Conv
50 |         x = self.depthwise_conv(x)
51 |         x = self.activation(self.norm(x))
52 | 
53 |         x = self.pointwise_conv2(x)
54 | 
55 |         return x.transpose(1, 2)
56 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/LengthRegulator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Tomoki Hayashi
 2 | # MIT License (https://opensource.org/licenses/MIT)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | from abc import ABC
 6 | 
 7 | import torch
 8 | 
 9 | from Utility.utils import pad_list
10 | 
11 | 
12 | class LengthRegulator(torch.nn.Module, ABC):
13 |     """
14 |     Length regulator module for feed-forward Transformer.
15 | 
16 |     This is a module of length regulator described in
17 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
18 |     The length regulator expands char or
19 |     phoneme-level embedding features to frame-level by repeating each
20 |     feature based on the corresponding predicted durations.
21 | 
22 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
23 |         https://arxiv.org/pdf/1905.09263.pdf
24 | 
25 |     """
26 | 
27 |     def __init__(self, pad_value=0.0):
28 |         """
29 |         Initialize length regulator module.
30 | 
31 |         Args:
32 |             pad_value (float, optional): Value used for padding.
33 |         """
34 |         super(LengthRegulator, self).__init__()
35 |         self.pad_value = pad_value
36 | 
37 |     def forward(self, xs, ds, alpha=1.0):
38 |         """
39 |         Calculate forward propagation.
40 |         Args:
41 |             xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
42 |             ds (LongTensor): Batch of durations of each frame (B, T).
43 |             alpha (float, optional): Alpha value to control speed of speech.
44 |         Returns:
45 |             Tensor: replicated input tensor based on durations (B, T*, D).
46 |         """
47 | 
48 |         if alpha != 1.0:
49 |             assert alpha > 0
50 |             ds = torch.round(ds.float() * alpha).long()
51 | 
52 |         if ds.sum() == 0:
53 |             ds[ds.sum(dim=1).eq(0)] = 1
54 | 
55 |         return pad_list([self._repeat_one_sequence(x, d) for x, d in zip(xs, ds)], self.pad_value)
56 | 
57 |     def _repeat_one_sequence(self, x, d):
58 |         """
59 |         Repeat each frame according to duration
60 |         """
61 |         d = torch.clamp(d, min=0)
62 |         return torch.repeat_interleave(x, d, dim=0)
63 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/glow_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MIT License
 3 | 
 4 | Copyright (c) 2022 Yi Ren
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | """
24 | 
25 | import torch
26 | 
27 | 
28 | def squeeze(x, nonpadding=None, n_sqz=2):
29 |     b, c, t = x.size()
30 | 
31 |     t = (t // n_sqz) * n_sqz
32 |     x = x[:, :, :t]
33 |     x_sqz = x.view(b, c, t // n_sqz, n_sqz)
34 |     x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * n_sqz, t // n_sqz)
35 | 
36 |     if nonpadding is not None:
37 |         nonpadding = nonpadding[:, :, n_sqz - 1::n_sqz]
38 |     else:
39 |         nonpadding = torch.ones(b, 1, t // n_sqz).to(device=x.device, dtype=x.dtype)
40 |     return x_sqz * nonpadding, nonpadding
41 | 
42 | 
43 | def unsqueeze(x, nonpadding=None, n_sqz=2):
44 |     b, c, t = x.size()
45 | 
46 |     x_unsqz = x.view(b, n_sqz, c // n_sqz, t)
47 |     x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // n_sqz, t * n_sqz)
48 | 
49 |     if nonpadding is not None:
50 |         nonpadding = nonpadding.unsqueeze(-1).repeat(1, 1, 1, n_sqz).view(b, 1, t * n_sqz)
51 |     else:
52 |         nonpadding = torch.ones(b, 1, t * n_sqz).to(device=x.device, dtype=x.dtype)
53 |     return x_unsqz * nonpadding, nonpadding
54 | 


--------------------------------------------------------------------------------
/Preprocessing/Codec/encodec.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch.nn as nn
 6 | 
 7 | from Preprocessing.Codec.seanet import SEANetDecoder
 8 | from Preprocessing.Codec.seanet import SEANetEncoder
 9 | from Preprocessing.Codec.vq import ResidualVectorQuantizer
10 | 
11 | 
12 | # Generator
13 | class EnCodec(nn.Module):
14 |     def __init__(self,
15 |                  n_filters,
16 |                  D,
17 |                  target_bandwidths=[1, 1.5, 2, 4, 6, 12],
18 |                  ratios=[8, 5, 4, 2],
19 |                  sample_rate=16000,
20 |                  bins=1024,
21 |                  normalize=False):
22 |         super().__init__()
23 |         self.hop_length = np.prod(ratios)  # 计算乘积
24 |         self.encoder = SEANetEncoder(n_filters=n_filters, dimension=D, ratios=ratios)
25 |         n_q = int(1000 * target_bandwidths[-1] // (math.ceil(sample_rate / self.hop_length) * 10))
26 |         self.frame_rate = math.ceil(sample_rate / np.prod(ratios))  # 50
27 |         self.bits_per_codebook = int(math.log2(bins))
28 |         self.target_bandwidths = target_bandwidths
29 |         self.quantizer = ResidualVectorQuantizer(dimension=D, n_q=n_q, bins=bins)
30 |         self.decoder = SEANetDecoder(n_filters=n_filters, dimension=D, ratios=ratios)
31 | 
32 |     def get_last_layer(self):
33 |         return self.decoder.layers[-1].weight
34 | 
35 |     def forward(self, x):
36 |         e = self.encoder(x)
37 |         max_idx = len(self.target_bandwidths) - 1
38 |         bw = self.target_bandwidths[random.randint(0, max_idx)]
39 |         quantized, codes, bandwidth, commit_loss = self.quantizer(e, self.frame_rate, bw)
40 |         o = self.decoder(quantized)
41 |         return o, commit_loss, None
42 | 
43 |     def encode(self, x, target_bw=None, st=None):
44 |         e = self.encoder(x)
45 |         if target_bw is None:
46 |             bw = self.target_bandwidths[-1]
47 |         else:
48 |             bw = target_bw
49 |         if st is None:
50 |             st = 0
51 |         codes = self.quantizer.encode(e, self.frame_rate, bw, st)
52 |         return codes
53 | 
54 |     def decode(self, codes):
55 |         quantized = self.quantizer.decode(codes)
56 |         o = self.decoder(quantized)
57 |         return o
58 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/ResidualStack.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Tomoki Hayashi
 2 | # MIT License (https://opensource.org/licenses/MIT)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | class ResidualStack(torch.nn.Module):
10 | 
11 |     def __init__(self, kernel_size=3, channels=32, dilation=1, bias=True, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2},
12 |                  pad="ReflectionPad1d", pad_params={}, ):
13 |         """
14 |         Initialize ResidualStack module.
15 | 
16 |         Args:
17 |             kernel_size (int): Kernel size of dilation convolution layer.
18 |             channels (int): Number of channels of convolution layers.
19 |             dilation (int): Dilation factor.
20 |             bias (bool): Whether to add bias parameter in convolution layers.
21 |             nonlinear_activation (str): Activation function module name.
22 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
23 |             pad (str): Padding function module name before dilated convolution layer.
24 |             pad_params (dict): Hyperparameters for padding function.
25 | 
26 |         """
27 |         super(ResidualStack, self).__init__()
28 | 
29 |         # defile residual stack part
30 |         assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
31 |         self.stack = torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
32 |                                          getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
33 |                                          torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
34 |                                          getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
35 |                                          torch.nn.Conv1d(channels, channels, 1, bias=bias), )
36 | 
37 |         # defile extra layer for skip connection
38 |         self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
39 | 
40 |     def forward(self, c):
41 |         """
42 |         Calculate forward propagation.
43 | 
44 |         Args:
45 |             c (Tensor): Input tensor (B, channels, T).
46 | 
47 |         Returns:
48 |             Tensor: Output tensor (B, chennels, T).
49 | 
50 |         """
51 |         return self.stack(c) + self.skip_layer(c)
52 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/Snake.py:
--------------------------------------------------------------------------------
 1 | # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch import pow
 6 | from torch import sin
 7 | from torch.nn import Parameter
 8 | 
 9 | 
10 | class SnakeBeta(nn.Module):
11 |     """
12 |     A modified Snake function which uses separate parameters for the magnitude of the periodic components
13 |     Shape:
14 |         - Input: (B, C, T)
15 |         - Output: (B, C, T), same shape as the input
16 |     Parameters:
17 |         - alpha - trainable parameter that controls frequency
18 |         - beta - trainable parameter that controls magnitude
19 |     References:
20 |         - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
21 |         https://arxiv.org/abs/2006.08195
22 |     """
23 | 
24 |     def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
25 |         """
26 |         Initialization.
27 |         INPUT:
28 |             - in_features: shape of the input
29 |             - alpha - trainable parameter that controls frequency
30 |             - beta - trainable parameter that controls magnitude
31 |             alpha is initialized to 1 by default, higher values = higher-frequency.
32 |             beta is initialized to 1 by default, higher values = higher-magnitude.
33 |             alpha will be trained along with the rest of your model.
34 |         """
35 |         super(SnakeBeta, self).__init__()
36 |         self.in_features = in_features
37 | 
38 |         # initialize alpha
39 |         self.alpha_logscale = alpha_logscale
40 |         if self.alpha_logscale:  # log scale alphas initialized to zeros
41 |             self.alpha = Parameter(torch.zeros(in_features) * alpha)
42 |             self.beta = Parameter(torch.zeros(in_features) * alpha)
43 |         else:  # linear scale alphas initialized to ones
44 |             self.alpha = Parameter(torch.ones(in_features) * alpha)
45 |             self.beta = Parameter(torch.ones(in_features) * alpha)
46 | 
47 |         self.alpha.requires_grad = alpha_trainable
48 |         self.beta.requires_grad = alpha_trainable
49 | 
50 |         self.no_div_by_zero = 0.000000001
51 | 
52 |     def forward(self, x):
53 |         """
54 |         Forward pass of the function.
55 |         Applies the function to the input elementwise.
56 |         SnakeBeta ∶= x + 1/b * sin^2 (xa)
57 |         """
58 |         alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
59 |         beta = self.beta.unsqueeze(0).unsqueeze(-1)
60 |         if self.alpha_logscale:
61 |             alpha = torch.exp(alpha)
62 |             beta = torch.exp(beta)
63 |         x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
64 | 
65 |         return x
66 | 


--------------------------------------------------------------------------------
/Recipes/ToucanTTS_Nancy.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | import wandb
 5 | 
 6 | from Utility.path_to_transcript_dicts import *
 7 | 
 8 | 
 9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
10 |     from torch.utils.data import ConcatDataset
11 | 
12 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
13 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
14 |     from Utility.corpus_preparation import prepare_tts_corpus
15 |     from Utility.storage_config import MODEL_DIR
16 |     from Utility.storage_config import PREPROCESSING_DIR
17 | 
18 |     if gpu_id == "cpu":
19 |         device = torch.device("cpu")
20 |     else:
21 |         device = torch.device("cuda")
22 | 
23 |     print("Preparing")
24 | 
25 |     if model_dir is not None:
26 |         save_dir = model_dir
27 |     else:
28 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_Nancy")
29 |     os.makedirs(save_dir, exist_ok=True)
30 | 
31 |     if gpu_count > 1:
32 |         rank = int(os.environ["LOCAL_RANK"])
33 |         torch.cuda.set_device(rank)
34 |         torch.distributed.init_process_group(backend="nccl")
35 |     else:
36 |         rank = 0
37 | 
38 |     train_set = prepare_tts_corpus(transcript_dict=build_path_to_transcript_nancy(),
39 |                                    corpus_dir=os.path.join(PREPROCESSING_DIR, "Nancy"),
40 |                                    lang="eng",
41 |                                    save_imgs=False,
42 |                                    gpu_count=gpu_count,
43 |                                    rank=rank)
44 | 
45 |     model = ToucanTTS()
46 | 
47 |     if gpu_count > 1:
48 |         model.to(rank)
49 |         model = torch.nn.parallel.DistributedDataParallel(
50 |             model,
51 |             device_ids=[rank],
52 |             output_device=rank,
53 |             find_unused_parameters=True,
54 |         )
55 |         torch.distributed.barrier()
56 |     train_sampler = torch.utils.data.RandomSampler(train_set)
57 | 
58 |     if use_wandb:
59 |         if rank == 0:
60 |             wandb.init(
61 |                 name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
62 |                 id=wandb_resume_id,  # this is None if not specified in the command line arguments.
63 |                 resume="must" if wandb_resume_id is not None else None)
64 |     print("Training model")
65 |     train_loop(net=model,
66 |                datasets=[train_set],
67 |                device=device,
68 |                warmup_steps=4000,
69 |                steps=200000,
70 |                batch_size=16,
71 |                save_directory=save_dir,
72 |                eval_lang="eng",
73 |                path_to_checkpoint=resume_checkpoint,
74 |                fine_tune=finetune,
75 |                resume=resume,
76 |                use_wandb=use_wandb,
77 |                train_samplers=[train_sampler],
78 |                gpu_count=gpu_count)
79 |     if use_wandb:
80 |         wandb.finish()
81 | 


--------------------------------------------------------------------------------
/Recipes/ToucanTTS_Massive_English_stage2.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | import wandb
 5 | 
 6 | from Utility.path_to_transcript_dicts import *
 7 | 
 8 | 
 9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
10 |     from torch.utils.data import ConcatDataset
11 | 
12 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
13 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
14 |     from Utility.corpus_preparation import prepare_tts_corpus
15 |     from Utility.storage_config import MODEL_DIR
16 |     from Utility.storage_config import PREPROCESSING_DIR
17 | 
18 |     if gpu_id == "cpu":
19 |         device = torch.device("cpu")
20 |     else:
21 |         device = torch.device("cuda")
22 | 
23 |     print("Preparing")
24 | 
25 |     if model_dir is not None:
26 |         save_dir = model_dir
27 |     else:
28 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_English_v4")
29 |     os.makedirs(save_dir, exist_ok=True)
30 | 
31 |     if gpu_count > 1:
32 |         rank = int(os.environ["LOCAL_RANK"])
33 |         torch.cuda.set_device(rank)
34 |         torch.distributed.init_process_group(backend="nccl")
35 |     else:
36 |         rank = 0
37 | 
38 |     datasets = list()
39 | 
40 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_libritts_all_clean,
41 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "libri_all_clean"),
42 |                                        lang="eng",
43 |                                        gpu_count=gpu_count,
44 |                                        rank=rank))
45 | 
46 |     train_set = ConcatDataset(datasets)
47 | 
48 |     model = ToucanTTS()
49 | 
50 |     if gpu_count > 1:
51 |         model.to(rank)
52 |         model = torch.nn.parallel.DistributedDataParallel(
53 |             model,
54 |             device_ids=[rank],
55 |             output_device=rank,
56 |             find_unused_parameters=True,
57 |         )
58 |         torch.distributed.barrier()
59 |     train_sampler = torch.utils.data.RandomSampler(train_set)
60 | 
61 |     if use_wandb:
62 |         if rank == 0:
63 |             wandb.init(
64 |                 name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
65 |                 id=wandb_resume_id,  # this is None if not specified in the command line arguments.
66 |                 resume="must" if wandb_resume_id is not None else None)
67 |     print("Training model")
68 |     train_loop(net=model,
69 |                datasets=[train_set],
70 |                device=device,
71 |                batch_size=12,
72 |                steps_per_checkpoint=1000,
73 |                save_directory=save_dir,
74 |                eval_lang="eng",
75 |                path_to_checkpoint=resume_checkpoint,
76 |                fine_tune=finetune,
77 |                resume=resume,
78 |                use_wandb=use_wandb,
79 |                train_samplers=[train_sampler],
80 |                gpu_count=gpu_count)
81 |     if use_wandb:
82 |         wandb.finish()
83 | 


--------------------------------------------------------------------------------
/Recipes/ToucanTTS_IntegrationTest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is basically an integration test
 3 | """
 4 | 
 5 | import time
 6 | 
 7 | import torch
 8 | import wandb
 9 | 
10 | from Utility.path_to_transcript_dicts import *
11 | 
12 | 
13 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
14 |     from torch.utils.data import ConcatDataset
15 | 
16 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
17 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
18 |     from Utility.corpus_preparation import prepare_tts_corpus
19 |     from Utility.storage_config import MODEL_DIR
20 |     from Utility.storage_config import PREPROCESSING_DIR
21 | 
22 |     if gpu_id == "cpu":
23 |         device = torch.device("cpu")
24 |     else:
25 |         device = torch.device("cuda")
26 | 
27 |     print("Preparing")
28 | 
29 |     if model_dir is not None:
30 |         save_dir = model_dir
31 |     else:
32 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_IntegrationTest")
33 |     os.makedirs(save_dir, exist_ok=True)
34 | 
35 |     if gpu_count > 1:
36 |         rank = int(os.environ["LOCAL_RANK"])
37 |         torch.cuda.set_device(rank)
38 |         torch.distributed.init_process_group(backend="nccl")
39 |     else:
40 |         rank = 0
41 | 
42 |     train_set = prepare_tts_corpus(transcript_dict=build_path_to_transcript_integration_test(),
43 |                                    corpus_dir=os.path.join(PREPROCESSING_DIR, "IntegrationTest"),
44 |                                    lang="eng",
45 |                                    save_imgs=True,
46 |                                    gpu_count=gpu_count,
47 |                                    rank=rank)
48 | 
49 |     model = ToucanTTS()
50 | 
51 |     if gpu_count > 1:
52 |         model.to(rank)
53 |         model = torch.nn.parallel.DistributedDataParallel(
54 |             model,
55 |             device_ids=[rank],
56 |             output_device=rank,
57 |             find_unused_parameters=True,
58 |         )
59 |         torch.distributed.barrier()
60 |     train_sampler = torch.utils.data.RandomSampler(train_set)
61 | 
62 |     if use_wandb:
63 |         if rank == 0:
64 |             wandb.init(
65 |                 name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
66 |                 id=wandb_resume_id,  # this is None if not specified in the command line arguments.
67 |                 resume="must" if wandb_resume_id is not None else None)
68 |     print("Training model")
69 |     train_loop(net=model,
70 |                datasets=[train_set],
71 |                device=device,
72 |                save_directory=save_dir,
73 |                batch_size=8,
74 |                eval_lang="eng",
75 |                warmup_steps=500,
76 |                path_to_checkpoint=resume_checkpoint,
77 |                fine_tune=finetune,
78 |                resume=resume,
79 |                steps=5000,
80 |                use_wandb=use_wandb,
81 |                train_samplers=[train_sampler],
82 |                gpu_count=gpu_count)
83 |     if use_wandb:
84 |         wandb.finish()
85 | 


--------------------------------------------------------------------------------
/Recipes/BigVGAN_e2e.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | import wandb
 5 | 
 6 | from Utility.path_to_transcript_dicts import *
 7 | 
 8 | 
 9 | def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
10 |     from Modules.Vocoder.BigVGAN import BigVGAN
11 |     from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
12 |     from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
13 |     from Modules.Vocoder.HiFiGAN_train_loop import train_loop
14 |     from Utility.storage_config import MODEL_DIR
15 | 
16 |     if gpu_id == "cpu":
17 |         device = torch.device("cpu")
18 |     else:
19 |         device = torch.device("cuda")
20 | 
21 |     if gpu_count > 1:
22 |         print("Multi GPU training not supported for BigVGAN!")
23 |         import sys
24 |         sys.exit()
25 | 
26 |     print("Preparing")
27 |     if model_dir is not None:
28 |         model_save_dir = model_dir
29 |     else:
30 |         model_save_dir = os.path.join(MODEL_DIR, "BigVGAN_e2e")
31 |     os.makedirs(model_save_dir, exist_ok=True)
32 | 
33 |     # To prepare the data, have a look at Modules/Vocoder/run_end-to-end_data_creation
34 | 
35 |     print("Collecting new data...")
36 | 
37 |     file_lists_for_this_run_combined = list()
38 |     file_lists_for_this_run_combined_synthetic = list()
39 | 
40 |     fl = list(build_path_to_transcript_libritts_all_clean().keys())
41 |     fisher_yates_shuffle(fl)
42 |     fisher_yates_shuffle(fl)
43 |     for i, f in enumerate(fl):
44 |         if os.path.exists(f.replace(".wav", "_synthetic_spec.pt")):
45 |             file_lists_for_this_run_combined.append(f)
46 |             file_lists_for_this_run_combined_synthetic.append(f.replace(".wav", "_synthetic_spec.pt"))
47 |     print("filepaths collected")
48 | 
49 |     train_set = HiFiGANDataset(list_of_original_paths=file_lists_for_this_run_combined,
50 |                                list_of_synthetic_paths=file_lists_for_this_run_combined_synthetic)
51 | 
52 |     generator = BigVGAN()
53 |     discriminator = AvocodoHiFiGANJointDiscriminator()
54 | 
55 |     print("Training model")
56 |     if use_wandb:
57 |         wandb.init(
58 |             name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
59 |             id=wandb_resume_id,  # this is None if not specified in the command line arguments.
60 |             resume="must" if wandb_resume_id is not None else None)
61 |     train_loop(batch_size=16,
62 |                epochs=5180000,
63 |                generator=generator,
64 |                discriminator=discriminator,
65 |                train_dataset=train_set,
66 |                device=device,
67 |                epochs_per_save=1,
68 |                model_save_dir=model_save_dir,
69 |                path_to_checkpoint=resume_checkpoint,
70 |                resume=resume,
71 |                use_wandb=use_wandb,
72 |                finetune=finetune)
73 |     if use_wandb:
74 |         wandb.finish()
75 | 
76 | 
77 | def fisher_yates_shuffle(lst):
78 |     for i in range(len(lst) - 1, 0, -1):
79 |         j = random.randint(0, i)
80 |         lst[i], lst[j] = lst[j], lst[i]
81 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/AMP.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022 NVIDIA CORPORATION.
 2 | #   Licensed under the MIT license.
 3 | 
 4 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 5 | #   LICENSE is in incl_licenses directory.
 6 | 
 7 | 
 8 | from alias_free_torch import *
 9 | from alias_free_torch import Activation1d
10 | from torch.nn import Conv1d
11 | from torch.nn.utils import remove_weight_norm
12 | from torch.nn.utils import weight_norm
13 | 
14 | from Modules.Vocoder.Snake import SnakeBeta
15 | 
16 | LRELU_SLOPE = 0.1
17 | 
18 | 
19 | class AMPBlock1(torch.nn.Module):
20 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
21 |         super(AMPBlock1, self).__init__()
22 | 
23 |         self.convs1 = nn.ModuleList([
24 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
25 |                                padding=get_padding(kernel_size, dilation[0]))),
26 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
27 |                                padding=get_padding(kernel_size, dilation[1]))),
28 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
29 |                                padding=get_padding(kernel_size, dilation[2])))
30 |         ])
31 |         self.convs1.apply(init_weights)
32 | 
33 |         self.convs2 = nn.ModuleList([
34 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
35 |                                padding=get_padding(kernel_size, 1))),
36 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
37 |                                padding=get_padding(kernel_size, 1))),
38 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
39 |                                padding=get_padding(kernel_size, 1)))
40 |         ])
41 |         self.convs2.apply(init_weights)
42 | 
43 |         self.num_layers = len(self.convs1) + len(self.convs2)  # total number of conv layers
44 | 
45 |         self.activations = nn.ModuleList([
46 |             Activation1d(
47 |                 activation=SnakeBeta(channels, alpha_logscale=True))
48 |             for _ in range(self.num_layers)
49 |         ])
50 | 
51 |     def forward(self, x):
52 |         acts1, acts2 = self.activations[::2], self.activations[1::2]
53 |         for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
54 |             xt = a1(x)
55 |             xt = c1(xt)
56 |             xt = a2(xt)
57 |             xt = c2(xt)
58 |             x = xt + x
59 | 
60 |         return x
61 | 
62 |     def remove_weight_norm(self):
63 |         for l in self.convs1:
64 |             remove_weight_norm(l)
65 |         for l in self.convs2:
66 |             remove_weight_norm(l)
67 | 
68 | 
69 | def init_weights(m, mean=0.0, std=0.01):
70 |     classname = m.__class__.__name__
71 |     if classname.find("Conv") != -1:
72 |         m.weight.data.normal_(mean, std)
73 | 
74 | 
75 | def apply_weight_norm(m):
76 |     classname = m.__class__.__name__
77 |     if classname.find("Conv") != -1:
78 |         weight_norm(m)
79 | 
80 | 
81 | def get_padding(kernel_size, dilation=1):
82 |     return int((kernel_size * dilation - dilation) / 2)
83 | 


--------------------------------------------------------------------------------
/Recipes/HiFiGAN_e2e.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import torch
 4 | import wandb
 5 | 
 6 | from Utility.path_to_transcript_dicts import *
 7 | 
 8 | 
 9 | def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
10 |     from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
11 |     from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
12 |     from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
13 |     from Modules.Vocoder.HiFiGAN_train_loop import train_loop
14 |     from Utility.storage_config import MODEL_DIR
15 | 
16 |     if gpu_id == "cpu":
17 |         device = torch.device("cpu")
18 |     else:
19 |         device = torch.device("cuda")
20 | 
21 |     if gpu_count > 1:
22 |         print("Multi GPU training not supported for HiFiGAN!")
23 |         import sys
24 |         sys.exit()
25 | 
26 |     print("Preparing")
27 |     if model_dir is not None:
28 |         model_save_dir = model_dir
29 |     else:
30 |         model_save_dir = os.path.join(MODEL_DIR, "HiFiGAN_e2e_scratch_direct_cont")
31 |     os.makedirs(model_save_dir, exist_ok=True)
32 | 
33 |     # To prepare the data, have a look at Modules/Vocoder/run_end-to-end_data_creation
34 | 
35 |     print("Collecting new data...")
36 | 
37 |     file_lists_for_this_run_combined = list()
38 |     file_lists_for_this_run_combined_synthetic = list()
39 | 
40 |     fl = list(build_path_to_transcript_libritts_all_clean().keys())
41 |     fisher_yates_shuffle(fl)
42 |     fisher_yates_shuffle(fl)
43 |     for i, f in enumerate(fl):
44 |         if os.path.exists(f.replace(".wav", "_synthetic_spec.pt")):
45 |             file_lists_for_this_run_combined.append(f)
46 |             file_lists_for_this_run_combined_synthetic.append(f.replace(".wav", "_synthetic_spec.pt"))
47 |     print("filepaths collected")
48 | 
49 |     train_set = HiFiGANDataset(list_of_original_paths=file_lists_for_this_run_combined,
50 |                                list_of_synthetic_paths=file_lists_for_this_run_combined_synthetic)
51 | 
52 |     generator = HiFiGAN()
53 |     discriminator = AvocodoHiFiGANJointDiscriminator()
54 | 
55 |     print("Training model")
56 |     if use_wandb:
57 |         wandb.init(
58 |             name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
59 |             id=wandb_resume_id,  # this is None if not specified in the command line arguments.
60 |             resume="must" if wandb_resume_id is not None else None)
61 |     train_loop(batch_size=24,
62 |                epochs=5180000,
63 |                generator=generator,
64 |                discriminator=discriminator,
65 |                train_dataset=train_set,
66 |                device=device,
67 |                epochs_per_save=1,
68 |                model_save_dir=model_save_dir,
69 |                path_to_checkpoint=resume_checkpoint,
70 |                resume=resume,
71 |                use_wandb=use_wandb,
72 |                finetune=finetune)
73 |     if use_wandb:
74 |         wandb.finish()
75 | 
76 | 
77 | def fisher_yates_shuffle(lst):
78 |     for i in range(len(lst) - 1, 0, -1):
79 |         j = random.randint(0, i)
80 |         lst[i], lst[j] = lst[j], lst[i]
81 | 


--------------------------------------------------------------------------------
/Utility/weight_averaging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | https://alexander-stasiuk.medium.com/pytorch-weights-averaging-e2c0fa611a0c
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | import torch
 8 | 
 9 | from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS
10 | from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
11 | 
12 | 
13 | def load_net_toucan(path):
14 |     check_dict = torch.load(path, map_location=torch.device("cpu"))
15 |     net = ToucanTTS(weights=check_dict["model"], config=check_dict["config"])
16 |     return net, check_dict["default_emb"]
17 | 
18 | 
19 | def load_net_bigvgan(path):
20 |     check_dict = torch.load(path, map_location=torch.device("cpu"))
21 |     net = HiFiGAN(weights=check_dict["generator"])
22 |     return net, None
23 | 
24 | 
25 | def get_n_recent_checkpoints_paths(checkpoint_dir, n=5):
26 |     print("selecting checkpoints...")
27 |     checkpoint_list = list()
28 |     for el in os.listdir(checkpoint_dir):
29 |         if el.endswith(".pt") and el.startswith("checkpoint_"):
30 |             try:
31 |                 checkpoint_list.append(int(el.split(".")[0].split("_")[1]))
32 |             except RuntimeError:
33 |                 pass
34 |     if len(checkpoint_list) == 0:
35 |         return None
36 |     elif len(checkpoint_list) < n:
37 |         n = len(checkpoint_list)
38 |     checkpoint_list.sort(reverse=True)
39 |     return [os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(step)) for step in checkpoint_list[:n]]
40 | 
41 | 
42 | def average_checkpoints(list_of_checkpoint_paths, load_func):
43 |     # COLLECT CHECKPOINTS
44 |     if list_of_checkpoint_paths is None or len(list_of_checkpoint_paths) == 0:
45 |         return None
46 |     checkpoints_weights = {}
47 |     model = None
48 |     default_embed = None
49 | 
50 |     # LOAD CHECKPOINTS
51 |     for path_to_checkpoint in list_of_checkpoint_paths:
52 |         print("loading model {}".format(path_to_checkpoint))
53 |         model, default_embed = load_func(path=path_to_checkpoint)
54 |         checkpoints_weights[path_to_checkpoint] = dict(model.named_parameters())
55 | 
56 |     # AVERAGE CHECKPOINTS
57 |     params = model.named_parameters()
58 |     dict_params = dict(params)
59 |     checkpoint_amount = len(checkpoints_weights)
60 |     print("averaging...")
61 |     for name in dict_params.keys():
62 |         custom_params = None
63 |         for _, checkpoint_parameters in checkpoints_weights.items():
64 |             if custom_params is None:
65 |                 custom_params = checkpoint_parameters[name].data
66 |             else:
67 |                 custom_params += checkpoint_parameters[name].data
68 |         dict_params[name].data.copy_(custom_params / checkpoint_amount)
69 |     model_dict = model.state_dict()
70 |     model_dict.update(dict_params)
71 |     model.load_state_dict(model_dict)
72 |     model.eval()
73 |     return model, default_embed
74 | 
75 | 
76 | def save_model_for_use(model, name="", default_embed=None, dict_name="model"):
77 |     print("saving model...")
78 |     torch.save({dict_name: model.state_dict(), "default_emb": default_embed, "config": model.config}, name)
79 |     print("...done!")
80 | 
81 | 
82 | def count_parameters(net):
83 |     return sum(p.numel() for p in net.parameters() if p.requires_grad)
84 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/ToucanTTSLoss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Taken from ESPNet
 3 | Adapted by Flux
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | from Modules.GeneralLayers.DurationPredictor import DurationPredictorLoss
 9 | from Utility.utils import make_non_pad_mask
10 | 
11 | 
12 | class ToucanTTSLoss(torch.nn.Module):
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.l1_criterion = torch.nn.L1Loss(reduction="none")
17 |         self.l2_criterion = torch.nn.MSELoss(reduction="none")
18 |         self.duration_criterion = DurationPredictorLoss(reduction="none")
19 | 
20 |     def forward(self, predicted_features, gold_features, features_lengths, text_lengths, gold_durations, predicted_durations, predicted_pitch, predicted_energy, gold_pitch, gold_energy):
21 |         """
22 |         Args:
23 |             predicted_features (Tensor): Batch of outputs before postnets (B, Lmax, odim).
24 |             gold_features (Tensor): Batch of target features (B, Lmax, odim).
25 |             features_lengths (LongTensor): Batch of the lengths of each target (B,).
26 |             gold_durations (LongTensor): Batch of durations (B, Tmax).
27 |             gold_pitch (LongTensor): Batch of pitch (B, Tmax).
28 |             gold_energy (LongTensor): Batch of energy (B, Tmax).
29 |             predicted_durations (LongTensor): Batch of outputs of duration predictor (B, Tmax).
30 |             predicted_pitch (LongTensor): Batch of outputs of pitch predictor (B, Tmax).
31 |             predicted_energy (LongTensor): Batch of outputs of energy predictor (B, Tmax).
32 |             text_lengths (LongTensor): Batch of the lengths of each input (B,).
33 | 
34 |         Returns:
35 |             Tensor: L1 loss value.
36 |             Tensor: Duration loss value
37 |         """
38 | 
39 |         # calculate losses
40 |         distance_loss = self.l1_criterion(predicted_features, gold_features)
41 |         duration_loss = self.duration_criterion(predicted_durations, gold_durations)
42 |         pitch_loss = self.l2_criterion(predicted_pitch, gold_pitch)
43 |         energy_loss = self.l2_criterion(predicted_energy, gold_energy)
44 | 
45 |         # make weighted masks to ensure that long samples and short samples are all equally important
46 |         out_masks = make_non_pad_mask(features_lengths).unsqueeze(-1).to(gold_features.device)
47 |         out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
48 |         out_weights /= gold_features.size(0) * gold_features.size(-1)
49 |         duration_masks = make_non_pad_mask(text_lengths).to(gold_features.device)
50 |         duration_weights = (duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float())
51 |         variance_masks = duration_masks.unsqueeze(-1)
52 |         variance_weights = duration_weights.unsqueeze(-1)
53 | 
54 |         # apply weighted masks
55 |         distance_loss = distance_loss.mul(out_weights).masked_select(out_masks).sum()
56 |         duration_loss = (duration_loss.mul(duration_weights).masked_select(duration_masks).sum())
57 |         pitch_loss = pitch_loss.mul(variance_weights).masked_select(variance_masks).sum()
58 |         energy_loss = (energy_loss.mul(variance_weights).masked_select(variance_masks).sum())
59 | 
60 |         return distance_loss, duration_loss, pitch_loss, energy_loss
61 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/MultiLayeredConv1d.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Tomoki Hayashi
 2 | # MIT License (https://opensource.org/licenses/MIT)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | """
 6 | Layer modules for FFT block in FastSpeech (Feed-forward Transformer).
 7 | """
 8 | 
 9 | import torch
10 | 
11 | 
12 | class MultiLayeredConv1d(torch.nn.Module):
13 |     """
14 |     Multi-layered conv1d for Transformer block.
15 | 
16 |     This is a module of multi-layered conv1d designed
17 |     to replace positionwise feed-forward network
18 |     in Transformer block, which is introduced in
19 |     `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
20 | 
21 |     .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
22 |         https://arxiv.org/pdf/1905.09263.pdf
23 |     """
24 | 
25 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
26 |         """
27 |         Initialize MultiLayeredConv1d module.
28 | 
29 |         Args:
30 |             in_chans (int): Number of input channels.
31 |             hidden_chans (int): Number of hidden channels.
32 |             kernel_size (int): Kernel size of conv1d.
33 |             dropout_rate (float): Dropout rate.
34 |         """
35 |         super(MultiLayeredConv1d, self).__init__()
36 |         self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
37 |         self.w_2 = torch.nn.Conv1d(hidden_chans, in_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
38 |         self.dropout = torch.nn.Dropout(dropout_rate)
39 | 
40 |     def forward(self, x):
41 |         """
42 |         Calculate forward propagation.
43 | 
44 |         Args:
45 |             x (torch.Tensor): Batch of input tensors (B, T, in_chans).
46 | 
47 |         Returns:
48 |             torch.Tensor: Batch of output tensors (B, T, hidden_chans).
49 |         """
50 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
51 |         return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
52 | 
53 | 
54 | class Conv1dLinear(torch.nn.Module):
55 |     """
56 |     Conv1D + Linear for Transformer block.
57 | 
58 |     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
59 |     """
60 | 
61 |     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
62 |         """
63 |         Initialize Conv1dLinear module.
64 | 
65 |         Args:
66 |             in_chans (int): Number of input channels.
67 |             hidden_chans (int): Number of hidden channels.
68 |             kernel_size (int): Kernel size of conv1d.
69 |             dropout_rate (float): Dropout rate.
70 |         """
71 |         super(Conv1dLinear, self).__init__()
72 |         self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
73 |         self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
74 |         self.dropout = torch.nn.Dropout(dropout_rate)
75 | 
76 |     def forward(self, x):
77 |         """
78 |         Calculate forward propagation.
79 | 
80 |         Args:
81 |             x (torch.Tensor): Batch of input tensors (B, T, in_chans).
82 | 
83 |         Returns:
84 |             torch.Tensor: Batch of output tensors (B, T, hidden_chans).
85 |         """
86 |         x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
87 |         return self.w_2(self.dropout(x))
88 | 


--------------------------------------------------------------------------------
/Recipes/finetuning_example_simple.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example script for fine-tuning the pretrained model to your own data.
 3 | 
 4 | Comments in ALL CAPS are instructions
 5 | """
 6 | 
 7 | import time
 8 | 
 9 | import torch
10 | import wandb
11 | 
12 | from Utility.path_to_transcript_dicts import *
13 | 
14 | 
15 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
16 |     from huggingface_hub import hf_hub_download
17 |     from torch.utils.data import ConcatDataset
18 | 
19 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
20 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
21 |     from Utility.corpus_preparation import prepare_tts_corpus
22 |     from Utility.storage_config import MODEL_DIR
23 |     from Utility.storage_config import PREPROCESSING_DIR
24 | 
25 |     if gpu_id == "cpu":
26 |         device = torch.device("cpu")
27 |     else:
28 |         device = torch.device("cuda")
29 |     assert gpu_count == 1  # distributed finetuning is not supported
30 | 
31 |     # IF YOU'RE ADDING A NEW LANGUAGE, YOU MIGHT NEED TO ADD HANDLING FOR IT IN Preprocessing/TextFrontend.py
32 | 
33 |     print("Preparing")
34 | 
35 |     if model_dir is not None:
36 |         save_dir = model_dir
37 |     else:
38 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_FinetuningExample")  # RENAME TO SOMETHING MEANINGFUL FOR YOUR DATA
39 |     os.makedirs(save_dir, exist_ok=True)
40 | 
41 |     train_data = prepare_tts_corpus(transcript_dict=build_path_to_transcript_integration_test(),
42 |                                     corpus_dir=os.path.join(PREPROCESSING_DIR, "integration_test"),
43 |                                     lang="eng")  # CHANGE THE TRANSCRIPT DICT, THE NAME OF THE CACHE DIRECTORY AND THE LANGUAGE TO YOUR NEEDS
44 | 
45 |     model = ToucanTTS()
46 | 
47 |     if use_wandb:
48 |         wandb.init(
49 |             name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
50 |             id=wandb_resume_id,  # this is None if not specified in the command line arguments.
51 |             resume="must" if wandb_resume_id is not None else None)
52 | 
53 |     print("Training model")
54 |     train_loop(net=model,
55 |                datasets=[train_data],
56 |                device=device,
57 |                save_directory=save_dir,
58 |                batch_size=12,  # YOU MIGHT GET OUT OF MEMORY ISSUES ON SMALL GPUs, IF SO, DECREASE THIS.
59 |                eval_lang="eng",  # THE LANGUAGE YOUR PROGRESS PLOTS WILL BE MADE IN
60 |                warmup_steps=500,
61 |                lr=1e-5,  # if you have enough data (over ~1000 datapoints) you can increase this up to 1e-4 and it will still be stable, but learn quicker.
62 |                # DOWNLOAD THESE INITIALIZATION MODELS FROM THE RELEASE PAGE OF THE GITHUB OR RUN THE DOWNLOADER SCRIPT TO GET THEM AUTOMATICALLY
63 |                path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") if resume_checkpoint is None else resume_checkpoint,
64 |                fine_tune=True if resume_checkpoint is None and not resume else finetune,
65 |                resume=resume,
66 |                steps=5000,
67 |                use_wandb=use_wandb,
68 |                train_samplers=[torch.utils.data.RandomSampler(train_data)],
69 |                gpu_count=1)
70 |     if use_wandb:
71 |         wandb.finish()
72 | 


--------------------------------------------------------------------------------
/Modules/EmbeddingModel/StyleEmbedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from Modules.EmbeddingModel.GST import GSTStyleEncoder
 4 | from Modules.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
 5 | 
 6 | 
 7 | class StyleEmbedding(torch.nn.Module):
 8 |     """
 9 |     The style embedding should provide information of the speaker and their speaking style
10 | 
11 |     The feedback signal for the module will come from the TTS objective, so it doesn't have a dedicated train loop.
12 |     The train loop does however supply supervision in the form of a barlow twins objective.
13 | 
14 |     See the git history for some other approaches for style embedding, like the SWIN transformer
15 |     and a simple LSTM baseline. GST turned out to be the best.
16 |     """
17 | 
18 |     def __init__(self, embedding_dim=16, style_tts_encoder=False):
19 |         super().__init__()
20 |         self.embedding_dim = embedding_dim
21 |         self.use_gst = not style_tts_encoder
22 |         if style_tts_encoder:
23 |             self.style_encoder = StyleTTSEncoder(style_dim=embedding_dim)
24 |         else:
25 |             self.style_encoder = GSTStyleEncoder(gst_token_dim=embedding_dim)
26 | 
27 |     def forward(self,
28 |                 batch_of_feature_sequences,
29 |                 batch_of_feature_sequence_lengths):
30 |         """
31 |         Args:
32 |             batch_of_feature_sequences: b is the batch axis, 128 features per timestep
33 |                                    and l time-steps, which may include padding
34 |                                    for most elements in the batch (b, l, 128)
35 |             batch_of_feature_sequence_lengths: indicate for every element in the batch,
36 |                                           what the true length is, since they are
37 |                                           all padded to the length of the longest
38 |                                           element in the batch (b, 1)
39 |         Returns:
40 |             batch of n dimensional embeddings (b,n)
41 |         """
42 | 
43 |         minimum_sequence_length = 512
44 |         specs = list()
45 |         for index, spec_length in enumerate(batch_of_feature_sequence_lengths):
46 |             spec = batch_of_feature_sequences[index][:spec_length]
47 |             # double the length at least once, then check
48 |             spec = spec.repeat((2, 1))
49 |             current_spec_length = len(spec)
50 |             while current_spec_length < minimum_sequence_length:
51 |                 # make it longer
52 |                 spec = spec.repeat((2, 1))
53 |                 current_spec_length = len(spec)
54 |             specs.append(spec[:minimum_sequence_length])
55 | 
56 |         spec_batch = torch.stack(specs, dim=0)
57 |         return self.style_encoder(speech=spec_batch)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     style_emb = StyleEmbedding(style_tts_encoder=False)
62 |     print(f"GST parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}")
63 | 
64 |     seq_length = 398
65 |     print(style_emb(torch.randn(5, seq_length, 512),
66 |                     torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape)
67 | 
68 |     style_emb = StyleEmbedding(style_tts_encoder=True)
69 |     print(f"StyleTTS encoder parameter count: {sum(p.numel() for p in style_emb.style_encoder.parameters() if p.requires_grad)}")
70 | 
71 |     seq_length = 398
72 |     print(style_emb(torch.randn(5, seq_length, 512),
73 |                     torch.tensor([seq_length, seq_length, seq_length, seq_length, seq_length])).shape)
74 | 


--------------------------------------------------------------------------------
/Utility/WarmupScheduler.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import _LRScheduler
 2 | 
 3 | 
 4 | # This is rather suboptimal, because we need to import a protected class. Unfortunately, I don't see another way.
 5 | 
 6 | 
 7 | class ToucanWarmupScheduler(_LRScheduler):
 8 |     """
 9 |     A warmup scheduler that should be called after every batch.
10 |     """
11 | 
12 |     def __init__(self, optimizer, peak_lr=0.0002, warmup_steps=20000, max_steps=200000, last_epoch=-1):
13 |         self.warmup_steps = warmup_steps
14 |         self.peak_lr = peak_lr
15 |         self.max_steps = max_steps
16 |         self.plateau = self.warmup_steps * 4
17 |         self.last_lr = 0.0
18 |         # __init__() must be invoked before setting field
19 |         # because step() is also invoked in __init__()
20 |         super().__init__(optimizer, last_epoch)
21 | 
22 |     def __repr__(self):
23 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
24 | 
25 |     def get_lr(self):
26 |         step_num = self.last_epoch + 1
27 |         if step_num <= self.warmup_steps:
28 |             lr = self.peak_lr * min(step_num / self.warmup_steps, 1.0)
29 |             self.last_lr = lr
30 |             return [lr for _ in self.base_lrs]
31 |         elif step_num < self.warmup_steps + self.plateau:
32 |             self.last_lr = self.peak_lr
33 |             return [self.peak_lr for _ in self.base_lrs]
34 |         else:
35 |             scale = 1 - (((step_num - (self.warmup_steps + self.plateau)) / self.max_steps) / (self.max_steps / 10))
36 |             self.last_lr = max(self.last_lr * scale, 1e-7)
37 |             return [self.last_lr for _ in self.base_lrs]
38 | 
39 | 
40 | class WarmupScheduler(_LRScheduler):
41 |     """
42 |     The WarmupLR scheduler
43 |     This scheduler is almost same as NoamLR Scheduler except for following difference:
44 |     NoamLR:
45 |         lr = optimizer.lr * model_size ** -0.5
46 |              * min(step ** -0.5, step * warmup_step ** -1.5)
47 |     WarmupLR:
48 |         lr = optimizer.lr * warmup_step ** 0.5
49 |              * min(step ** -0.5, step * warmup_step ** -1.5)
50 |     Note that the maximum lr equals to optimizer.lr in this scheduler.
51 | 
52 |     Taken from ESPnet
53 |     """
54 | 
55 |     def __init__(self, optimizer, warmup_steps=25000, last_epoch=-1):
56 |         self.warmup_steps = warmup_steps
57 |         # __init__() must be invoked before setting field
58 |         # because step() is also invoked in __init__()
59 |         super().__init__(optimizer, last_epoch)
60 | 
61 |     def __repr__(self):
62 |         return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
63 | 
64 |     def get_lr(self):
65 |         step_num = self.last_epoch + 1
66 |         return [lr * self.warmup_steps ** 0.5 * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5) for lr in
67 |                 self.base_lrs]
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     lrs = list()
72 |     warmup_steps = 30000
73 |     peak_lr = 0.0005
74 |     max_steps = 800000
75 |     plateau_size = warmup_steps * 5
76 |     for step_num in range(max_steps):
77 |         if step_num <= warmup_steps:
78 |             lr = peak_lr * min(step_num / warmup_steps, 1.0)
79 |             lrs.append(lr)
80 |         elif step_num < warmup_steps + plateau_size:
81 |             lrs.append(peak_lr)
82 |         else:
83 |             scale = 1 - (((step_num - (warmup_steps + plateau_size)) / max_steps) / (max_steps / 10))
84 |             lrs.append(max(lrs[-1] * scale, 1e-7))
85 |     import matplotlib.pyplot as plt
86 | 
87 |     plt.plot(lrs)
88 |     plt.show()
89 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/dataset/speaker_embeddings_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | class SpeakerEmbeddingsDataset(torch.utils.data.Dataset):
 8 | 
 9 |     def __init__(self, feature_path, device, mode='utterance'):
10 |         super(SpeakerEmbeddingsDataset, self).__init__()
11 | 
12 |         modes = ['utterance', 'speaker']
13 |         assert mode in modes, f'mode: {mode} is not supported'
14 |         if mode == 'utterance':
15 |             self.mode = 'utt'
16 |         elif mode == 'speaker':
17 |             self.mode = 'spk'
18 | 
19 |         self.device = device
20 | 
21 |         self.x, self.speakers = self._load_features(feature_path)
22 |         # unique_speakers = set(self.speakers)
23 |         # spk2class = dict(zip(unique_speakers, range(len(unique_speakers))))
24 |         # #self.x = self._reformat_features(self.x)
25 |         # self.y = torch.tensor([spk2class[spk] for spk in self.speakers]).to(self.device)
26 |         # self.class2spk = dict(zip(spk2class.values(), spk2class.keys()))
27 | 
28 |     def __len__(self):
29 |         return len(self.speakers)
30 | 
31 |     def __getitem__(self, index):
32 |         embedding = self.normalize_embedding(self.x[index])
33 |         # speaker_id = self.y[index]
34 |         return embedding, torch.zeros([0])
35 | 
36 |     def normalize_embedding(self, vector):
37 |         return torch.sub(vector, self.mean) / self.std
38 | 
39 |     def get_speaker(self, label):
40 |         return self.class2spk[label]
41 | 
42 |     def get_embedding_dim(self):
43 |         return self.x.shape[-1]
44 | 
45 |     def get_num_speaker(self):
46 |         return len(torch.unique((self.y)))
47 | 
48 |     def set_labels(self, labels):
49 |         self.y_old = self.y
50 |         self.y = torch.full(size=(len(self),), fill_value=labels).to(self.device)
51 |         # if isinstance(labels, int) or isinstance(labels, float):
52 |         #    self.y = torch.full(size=len(self), fill_value=labels)
53 |         # elif len(labels) == len(self):
54 |         #    self.y = torch.tensor(labels)
55 | 
56 |     def _load_features(self, feature_path):
57 |         if os.path.isfile(feature_path):
58 |             vectors = torch.load(feature_path, map_location=self.device)
59 |             if isinstance(vectors, list):
60 |                 vectors = torch.stack(vectors)
61 | 
62 |             self.mean = torch.mean(vectors)
63 |             self.std = torch.std(vectors)
64 |             return vectors, torch.zeros(vectors.size(0))
65 |         else:
66 |             vectors = torch.load(feature_path, map_location=self.device)
67 | 
68 |         self.mean = torch.mean(vectors)
69 |         self.std = torch.std(vectors)
70 | 
71 |         spk2idx = {}
72 |         with open(feature_path / f'{self.mode}2idx', 'r') as f:
73 |             for line in f:
74 |                 split_line = line.strip().split()
75 |                 if len(split_line) == 2:
76 |                     spk2idx[split_line[0].strip()] = int(split_line[1])
77 | 
78 |         speakers, indices = zip(*spk2idx.items())
79 | 
80 |         if (feature_path / 'utt2spk').exists():  # spk2idx contains utt_ids not speaker_ids
81 |             utt2spk = {}
82 |             with open(feature_path / 'utt2spk', 'r') as f:
83 |                 for line in f:
84 |                     split_line = line.strip().split()
85 |                     if len(split_line) == 2:
86 |                         utt2spk[split_line[0].strip()] = split_line[1].strip()
87 | 
88 |             speakers = [utt2spk[utt] for utt in speakers]
89 | 
90 |         return vectors[np.array(indices)], speakers
91 | 
92 |     def _reformat_features(self, features):
93 |         if len(features.shape) == 2:
94 |             return features.reshape(features.shape[0], 1, 1, features.shape[1])
95 | 


--------------------------------------------------------------------------------
/Modules/ControllabilityGAN/GAN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from Modules.ControllabilityGAN.wgan.init_wgan import create_wgan
  4 | 
  5 | 
  6 | class GanWrapper:
  7 | 
  8 |     def __init__(self, path_wgan, device, num_cached_voices=10):
  9 |         self.device = device
 10 |         self.path_wgan = path_wgan
 11 | 
 12 |         self.mean = None
 13 |         self.std = None
 14 |         self.wgan = None
 15 |         self.normalize = True
 16 | 
 17 |         torch.manual_seed(160923)
 18 | 
 19 |         self.load_model(path_wgan)
 20 | 
 21 |         self.U = self.compute_controllability()
 22 | 
 23 |         self.z_list = list()
 24 | 
 25 |         while len(self.z_list) < num_cached_voices + 2:
 26 |             z = self.wgan.G.sample_latent(1, self.wgan.G.z_dim, temperature=0.4)
 27 |             l1_distances = [100.0]
 28 |             for other_z in self.z_list:
 29 |                 l1_distances.append(torch.nn.functional.l1_loss(z, other_z))
 30 |             print("dist: ", min(l1_distances), len(self.z_list))
 31 |             if min(l1_distances) > 0.5:
 32 |                 self.z_list.append(z)
 33 |         self.z = self.z_list[0]
 34 | 
 35 |     def set_latent(self, seed):
 36 |         self.z = self.z_list[seed]
 37 | 
 38 |     def load_model(self, path):
 39 |         gan_checkpoint = torch.load(path, map_location="cpu")
 40 | 
 41 |         self.wgan = create_wgan(parameters=gan_checkpoint['model_parameters'], device=self.device)
 42 |         # Create a new state dict without 'module.' prefix
 43 |         new_state_dict_G = {}
 44 |         for key, value in gan_checkpoint['generator_state_dict'].items():
 45 |             # Remove 'module.' prefix
 46 |             new_key = key.replace('module.', '')
 47 |             new_state_dict_G[new_key] = value
 48 | 
 49 |         new_state_dict_D = {}
 50 |         for key, value in gan_checkpoint['critic_state_dict'].items():
 51 |             # Remove 'module.' prefix
 52 |             new_key = key.replace('module.', '')
 53 |             new_state_dict_D[new_key] = value
 54 | 
 55 |         self.wgan.G.load_state_dict(new_state_dict_G)
 56 |         self.wgan.D.load_state_dict(new_state_dict_D)
 57 | 
 58 |         self.mean = gan_checkpoint["dataset_mean"]
 59 |         self.std = gan_checkpoint["dataset_std"]
 60 | 
 61 |     def compute_controllability(self, n_samples=200000):
 62 |         _, intermediate, z = self.wgan.sample_generator(num_samples=n_samples, nograd=True, return_intermediate=True)
 63 |         intermediate = intermediate.cpu()
 64 |         z = z.cpu()
 65 |         U = self.controllable_speakers(intermediate, z)
 66 |         return U
 67 | 
 68 |     def controllable_speakers(self, intermediate, z):
 69 |         pca = torch.pca_lowrank(intermediate)
 70 |         mu = intermediate.mean()
 71 |         X = torch.matmul((intermediate - mu), pca[2])
 72 |         U = torch.linalg.lstsq(X, z)
 73 |         return U
 74 | 
 75 |     def get_original_embed(self):
 76 |         self.wgan.G.eval()
 77 |         embed_original = self.wgan.G.module.forward(self.z.to(self.device))
 78 | 
 79 |         if self.normalize:
 80 |             embed_original = inverse_normalize(
 81 |                 embed_original.cpu(),
 82 |                 self.mean.cpu().unsqueeze(0),
 83 |                 self.std.cpu().unsqueeze(0)
 84 |             )
 85 |         return embed_original
 86 | 
 87 |     def modify_embed(self, x):
 88 |         self.wgan.G.eval()
 89 |         z_new = self.z.squeeze() + torch.matmul(self.U.solution.t(), x)
 90 |         embed_modified = self.wgan.G.forward(z_new.unsqueeze(0).to(self.device))
 91 |         if self.normalize:
 92 |             embed_modified = inverse_normalize(
 93 |                 embed_modified.cpu(),
 94 |                 self.mean.cpu().unsqueeze(0),
 95 |                 self.std.cpu().unsqueeze(0)
 96 |             )
 97 |         return embed_modified
 98 | 
 99 | 
100 | def inverse_normalize(tensor, mean, std):
101 |     return tensor * std + mean
102 | 


--------------------------------------------------------------------------------
/Utility/corpus_preparation.py:
--------------------------------------------------------------------------------
 1 | import torch.multiprocessing
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | from Modules.Aligner.CodecAlignerDataset import CodecAlignerDataset
 5 | from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner
 6 | from Modules.ToucanTTS.TTSDataset import TTSDataset
 7 | from Utility.path_to_transcript_dicts import *
 8 | from Utility.storage_config import MODEL_DIR
 9 | 
10 | 
11 | def prepare_aligner_corpus(transcript_dict, corpus_dir, lang, device, phone_input=False,
12 |                            gpu_count=1,
13 |                            rank=0):
14 |     return CodecAlignerDataset(transcript_dict,
15 |                                cache_dir=corpus_dir,
16 |                                lang=lang,
17 |                                loading_processes=5,  # this can be increased for massive clusters, but the overheads that are introduced are kind of not really worth it
18 |                                device=device,
19 |                                phone_input=phone_input,
20 |                                gpu_count=gpu_count,
21 |                                rank=rank)
22 | 
23 | 
24 | def prepare_tts_corpus(transcript_dict,
25 |                        corpus_dir,
26 |                        lang,
27 |                        # For small datasets it's best to turn this off and instead inspect the data with the scorer, if there are any issues.
28 |                        fine_tune_aligner=True,
29 |                        use_reconstruction=True,
30 |                        phone_input=False,
31 |                        save_imgs=False,
32 |                        gpu_count=1,
33 |                        rank=0):
34 |     """
35 |     create an aligner dataset,
36 |     fine-tune an aligner,
37 |     create a TTS dataset,
38 |     return it.
39 | 
40 |     Automatically skips parts that have been done before.
41 |     """
42 |     if not os.path.exists(os.path.join(corpus_dir, "tts_train_cache.pt")):
43 |         if fine_tune_aligner:
44 |             aligner_dir = os.path.join(corpus_dir, "Aligner")
45 |             aligner_loc = os.path.join(corpus_dir, "Aligner", "aligner.pt")
46 | 
47 |             if not os.path.exists(os.path.join(corpus_dir, "aligner_train_cache.pt")):
48 |                 prepare_aligner_corpus(transcript_dict, corpus_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda"))
49 | 
50 |             if not os.path.exists(os.path.join(aligner_dir, "aligner.pt")):
51 |                 aligner_datapoints = prepare_aligner_corpus(transcript_dict, corpus_dir=corpus_dir, lang=lang, phone_input=phone_input, device=torch.device("cuda"))
52 |                 train_aligner(train_dataset=aligner_datapoints,
53 |                               device=torch.device("cuda"),
54 |                               save_directory=aligner_dir,
55 |                               steps=min(len(aligner_datapoints) // 2, 10000),  # relatively good finetuning heuristic
56 |                               batch_size=16 if len(aligner_datapoints) > 16 else len(aligner_datapoints) // 2,
57 |                               path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="Aligner.pt"),
58 |                               fine_tune=True,
59 |                               debug_img_path=aligner_dir,
60 |                               resume=False,
61 |                               use_reconstruction=use_reconstruction)
62 |         else:
63 |             aligner_loc = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="Aligner.pt")
64 |     else:
65 |         aligner_loc = None
66 |     return TTSDataset(transcript_dict,
67 |                       acoustic_checkpoint_path=aligner_loc,
68 |                       cache_dir=corpus_dir,
69 |                       device=torch.device("cuda"),
70 |                       lang=lang,
71 |                       save_imgs=save_imgs,
72 |                       gpu_count=gpu_count,
73 |                       rank=rank)
74 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/LanguageEmbeddingSpaceStructureLoss.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import torch
 4 | 
 5 | from Preprocessing.multilinguality.create_distance_lookups import CacheCreator
 6 | from Utility.utils import load_json_from_path
 7 | 
 8 | 
 9 | class LanguageEmbeddingSpaceStructureLoss(torch.nn.Module):
10 | 
11 |     def __init__(self):
12 |         super().__init__()
13 |         cc = CacheCreator(cache_root="Preprocessing/multilinguality")
14 |         if not os.path.exists('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json'):
15 |             cc.create_tree_cache(cache_root="Preprocessing/multilinguality")
16 |         if not os.path.exists('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json'):
17 |             cc.create_map_cache(cache_root="Preprocessing/multilinguality")
18 | 
19 |         self.tree_dist = load_json_from_path('Preprocessing/multilinguality/lang_1_to_lang_2_to_tree_dist.json')
20 |         self.map_dist = load_json_from_path('Preprocessing/multilinguality/lang_1_to_lang_2_to_map_dist.json')
21 |         # with open("Preprocessing/multilinguality/asp_dict.pkl", 'rb') as dictfile:
22 |         #    self.asp_sim = pickle.load(dictfile)
23 |         # self.lang_list = list(self.asp_sim.keys())  # list of all languages, to get lang_b's index
24 | 
25 |         self.largest_value_map_dist = 0.0
26 |         for _, values in self.map_dist.items():
27 |             for _, value in values.items():
28 |                 self.largest_value_map_dist = max(self.largest_value_map_dist, value)
29 | 
30 |         self.iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
31 |         self.ids_to_iso_codes = {v: k for k, v in self.iso_codes_to_ids.items()}
32 | 
33 |     def forward(self, language_ids, language_embeddings):
34 |         """
35 |         Args:
36 |             language_ids (Tensor): IDs of languages in the same order as the embeddings to calculate the distances according to the metrics.
37 |             language_embeddings (Tensor): Batch of language embeddings, of which the distances will be compared to the distances according to the metrics.
38 | 
39 |         Returns:
40 |             Tensor: Language Embedding Structure Loss Value
41 |         """
42 | 
43 |         losses = list()
44 |         for language_id_1, language_embedding_1 in zip(language_ids, language_embeddings):
45 |             for language_id_2, language_embedding_2 in zip(language_ids, language_embeddings):
46 |                 if language_id_1 != language_id_2:
47 |                     embed_dist = torch.nn.functional.l1_loss(language_embedding_1, language_embedding_2)
48 |                     lang_1 = self.ids_to_iso_codes[language_id_1]
49 |                     lang_2 = self.ids_to_iso_codes[language_id_2]
50 | 
51 |                     # Value Range Normalized Tree Dist
52 |                     try:
53 |                         tree_dist = self.tree_dist[lang_1][lang_2]
54 |                     except KeyError:
55 |                         tree_dist = self.tree_dist[lang_2][lang_1]
56 | 
57 |                     # Value Range Normalized Map Dist
58 |                     try:
59 |                         map_dist = self.map_dist[lang_1][lang_2] / self.largest_value_map_dist
60 |                     except KeyError:
61 |                         map_dist = self.map_dist[lang_2][lang_1] / self.largest_value_map_dist
62 | 
63 |                     # Value Range Normalized ASP Dist
64 |                     # lang_2_idx = self.lang_list.index(lang_2)
65 |                     # asp_dist = 1.0 - self.asp_sim[lang_1][lang_2_idx]  # it's a similarity measure that goes from 0 to 1, so we subtract it from 1 to turn it into a distance
66 | 
67 |                     # Average distance should be similar to embedding distance to bring some structure into the embedding-space
68 |                     # metric_distance = (torch.tensor(tree_dist) + torch.tensor(map_dist) + torch.tensor(asp_dist)) / 3
69 |                     metric_distance = (torch.tensor(tree_dist) + torch.tensor(map_dist)) / 2
70 |                     losses.append(torch.nn.functional.l1_loss(embed_dist, metric_distance))
71 | 
72 |         return sum(losses) / len(losses)
73 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/CodecDiscriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def weights_init_D(m):
 6 |     classname = m.__class__.__name__
 7 |     if classname.find('Conv') != -1:
 8 |         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
 9 |     elif classname.find('BatchNorm') != -1:
10 |         nn.init.constant_(m.weight, 1)
11 |         nn.init.constant_(m.bias, 0)
12 | 
13 | 
14 | class SpectrogramDiscriminator(torch.nn.Module):
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.D = DiscriminatorNet()
18 |         self.D.apply(weights_init_D)
19 | 
20 |     def _generator_feedback(self, data_generated, data_real):
21 |         for p in self.D.parameters():
22 |             p.requires_grad = False  # freeze critic
23 | 
24 |         score_fake, fmap_fake = self.D(data_generated)
25 |         _, fmap_real = self.D(data_real)
26 | 
27 |         feature_matching_loss = 0.0
28 |         for feat_fake, feat_real in zip(fmap_fake, fmap_real):
29 |             feature_matching_loss += nn.functional.l1_loss(feat_fake, feat_real.detach())
30 | 
31 |         discr_loss = nn.functional.mse_loss(input=score_fake, target=torch.ones(score_fake.shape, device=score_fake.device), reduction="mean")
32 | 
33 |         return feature_matching_loss + discr_loss
34 | 
35 |     def _discriminator_feature_matching(self, data_generated, data_real):
36 |         for p in self.D.parameters():
37 |             p.requires_grad = True  # unfreeze critic
38 |         self.D.train()
39 | 
40 |         score_fake, _ = self.D(data_generated)
41 |         score_real, _ = self.D(data_real)
42 | 
43 |         discr_loss = 0.0
44 |         discr_loss = discr_loss + nn.functional.mse_loss(input=score_fake, target=torch.zeros(score_fake.shape, device=score_fake.device), reduction="mean")
45 |         discr_loss = discr_loss + nn.functional.mse_loss(input=score_real, target=torch.ones(score_real.shape, device=score_real.device), reduction="mean")
46 | 
47 |         return discr_loss
48 | 
49 |     def calc_discriminator_loss(self, data_generated, data_real):
50 |         return self._discriminator_feature_matching(data_generated.detach(), data_real)
51 | 
52 |     def calc_generator_feedback(self, data_generated, data_real):
53 |         return self._generator_feedback(data_generated, data_real)
54 | 
55 | 
56 | class DiscriminatorNet(nn.Module):
57 |     def __init__(self):
58 |         super().__init__()
59 |         self.filters = nn.ModuleList([
60 |             nn.utils.weight_norm(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
61 |             nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
62 |             nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
63 |             nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
64 |             nn.utils.weight_norm(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
65 |         ])
66 | 
67 |         self.out = nn.utils.weight_norm(nn.Conv2d(32, 1, 3, 1, 1))
68 | 
69 |         self.fc = nn.Linear(900, 1)  # this needs to be changed everytime the window length is changes. It would be nice if this could be done dynamically.
70 | 
71 |     def forward(self, y):
72 |         feature_maps = list()
73 |         feature_maps.append(y)
74 |         for d in self.filters:
75 |             y = d(y)
76 |             feature_maps.append(y)
77 |             y = nn.functional.leaky_relu(y, 0.1)
78 |         y = self.out(y)
79 |         feature_maps.append(y)
80 |         y = torch.flatten(y, 1, -1)
81 |         y = self.fc(y)
82 | 
83 |         return y, feature_maps
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     d = SpectrogramDiscriminator()
88 |     fake = torch.randn([2, 100, 72])  # [Batch, Sequence Length, Spectrogram Buckets]
89 |     real = torch.randn([2, 100, 72])  # [Batch, Sequence Length, Spectrogram Buckets]
90 | 
91 |     critic_loss = d.calc_discriminator_loss((fake.unsqueeze(1)), real.unsqueeze(1))
92 |     generator_loss = d.calc_generator_feedback(fake.unsqueeze(1), real.unsqueeze(1))
93 |     print(critic_loss)
94 |     print(generator_loss)
95 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/VariancePredictor.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Tomoki Hayashi
 2 | # MIT License (https://opensource.org/licenses/MIT)
 3 | # Adapted by Florian Lux 2023
 4 | 
 5 | from abc import ABC
 6 | 
 7 | import torch
 8 | 
 9 | from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
10 | from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
11 | from Modules.GeneralLayers.LayerNorm import LayerNorm
12 | from Utility.utils import integrate_with_utt_embed
13 | 
14 | 
15 | class VariancePredictor(torch.nn.Module, ABC):
16 |     """
17 |     Variance predictor module.
18 | 
19 |     This is a module of variance predictor described in `FastSpeech 2:
20 |     Fast and High-Quality End-to-End Text to Speech`_.
21 | 
22 |     .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
23 |         https://arxiv.org/abs/2006.04558
24 | 
25 |     """
26 | 
27 |     def __init__(self,
28 |                  idim,
29 |                  n_layers=2,
30 |                  n_chans=384,
31 |                  kernel_size=3,
32 |                  bias=True,
33 |                  dropout_rate=0.5,
34 |                  utt_embed_dim=None,
35 |                  embedding_integration="AdaIN"):
36 |         """
37 |         Initialize duration predictor module.
38 | 
39 |         Args:
40 |             idim (int): Input dimension.
41 |             n_layers (int, optional): Number of convolutional layers.
42 |             n_chans (int, optional): Number of channels of convolutional layers.
43 |             kernel_size (int, optional): Kernel size of convolutional layers.
44 |             dropout_rate (float, optional): Dropout rate.
45 |         """
46 |         super().__init__()
47 |         self.conv = torch.nn.ModuleList()
48 |         self.dropouts = torch.nn.ModuleList()
49 |         self.norms = torch.nn.ModuleList()
50 |         self.embedding_projections = torch.nn.ModuleList()
51 |         self.utt_embed_dim = utt_embed_dim
52 |         self.use_conditional_layernorm_embedding_integration = embedding_integration in ["AdaIN", "ConditionalLayerNorm"]
53 | 
54 |         for idx in range(n_layers):
55 |             if utt_embed_dim is not None:
56 |                 if embedding_integration == "AdaIN":
57 |                     self.embedding_projections += [AdaIN1d(style_dim=utt_embed_dim, num_features=idim)]
58 |                 elif embedding_integration == "ConditionalLayerNorm":
59 |                     self.embedding_projections += [ConditionalLayerNorm(speaker_embedding_dim=utt_embed_dim, hidden_dim=idim)]
60 |                 else:
61 |                     self.embedding_projections += [torch.nn.Linear(utt_embed_dim + idim, idim)]
62 |             else:
63 |                 self.embedding_projections += [lambda x: x]
64 |             in_chans = idim if idx == 0 else n_chans
65 |             self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, bias=bias, ),
66 |                                               torch.nn.ReLU())]
67 |             self.norms += [LayerNorm(n_chans, dim=1)]
68 |             self.dropouts += [torch.nn.Dropout(dropout_rate)]
69 | 
70 |         self.linear = torch.nn.Linear(n_chans, 1)
71 | 
72 |     def forward(self, xs, padding_mask=None, utt_embed=None):
73 |         """
74 |         Calculate forward propagation.
75 | 
76 |         Args:
77 |             xs (Tensor): Batch of input sequences (B, Tmax, idim).
78 |             padding_mask (ByteTensor, optional):
79 |                 Batch of masks indicating padded part (B, Tmax).
80 | 
81 |         Returns:
82 |             Tensor: Batch of predicted sequences (B, Tmax, 1).
83 |         """
84 |         xs = xs.transpose(1, -1)  # (B, idim, Tmax)
85 | 
86 |         for f, c, d, p in zip(self.conv, self.norms, self.dropouts, self.embedding_projections):
87 |             xs = f(xs)  # (B, C, Tmax)
88 |             if self.utt_embed_dim is not None:
89 |                 xs = integrate_with_utt_embed(hs=xs.transpose(1, 2), utt_embeddings=utt_embed, projection=p, embedding_training=self.use_conditional_layernorm_embedding_integration).transpose(1, 2)
90 |             xs = c(xs)
91 |             xs = d(xs)
92 | 
93 |         xs = self.linear(xs.transpose(1, 2))  # (B, Tmax, 1)
94 | 
95 |         if padding_mask is not None:
96 |             xs = xs.masked_fill(padding_mask, 0.0)
97 | 
98 |         return xs
99 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/ResidualBlock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | References:
 5 |     - https://github.com/jik876/hifi-gan
 6 |     - https://github.com/kan-bayashi/ParallelWaveGAN
 7 | """
 8 | 
 9 | import torch
10 | 
11 | 
12 | class Conv1d(torch.nn.Conv1d):
13 |     """
14 |     Conv1d module with customized initialization.
15 |     """
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super(Conv1d, self).__init__(*args, **kwargs)
19 | 
20 |     def reset_parameters(self):
21 |         torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
22 |         if self.bias is not None:
23 |             torch.nn.init.constant_(self.bias, 0.0)
24 | 
25 | 
26 | class Conv1d1x1(Conv1d):
27 |     """
28 |     1x1 Conv1d with customized initialization.
29 |     """
30 | 
31 |     def __init__(self, in_channels, out_channels, bias):
32 |         super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
33 | 
34 | 
35 | class HiFiGANResidualBlock(torch.nn.Module):
36 |     """Residual block module in HiFiGAN."""
37 | 
38 |     def __init__(self,
39 |                  kernel_size=3,
40 |                  channels=512,
41 |                  dilations=(1, 3, 5),
42 |                  bias=True,
43 |                  use_additional_convs=True,
44 |                  nonlinear_activation="LeakyReLU",
45 |                  nonlinear_activation_params={"negative_slope": 0.1}, ):
46 |         """
47 |         Initialize HiFiGANResidualBlock module.
48 | 
49 |         Args:
50 |             kernel_size (int): Kernel size of dilation convolution layer.
51 |             channels (int): Number of channels for convolution layer.
52 |             dilations (List[int]): List of dilation factors.
53 |             use_additional_convs (bool): Whether to use additional convolution layers.
54 |             bias (bool): Whether to add bias parameter in convolution layers.
55 |             nonlinear_activation (str): Activation function module name.
56 |             nonlinear_activation_params (dict): Hyperparameters for activation function.
57 |         """
58 |         super().__init__()
59 |         self.use_additional_convs = use_additional_convs
60 |         self.convs1 = torch.nn.ModuleList()
61 |         if use_additional_convs:
62 |             self.convs2 = torch.nn.ModuleList()
63 |         assert kernel_size % 2 == 1, "Kernel size must be odd number."
64 |         for dilation in dilations:
65 |             self.convs1 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
66 |                                                 torch.nn.Conv1d(channels,
67 |                                                                 channels,
68 |                                                                 kernel_size,
69 |                                                                 1,
70 |                                                                 dilation=dilation,
71 |                                                                 bias=bias,
72 |                                                                 padding=(kernel_size - 1) // 2 * dilation, ), )]
73 |             if use_additional_convs:
74 |                 self.convs2 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
75 |                                                     torch.nn.Conv1d(channels,
76 |                                                                     channels,
77 |                                                                     kernel_size,
78 |                                                                     1,
79 |                                                                     dilation=1,
80 |                                                                     bias=bias,
81 |                                                                     padding=(kernel_size - 1) // 2, ), )]
82 | 
83 |     def forward(self, x):
84 |         """
85 |         Calculate forward propagation.
86 | 
87 |         Args:
88 |             x (Tensor): Input tensor (B, channels, T).
89 | 
90 |         Returns:
91 |             Tensor: Output tensor (B, channels, T).
92 |         """
93 |         for idx in range(len(self.convs1)):
94 |             xt = self.convs1[idx](x)
95 |             if self.use_additional_convs:
96 |                 xt = self.convs2[idx](xt)
97 |             x = xt + x
98 |         return x
99 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/EnergyCalculator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Nagoya University (Tomoki Hayashi)
 2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 3 | # Adapted by Florian Lux 2021
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from Modules.GeneralLayers.STFT import STFT
 9 | from Utility.utils import pad_list
10 | 
11 | 
12 | class EnergyCalculator(torch.nn.Module):
13 | 
14 |     def __init__(self, fs=16000, n_fft=1024, win_length=None, hop_length=256, window="hann", center=True,
15 |                  normalized=False, onesided=True, use_token_averaged_energy=True, reduction_factor=1):
16 |         super().__init__()
17 | 
18 |         self.fs = fs
19 |         self.n_fft = n_fft
20 |         self.hop_length = hop_length
21 |         self.win_length = win_length
22 |         self.window = window
23 |         self.use_token_averaged_energy = use_token_averaged_energy
24 |         if use_token_averaged_energy:
25 |             assert reduction_factor >= 1
26 |         self.reduction_factor = reduction_factor
27 | 
28 |         self.stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=center, normalized=normalized, onesided=onesided)
29 | 
30 |     def output_size(self):
31 |         return 1
32 | 
33 |     def get_parameters(self):
34 |         return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window, win_length=self.win_length, center=self.stft.center,
35 |                     normalized=self.stft.normalized, use_token_averaged_energy=self.use_token_averaged_energy, reduction_factor=self.reduction_factor)
36 | 
37 |     def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
38 |                 durations_lengths=None, norm_by_average=True, text=None):
39 |         # If not provided, we assume that the inputs have the same length
40 |         if input_waves_lengths is None:
41 |             input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1])
42 | 
43 |         # Domain-conversion: e.g. Stft: time -> time-freq
44 |         input_stft, energy_lengths = self.stft(input_waves, input_waves_lengths)
45 | 
46 |         assert input_stft.dim() >= 4, input_stft.shape
47 |         assert input_stft.shape[-1] == 2, input_stft.shape
48 | 
49 |         # input_stft: (..., F, 2) -> (..., F)
50 |         input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
51 |         # sum over frequency (B, N, F) -> (B, N)
52 |         energy = torch.sqrt(torch.clamp(input_power.sum(dim=2), min=1.0e-10))
53 | 
54 |         # (Optional): Adjust length to match with the features
55 |         if feats_lengths is not None:
56 |             energy = [self._adjust_num_frames(e[:el].view(-1), fl) for e, el, fl in zip(energy, energy_lengths, feats_lengths)]
57 |             energy_lengths = feats_lengths
58 | 
59 |         # (Optional): Average by duration to calculate token-wise energy
60 |         if self.use_token_averaged_energy:
61 |             energy = [self._average_by_duration(e[:el].view(-1), d, text) for e, el, d in zip(energy, energy_lengths, durations)]
62 |             energy_lengths = durations_lengths
63 | 
64 |         # Padding
65 |         if isinstance(energy, list):
66 |             energy = pad_list(energy, 0.0)
67 | 
68 |         if norm_by_average:
69 |             average = energy[0][energy[0] != 0.0].mean()
70 |             energy = energy / average
71 | 
72 |         # Return with the shape (B, T, 1)
73 |         return energy.unsqueeze(-1), energy_lengths
74 | 
75 |     def _average_by_duration(self, x, d, text=None):
76 |         d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
77 |         x_avg = [x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0) for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
78 | 
79 |         # find tokens that are not phoneme and set energy to 0
80 |         # while this makes sense, it make sit harder to model, so we leave this out
81 |         # if text is not None:
82 |         #    for i, vector in enumerate(text):
83 |         #        if vector[get_feature_to_index_lookup()["phoneme"]] == 0:
84 |         #            x_avg[i] = torch.tensor(0.0, device=x.device)
85 | 
86 |         return torch.stack(x_avg)
87 | 
88 |     @staticmethod
89 |     def _adjust_num_frames(x, num_frames):
90 |         if num_frames > len(x):
91 |             x = F.pad(x, (0, num_frames - len(x)))
92 |         elif num_frames < len(x):
93 |             x = x[:num_frames]
94 |         return x
95 | 


--------------------------------------------------------------------------------
/Utility/silence_removal.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import soundfile as sf
 3 | import torch
 4 | from tqdm import tqdm
 5 | 
 6 | from Preprocessing.TextFrontend import get_feature_to_index_lookup
 7 | from Utility.path_to_transcript_dicts import *
 8 | 
 9 | 
10 | def make_silence_cleaned_versions(train_sets):
11 |     torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
12 |     # careful: assumes 16kHz or 8kHz audio
13 |     silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
14 |                                          model='silero_vad',
15 |                                          force_reload=False,
16 |                                          onnx=False,
17 |                                          verbose=False)
18 |     (get_speech_timestamps,
19 |      save_audio,
20 |      read_audio,
21 |      VADIterator,
22 |      collect_chunks) = utils
23 |     torch.set_grad_enabled(True)  # finding this issue was very infuriating: silero sets
24 |     # this to false globally during model loading rather than using inference mode or no_grad
25 |     device = "cuda" if torch.cuda.is_available() else "cpu"
26 |     silero_model = silero_model.to(device)
27 | 
28 |     for train_set in train_sets:
29 |         for index in tqdm(range(len(train_set))):
30 |             filepath = train_set.datapoints[index][8]
31 |             phonemes = train_set.datapoints[index][0]
32 |             speech_length = train_set.datapoints[index][3]
33 |             durations = train_set.datapoints[index][4]
34 |             cumsum = 0
35 |             legal_silences = list()
36 |             for phoneme_index, phone in enumerate(phonemes):
37 |                 if phone[get_feature_to_index_lookup()["silence"]] == 1 or phone[get_feature_to_index_lookup()["end of sentence"]] == 1 or phone[get_feature_to_index_lookup()["questionmark"]] == 1 or phone[get_feature_to_index_lookup()["exclamationmark"]] == 1 or phone[get_feature_to_index_lookup()["fullstop"]] == 1:
38 |                     legal_silences.append([cumsum, cumsum + durations[phoneme_index]])
39 |                 cumsum = cumsum + durations[phoneme_index]
40 |             wave, sr = sf.read(filepath)
41 |             resampled_wave = librosa.resample(wave, orig_sr=sr, target_sr=16000)
42 |             with torch.inference_mode():
43 |                 speech_timestamps = get_speech_timestamps(torch.Tensor(resampled_wave).to(device), silero_model, sampling_rate=16000)
44 |             silences = list()
45 |             prev_end = 0
46 |             for speech_segment in speech_timestamps:
47 |                 if prev_end != 0:
48 |                     silences.append([prev_end, speech_segment["start"]])
49 |                 prev_end = speech_segment["end"]
50 |             # at this point we know all the silences and we know the legal silences.
51 |             # We have to transform them both into ratios, so we can compare them.
52 |             # If a silence overlaps with a legal silence, it can stay.
53 |             illegal_silences = list()
54 |             for silence in silences:
55 |                 illegal = True
56 |                 start = silence[0] / len(resampled_wave)
57 |                 end = silence[1] / len(resampled_wave)
58 |                 for legal_silence in legal_silences:
59 |                     legal_start = legal_silence[0] / speech_length
60 |                     legal_end = legal_silence[1] / speech_length
61 |                     if legal_start < start < legal_end or legal_start < end < legal_end:
62 |                         illegal = False
63 |                         break
64 |                 if illegal:
65 |                     # If it is an illegal silence, it is marked for removal in the original wave according to ration with real samplingrate.
66 |                     illegal_silences.append([start, end])
67 | 
68 |             # print(f"{len(illegal_silences)} illegal silences detected. ({len(silences) - len(illegal_silences)} legal silences left)")
69 |             wave = list(wave)
70 |             orig_wave_length = len(wave)
71 |             for illegal_silence in reversed(illegal_silences):
72 |                 wave = wave[:int(illegal_silence[0] * orig_wave_length)] + wave[int(illegal_silence[1] * orig_wave_length):]
73 |             # Audio with illegal silences removed will be saved into a new directory.
74 |             new_filepath_list = filepath.split("/")
75 |             new_filepath_list[-2] = new_filepath_list[-2] + "_silence_removed"
76 |             os.makedirs("/".join(new_filepath_list[:-1]), exist_ok=True)
77 |             sf.write("/".join(new_filepath_list), wave, sr)
78 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/BigVGAN.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022 NVIDIA CORPORATION.
  2 | #   Licensed under the MIT license.
  3 | 
  4 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
  5 | 
  6 | import torch
  7 | from alias_free_torch import Activation1d
  8 | from torch.nn import Conv1d
  9 | from torch.nn import ConvTranspose1d
 10 | from torch.nn import ModuleList
 11 | from torch.nn.utils import remove_weight_norm
 12 | from torch.nn.utils import weight_norm
 13 | 
 14 | from Modules.Vocoder.AMP import AMPBlock1
 15 | from Modules.Vocoder.Snake import SnakeBeta
 16 | 
 17 | 
 18 | class BigVGAN(torch.nn.Module):
 19 |     # this is the main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
 20 | 
 21 |     def __init__(self,
 22 |                  num_mels=128,
 23 |                  upsample_initial_channel=1024,
 24 |                  upsample_rates=(8, 6, 2, 2, 2),  # CAREFUL: Avocodo discriminator assumes that there are always 4 upsample scales, because it takes intermediate results.
 25 |                  upsample_kernel_sizes=(16, 12, 4, 4, 4),
 26 |                  resblock_kernel_sizes=(3, 7, 11),
 27 |                  resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5)),
 28 |                  weights=None
 29 |                  ):
 30 |         super(BigVGAN, self).__init__()
 31 | 
 32 |         self.num_kernels = len(resblock_kernel_sizes)
 33 |         self.num_upsamples = len(upsample_rates)
 34 | 
 35 |         # pre conv
 36 |         self.conv_pre = weight_norm(Conv1d(num_mels, upsample_initial_channel, 7, 1, padding=3))
 37 | 
 38 |         # transposed conv-based upsamplers. does not apply anti-aliasing
 39 |         self.ups = ModuleList()
 40 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
 41 |             self.ups.append(ModuleList([
 42 |                 weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i),
 43 |                                             upsample_initial_channel // (2 ** (i + 1)),
 44 |                                             k, u, padding=(k - u) // 2))
 45 |             ]))
 46 | 
 47 |         # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
 48 |         self.resblocks = ModuleList()
 49 |         for i in range(len(self.ups)):
 50 |             ch = upsample_initial_channel // (2 ** (i + 1))
 51 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
 52 |                 self.resblocks.append(AMPBlock1(ch, k, d))
 53 | 
 54 |         # post conv
 55 |         activation_post = SnakeBeta(ch, alpha_logscale=True)
 56 |         self.activation_post = Activation1d(activation=activation_post)
 57 | 
 58 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
 59 | 
 60 |         # weight initialization
 61 |         for i in range(len(self.ups)):
 62 |             self.ups[i].apply(init_weights)
 63 |         self.conv_post.apply(init_weights)
 64 | 
 65 |         if weights is not None:
 66 |             self.load_state_dict(weights)
 67 | 
 68 |     def forward(self, x):
 69 |         # pre conv
 70 |         x = self.conv_pre(x)
 71 | 
 72 |         for i in range(self.num_upsamples):
 73 |             # upsampling
 74 |             for i_up in range(len(self.ups[i])):
 75 |                 x = self.ups[i][i_up](x)
 76 |             # AMP blocks
 77 |             xs = None
 78 |             for j in range(self.num_kernels):
 79 |                 if xs is None:
 80 |                     xs = self.resblocks[i * self.num_kernels + j](x)
 81 |                 else:
 82 |                     xs += self.resblocks[i * self.num_kernels + j](x)
 83 |             x = xs / self.num_kernels
 84 | 
 85 |         # post conv
 86 |         x = self.activation_post(x)
 87 |         x = self.conv_post(x)
 88 |         x = torch.tanh(x)
 89 | 
 90 |         return x
 91 | 
 92 |     def remove_weight_norm(self):
 93 |         print('Removing weight norm...')
 94 |         for l in self.ups:
 95 |             for l_i in l:
 96 |                 remove_weight_norm(l_i)
 97 |         for l in self.resblocks:
 98 |             l.remove_weight_norm()
 99 |         remove_weight_norm(self.conv_pre)
100 |         remove_weight_norm(self.conv_post)
101 | 
102 | 
103 | def init_weights(m, mean=0.0, std=0.01):
104 |     classname = m.__class__.__name__
105 |     if classname.find("Conv") != -1:
106 |         m.weight.data.normal_(mean, std)
107 | 
108 | 
109 | def apply_weight_norm(m):
110 |     classname = m.__class__.__name__
111 |     if classname.find("Conv") != -1:
112 |         weight_norm(m)
113 | 
114 | 
115 | def get_padding(kernel_size, dilation=1):
116 |     return int((kernel_size * dilation - dilation) / 2)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     vgan = BigVGAN()
121 |     print(f"BigVGAN parameter count: {sum(p.numel() for p in vgan.parameters() if p.requires_grad)}")
122 |     print(BigVGAN()(torch.randn([1, 128, 100])).shape)
123 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/toucantts_train_loop_arbiter.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from Modules.ToucanTTS.toucantts_meta_train_loop import train_loop as multi_language_loop
 4 | from Modules.ToucanTTS.toucantts_train_loop import train_loop as mono_language_loop
 5 | 
 6 | 
 7 | def train_loop(net,  # an already initialized ToucanTTS model that should be trained.
 8 |                datasets,
 9 |                # a list of datasets to train on. Every dataset within a language should already be a concat dataset of all the datasets
10 |                # in that language. So every list entry here should be a (combined) dataset for each language. For the case of a monolingual model, pass a list
11 |                # with only one dataset in it. This will trigger the arbiter to call the train loop for simple one language training runs rather than the complex
12 |                # LAML based one.
13 |                train_samplers,  # the sampler(s) for the dataloader(s) (gpu_count or single GPU use different ones)
14 |                gpu_count,  # amount of GPUs to use
15 |                device,  # the device where this training should run on.
16 |                save_directory,  # directory where the models and visualizations should be saved.
17 |                steps_per_checkpoint=None,  # how many steps should be trained before a checkpoint is created. This is only relevant for the multilingual case,
18 |                # the monolingual case will do this once per epoch, regardless of the steps.
19 |                path_to_checkpoint=None,  # path to a trained checkpoint to either continue training or fine-tune from.
20 |                lr=0.0001,  # learning rate of the model.
21 |                resume=False,  # whether to automatically load the most recent checkpoint and resume training from it.
22 |                warmup_steps=4000,  # how many steps until the learning rate reaches the specified value and starts decreasing again.
23 |                use_wandb=False,  # whether to use online experiment tracking with weights and biases. Requires prior CLI login.
24 |                batch_size=32,  # how many samples to put into one batch. Higher batch size is more stable, but requires more VRAM.
25 |                eval_lang="eng",  # in which language the evaluation sentence is to be plotted.
26 |                fine_tune=False,  # whether to use the provided checkpoint as basis for fine-tuning.
27 |                steps=200000,  # how many updates to run until training is completed
28 |                use_less_loss=False,  # whether to use the loss that enforces a structure in the language embedding space
29 |                freeze_lang_embs=False,  # whether to use the language embeddings from a checkpoint without modifying them, to maintain compatibility with the zero-shot method. This treats language embeddings from the given checkpoint as constants.
30 |                ):
31 |     torch.multiprocessing.set_start_method('spawn', force=True)
32 |     if type(datasets) != list:
33 |         datasets = [datasets]
34 |     if len(datasets) > 1:
35 |         multi_language_loop(net=net,
36 |                             datasets=datasets,
37 |                             train_samplers=train_samplers,
38 |                             device=device,
39 |                             save_directory=save_directory,
40 |                             batch_size=batch_size,
41 |                             steps=steps,
42 |                             steps_per_checkpoint=steps_per_checkpoint,
43 |                             lr=lr,
44 |                             lang=eval_lang,
45 |                             path_to_checkpoint=path_to_checkpoint,
46 |                             resume=resume,
47 |                             fine_tune=fine_tune,
48 |                             warmup_steps=warmup_steps,
49 |                             use_wandb=use_wandb,
50 |                             gpu_count=gpu_count,
51 |                             use_less_loss=use_less_loss,
52 |                             freeze_lang_embs=freeze_lang_embs
53 |                             )
54 |     else:
55 |         mono_language_loop(net=net,
56 |                            train_dataset=datasets[0],
57 |                            train_sampler=train_samplers[0],
58 |                            device=device,
59 |                            save_directory=save_directory,
60 |                            batch_size=batch_size,
61 |                            lang=eval_lang,
62 |                            lr=lr,
63 |                            warmup_steps=warmup_steps,
64 |                            path_to_checkpoint=path_to_checkpoint,
65 |                            fine_tune=fine_tune,
66 |                            resume=resume,
67 |                            steps=steps,
68 |                            use_wandb=use_wandb,
69 |                            gpu_count=gpu_count,
70 |                            steps_per_checkpoint=steps_per_checkpoint
71 |                            )
72 | 


--------------------------------------------------------------------------------
/Recipes/ToucanTTS_Massive_German.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import torch
  4 | import wandb
  5 | 
  6 | from Utility.path_to_transcript_dicts import *
  7 | 
  8 | 
  9 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
 10 |     from torch.utils.data import ConcatDataset
 11 | 
 12 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
 13 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
 14 |     from Utility.corpus_preparation import prepare_tts_corpus
 15 |     from Utility.storage_config import MODEL_DIR
 16 |     from Utility.storage_config import PREPROCESSING_DIR
 17 | 
 18 |     if gpu_id == "cpu":
 19 |         device = torch.device("cpu")
 20 |     else:
 21 |         device = torch.device("cuda")
 22 | 
 23 |     print("Preparing")
 24 | 
 25 |     if model_dir is not None:
 26 |         save_dir = model_dir
 27 |     else:
 28 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_German_refined")
 29 |     os.makedirs(save_dir, exist_ok=True)
 30 | 
 31 |     if gpu_count > 1:
 32 |         rank = int(os.environ["LOCAL_RANK"])
 33 |         torch.cuda.set_device(rank)
 34 |         torch.distributed.init_process_group(backend="nccl")
 35 |     else:
 36 |         rank = 0
 37 | 
 38 |     datasets = list()
 39 | 
 40 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_karlsson,
 41 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "Karlsson"),
 42 |                                        lang="deu",
 43 |                                        gpu_count=gpu_count,
 44 |                                        rank=rank))
 45 | 
 46 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_eva,
 47 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "Eva"),
 48 |                                        lang="deu",
 49 |                                        gpu_count=gpu_count,
 50 |                                        rank=rank))
 51 | 
 52 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_hokus,
 53 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "Hokus"),
 54 |                                        lang="deu",
 55 |                                        gpu_count=gpu_count,
 56 |                                        rank=rank))
 57 | 
 58 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_bernd,
 59 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "Bernd"),
 60 |                                        lang="deu",
 61 |                                        gpu_count=gpu_count,
 62 |                                        rank=rank))
 63 | 
 64 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_friedrich,
 65 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "Friedrich"),
 66 |                                        lang="deu",
 67 |                                        gpu_count=gpu_count,
 68 |                                        rank=rank))
 69 | 
 70 |     datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_hui_others,
 71 |                                        corpus_dir=os.path.join(PREPROCESSING_DIR, "hui_others"),
 72 |                                        lang="deu",
 73 |                                        gpu_count=gpu_count,
 74 |                                        rank=rank))
 75 | 
 76 |     train_set = ConcatDataset(datasets)
 77 | 
 78 |     model = ToucanTTS()
 79 | 
 80 |     if gpu_count > 1:
 81 |         model.to(rank)
 82 |         model = torch.nn.parallel.DistributedDataParallel(
 83 |             model,
 84 |             device_ids=[rank],
 85 |             output_device=rank,
 86 |             find_unused_parameters=True,
 87 |         )
 88 |         torch.distributed.barrier()
 89 |     train_sampler = torch.utils.data.RandomSampler(train_set)
 90 | 
 91 |     if use_wandb:
 92 |         if rank == 0:
 93 |             wandb.init(
 94 |                 name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
 95 |                 id=wandb_resume_id,  # this is None if not specified in the command line arguments.
 96 |                 resume="must" if wandb_resume_id is not None else None)
 97 |     print("Training model")
 98 |     train_loop(net=model,
 99 |                datasets=[train_set],
100 |                batch_size=12,
101 |                steps_per_checkpoint=1000,
102 |                device=device,
103 |                save_directory=save_dir,
104 |                eval_lang="deu",
105 |                path_to_checkpoint=resume_checkpoint,
106 |                fine_tune=finetune,
107 |                resume=resume,
108 |                use_wandb=use_wandb,
109 |                train_samplers=[train_sampler],
110 |                gpu_count=gpu_count)
111 |     if use_wandb:
112 |         wandb.finish()
113 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/STFT.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Taken from ESPNet
  3 | """
  4 | 
  5 | import torch
  6 | from torch.functional import stft as torch_stft
  7 | from torch_complex.tensor import ComplexTensor
  8 | 
  9 | from Utility.utils import make_pad_mask
 10 | 
 11 | 
 12 | class STFT(torch.nn.Module):
 13 | 
 14 |     def __init__(self, n_fft=512,
 15 |                  win_length=None,
 16 |                  hop_length=128,
 17 |                  window="hann",
 18 |                  center=True,
 19 |                  normalized=False,
 20 |                  onesided=True):
 21 |         super().__init__()
 22 |         self.n_fft = n_fft
 23 |         if win_length is None:
 24 |             self.win_length = n_fft
 25 |         else:
 26 |             self.win_length = win_length
 27 |         self.hop_length = hop_length
 28 |         self.center = center
 29 |         self.normalized = normalized
 30 |         self.onesided = onesided
 31 |         self.window = window
 32 | 
 33 |     def extra_repr(self):
 34 |         return (f"n_fft={self.n_fft}, "
 35 |                 f"win_length={self.win_length}, "
 36 |                 f"hop_length={self.hop_length}, "
 37 |                 f"center={self.center}, "
 38 |                 f"normalized={self.normalized}, "
 39 |                 f"onesided={self.onesided}")
 40 | 
 41 |     def forward(self, input_wave, ilens=None):
 42 |         """
 43 |         STFT forward function.
 44 |         Args:
 45 |             input_wave: (Batch, Nsamples) or (Batch, Nsample, Channels)
 46 |             ilens: (Batch)
 47 |         Returns:
 48 |             output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)
 49 |         """
 50 |         bs = input_wave.size(0)
 51 | 
 52 |         if input_wave.dim() == 3:
 53 |             multi_channel = True
 54 |             # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
 55 |             input_wave = input_wave.transpose(1, 2).reshape(-1, input_wave.size(1))
 56 |         else:
 57 |             multi_channel = False
 58 | 
 59 |         # output: (Batch, Freq, Frames, 2=real_imag)
 60 |         # or (Batch, Channel, Freq, Frames, 2=real_imag)
 61 |         if self.window is not None:
 62 |             window_func = getattr(torch, f"{self.window}_window")
 63 |             window = window_func(self.win_length, dtype=input_wave.dtype, device=input_wave.device)
 64 |         else:
 65 |             window = None
 66 | 
 67 |         complex_output = torch_stft(input=input_wave,
 68 |                                     n_fft=self.n_fft,
 69 |                                     win_length=self.win_length,
 70 |                                     hop_length=self.hop_length,
 71 |                                     center=self.center,
 72 |                                     window=window,
 73 |                                     normalized=self.normalized,
 74 |                                     onesided=self.onesided,
 75 |                                     return_complex=True)
 76 |         output = torch.view_as_real(complex_output)
 77 |         # output: (Batch, Freq, Frames, 2=real_imag)
 78 |         # -> (Batch, Frames, Freq, 2=real_imag)
 79 |         output = output.transpose(1, 2)
 80 |         if multi_channel:
 81 |             # output: (Batch * Channel, Frames, Freq, 2=real_imag)
 82 |             # -> (Batch, Frame, Channel, Freq, 2=real_imag)
 83 |             output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(1, 2)
 84 | 
 85 |         if ilens is not None:
 86 |             if self.center:
 87 |                 pad = self.win_length // 2
 88 |                 ilens = ilens + 2 * pad
 89 | 
 90 |             olens = torch.div((ilens - self.win_length), self.hop_length, rounding_mode='trunc') + 1
 91 |             output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
 92 |         else:
 93 |             olens = None
 94 | 
 95 |         return output, olens
 96 | 
 97 |     def inverse(self, input, ilens=None):
 98 |         """
 99 |         Inverse STFT.
100 |         Args:
101 |             input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F)
102 |             ilens: (batch,)
103 |         Returns:
104 |             wavs: (batch, samples)
105 |             ilens: (batch,)
106 |         """
107 |         istft = torch.functional.istft
108 | 
109 |         if self.window is not None:
110 |             window_func = getattr(torch, f"{self.window}_window")
111 |             window = window_func(self.win_length, dtype=input.dtype, device=input.device)
112 |         else:
113 |             window = None
114 | 
115 |         if isinstance(input, ComplexTensor):
116 |             input = torch.stack([input.real, input.imag], dim=-1)
117 |         assert input.shape[-1] == 2
118 |         input = input.transpose(1, 2)
119 | 
120 |         wavs = istft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=window, center=self.center,
121 |                      normalized=self.normalized, onesided=self.onesided, length=ilens.max() if ilens is not None else ilens)
122 | 
123 |         return wavs, ilens
124 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/PitchCalculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Nagoya University (Tomoki Hayashi)
  2 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  3 | # Adapted by Florian Lux 2021
  4 | 
  5 | import math
  6 | 
  7 | import numpy as np
  8 | import parselmouth
  9 | import torch
 10 | import torch.nn.functional as F
 11 | from scipy.interpolate import interp1d
 12 | 
 13 | 
 14 | class Parselmouth(torch.nn.Module):
 15 |     """
 16 |     F0 estimation with Parselmouth https://parselmouth.readthedocs.io/en/stable/index.html
 17 |     """
 18 | 
 19 |     def __init__(self, fs=16000, n_fft=1024, hop_length=256, f0min=40, f0max=600, use_token_averaged_f0=True,
 20 |                  use_continuous_f0=True, use_log_f0=False, reduction_factor=1):
 21 |         super().__init__()
 22 |         self.fs = fs
 23 |         self.n_fft = n_fft
 24 |         self.hop_length = hop_length
 25 |         self.frame_period = 1000 * hop_length / fs
 26 |         self.f0min = f0min
 27 |         self.f0max = f0max
 28 |         self.use_token_averaged_f0 = use_token_averaged_f0
 29 |         self.use_continuous_f0 = use_continuous_f0
 30 |         self.use_log_f0 = use_log_f0
 31 |         if use_token_averaged_f0:
 32 |             assert reduction_factor >= 1
 33 |         self.reduction_factor = reduction_factor
 34 | 
 35 |     def output_size(self):
 36 |         return 1
 37 | 
 38 |     def get_parameters(self):
 39 |         return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, f0min=self.f0min, f0max=self.f0max,
 40 |                     use_token_averaged_f0=self.use_token_averaged_f0, use_continuous_f0=self.use_continuous_f0, use_log_f0=self.use_log_f0,
 41 |                     reduction_factor=self.reduction_factor)
 42 | 
 43 |     def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
 44 |                 durations_lengths=None, norm_by_average=True, text=None):
 45 | 
 46 |         # F0 extraction
 47 |         pitch = self._calculate_f0(input_waves[0])
 48 | 
 49 |         # Adjust length to match with the feature sequences
 50 |         pitch = self._adjust_num_frames(pitch, feats_lengths[0]).view(-1)
 51 | 
 52 |         pitch = self._average_by_duration(pitch, durations[0], text).view(-1)
 53 |         pitch_lengths = durations_lengths
 54 | 
 55 |         if norm_by_average:
 56 |             average = pitch[pitch != 0.0].mean()
 57 |             pitch = pitch / average
 58 | 
 59 |         # Return with the shape (B, T, 1)
 60 |         return pitch.unsqueeze(-1), pitch_lengths
 61 | 
 62 |     def _calculate_f0(self, input):
 63 |         x = input.cpu().numpy().astype(np.double)
 64 |         snd = parselmouth.Sound(values=x, sampling_frequency=self.fs)
 65 |         f0 = snd.to_pitch(time_step=self.hop_length / self.fs, pitch_floor=self.f0min, pitch_ceiling=self.f0max).selected_array['frequency']
 66 |         if self.use_continuous_f0:
 67 |             f0 = self._convert_to_continuous_f0(f0)
 68 |         if self.use_log_f0:
 69 |             nonzero_idxs = np.where(f0 != 0)[0]
 70 |             f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
 71 |         return input.new_tensor(f0.reshape(-1), dtype=torch.float)
 72 | 
 73 |     @staticmethod
 74 |     def _adjust_num_frames(x, num_frames):
 75 |         if num_frames > len(x):
 76 |             # x = F.pad(x, (0, num_frames - len(x)))
 77 |             x = F.pad(x, (math.ceil((num_frames - len(x)) / 2), math.floor((num_frames - len(x)) / 2)))
 78 |         elif num_frames < len(x):
 79 |             x = x[:num_frames]
 80 |         return x
 81 | 
 82 |     @staticmethod
 83 |     def _convert_to_continuous_f0(f0: np.array):
 84 |         if (f0 == 0).all():
 85 |             return f0
 86 | 
 87 |         # padding start and end of f0 sequence
 88 |         start_f0 = f0[f0 != 0][0]
 89 |         end_f0 = f0[f0 != 0][-1]
 90 |         start_idx = np.where(f0 == start_f0)[0][0]
 91 |         end_idx = np.where(f0 == end_f0)[0][-1]
 92 |         f0[:start_idx] = start_f0
 93 |         f0[end_idx:] = end_f0
 94 | 
 95 |         # get non-zero frame index
 96 |         nonzero_idxs = np.where(f0 != 0)[0]
 97 | 
 98 |         # perform linear interpolation
 99 |         interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
100 |         f0 = interp_fn(np.arange(0, f0.shape[0]))
101 | 
102 |         return f0
103 | 
104 |     def _average_by_duration(self, x, d, text=None):
105 |         d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
106 |         x_avg = [
107 |             x[start:end].masked_select(x[start:end].gt(0.0)).mean(dim=0) if len(x[start:end].masked_select(x[start:end].gt(0.0))) != 0 else x.new_tensor(0.0)
108 |             for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
109 | 
110 |         # find tokens that are not voiced and set pitch to 0
111 |         # while this makes sense, it makes it harder for the model to learn, so we leave this out now.
112 |         # if text is not None:
113 |         #    for i, vector in enumerate(text):
114 |         #        if vector[get_feature_to_index_lookup()["voiced"]] == 0:
115 |         #            x_avg[i] = torch.tensor(0.0, device=x.device)
116 | 
117 |         return torch.stack(x_avg)
118 | 


--------------------------------------------------------------------------------
/Recipes/finetuning_example_multilingual.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example script for fine-tuning the pretrained model to your own data.
  3 | 
  4 | Comments in ALL CAPS are instructions
  5 | """
  6 | 
  7 | import time
  8 | 
  9 | import torch
 10 | import wandb
 11 | 
 12 | from Utility.path_to_transcript_dicts import *
 13 | 
 14 | 
 15 | def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
 16 |     from huggingface_hub import hf_hub_download
 17 |     from torch.utils.data import ConcatDataset
 18 | 
 19 |     from Modules.ToucanTTS.ToucanTTS import ToucanTTS
 20 |     from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
 21 |     from Utility.corpus_preparation import prepare_tts_corpus
 22 |     from Utility.storage_config import MODEL_DIR
 23 |     from Utility.storage_config import PREPROCESSING_DIR
 24 | 
 25 |     if gpu_id == "cpu":
 26 |         device = torch.device("cpu")
 27 |     else:
 28 |         device = torch.device("cuda")
 29 |     assert gpu_count == 1  # distributed finetuning is not supported
 30 | 
 31 |     # IF YOU'RE ADDING A NEW LANGUAGE, YOU MIGHT NEED TO ADD HANDLING FOR IT IN Preprocessing/TextFrontend.py
 32 | 
 33 |     print("Preparing")
 34 | 
 35 |     if model_dir is not None:
 36 |         save_dir = model_dir
 37 |     else:
 38 |         save_dir = os.path.join(MODEL_DIR, "ToucanTTS_German_and_English")  # RENAME TO SOMETHING MEANINGFUL FOR YOUR DATA
 39 |     os.makedirs(save_dir, exist_ok=True)
 40 | 
 41 |     all_train_sets = list()  # YOU CAN HAVE MULTIPLE LANGUAGES, OR JUST ONE. JUST MAKE ONE ConcatDataset PER LANGUAGE AND ADD IT TO THE LIST.
 42 |     train_samplers = list()
 43 | 
 44 |     # =======================
 45 |     # =    German Data      =
 46 |     # =======================
 47 |     german_datasets = list()
 48 |     german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_karlsson(),
 49 |                                               corpus_dir=os.path.join(PREPROCESSING_DIR, "Karlsson"),
 50 |                                               lang="deu"))  # CHANGE THE TRANSCRIPT DICT, THE NAME OF THE CACHE DIRECTORY AND THE LANGUAGE TO YOUR NEEDS
 51 | 
 52 |     german_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_eva(),
 53 |                                               corpus_dir=os.path.join(PREPROCESSING_DIR, "Eva"),
 54 |                                               lang="deu"))  # YOU CAN SIMPLY ADD MODE CORPORA AND DO THE SAME, BUT YOU DON'T HAVE TO, ONE IS ENOUGH
 55 | 
 56 |     all_train_sets.append(ConcatDataset(german_datasets))
 57 | 
 58 |     # ========================
 59 |     # =    English Data      =
 60 |     # ========================
 61 |     english_datasets = list()
 62 |     english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_nancy(),
 63 |                                                corpus_dir=os.path.join(PREPROCESSING_DIR, "Nancy"),
 64 |                                                lang="eng"))
 65 | 
 66 |     english_datasets.append(prepare_tts_corpus(transcript_dict=build_path_to_transcript_ljspeech(),
 67 |                                                corpus_dir=os.path.join(PREPROCESSING_DIR, "LJSpeech"),
 68 |                                                lang="eng"))
 69 | 
 70 |     all_train_sets.append(ConcatDataset(english_datasets))
 71 | 
 72 |     model = ToucanTTS()
 73 | 
 74 |     for train_set in all_train_sets:
 75 |         train_samplers.append(torch.utils.data.RandomSampler(train_set))
 76 | 
 77 |     if use_wandb:
 78 |         wandb.init(
 79 |             name=f"{__name__.split('.')[-1]}_{time.strftime('%Y%m%d-%H%M%S')}" if wandb_resume_id is None else None,
 80 |             id=wandb_resume_id,  # this is None if not specified in the command line arguments.
 81 |             resume="must" if wandb_resume_id is not None else None)
 82 | 
 83 |     print("Training model")
 84 |     train_loop(net=model,
 85 |                datasets=all_train_sets,
 86 |                device=device,
 87 |                save_directory=save_dir,
 88 |                batch_size=12,  # YOU MIGHT GET OUT OF MEMORY ISSUES ON SMALL GPUs, IF SO, DECREASE THIS.
 89 |                eval_lang="deu",  # THE LANGUAGE YOUR PROGRESS PLOTS WILL BE MADE IN
 90 |                warmup_steps=500,
 91 |                lr=1e-5,  # if you have enough data (over ~1000 datapoints) you can increase this up to 1e-4 and it will still be stable, but learn quicker.
 92 |                # DOWNLOAD THESE INITIALIZATION MODELS FROM THE RELEASE PAGE OF THE GITHUB OR RUN THE DOWNLOADER SCRIPT TO GET THEM AUTOMATICALLY
 93 |                path_to_checkpoint=hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt") if resume_checkpoint is None else resume_checkpoint,
 94 |                fine_tune=True if resume_checkpoint is None and not resume else finetune,
 95 |                resume=resume,
 96 |                steps=5000,
 97 |                use_wandb=use_wandb,
 98 |                train_samplers=train_samplers,
 99 |                gpu_count=1)
100 |     if use_wandb:
101 |         wandb.finish()
102 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/ConditionalLayerNorm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code taken from https://github.com/tuanh123789/AdaSpeech/blob/main/model/adaspeech_modules.py
  3 | By https://github.com/tuanh123789
  4 | No license specified
  5 | 
  6 | Implemented as outlined in AdaSpeech https://arxiv.org/pdf/2103.00993.pdf
  7 | Used in this toolkit similar to how it is done in AdaSpeech 4 https://arxiv.org/pdf/2204.00436.pdf
  8 | 
  9 | """
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | 
 14 | 
 15 | class ConditionalLayerNorm(nn.Module):
 16 | 
 17 |     def __init__(self,
 18 |                  hidden_dim,
 19 |                  speaker_embedding_dim,
 20 |                  dim=-1):
 21 |         super(ConditionalLayerNorm, self).__init__()
 22 |         self.dim = dim
 23 |         if isinstance(hidden_dim, int):
 24 |             self.normal_shape = hidden_dim
 25 |         self.speaker_embedding_dim = speaker_embedding_dim
 26 |         self.W_scale = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape),
 27 |                                      nn.Tanh(),
 28 |                                      nn.Linear(self.normal_shape, self.normal_shape))
 29 |         self.W_bias = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape),
 30 |                                     nn.Tanh(),
 31 |                                     nn.Linear(self.normal_shape, self.normal_shape))
 32 |         self.reset_parameters()
 33 | 
 34 |     def reset_parameters(self):
 35 |         torch.nn.init.constant_(self.W_scale[0].weight, 0.0)
 36 |         torch.nn.init.constant_(self.W_scale[2].weight, 0.0)
 37 |         torch.nn.init.constant_(self.W_scale[0].bias, 1.0)
 38 |         torch.nn.init.constant_(self.W_scale[2].bias, 1.0)
 39 |         torch.nn.init.constant_(self.W_bias[0].weight, 0.0)
 40 |         torch.nn.init.constant_(self.W_bias[2].weight, 0.0)
 41 |         torch.nn.init.constant_(self.W_bias[0].bias, 0.0)
 42 |         torch.nn.init.constant_(self.W_bias[2].bias, 0.0)
 43 | 
 44 |     def forward(self, x, speaker_embedding):
 45 | 
 46 |         if self.dim != -1:
 47 |             x = x.transpose(-1, self.dim)
 48 | 
 49 |         mean = x.mean(dim=-1, keepdim=True)
 50 |         var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
 51 |         scale = self.W_scale(speaker_embedding)
 52 |         bias = self.W_bias(speaker_embedding)
 53 | 
 54 |         y = scale.unsqueeze(1) * ((x - mean) / var) + bias.unsqueeze(1)
 55 | 
 56 |         if self.dim != -1:
 57 |             y = y.transpose(-1, self.dim)
 58 | 
 59 |         return y
 60 | 
 61 | 
 62 | class SequentialWrappableConditionalLayerNorm(nn.Module):
 63 | 
 64 |     def __init__(self,
 65 |                  hidden_dim,
 66 |                  speaker_embedding_dim):
 67 |         super(SequentialWrappableConditionalLayerNorm, self).__init__()
 68 |         if isinstance(hidden_dim, int):
 69 |             self.normal_shape = hidden_dim
 70 |         self.speaker_embedding_dim = speaker_embedding_dim
 71 |         self.W_scale = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape),
 72 |                                      nn.Tanh(),
 73 |                                      nn.Linear(self.normal_shape, self.normal_shape))
 74 |         self.W_bias = nn.Sequential(nn.Linear(self.speaker_embedding_dim, self.normal_shape),
 75 |                                     nn.Tanh(),
 76 |                                     nn.Linear(self.normal_shape, self.normal_shape))
 77 |         self.reset_parameters()
 78 | 
 79 |     def reset_parameters(self):
 80 |         torch.nn.init.constant_(self.W_scale[0].weight, 0.0)
 81 |         torch.nn.init.constant_(self.W_scale[2].weight, 0.0)
 82 |         torch.nn.init.constant_(self.W_scale[0].bias, 1.0)
 83 |         torch.nn.init.constant_(self.W_scale[2].bias, 1.0)
 84 |         torch.nn.init.constant_(self.W_bias[0].weight, 0.0)
 85 |         torch.nn.init.constant_(self.W_bias[2].weight, 0.0)
 86 |         torch.nn.init.constant_(self.W_bias[0].bias, 0.0)
 87 |         torch.nn.init.constant_(self.W_bias[2].bias, 0.0)
 88 | 
 89 |     def forward(self, packed_input):
 90 |         x, speaker_embedding = packed_input
 91 |         mean = x.mean(dim=-1, keepdim=True)
 92 |         var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
 93 |         scale = self.W_scale(speaker_embedding)
 94 |         bias = self.W_bias(speaker_embedding)
 95 | 
 96 |         y = scale.unsqueeze(1) * ((x - mean) / var) + bias.unsqueeze(1)
 97 | 
 98 |         return y
 99 | 
100 | 
101 | class AdaIN1d(nn.Module):
102 |     """
103 |     MIT Licensed
104 | 
105 |     Copyright (c) 2022 Aaron (Yinghao) Li
106 |     https://github.com/yl4579/StyleTTS/blob/main/models.py
107 |     """
108 | 
109 |     def __init__(self, style_dim, num_features):
110 |         super().__init__()
111 |         self.norm = nn.InstanceNorm1d(num_features, affine=False)
112 |         self.fc = nn.Linear(style_dim, num_features * 2)
113 | 
114 |     def forward(self, x, s):
115 |         h = self.fc(s)
116 |         h = h.view(h.size(0), h.size(1), 1)
117 |         gamma, beta = torch.chunk(h, chunks=2, dim=1)
118 |         return (1 + gamma.transpose(1, 2)) * self.norm(x.transpose(1, 2)).transpose(1, 2) + beta.transpose(1, 2)
119 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/AdversarialLoss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2021 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | 
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | def discriminator_adv_loss(disc_real_outputs, disc_generated_outputs):
 12 |     loss = 0
 13 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
 14 |         dr_fun, dr_dir = dr
 15 |         dg_fun, dg_dir = dg
 16 |         r_loss_fun = torch.mean(F.softplus(1 - dr_fun) ** 2)
 17 |         g_loss_fun = torch.mean(F.softplus(dg_fun) ** 2)
 18 |         r_loss_dir = torch.mean(F.softplus(1 - dr_dir) ** 2)
 19 |         g_loss_dir = torch.mean(-F.softplus(1 - dg_dir) ** 2)
 20 |         r_loss = r_loss_fun + r_loss_dir
 21 |         g_loss = g_loss_fun + g_loss_dir
 22 |         loss += (r_loss + g_loss)
 23 | 
 24 |     return loss / len(disc_generated_outputs)
 25 | 
 26 | 
 27 | def generator_adv_loss(disc_outputs):
 28 |     loss = 0
 29 |     for dg in disc_outputs:
 30 |         l = torch.mean(F.softplus(1 - dg) ** 2)
 31 |         loss += l
 32 | 
 33 |     return loss / len(disc_outputs)
 34 | 
 35 | 
 36 | class GeneratorAdversarialLoss(torch.nn.Module):
 37 | 
 38 |     def __init__(self,
 39 |                  average_by_discriminators=True,
 40 |                  loss_type="mse", ):
 41 |         """Initialize GeneratorAversarialLoss module."""
 42 |         super().__init__()
 43 |         self.average_by_discriminators = average_by_discriminators
 44 |         assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
 45 |         if loss_type == "mse":
 46 |             self.criterion = self._mse_loss
 47 |         else:
 48 |             self.criterion = self._hinge_loss
 49 | 
 50 |     def forward(self, outputs):
 51 |         """
 52 |         Calcualate generator adversarial loss.
 53 | 
 54 |         Args:
 55 |             outputs (Tensor or list): Discriminator outputs or list of
 56 |                 discriminator outputs.
 57 | 
 58 |         Returns:
 59 |             Tensor: Generator adversarial loss value.
 60 |         """
 61 |         if isinstance(outputs, (tuple, list)):
 62 |             adv_loss = 0.0
 63 |             for i, outputs_ in enumerate(outputs):
 64 |                 if isinstance(outputs_, (tuple, list)):
 65 |                     outputs_ = outputs_[-1]
 66 |                 adv_loss = adv_loss + self.criterion(outputs_)
 67 |             if self.average_by_discriminators:
 68 |                 adv_loss /= i + 1
 69 |         else:
 70 |             adv_loss = self.criterion(outputs)
 71 | 
 72 |         return adv_loss
 73 | 
 74 |     def _mse_loss(self, x):
 75 |         return F.mse_loss(x, x.new_ones(x.size()))
 76 | 
 77 |     def _hinge_loss(self, x):
 78 |         return -x.mean()
 79 | 
 80 | 
 81 | class DiscriminatorAdversarialLoss(torch.nn.Module):
 82 | 
 83 |     def __init__(self,
 84 |                  average_by_discriminators=True,
 85 |                  loss_type="mse", ):
 86 |         super().__init__()
 87 |         self.average_by_discriminators = average_by_discriminators
 88 |         assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
 89 |         if loss_type == "mse":
 90 |             self.fake_criterion = self._mse_fake_loss
 91 |             self.real_criterion = self._mse_real_loss
 92 |         else:
 93 |             self.fake_criterion = self._hinge_fake_loss
 94 |             self.real_criterion = self._hinge_real_loss
 95 | 
 96 |     def forward(self, outputs_hat, outputs):
 97 |         """
 98 |         Calcualate discriminator adversarial loss.
 99 | 
100 |         Args:
101 |             outputs_hat (Tensor or list): Discriminator outputs or list of
102 |                 discriminator outputs calculated from generator outputs.
103 |             outputs (Tensor or list): Discriminator outputs or list of
104 |                 discriminator outputs calculated from groundtruth.
105 | 
106 |         Returns:
107 |             Tensor: Discriminator real loss value.
108 |             Tensor: Discriminator fake loss value.
109 |         """
110 |         if isinstance(outputs, (tuple, list)):
111 |             real_loss = 0.0
112 |             fake_loss = 0.0
113 |             for i, (outputs_hat_, outputs_) in enumerate(zip(outputs_hat, outputs)):
114 |                 if isinstance(outputs_hat_, (tuple, list)):
115 |                     outputs_hat_ = outputs_hat_[-1]
116 |                     outputs_ = outputs_[-1]
117 |                 real_loss = real_loss + self.real_criterion(outputs_)
118 |                 fake_loss = fake_loss + self.fake_criterion(outputs_hat_)
119 |             if self.average_by_discriminators:
120 |                 fake_loss /= i + 1
121 |                 real_loss /= i + 1
122 |         else:
123 |             real_loss = self.real_criterion(outputs)
124 |             fake_loss = self.fake_criterion(outputs_hat)
125 | 
126 |         return real_loss + fake_loss
127 | 
128 |     def _mse_real_loss(self, x):
129 |         return F.mse_loss(x, x.new_ones(x.size()))
130 | 
131 |     def _mse_fake_loss(self, x):
132 |         return F.mse_loss(x, x.new_zeros(x.size()))
133 | 
134 |     def _hinge_real_loss(self, x):
135 |         return -torch.mean(torch.min(x - 1, x.new_zeros(x.size())))
136 | 
137 |     def _hinge_fake_loss(self, x):
138 |         return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size())))
139 | 


--------------------------------------------------------------------------------
/Preprocessing/multilinguality/generate_zero_shot_lang_embs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import torch
 8 | from huggingface_hub import hf_hub_download
 9 | from tqdm import tqdm
10 | 
11 | from Utility.storage_config import MODEL_DIR
12 | 
13 | 
14 | def approximate_and_inject_language_embeddings(model_path, df, iso_lookup, min_n_langs=5, max_n_langs=25, threshold_percentile=50):
15 |     # load pretrained language_embeddings
16 |     model = torch.load(model_path, map_location="cpu")
17 |     lang_embs = model["model"]["encoder.language_embedding.weight"]
18 | 
19 |     features_per_closest_lang = 2
20 |     # for combined, df has up to 5 features (if containing individual distances) per closest lang + 1 target lang column
21 |     if "combined_dist_0" in df.columns:
22 |         if "map_dist_0" in df.columns:
23 |             features_per_closest_lang += 1
24 |         if "asp_dist_0" in df.columns:
25 |             features_per_closest_lang += 1
26 |         if "tree_dist_0" in df.columns:
27 |             features_per_closest_lang += 1
28 |         n_closest = len(df.columns) // features_per_closest_lang
29 |         distance_type = "combined"
30 |     # else, df has 2 features per closest lang + 1 target lang column
31 |     else:
32 |         n_closest = len(df.columns) // features_per_closest_lang
33 |         if "map_dist_0" in df.columns:
34 |             distance_type = "map"
35 |         elif "tree_dist_0" in df.columns:
36 |             distance_type = "tree"
37 |         elif "asp_dist_0" in df.columns:
38 |             distance_type = "asp"
39 |         elif "learned_dist_0" in df.columns:
40 |             distance_type = "learned"
41 |         else:
42 |             distance_type = "random"
43 | 
44 |     # get relevant columns
45 |     closest_lang_columns = [f"closest_lang_{i}" for i in range(n_closest)]
46 |     closest_dist_columns = [f"{distance_type}_dist_{i}" for i in range(n_closest)]
47 |     closest_lang_columns = closest_lang_columns[:max_n_langs]
48 |     closest_dist_columns = closest_dist_columns[:max_n_langs]
49 |     assert df[closest_dist_columns[-1]].isna().sum().sum() == 0
50 | 
51 |     # get threshold based on distance of a certain percentile of the furthest language across all samples
52 |     threshold = np.percentile(df[closest_dist_columns[-1]], threshold_percentile)
53 |     print(f"threshold: {threshold:.4f}")
54 |     for row in tqdm(df.itertuples(), total=df.shape[0], desc="Approximating language embeddings"):
55 |         avg_emb = torch.zeros([32])  # If you change the size of the language embedding in the model, you need to change the size here as well.
56 |         dists = [getattr(row, d) for i, d in enumerate(closest_dist_columns) if i < min_n_langs or getattr(row, d) < threshold]
57 |         langs = [getattr(row, l) for l in closest_lang_columns[:len(dists)]]
58 | 
59 |         for lang in langs:
60 |             lang_emb = lang_embs[iso_lookup[-1][str(lang)]]
61 |             avg_emb += lang_emb
62 |         avg_emb /= len(langs)  # normalize
63 |         lang_embs[iso_lookup[-1][str(row.target_lang)]] = avg_emb
64 | 
65 |     # inject language embeddings into Toucan model and save
66 |     model["model"]["encoder.language_embedding.weight"] = lang_embs
67 |     modified_model_path = model_path.split(".")[0] + "_zeroshot_lang_embs.pt"
68 |     torch.save(model, modified_model_path)
69 |     print(f"Replaced unsupervised language embeddings with zero-shot approximations.\nSaved modified model to {modified_model_path}")
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     default_model_path = os.path.join(MODEL_DIR, "ToucanTTS_Meta", "best.pt")  # MODEL_DIR must be absolute path, the relative path will fail at this location
74 |     default_csv_path = "distance_datasets/dataset_learned_top30.csv"
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument("--model_path", type=str, default=default_model_path, help="path of the model for which the language embeddings should be modified")
77 |     parser.add_argument("--dataset_path", type=str, default=default_csv_path, help="path to distance dataset CSV")
78 |     parser.add_argument("--min_n_langs", type=int, default=5, help="minimum amount of languages used for averaging")
79 |     parser.add_argument("--max_n_langs", type=int, default=25, help="maximum amount of languages used for averaging")
80 |     parser.add_argument("--threshold_percentile", type=int, default=50, help="percentile of the furthest used languages \
81 |                         used as cutoff threshold (no langs >= the threshold are used for averaging)")
82 |     args = parser.parse_args()
83 |     ISO_LOOKUP_PATH = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json")
84 |     with open(ISO_LOOKUP_PATH, "r") as f:
85 |         iso_lookup = json.load(f)  # iso_lookup[-1] = iso2id mapping
86 |     # load language distance dataset
87 |     distance_df = pd.read_csv(args.dataset_path, sep="|")
88 |     approximate_and_inject_language_embeddings(model_path=args.model_path,
89 |                                                df=distance_df,
90 |                                                iso_lookup=iso_lookup,
91 |                                                min_n_langs=args.min_n_langs,
92 |                                                max_n_langs=args.max_n_langs,
93 |                                                threshold_percentile=args.threshold_percentile)
94 | 


--------------------------------------------------------------------------------
/Utility/diverse_losses.py:
--------------------------------------------------------------------------------
  1 | # adapted from https://github.com/facebookresearch/barlowtwins
  2 | 
  3 | from math import exp
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | 
  9 | 
 10 | class RedundancyReduction(torch.nn.Module):
 11 | 
 12 |     def __init__(self, lambd=1e-5, vector_dimensions=256):
 13 |         super().__init__()
 14 |         self.lambd = lambd
 15 |         self.bn = torch.nn.BatchNorm1d(vector_dimensions, affine=False)
 16 | 
 17 |     def forward(self, z1, z2):
 18 |         c = self.bn(z1).T @ self.bn(z2)
 19 |         c.div_(z1.size(0))
 20 |         off_diag = off_diagonal(c).pow_(2).sum()
 21 |         return self.lambd * off_diag
 22 | 
 23 | 
 24 | class BarlowTwinsLoss(torch.nn.Module):
 25 | 
 26 |     def __init__(self, lambd=1e-5, vector_dimensions=256):
 27 |         super().__init__()
 28 |         self.lambd = lambd
 29 |         self.bn = torch.nn.BatchNorm1d(vector_dimensions, affine=False)
 30 | 
 31 |     def forward(self, z1, z2):
 32 |         c = self.bn(z1).T @ self.bn(z2)
 33 |         c.div_(z1.size(0))
 34 |         on_diag = torch.diagonal(c).add_(-1).pow_(2).sum()
 35 |         off_diag = off_diagonal(c).pow_(2).sum()
 36 |         loss = on_diag + self.lambd * off_diag
 37 |         return loss
 38 | 
 39 | 
 40 | def off_diagonal(x):
 41 |     # return a flattened view of the off-diagonal elements of a square matrix
 42 |     n, m = x.shape
 43 |     assert n == m
 44 |     return x.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten()
 45 | 
 46 | 
 47 | class TripletLoss(torch.nn.Module):
 48 | 
 49 |     def __init__(self, margin):
 50 |         super().__init__()
 51 |         self.cosine_similarity = torch.nn.CosineSimilarity()
 52 |         self.margin = margin
 53 | 
 54 |     def forward(self,
 55 |                 anchor_embeddings,
 56 |                 positive_embeddings,
 57 |                 negative_embeddings):
 58 |         positive_distance = 1 - self.cosine_similarity(anchor_embeddings, positive_embeddings)
 59 |         negative_distance = 1 - self.cosine_similarity(anchor_embeddings, negative_embeddings)
 60 | 
 61 |         losses = torch.max(positive_distance - negative_distance + self.margin,
 62 |                            torch.full_like(positive_distance, 0))
 63 |         return torch.mean(losses)
 64 | 
 65 | 
 66 | # The following is taken from https://github.com/NATSpeech/NATSpeech/blob/aef3aa8899c82e40a28e4f59d559b46b18ba87e8/utils/metrics/ssim.py
 67 | 
 68 | def gaussian(window_size, sigma):
 69 |     gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
 70 |     return gauss / gauss.sum()
 71 | 
 72 | 
 73 | def create_window(window_size, channel):
 74 |     _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
 75 |     _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
 76 |     window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
 77 |     return window
 78 | 
 79 | 
 80 | def _ssim(img1, img2, window, window_size, channel, size_average=True):
 81 |     mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
 82 |     mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
 83 | 
 84 |     mu1_sq = mu1.pow(2)
 85 |     mu2_sq = mu2.pow(2)
 86 |     mu1_mu2 = mu1 * mu2
 87 | 
 88 |     sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
 89 |     sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
 90 |     sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
 91 | 
 92 |     C1 = 0.01 ** 2
 93 |     C2 = 0.03 ** 2
 94 | 
 95 |     ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
 96 | 
 97 |     if size_average:
 98 |         return ssim_map.mean()
 99 |     else:
100 |         return ssim_map.mean(1)
101 | 
102 | 
103 | class SSIM(torch.nn.Module):
104 |     """
105 |     Adapted from https://github.com/Po-Hsun-Su/pytorch-ssim
106 |     """
107 | 
108 |     def __init__(self, window_size=11, size_average=True):
109 |         super(SSIM, self).__init__()
110 |         self.window_size = window_size
111 |         self.size_average = size_average
112 |         self.channel = 1
113 |         self.window = create_window(window_size, self.channel)
114 | 
115 |     def forward(self, img1, img2):
116 |         (_, channel, _, _) = img1.size()
117 | 
118 |         if channel == self.channel and self.window.data.type() == img1.data.type():
119 |             window = self.window
120 |         else:
121 |             window = create_window(self.window_size, channel)
122 | 
123 |             if img1.is_cuda:
124 |                 window = window.cuda(img1.get_device())
125 |             window = window.type_as(img1)
126 | 
127 |             self.window = window
128 |             self.channel = channel
129 | 
130 |         return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
131 | 
132 | 
133 | window = None
134 | 
135 | 
136 | def ssim(img1, img2, window_size=11, size_average=True):
137 |     (_, channel, _, _) = img1.size()
138 |     global window
139 |     if window is None:
140 |         window = create_window(window_size, channel)
141 |         if img1.is_cuda:
142 |             window = window.cuda(img1.get_device())
143 |         window = window.type_as(img1)
144 |     return _ssim(img1, img2, window, window_size, channel, size_average)
145 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/wavenet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MIT License
  3 | 
  4 | Copyright (c) 2022 Yi Ren
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | SOFTWARE.
 23 | """
 24 | 
 25 | import torch
 26 | from torch import nn
 27 | 
 28 | 
 29 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
 30 |     n_channels_int = n_channels[0]
 31 |     in_act = input_a + input_b
 32 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
 33 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
 34 |     acts = t_act * s_act
 35 |     return acts
 36 | 
 37 | 
 38 | class WN(torch.nn.Module):
 39 | 
 40 |     def __init__(self, hidden_size, kernel_size, dilation_rate, n_layers, c_cond=0,
 41 |                  p_dropout=0, share_cond_layers=False, is_BTC=False, use_weightnorm=True):
 42 |         super(WN, self).__init__()
 43 |         assert (kernel_size % 2 == 1)
 44 |         assert (hidden_size % 2 == 0)
 45 |         self.is_BTC = is_BTC
 46 |         self.hidden_size = hidden_size
 47 |         self.kernel_size = kernel_size
 48 |         self.dilation_rate = dilation_rate
 49 |         self.n_layers = n_layers
 50 |         self.gin_channels = c_cond
 51 |         self.p_dropout = p_dropout
 52 |         self.share_cond_layers = share_cond_layers
 53 | 
 54 |         self.in_layers = torch.nn.ModuleList()
 55 |         self.res_skip_layers = torch.nn.ModuleList()
 56 |         self.drop = nn.Dropout(p_dropout)
 57 | 
 58 |         if c_cond != 0 and not share_cond_layers:
 59 |             cond_layer = torch.nn.Conv1d(c_cond, 2 * hidden_size * n_layers, 1)
 60 |             if use_weightnorm:
 61 |                 self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
 62 |             else:
 63 |                 self.cond_layer = cond_layer
 64 | 
 65 |         for i in range(n_layers):
 66 |             dilation = dilation_rate ** i
 67 |             padding = int((kernel_size * dilation - dilation) / 2)
 68 |             in_layer = torch.nn.Conv1d(hidden_size, 2 * hidden_size, kernel_size,
 69 |                                        dilation=dilation, padding=padding)
 70 |             if use_weightnorm:
 71 |                 in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
 72 |             self.in_layers.append(in_layer)
 73 | 
 74 |             # last one is not necessary
 75 |             if i < n_layers - 1:
 76 |                 res_skip_channels = 2 * hidden_size
 77 |             else:
 78 |                 res_skip_channels = hidden_size
 79 | 
 80 |             res_skip_layer = torch.nn.Conv1d(hidden_size, res_skip_channels, 1)
 81 |             if use_weightnorm:
 82 |                 res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
 83 |             self.res_skip_layers.append(res_skip_layer)
 84 | 
 85 |     def forward(self, x, nonpadding=None, cond=None):
 86 |         if self.is_BTC:
 87 |             x = x.transpose(1, 2)
 88 |             cond = cond.transpose(1, 2) if cond is not None else None
 89 |             nonpadding = nonpadding.transpose(1, 2) if nonpadding is not None else None
 90 |         if nonpadding is None:
 91 |             nonpadding = 1
 92 |         output = torch.zeros_like(x)
 93 |         n_channels_tensor = torch.IntTensor([self.hidden_size])
 94 | 
 95 |         if cond is not None and not self.share_cond_layers:
 96 |             cond = self.cond_layer(cond)
 97 | 
 98 |         for i in range(self.n_layers):
 99 |             x_in = self.in_layers[i](x)
100 |             x_in = self.drop(x_in)
101 |             if cond is not None:
102 |                 cond_offset = i * 2 * self.hidden_size
103 |                 cond_l = cond[:, cond_offset:cond_offset + 2 * self.hidden_size, :]
104 |             else:
105 |                 cond_l = torch.zeros_like(x_in)
106 | 
107 |             acts = fused_add_tanh_sigmoid_multiply(x_in, cond_l, n_channels_tensor)
108 | 
109 |             res_skip_acts = self.res_skip_layers[i](acts)
110 |             if i < self.n_layers - 1:
111 |                 x = (x + res_skip_acts[:, :self.hidden_size, :]) * nonpadding
112 |                 output = output + res_skip_acts[:, self.hidden_size:, :]
113 |             else:
114 |                 output = output + res_skip_acts
115 |         output = output * nonpadding
116 |         if self.is_BTC:
117 |             output = output.transpose(1, 2)
118 |         return output
119 | 
120 |     def remove_weight_norm(self):
121 |         def remove_weight_norm(m):
122 |             try:
123 |                 nn.utils.remove_weight_norm(m)
124 |             except ValueError:  # this module didn't have weight norm
125 |                 return
126 | 
127 |         self.apply(remove_weight_norm)
128 | 


--------------------------------------------------------------------------------
/Modules/Vocoder/MelSpecLoss.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Tomoki Hayashi
  2 | # MIT License (https://opensource.org/licenses/MIT)
  3 | # Adapted by Florian Lux 2021
  4 | 
  5 | 
  6 | import librosa
  7 | import torch
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MelSpectrogram(torch.nn.Module):
 12 | 
 13 |     def __init__(self,
 14 |                  fs=24000,
 15 |                  fft_size=1536,
 16 |                  hop_size=384,
 17 |                  win_length=None,
 18 |                  window="hann",
 19 |                  num_mels=100,
 20 |                  fmin=60,
 21 |                  fmax=None,
 22 |                  center=True,
 23 |                  normalized=False,
 24 |                  onesided=True,
 25 |                  eps=1e-10,
 26 |                  log_base=10.0, ):
 27 |         super().__init__()
 28 |         self.fft_size = fft_size
 29 |         if win_length is None:
 30 |             self.win_length = fft_size
 31 |         else:
 32 |             self.win_length = win_length
 33 |         self.hop_size = hop_size
 34 |         self.center = center
 35 |         self.normalized = normalized
 36 |         self.onesided = onesided
 37 |         if window is not None and not hasattr(torch, f"{window}_window"):
 38 |             raise ValueError(f"{window} window is not implemented")
 39 |         self.window = window
 40 |         self.eps = eps
 41 | 
 42 |         fmin = 0 if fmin is None else fmin
 43 |         fmax = fs / 2 if fmax is None else fmax
 44 |         melmat = librosa.filters.mel(sr=fs,
 45 |                                      n_fft=fft_size,
 46 |                                      n_mels=num_mels,
 47 |                                      fmin=fmin,
 48 |                                      fmax=fmax, )
 49 |         self.register_buffer("melmat", torch.from_numpy(melmat.T).float())
 50 |         self.stft_params = {
 51 |             "n_fft"     : self.fft_size,
 52 |             "win_length": self.win_length,
 53 |             "hop_length": self.hop_size,
 54 |             "center"    : self.center,
 55 |             "normalized": self.normalized,
 56 |             "onesided"  : self.onesided,
 57 |         }
 58 |         self.stft_params["return_complex"] = False
 59 | 
 60 |         self.log_base = log_base
 61 |         if self.log_base is None:
 62 |             self.log = torch.log
 63 |         elif self.log_base == 2.0:
 64 |             self.log = torch.log2
 65 |         elif self.log_base == 10.0:
 66 |             self.log = torch.log10
 67 |         else:
 68 |             raise ValueError(f"log_base: {log_base} is not supported.")
 69 | 
 70 |     def forward(self, x):
 71 |         """
 72 |         Calculate Mel-spectrogram.
 73 | 
 74 |         Args:
 75 |             x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
 76 | 
 77 |         Returns:
 78 |             Tensor: Mel-spectrogram (B, #mels, #frames).
 79 |         """
 80 |         if x.dim() == 3:
 81 |             # (B, C, T) -> (B*C, T)
 82 |             x = x.reshape(-1, x.size(2))
 83 | 
 84 |         if self.window is not None:
 85 |             window_func = getattr(torch, f"{self.window}_window")
 86 |             window = window_func(self.win_length, dtype=x.dtype, device=x.device)
 87 |         else:
 88 |             window = None
 89 | 
 90 |         x_stft = torch.stft(x, window=window, **self.stft_params)
 91 |         # (B, #freqs, #frames, 2) -> (B, $frames, #freqs, 2)
 92 |         x_stft = x_stft.transpose(1, 2)
 93 |         x_power = x_stft[..., 0] ** 2 + x_stft[..., 1] ** 2
 94 |         x_amp = torch.sqrt(torch.clamp(x_power, min=self.eps))
 95 | 
 96 |         x_mel = torch.matmul(x_amp, self.melmat)
 97 |         x_mel = torch.clamp(x_mel, min=self.eps)
 98 | 
 99 |         return self.log(x_mel).transpose(1, 2)
100 | 
101 | 
102 | class MelSpectrogramLoss(torch.nn.Module):
103 | 
104 |     def __init__(self,
105 |                  fs=24000,
106 |                  fft_size=1024,
107 |                  hop_size=256,
108 |                  win_length=None,
109 |                  window="hann",
110 |                  num_mels=128,
111 |                  fmin=20,
112 |                  fmax=None,
113 |                  center=True,
114 |                  normalized=False,
115 |                  onesided=True,
116 |                  eps=1e-10,
117 |                  log_base=10.0, ):
118 |         super().__init__()
119 |         self.mel_spectrogram = MelSpectrogram(fs=fs,
120 |                                               fft_size=fft_size,
121 |                                               hop_size=hop_size,
122 |                                               win_length=win_length,
123 |                                               window=window,
124 |                                               num_mels=num_mels,
125 |                                               fmin=fmin,
126 |                                               fmax=fmax,
127 |                                               center=center,
128 |                                               normalized=normalized,
129 |                                               onesided=onesided,
130 |                                               eps=eps,
131 |                                               log_base=log_base, )
132 | 
133 |     def forward(self, y_hat, y):
134 |         """
135 |         Calculate Mel-spectrogram loss.
136 | 
137 |         Args:
138 |             y_hat (Tensor): Generated single tensor (B, 1, T).
139 |             y (Tensor): Groundtruth single tensor (B, 1, T).
140 | 
141 |         Returns:
142 |             Tensor: Mel-spectrogram loss value.
143 |         """
144 |         mel_hat = self.mel_spectrogram(y_hat)
145 |         mel = self.mel_spectrogram(y)
146 |         mel_loss = F.l1_loss(mel_hat, mel)
147 | 
148 |         return mel_loss
149 | 


--------------------------------------------------------------------------------
/run_simple_GUI_demo.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import torch.cuda
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | from InferenceInterfaces.ControllableInterface import ControllableInterface
 6 | from Utility.storage_config import MODEL_DIR
 7 | from Utility.utils import float2pcm
 8 | from Utility.utils import load_json_from_path
 9 | 
10 | 
11 | class TTSWebUI:
12 | 
13 |     def __init__(self,
14 |                  gpu_id="cpu",
15 |                  title="Controllable Text-to-Speech for over 7000 Languages",
16 |                  article="",
17 |                  tts_model_path=None,
18 |                  vocoder_model_path=None,
19 |                  embedding_gan_path=None,
20 |                  available_artificial_voices=10  # be careful with this, if you want too many, it might lead to an endless loop
21 |                  ):
22 |         path_to_iso_list = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
23 |         iso_to_name = load_json_from_path(path_to_iso_list)
24 |         text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
25 |         # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
26 | 
27 |         self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
28 |                                                      available_artificial_voices=available_artificial_voices,
29 |                                                      tts_model_path=tts_model_path,
30 |                                                      vocoder_model_path=vocoder_model_path,
31 |                                                      embedding_gan_path=embedding_gan_path)
32 |         self.iface = gr.Interface(fn=self.read,
33 |                                   inputs=[gr.Textbox(lines=2,
34 |                                                      placeholder="write what you want the synthesis to read here...",
35 |                                                      value="What I cannot create, I do not understand.",
36 |                                                      label="Text input"),
37 |                                           gr.Dropdown(text_selection,
38 |                                                       type="value",
39 |                                                       value='English (eng)',
40 |                                                       label="Select the Language of the Text (type on your keyboard to find it quickly)"),
41 |                                           gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
42 |                                           gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"),
43 |                                           gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=5, label="Random Seed for the artificial Voice"),
44 |                                           gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
45 |                                           gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
46 |                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
47 |                                           # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
48 |                                           # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
49 |                                           ],
50 |                                   outputs=[gr.Audio(type="numpy", label="Speech"),
51 |                                            gr.Image(label="Visualization")],
52 |                                   title=title,
53 |                                   allow_flagging="never",
54 |                                   article=article,
55 |                                   theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange"))
56 |         self.iface.launch()
57 | 
58 |     def read(self,
59 |              prompt,
60 |              language,
61 |              prosody_creativity,
62 |              duration_scaling_factor,
63 |              voice_seed,
64 |              emb1,
65 |              reference_audio,
66 |              # pitch_variance_scale,
67 |              # energy_variance_scale,
68 |              # emb2
69 |              ):
70 |         sr, wav, fig = self.controllable_ui.read(prompt,
71 |                                                  reference_audio,
72 |                                                  language.split(" ")[-1].split("(")[1].split(")")[0],
73 |                                                  language.split(" ")[-1].split("(")[1].split(")")[0],
74 |                                                  voice_seed,
75 |                                                  prosody_creativity,
76 |                                                  duration_scaling_factor,
77 |                                                  1.,
78 |                                                  1.0,
79 |                                                  1.0,
80 |                                                  emb1,
81 |                                                  0.,
82 |                                                  0.,
83 |                                                  0.,
84 |                                                  0.,
85 |                                                  0.,
86 |                                                  -24.)
87 |         return (sr, float2pcm(wav)), fig
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")
92 | 


--------------------------------------------------------------------------------
/Modules/ToucanTTS/flow_matching.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copied from https://github.com/KdaiP/StableTTS by https://github.com/KdaiP
  3 | 
  4 | https://github.com/KdaiP/StableTTS/blob/eebb177ebf195fd1246dedabec4ef69d9351a4f8/models/flow_matching.py
  5 | 
  6 | Code is under MIT License
  7 | """
  8 | 
  9 | import imageio
 10 | import torch
 11 | import torch.nn.functional as F
 12 | 
 13 | from Modules.ToucanTTS.dit_wrapper import Decoder
 14 | from Utility.utils import plot_spec_tensor
 15 | 
 16 | 
 17 | # copied from https://github.com/jaywalnut310/vits/blob/main/commons.py#L121
 18 | def sequence_mask(length: torch.Tensor, max_length: int = None) -> torch.Tensor:
 19 |     if max_length is None:
 20 |         max_length = length.max()
 21 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
 22 |     return x.unsqueeze(0) < length.unsqueeze(1)
 23 | 
 24 | 
 25 | # modified from https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/flow_matching.py
 26 | class CFMDecoder(torch.nn.Module):
 27 |     def __init__(self, hidden_channels, out_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, gin_channels):
 28 |         super().__init__()
 29 |         self.hidden_channels = hidden_channels
 30 |         self.out_channels = out_channels
 31 |         self.filter_channels = filter_channels
 32 |         self.gin_channels = gin_channels
 33 |         self.sigma_min = 1e-4
 34 | 
 35 |         self.estimator = Decoder(hidden_channels, out_channels, filter_channels, p_dropout, n_layers, n_heads, kernel_size, gin_channels)
 36 | 
 37 |     @torch.inference_mode()
 38 |     def forward(self, mu, mask, n_timesteps, temperature=1.0, c=None):
 39 |         """Forward diffusion
 40 | 
 41 |         Args:
 42 |             mu (torch.Tensor): output of encoder
 43 |                 shape: (batch_size, n_feats, mel_timesteps)
 44 |             mask (torch.Tensor): output_mask
 45 |                 shape: (batch_size, 1, mel_timesteps)
 46 |             n_timesteps (int): number of diffusion steps
 47 |             temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
 48 |             c (torch.Tensor, optional): shape: (batch_size, gin_channels)
 49 | 
 50 |         Returns:
 51 |             sample: generated mel-spectrogram
 52 |                 shape: (batch_size, n_feats, mel_timesteps)
 53 |         """
 54 |         size = list(mu.size())
 55 |         size[1] = self.out_channels
 56 |         z = torch.randn(size=size).to(mu.device) * temperature
 57 |         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
 58 |         return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, c=c)
 59 | 
 60 |     def solve_euler(self, x, t_span, mu, mask, c, plot_solutions=False):
 61 |         """
 62 |         Fixed euler solver for ODEs.
 63 |         Args:
 64 |             x (torch.Tensor): random noise
 65 |             t_span (torch.Tensor): n_timesteps interpolated
 66 |                 shape: (n_timesteps + 1,)
 67 |             mu (torch.Tensor): output of encoder
 68 |                 shape: (batch_size, n_feats, mel_timesteps)
 69 |             mask (torch.Tensor): output_mask
 70 |                 shape: (batch_size, 1, mel_timesteps)
 71 |             c (torch.Tensor, optional): speaker condition.
 72 |                 shape: (batch_size, gin_channels)
 73 |         """
 74 |         t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
 75 | 
 76 |         sol = []
 77 | 
 78 |         for step in range(1, len(t_span)):
 79 | 
 80 |             dphi_dt = self.estimator(x, mask, mu, t, c)
 81 | 
 82 |             x = x + dt * dphi_dt
 83 |             t = t + dt
 84 |             sol.append(x)
 85 |             if step < len(t_span) - 1:
 86 |                 dt = t_span[step + 1] - t
 87 | 
 88 |         if plot_solutions:
 89 |             create_plot_of_all_solutions(sol)
 90 | 
 91 |         return sol[-1]
 92 | 
 93 |     def compute_loss(self, x1, mask, mu, c):
 94 |         """Computes diffusion loss
 95 | 
 96 |         Args:
 97 |             x1 (torch.Tensor): Target
 98 |                 shape: (batch_size, n_feats, mel_timesteps)
 99 |             mask (torch.Tensor): target mask
100 |                 shape: (batch_size, 1, mel_timesteps)
101 |             mu (torch.Tensor): output of encoder
102 |                 shape: (batch_size, n_feats, mel_timesteps)
103 |             c (torch.Tensor, optional): speaker condition.
104 | 
105 |         Returns:
106 |             loss: conditional flow matching loss
107 |             y: conditional flow
108 |                 shape: (batch_size, n_feats, mel_timesteps)
109 |         """
110 |         b, _, t = mu.shape
111 | 
112 |         # random timestep
113 |         t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
114 |         # sample noise p(x_0)
115 |         z = torch.randn_like(x1)
116 | 
117 |         y = (1 - (1 - self.sigma_min) * t) * z + t * x1
118 |         u = x1 - (1 - self.sigma_min) * z
119 | 
120 |         loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), c),
121 |                           u,
122 |                           reduction="sum") / (torch.sum(mask) * u.shape[1])
123 |         return loss, y
124 | 
125 | 
126 | def create_plot_of_all_solutions(sol, fps=8):
127 |     gif_collector = list()
128 |     for step_index, solution in enumerate(sol):
129 |         unbatched_solution = solution[0]  # remove the batch axis (if there are more than one element in the batch, we only take the first)
130 |         plot_spec_tensor(unbatched_solution, "tmp", step_index, title=step_index + 1)
131 |         gif_collector.append(imageio.v2.imread(f"tmp/{step_index}.png"))
132 |     for _ in range(fps * 2):
133 |         gif_collector.append(gif_collector[-1])  # freeze-frame on the final one for two seconds
134 |     imageio.mimsave("tmp/animation.gif", gif_collector, fps=fps, loop=0)
135 | 


--------------------------------------------------------------------------------
/run_training_pipeline.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import sys
  5 | 
  6 | import torch
  7 | 
  8 | from Recipes.AlignerPipeline import run as aligner
  9 | from Recipes.BigVGAN_e2e import run as be2e
 10 | from Recipes.HiFiGAN_combined import run as HiFiGAN
 11 | from Recipes.HiFiGAN_e2e import run as e2e
 12 | from Recipes.ToucanTTS_IntegrationTest import run as tt_integration_test
 13 | from Recipes.ToucanTTS_Massive_Asian import run as asian
 14 | from Recipes.ToucanTTS_Massive_English_stage1 import run as eng1
 15 | from Recipes.ToucanTTS_Massive_English_stage2 import run as eng2
 16 | from Recipes.ToucanTTS_Massive_German import run as deu
 17 | from Recipes.ToucanTTS_Massive_stage1 import run as stage1
 18 | from Recipes.ToucanTTS_Massive_stage2 import run as stage2
 19 | from Recipes.ToucanTTS_Massive_stage3 import run as stage3
 20 | from Recipes.ToucanTTS_Nancy import run as nancy
 21 | from Recipes.finetuning_example_multilingual import run as fine_tuning_example_multilingual
 22 | from Recipes.finetuning_example_simple import run as fine_tuning_example_simple
 23 | 
 24 | pipeline_dict = {
 25 |     # the finetuning examples
 26 |     "finetuning_example_simple"      : fine_tuning_example_simple,
 27 |     "finetuning_example_multilingual": fine_tuning_example_multilingual,
 28 |     # integration test
 29 |     "tt_it"                          : tt_integration_test,
 30 |     # regular ToucanTTS pipelines
 31 |     "nancy"                          : nancy,
 32 |     "eng1"                           : eng1,
 33 |     "eng2"                           : eng2,
 34 |     "deu"                            : deu,
 35 |     "asian": asian,
 36 |     "stage1"                         : stage1,
 37 |     "stage2"                         : stage2,
 38 |     "stage3"                         : stage3,
 39 |     # training the aligner from scratch (not recommended, best to use provided checkpoint)
 40 |     "aligner"                        : aligner,
 41 |     # vocoder training (not recommended, best to use provided checkpoint)
 42 |     "hifigan"                        : HiFiGAN,
 43 |     "e2e"  : e2e,
 44 |     "be2e" : be2e
 45 | }
 46 | 
 47 | if __name__ == '__main__':
 48 | 
 49 |     parser = argparse.ArgumentParser(description='Training with the IMS Toucan Speech Synthesis Toolkit')
 50 | 
 51 |     parser.add_argument('pipeline',
 52 |                         choices=list(pipeline_dict.keys()),
 53 |                         help="Select pipeline to train.")
 54 | 
 55 |     parser.add_argument('--gpu_id',
 56 |                         type=str,
 57 |                         help="Which GPU(s) to run on. If not specified runs on CPU, but other than for integration tests that doesn't make much sense.",
 58 |                         default="cpu")
 59 | 
 60 |     parser.add_argument('--resume_checkpoint',
 61 |                         type=str,
 62 |                         help="Path to checkpoint to resume from.",
 63 |                         default=None)
 64 | 
 65 |     parser.add_argument('--resume',
 66 |                         action="store_true",
 67 |                         help="Automatically load the highest checkpoint and continue from there.",
 68 |                         default=False)
 69 | 
 70 |     parser.add_argument('--finetune',
 71 |                         action="store_true",
 72 |                         help="Whether to fine-tune from the specified checkpoint.",
 73 |                         default=False)
 74 | 
 75 |     parser.add_argument('--model_save_dir',
 76 |                         type=str,
 77 |                         help="Directory where the checkpoints should be saved to.",
 78 |                         default=None)
 79 | 
 80 |     parser.add_argument('--wandb',
 81 |                         action="store_true",
 82 |                         help="Whether to use weights and biases to track training runs. Requires you to run wandb login and place your auth key before.",
 83 |                         default=False)
 84 | 
 85 |     parser.add_argument('--wandb_resume_id',
 86 |                         type=str,
 87 |                         help="ID of a stopped wandb run to continue tracking",
 88 |                         default=None)
 89 | 
 90 |     args = parser.parse_args()
 91 | 
 92 |     if args.finetune and args.resume_checkpoint is None and not args.resume:
 93 |         print("Need to provide path to checkpoint to fine-tune from!")
 94 |         sys.exit()
 95 | 
 96 |     if args.gpu_id == "cpu":
 97 |         os.environ["CUDA_VISIBLE_DEVICES"] = ""
 98 |         device = torch.device("cpu")
 99 |         print(f"No GPU specified, using CPU. Training will likely not work without GPU.")
100 |         gpu_count = 1  # for technical reasons this is set to one, indicating it's not mutli-GPU training, even though there is no GPU in this case
101 |     else:
102 |         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
103 |         os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.gpu_id}"
104 |         device = torch.device("cuda")
105 |         print(f"Making GPU {os.environ['CUDA_VISIBLE_DEVICES']} the only visible device(s).")
106 |         gpu_count = len(args.gpu_id.replace(",", " ").split())
107 |         # example call for gpu_count training:
108 |         # torchrun --standalone --nproc_per_node=4 --nnodes=1 run_training_pipeline.py nancy --gpu_id "1,2,3"
109 | 
110 |     torch.manual_seed(9665)
111 |     random.seed(9665)
112 |     torch.random.manual_seed(9665)
113 | 
114 |     torch.multiprocessing.set_sharing_strategy('file_system')
115 | 
116 |     pipeline_dict[args.pipeline](gpu_id=args.gpu_id,
117 |                                  resume_checkpoint=args.resume_checkpoint,
118 |                                  resume=args.resume,
119 |                                  finetune=args.finetune,
120 |                                  model_dir=args.model_save_dir,
121 |                                  use_wandb=args.wandb,
122 |                                  wandb_resume_id=args.wandb_resume_id,
123 |                                  gpu_count=gpu_count)
124 | 


--------------------------------------------------------------------------------
/Modules/EmbeddingModel/StyleTTSEncoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MIT Licensed Code
  3 | 
  4 | Copyright (c) 2022 Aaron (Yinghao) Li
  5 | 
  6 | https://github.com/yl4579/StyleTTS/blob/main/models.py
  7 | """
  8 | 
  9 | import math
 10 | 
 11 | import torch
 12 | import torch.nn.functional as F
 13 | from torch import nn
 14 | from torch.nn.utils import spectral_norm
 15 | 
 16 | 
 17 | class StyleEncoder(nn.Module):
 18 |     def __init__(self, dim_in=128, style_dim=64, max_conv_dim=384):
 19 |         super().__init__()
 20 |         blocks = []
 21 |         blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
 22 | 
 23 |         repeat_num = 4
 24 |         for _ in range(repeat_num):
 25 |             dim_out = min(dim_in * 2, max_conv_dim)
 26 |             blocks += [ResBlk(dim_in, dim_out, downsample='half')]
 27 |             dim_in = dim_out
 28 | 
 29 |         blocks += [nn.LeakyReLU(0.2)]
 30 |         blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
 31 |         blocks += [nn.AdaptiveAvgPool2d(1)]
 32 |         blocks += [nn.LeakyReLU(0.2)]
 33 |         self.shared = nn.Sequential(*blocks)
 34 | 
 35 |         self.unshared = nn.Linear(dim_out, style_dim)
 36 | 
 37 |     def forward(self, speech):
 38 |         h = self.shared(speech.unsqueeze(1))
 39 |         h = h.view(h.size(0), -1)
 40 |         s = self.unshared(h)
 41 | 
 42 |         return s
 43 | 
 44 | 
 45 | class ResBlk(nn.Module):
 46 |     def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
 47 |                  normalize=False, downsample='none'):
 48 |         super().__init__()
 49 |         self.actv = actv
 50 |         self.normalize = normalize
 51 |         self.downsample = DownSample(downsample)
 52 |         self.downsample_res = LearnedDownSample(downsample, dim_in)
 53 |         self.learned_sc = dim_in != dim_out
 54 |         self._build_weights(dim_in, dim_out)
 55 | 
 56 |     def _build_weights(self, dim_in, dim_out):
 57 |         self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
 58 |         self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
 59 |         if self.normalize:
 60 |             self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
 61 |             self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
 62 |         if self.learned_sc:
 63 |             self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
 64 | 
 65 |     def _shortcut(self, x):
 66 |         if self.learned_sc:
 67 |             x = self.conv1x1(x)
 68 |         if self.downsample:
 69 |             x = self.downsample(x)
 70 |         return x
 71 | 
 72 |     def _residual(self, x):
 73 |         if self.normalize:
 74 |             x = self.norm1(x)
 75 |         x = self.actv(x)
 76 |         x = self.conv1(x)
 77 |         x = self.downsample_res(x)
 78 |         if self.normalize:
 79 |             x = self.norm2(x)
 80 |         x = self.actv(x)
 81 |         x = self.conv2(x)
 82 |         return x
 83 | 
 84 |     def forward(self, x):
 85 |         x = self._shortcut(x) + self._residual(x)
 86 |         return x / math.sqrt(2)  # unit variance
 87 | 
 88 | 
 89 | class LearnedDownSample(nn.Module):
 90 |     def __init__(self, layer_type, dim_in):
 91 |         super().__init__()
 92 |         self.layer_type = layer_type
 93 | 
 94 |         if self.layer_type == 'none':
 95 |             self.conv = nn.Identity()
 96 |         elif self.layer_type == 'timepreserve':
 97 |             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
 98 |         elif self.layer_type == 'half':
 99 |             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
100 |         else:
101 |             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
102 | 
103 |     def forward(self, x):
104 |         return self.conv(x)
105 | 
106 | 
107 | class LearnedUpSample(nn.Module):
108 |     def __init__(self, layer_type, dim_in):
109 |         super().__init__()
110 |         self.layer_type = layer_type
111 | 
112 |         if self.layer_type == 'none':
113 |             self.conv = nn.Identity()
114 |         elif self.layer_type == 'timepreserve':
115 |             self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
116 |         elif self.layer_type == 'half':
117 |             self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
118 |         else:
119 |             raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
120 | 
121 |     def forward(self, x):
122 |         return self.conv(x)
123 | 
124 | 
125 | class DownSample(nn.Module):
126 |     def __init__(self, layer_type):
127 |         super().__init__()
128 |         self.layer_type = layer_type
129 | 
130 |     def forward(self, x):
131 |         if self.layer_type == 'none':
132 |             return x
133 |         elif self.layer_type == 'timepreserve':
134 |             return F.avg_pool2d(x, (2, 1))
135 |         elif self.layer_type == 'half':
136 |             if x.shape[-1] % 2 != 0:
137 |                 x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
138 |             return F.avg_pool2d(x, 2)
139 |         else:
140 |             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
141 | 
142 | 
143 | class UpSample(nn.Module):
144 |     def __init__(self, layer_type):
145 |         super().__init__()
146 |         self.layer_type = layer_type
147 | 
148 |     def forward(self, x):
149 |         if self.layer_type == 'none':
150 |             return x
151 |         elif self.layer_type == 'timepreserve':
152 |             return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
153 |         elif self.layer_type == 'half':
154 |             return F.interpolate(x, scale_factor=2, mode='nearest')
155 |         else:
156 |             raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
157 | 


--------------------------------------------------------------------------------
/run_text_to_file_reader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | 
  5 | from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
  6 | 
  7 | 
  8 | def read_texts(sentence, filename, model_id=None, device="cpu", language="eng", speaker_reference=None, duration_scaling_factor=1.0):
  9 |     tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
 10 |     tts.set_language(language)
 11 |     if speaker_reference is not None:
 12 |         tts.set_utterance_embedding(speaker_reference)
 13 |     if type(sentence) == str:
 14 |         sentence = [sentence]
 15 |     tts.read_to_file(text_list=sentence, file_location=filename, duration_scaling_factor=duration_scaling_factor, prosody_creativity=0.0)
 16 |     del tts
 17 | 
 18 | 
 19 | def english_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 20 |     os.makedirs("audios", exist_ok=True)
 21 | 
 22 |     read_texts(model_id=model_id,
 23 |                sentence=["""Once upon a midnight dreary, while I pondered, weak, and weary,
 24 |                             Over many a quaint, and curious volume, of forgotten lore,
 25 |                             While I nodded, nearly napping, suddenly, there came a tapping,
 26 |                             As of someone gently rapping, rapping at my chamber door."""],
 27 |                filename=f"audios/{model_id}_english_test_{version}.wav",
 28 |                device=exec_device,
 29 |                language="eng",
 30 |                speaker_reference=speaker_reference)
 31 | 
 32 | 
 33 | def japanese_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 34 |     os.makedirs("audios", exist_ok=True)
 35 | 
 36 |     read_texts(model_id=model_id,
 37 |                sentence=["医師会がなくても、近隣の病院なら紹介してくれると思います。"],
 38 |                filename=f"audios/{model_id}_japanese_test_{version}.wav",
 39 |                device=exec_device,
 40 |                language="jpn",
 41 |                speaker_reference=speaker_reference)
 42 | 
 43 | 
 44 | def chinese_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 45 |     os.makedirs("audios", exist_ok=True)
 46 | 
 47 |     read_texts(model_id=model_id,
 48 |                sentence=["李绅 《悯农》 锄禾日当午， 汗滴禾下土。 谁知盘中餐， 粒粒皆辛苦。"],
 49 |                filename=f"audios/{model_id}_chinese_test_{version}.wav",
 50 |                device=exec_device,
 51 |                language="cmn",
 52 |                speaker_reference=speaker_reference)
 53 | 
 54 | 
 55 | def german_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 56 |     os.makedirs("audios", exist_ok=True)
 57 | 
 58 |     read_texts(model_id=model_id,
 59 |                sentence=["""Fest gemauert in der Erden,
 60 |                             Steht die Form, aus Lehm gebrannt.
 61 |                             Heute muss die Glocke werden!
 62 |                             Frisch, Gesellen, seid zur Hand!"""],
 63 |                filename=f"audios/{model_id}_german_test_{version}.wav",
 64 |                device=exec_device,
 65 |                language="deu",
 66 |                speaker_reference=speaker_reference)
 67 | 
 68 | 
 69 | def vietnamese_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 70 |     os.makedirs("audios", exist_ok=True)
 71 | 
 72 |     read_texts(model_id=model_id,
 73 |                sentence=["""Thân phận,
 74 |                             ở một nơi luôn phải nhắc mình,
 75 |                             im miệng,
 76 |                             thân phận,
 77 |                             là khi nói về quá khứ,
 78 |                             ngó trước nhìn sau,
 79 |                             là phải biết nhắm mắt bịt tai làm lơ,
 80 |                             thờ ơ,
 81 |                             với tất cả những điều gai chướng,
 82 |                             thân phận chúng tôi ở đó,
 83 |                             những quyển sách chuyền tay nhau như ăn cắp,
 84 |                             ngôn luận ư?
 85 |                             không có đất cho nghĩa tự do."""],
 86 |                filename=f"audios/{model_id}_vietnamese_test_{version}.wav",
 87 |                device=exec_device,
 88 |                language="vie",
 89 |                speaker_reference=speaker_reference)
 90 | 
 91 | 
 92 | def french_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
 93 |     os.makedirs("audios", exist_ok=True)
 94 | 
 95 |     read_texts(model_id=model_id,
 96 |                sentence=["""Maître corbeau, sur un arbre perché,
 97 |                             Tenait en son bec un fromage.
 98 |                             Maître renard par l'odeur alléché ,
 99 |                             Lui tint à peu près ce langage :
100 |                             «Et bonjour Monsieur du Corbeau.
101 |                             Que vous êtes joli! que vous me semblez beau!"""],
102 |                filename=f"audios/{model_id}_french_test_{version}.wav",
103 |                device=exec_device,
104 |                language="fra",
105 |                speaker_reference=speaker_reference)
106 | 
107 | 
108 | def all_test(version, model_id=None, exec_device="cpu", speaker_reference=None):
109 |     english_test(version, model_id, exec_device, speaker_reference)
110 |     german_test(version, model_id, exec_device, speaker_reference)
111 |     french_test(version, model_id, exec_device, speaker_reference)
112 |     vietnamese_test(version, model_id, exec_device, speaker_reference)
113 |     japanese_test(version, model_id, exec_device, speaker_reference)
114 |     chinese_test(version, model_id, exec_device, speaker_reference)
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     exec_device = "cuda" if torch.cuda.is_available() else "cpu"
119 |     print(f"running on {exec_device}")
120 | 
121 |     os.makedirs("audios/speaker_references/", exist_ok=True)
122 |     merged_speaker_references = ["audios/speaker_references/" + ref for ref in os.listdir("audios/speaker_references/")]
123 | 
124 |     all_test(version="version_11",
125 |              model_id=None,  # will use the default
126 |              exec_device=exec_device,
127 |              speaker_reference=merged_speaker_references if merged_speaker_references != [] else None)
128 | 


--------------------------------------------------------------------------------
/Modules/GeneralLayers/EncoderLayer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Johns Hopkins University (Shinji Watanabe)
  2 | #                Northwestern Polytechnical University (Pengcheng Guo)
  3 | #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
  4 | # Adapted by Florian Lux 2021
  5 | 
  6 | 
  7 | import torch
  8 | from torch import nn
  9 | 
 10 | from Modules.GeneralLayers.LayerNorm import LayerNorm
 11 | 
 12 | 
 13 | class EncoderLayer(nn.Module):
 14 |     """
 15 |     Encoder layer module.
 16 | 
 17 |     Args:
 18 |         size (int): Input dimension.
 19 |         self_attn (torch.nn.Module): Self-attention module instance.
 20 |             `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
 21 |             can be used as the argument.
 22 |         feed_forward (torch.nn.Module): Feed-forward module instance.
 23 |             `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
 24 |             can be used as the argument.
 25 |         feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
 26 |             `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
 27 |             can be used as the argument.
 28 |         conv_module (torch.nn.Module): Convolution module instance.
 29 |             `ConvlutionModule` instance can be used as the argument.
 30 |         dropout_rate (float): Dropout rate.
 31 |         normalize_before (bool): Whether to use layer_norm before the first block.
 32 |         concat_after (bool): Whether to concat attention layer's input and output.
 33 |             if True, additional linear will be applied.
 34 |             i.e. x -> x + linear(concat(x, att(x)))
 35 |             if False, no additional linear will be applied. i.e. x -> x + att(x)
 36 | 
 37 |     """
 38 | 
 39 |     def __init__(self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ):
 40 |         super(EncoderLayer, self).__init__()
 41 |         self.self_attn = self_attn
 42 |         self.feed_forward = feed_forward
 43 |         self.feed_forward_macaron = feed_forward_macaron
 44 |         self.conv_module = conv_module
 45 |         self.norm_ff = LayerNorm(size)  # for the FNN module
 46 |         self.norm_mha = LayerNorm(size)  # for the MHA module
 47 |         if feed_forward_macaron is not None:
 48 |             self.norm_ff_macaron = LayerNorm(size)
 49 |             self.ff_scale = 0.5
 50 |         else:
 51 |             self.ff_scale = 1.0
 52 |         if self.conv_module is not None:
 53 |             self.norm_conv = LayerNorm(size)  # for the CNN module
 54 |             self.norm_final = LayerNorm(size)  # for the final output of the block
 55 |         self.dropout = nn.Dropout(dropout_rate)
 56 |         self.size = size
 57 |         self.normalize_before = normalize_before
 58 |         self.concat_after = concat_after
 59 |         if self.concat_after:
 60 |             self.concat_linear = nn.Linear(size + size, size)
 61 | 
 62 |     def forward(self, x_input, mask, cache=None):
 63 |         """
 64 |         Compute encoded features.
 65 | 
 66 |         Args:
 67 |             x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
 68 |                 - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
 69 |                 - w/o pos emb: Tensor (#batch, time, size).
 70 |             mask (torch.Tensor): Mask tensor for the input (#batch, time).
 71 |             cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
 72 | 
 73 |         Returns:
 74 |             torch.Tensor: Output tensor (#batch, time, size).
 75 |             torch.Tensor: Mask tensor (#batch, time).
 76 | 
 77 |         """
 78 |         if isinstance(x_input, tuple):
 79 |             x, pos_emb = x_input[0], x_input[1]
 80 |         else:
 81 |             x, pos_emb = x_input, None
 82 | 
 83 |         # whether to use macaron style
 84 |         if self.feed_forward_macaron is not None:
 85 |             residual = x
 86 |             if self.normalize_before:
 87 |                 x = self.norm_ff_macaron(x)
 88 |             x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
 89 |             if not self.normalize_before:
 90 |                 x = self.norm_ff_macaron(x)
 91 | 
 92 |         # multi-headed self-attention module
 93 |         residual = x
 94 |         if self.normalize_before:
 95 |             x = self.norm_mha(x)
 96 | 
 97 |         if cache is None:
 98 |             x_q = x
 99 |         else:
100 |             assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
101 |             x_q = x[:, -1:, :]
102 |             residual = residual[:, -1:, :]
103 |             mask = None if mask is None else mask[:, -1:, :]
104 | 
105 |         if pos_emb is not None:
106 |             x_att = self.self_attn(x_q, x, x, pos_emb, mask)
107 |         else:
108 |             x_att = self.self_attn(x_q, x, x, mask)
109 | 
110 |         if self.concat_after:
111 |             x_concat = torch.cat((x, x_att), dim=-1)
112 |             x = residual + self.concat_linear(x_concat)
113 |         else:
114 |             x = residual + self.dropout(x_att)
115 |         if not self.normalize_before:
116 |             x = self.norm_mha(x)
117 | 
118 |         # convolution module
119 |         if self.conv_module is not None:
120 |             residual = x
121 |             if self.normalize_before:
122 |                 x = self.norm_conv(x)
123 |             x = residual + self.dropout(self.conv_module(x))
124 |             if not self.normalize_before:
125 |                 x = self.norm_conv(x)
126 | 
127 |         # feed forward module
128 |         residual = x
129 |         if self.normalize_before:
130 |             x = self.norm_ff(x)
131 |         x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
132 |         if not self.normalize_before:
133 |             x = self.norm_ff(x)
134 | 
135 |         if self.conv_module is not None:
136 |             x = self.norm_final(x)
137 | 
138 |         if cache is not None:
139 |             x = torch.cat([cache, x], dim=1)
140 | 
141 |         if pos_emb is not None:
142 |             return (x, pos_emb), mask
143 | 
144 |         return x, mask
145 | 


--------------------------------------------------------------------------------
/Preprocessing/multilinguality/visualize_distances.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import networkx as nx
  5 | import torch
  6 | from huggingface_hub import hf_hub_download
  7 | from tqdm import tqdm
  8 | 
  9 | from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS
 10 | from Utility.storage_config import MODEL_DIR
 11 | from Utility.utils import load_json_from_path
 12 | 
 13 | distance_types = ["tree", "asp", "map", "learned", "l1"]
 14 | distance_type = distance_types[2]  # switch here
 15 | edge_threshold = 0.1
 16 | 
 17 | cache_root = "."
 18 | supervised_iso_codes = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="supervised_languages.json"))
 19 | 
 20 | if distance_type == "l1":
 21 |     iso_codes_to_ids = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json"))[-1]
 22 |     model_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="ToucanTTS.pt")
 23 |     checkpoint = torch.load(model_path, map_location='cpu')
 24 |     embedding_provider = ToucanTTS(weights=checkpoint["model"], config=checkpoint["config"]).encoder.language_embedding
 25 |     embedding_provider.requires_grad_(False)
 26 |     l1_dist = dict()
 27 |     seen_langs = set()
 28 |     for lang_1 in supervised_iso_codes:
 29 |         if lang_1 not in seen_langs:
 30 |             seen_langs.add(lang_1)
 31 |             l1_dist[lang_1] = dict()
 32 |         for lang_2 in supervised_iso_codes:
 33 |             if lang_2 not in seen_langs:  # it's symmetric
 34 |                 l1_dist[lang_1][lang_2] = torch.nn.functional.mse_loss(embedding_provider(torch.LongTensor([iso_codes_to_ids[lang_1]])).squeeze(), embedding_provider(torch.LongTensor([iso_codes_to_ids[lang_2]])).squeeze())
 35 |     largest_value_l1_dist = 0.0
 36 |     for _, values in l1_dist.items():
 37 |         for _, value in values.items():
 38 |             largest_value_l1_dist = max(largest_value_l1_dist, value)
 39 |     for key1 in l1_dist:
 40 |         for key2 in l1_dist[key1]:
 41 |             l1_dist[key1][key2] = l1_dist[key1][key2] / largest_value_l1_dist
 42 |     distance_measure = l1_dist
 43 | 
 44 | if distance_type == "tree":
 45 |     tree_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_tree_dist.json")
 46 |     tree_dist = load_json_from_path(tree_lookup_path)
 47 |     distance_measure = tree_dist
 48 | 
 49 | if distance_type == "map":
 50 |     map_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_map_dist.json")
 51 |     map_dist = load_json_from_path(map_lookup_path)
 52 |     largest_value_map_dist = 0.0
 53 |     for _, values in map_dist.items():
 54 |         for _, value in values.items():
 55 |             largest_value_map_dist = max(largest_value_map_dist, value)
 56 |     for key1 in map_dist:
 57 |         for key2 in map_dist[key1]:
 58 |             map_dist[key1][key2] = map_dist[key1][key2] / largest_value_map_dist
 59 |     distance_measure = map_dist
 60 | 
 61 | if distance_type == "learned":
 62 |     learned_lookup_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_learned_dist.json")
 63 |     learned_dist = load_json_from_path(learned_lookup_path)
 64 |     largest_value_learned_dist = 0.0
 65 |     for _, values in learned_dist.items():
 66 |         for _, value in values.items():
 67 |             largest_value_learned_dist = max(largest_value_learned_dist, value)
 68 |     for key1 in learned_dist:
 69 |         for key2 in learned_dist[key1]:
 70 |             learned_dist[key1][key2] = learned_dist[key1][key2] / largest_value_learned_dist
 71 |     distance_measure = learned_dist
 72 | 
 73 | if distance_type == "asp":
 74 |     asp_dict_path = hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="asp_dict.pkl")
 75 |     with open(asp_dict_path, 'rb') as dictfile:
 76 |         asp_sim = pickle.load(dictfile)
 77 |     lang_list = list(asp_sim.keys())
 78 |     asp_dist = dict()
 79 |     seen_langs = set()
 80 |     for lang_1 in lang_list:
 81 |         if lang_1 not in seen_langs:
 82 |             seen_langs.add(lang_1)
 83 |             asp_dist[lang_1] = dict()
 84 |         for index, lang_2 in enumerate(lang_list):
 85 |             if lang_2 not in seen_langs:  # it's symmetric
 86 |                 asp_dist[lang_1][lang_2] = 1 - asp_sim[lang_1][index]
 87 |     distance_measure = asp_dist
 88 | 
 89 | iso_codes_to_names = load_json_from_path(hf_hub_download(cache_dir=MODEL_DIR, repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json"))
 90 | distances = list()
 91 | 
 92 | for lang_1 in distance_measure:
 93 |     if lang_1 not in iso_codes_to_names:
 94 |         continue
 95 |     for lang_2 in distance_measure[lang_1]:
 96 |         distances.append((iso_codes_to_names[lang_1], iso_codes_to_names[lang_2], distance_measure[lang_1][lang_2]))
 97 | 
 98 | # Create a graph
 99 | G = nx.Graph()
100 | 
101 | # Add edges along with distances as weights
102 | min_dist = min(d for _, _, d in distances)
103 | max_dist = max(d for _, _, d in distances)
104 | normalized_distances = [(entity1, entity2, (d - min_dist) / (max_dist - min_dist)) for entity1, entity2, d in distances]
105 | 
106 | for entity1, entity2, d in tqdm(normalized_distances):
107 |     if d <= edge_threshold and entity1 != entity2:
108 |         spring_tension = edge_threshold - d
109 |         G.add_edge(entity1, entity2, weight=spring_tension * 10)
110 | 
111 | # Draw the graph
112 | pos = nx.spring_layout(G, weight="weight")  # Positions for all nodes
113 | edges = G.edges(data=True)
114 | 
115 | # Draw nodes
116 | nx.draw_networkx_nodes(G, pos, node_size=1, alpha=0.01)
117 | 
118 | # Draw edges with labels
119 | nx.draw_networkx_edges(G, pos, alpha=0.01, edge_color="gray")
120 | # nx.draw_networkx_edge_labels(G, pos, edge_labels={(u, v): d['weight'] for u, v, d in edges})
121 | 
122 | # Draw node labels
123 | nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
124 | 
125 | plt.title(f'Graph of {distance_type} Distances')
126 | 
127 | plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
128 | plt.tight_layout(pad=0)
129 | 
130 | plt.show()
131 | 


--------------------------------------------------------------------------------