├── .gitignore
├── Readme.md
├── TTS
├── .models.json
├── VERSION
├── __init__.py
├── api.py
├── bin
│ ├── __init__.py
│ ├── collect_env_info.py
│ ├── compute_attention_masks.py
│ ├── compute_embeddings.py
│ ├── compute_statistics.py
│ ├── eval_encoder.py
│ ├── extract_tts_spectrograms.py
│ ├── find_unique_chars.py
│ ├── find_unique_phonemes.py
│ ├── remove_silence_using_vad.py
│ ├── resample.py
│ ├── synthesize.py
│ ├── train_encoder.py
│ ├── train_tts.py
│ ├── train_vocoder.py
│ └── tune_wavegrad.py
├── config
│ ├── __init__.py
│ └── shared_configs.py
├── demos
│ └── xtts_ft_demo
│ │ ├── requirements.txt
│ │ ├── utils
│ │ ├── formatter.py
│ │ └── gpt_train.py
│ │ └── xtts_demo.py
├── encoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ │ ├── base_encoder_config.py
│ │ ├── emotion_encoder_config.py
│ │ └── speaker_encoder_config.py
│ ├── dataset.py
│ ├── losses.py
│ ├── models
│ │ ├── base_encoder.py
│ │ ├── lstm.py
│ │ └── resnet.py
│ ├── requirements.txt
│ └── utils
│ │ ├── __init__.py
│ │ ├── generic_utils.py
│ │ ├── prepare_voxceleb.py
│ │ ├── training.py
│ │ └── visual.py
├── model.py
├── server
│ ├── README.md
│ ├── __init__.py
│ ├── conf.json
│ ├── server.py
│ ├── static
│ │ └── coqui-log-green-TTS.png
│ └── templates
│ │ ├── details.html
│ │ └── index.html
├── tts
│ ├── __init__.py
│ ├── configs
│ │ ├── __init__.py
│ │ ├── align_tts_config.py
│ │ ├── bark_config.py
│ │ ├── delightful_tts_config.py
│ │ ├── fast_pitch_config.py
│ │ ├── fast_speech_config.py
│ │ ├── fastspeech2_config.py
│ │ ├── glow_tts_config.py
│ │ ├── neuralhmm_tts_config.py
│ │ ├── overflow_config.py
│ │ ├── shared_configs.py
│ │ ├── speedy_speech_config.py
│ │ ├── tacotron2_config.py
│ │ ├── tacotron_config.py
│ │ ├── tortoise_config.py
│ │ ├── vits_config.py
│ │ └── xtts_config.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── formatters.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── align_tts
│ │ │ ├── __init__.py
│ │ │ ├── duration_predictor.py
│ │ │ └── mdn.py
│ │ ├── bark
│ │ │ ├── __init__.py
│ │ │ ├── hubert
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hubert_manager.py
│ │ │ │ ├── kmeans_hubert.py
│ │ │ │ └── tokenizer.py
│ │ │ ├── inference_funcs.py
│ │ │ ├── load_model.py
│ │ │ ├── model.py
│ │ │ └── model_fine.py
│ │ ├── delightful_tts
│ │ │ ├── __init__.py
│ │ │ ├── acoustic_model.py
│ │ │ ├── conformer.py
│ │ │ ├── conv_layers.py
│ │ │ ├── encoders.py
│ │ │ ├── energy_adaptor.py
│ │ │ ├── kernel_predictor.py
│ │ │ ├── networks.py
│ │ │ ├── phoneme_prosody_predictor.py
│ │ │ ├── pitch_adaptor.py
│ │ │ └── variance_predictor.py
│ │ ├── feed_forward
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ └── encoder.py
│ │ ├── generic
│ │ │ ├── __init__.py
│ │ │ ├── aligner.py
│ │ │ ├── gated_conv.py
│ │ │ ├── normalization.py
│ │ │ ├── pos_encoding.py
│ │ │ ├── res_conv_bn.py
│ │ │ ├── time_depth_sep_conv.py
│ │ │ ├── transformer.py
│ │ │ └── wavenet.py
│ │ ├── glow_tts
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ ├── encoder.py
│ │ │ ├── glow.py
│ │ │ └── transformer.py
│ │ ├── losses.py
│ │ ├── overflow
│ │ │ ├── __init__.py
│ │ │ ├── common_layers.py
│ │ │ ├── decoder.py
│ │ │ ├── neural_hmm.py
│ │ │ └── plotting_utils.py
│ │ ├── tacotron
│ │ │ ├── __init__.py
│ │ │ ├── attentions.py
│ │ │ ├── capacitron_layers.py
│ │ │ ├── common_layers.py
│ │ │ ├── gst_layers.py
│ │ │ ├── tacotron.py
│ │ │ └── tacotron2.py
│ │ ├── tortoise
│ │ │ ├── arch_utils.py
│ │ │ ├── audio_utils.py
│ │ │ ├── autoregressive.py
│ │ │ ├── classifier.py
│ │ │ ├── clvp.py
│ │ │ ├── diffusion.py
│ │ │ ├── diffusion_decoder.py
│ │ │ ├── dpm_solver.py
│ │ │ ├── random_latent_generator.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transformer.py
│ │ │ ├── utils.py
│ │ │ ├── vocoder.py
│ │ │ ├── wav2vec_alignment.py
│ │ │ └── xtransformers.py
│ │ ├── vits
│ │ │ ├── discriminator.py
│ │ │ ├── networks.py
│ │ │ ├── stochastic_duration_predictor.py
│ │ │ └── transforms.py
│ │ └── xtts
│ │ │ ├── dvae.py
│ │ │ ├── gpt.py
│ │ │ ├── gpt_inference.py
│ │ │ ├── hifigan_decoder.py
│ │ │ ├── latent_encoder.py
│ │ │ ├── perceiver_encoder.py
│ │ │ ├── stream_generator.py
│ │ │ ├── tokenizer.py
│ │ │ ├── trainer
│ │ │ ├── dataset.py
│ │ │ ├── dvae_dataset.py
│ │ │ └── gpt_trainer.py
│ │ │ ├── xtts_manager.py
│ │ │ └── zh_num2words.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── align_tts.py
│ │ ├── bark.py
│ │ ├── base_tacotron.py
│ │ ├── base_tts.py
│ │ ├── delightful_tts.py
│ │ ├── forward_tts.py
│ │ ├── glow_tts.py
│ │ ├── neuralhmm_tts.py
│ │ ├── overflow.py
│ │ ├── tacotron.py
│ │ ├── tacotron2.py
│ │ ├── tortoise.py
│ │ ├── vits.py
│ │ └── xtts.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── assets
│ │ └── tortoise
│ │ │ └── tokenizer.json
│ │ ├── data.py
│ │ ├── fairseq.py
│ │ ├── helpers.py
│ │ ├── languages.py
│ │ ├── managers.py
│ │ ├── measures.py
│ │ ├── monotonic_align
│ │ ├── __init__.py
│ │ ├── core.pyx
│ │ └── setup.py
│ │ ├── speakers.py
│ │ ├── ssim.py
│ │ ├── synthesis.py
│ │ ├── text
│ │ ├── __init__.py
│ │ ├── bangla
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── belarusian
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── characters.py
│ │ ├── chinese_mandarin
│ │ │ ├── __init__.py
│ │ │ ├── numbers.py
│ │ │ ├── phonemizer.py
│ │ │ └── pinyinToPhonemes.py
│ │ ├── cleaners.py
│ │ ├── cmudict.py
│ │ ├── english
│ │ │ ├── __init__.py
│ │ │ ├── abbreviations.py
│ │ │ ├── number_norm.py
│ │ │ └── time_norm.py
│ │ ├── french
│ │ │ ├── __init__.py
│ │ │ └── abbreviations.py
│ │ ├── japanese
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── korean
│ │ │ ├── __init__.py
│ │ │ ├── ko_dictionary.py
│ │ │ ├── korean.py
│ │ │ └── phonemizer.py
│ │ ├── phonemizers
│ │ │ ├── __init__.py
│ │ │ ├── bangla_phonemizer.py
│ │ │ ├── base.py
│ │ │ ├── belarusian_phonemizer.py
│ │ │ ├── espeak_wrapper.py
│ │ │ ├── gruut_wrapper.py
│ │ │ ├── ja_jp_phonemizer.py
│ │ │ ├── ko_kr_phonemizer.py
│ │ │ ├── multi_phonemizer.py
│ │ │ └── zh_cn_phonemizer.py
│ │ ├── punctuation.py
│ │ └── tokenizer.py
│ │ └── visual.py
├── utils
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── numpy_transforms.py
│ │ ├── processor.py
│ │ └── torch_transforms.py
│ ├── callbacks.py
│ ├── capacitron_optimizer.py
│ ├── distribute.py
│ ├── download.py
│ ├── downloaders.py
│ ├── generic_utils.py
│ ├── io.py
│ ├── manage.py
│ ├── radam.py
│ ├── samplers.py
│ ├── synthesizer.py
│ ├── training.py
│ └── vad.py
├── vc
│ ├── configs
│ │ ├── __init__.py
│ │ ├── freevc_config.py
│ │ └── shared_configs.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── base_vc.py
│ │ └── freevc.py
│ └── modules
│ │ ├── __init__.py
│ │ └── freevc
│ │ ├── __init__.py
│ │ ├── commons.py
│ │ ├── mel_processing.py
│ │ ├── modules.py
│ │ ├── speaker_encoder
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── hparams.py
│ │ └── speaker_encoder.py
│ │ └── wavlm
│ │ ├── __init__.py
│ │ ├── config.json
│ │ ├── modules.py
│ │ └── wavlm.py
└── vocoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ ├── __init__.py
│ ├── fullband_melgan_config.py
│ ├── hifigan_config.py
│ ├── melgan_config.py
│ ├── multiband_melgan_config.py
│ ├── parallel_wavegan_config.py
│ ├── shared_configs.py
│ ├── univnet_config.py
│ ├── wavegrad_config.py
│ └── wavernn_config.py
│ ├── datasets
│ ├── __init__.py
│ ├── gan_dataset.py
│ ├── preprocess.py
│ ├── wavegrad_dataset.py
│ └── wavernn_dataset.py
│ ├── layers
│ ├── __init__.py
│ ├── hifigan.py
│ ├── losses.py
│ ├── lvc_block.py
│ ├── melgan.py
│ ├── parallel_wavegan.py
│ ├── pqmf.py
│ ├── qmf.dat
│ ├── upsample.py
│ └── wavegrad.py
│ ├── models
│ ├── __init__.py
│ ├── base_vocoder.py
│ ├── fullband_melgan_generator.py
│ ├── gan.py
│ ├── hifigan_discriminator.py
│ ├── hifigan_generator.py
│ ├── melgan_discriminator.py
│ ├── melgan_generator.py
│ ├── melgan_multiscale_discriminator.py
│ ├── multiband_melgan_generator.py
│ ├── parallel_wavegan_discriminator.py
│ ├── parallel_wavegan_generator.py
│ ├── random_window_discriminator.py
│ ├── univnet_discriminator.py
│ ├── univnet_generator.py
│ ├── wavegrad.py
│ └── wavernn.py
│ ├── pqmf_output.wav
│ └── utils
│ ├── __init__.py
│ ├── distribution.py
│ └── generic_utils.py
├── download_checkpoint.py
├── extend_vocab_config.py
├── recipes
├── README.md
├── bel-alex73
│ ├── .gitignore
│ ├── README.md
│ ├── choose_speaker.ipynb
│ ├── docker-prepare-start.sh
│ ├── docker-prepare
│ │ ├── Dockerfile
│ │ └── runtime.sh
│ ├── dump_config.py
│ ├── train_glowtts.py
│ └── train_hifigan.py
├── blizzard2013
│ ├── README.md
│ ├── tacotron1-Capacitron
│ │ └── train_capacitron_t1.py
│ └── tacotron2-Capacitron
│ │ └── train_capacitron_t2.py
├── kokoro
│ └── tacotron2-DDC
│ │ ├── run.sh
│ │ └── tacotron2-DDC.json
├── ljspeech
│ ├── README.md
│ ├── align_tts
│ │ └── train_aligntts.py
│ ├── delightful_tts
│ │ └── train_delightful_tts.py
│ ├── download_ljspeech.sh
│ ├── fast_pitch
│ │ └── train_fast_pitch.py
│ ├── fast_speech
│ │ └── train_fast_speech.py
│ ├── fastspeech2
│ │ └── train_fastspeech2.py
│ ├── glow_tts
│ │ └── train_glowtts.py
│ ├── hifigan
│ │ └── train_hifigan.py
│ ├── multiband_melgan
│ │ └── train_multiband_melgan.py
│ ├── neuralhmm_tts
│ │ └── train_neuralhmmtts.py
│ ├── overflow
│ │ ├── lj_parameters.pt
│ │ └── train_overflow.py
│ ├── speedy_speech
│ │ └── train_speedy_speech.py
│ ├── tacotron2-Capacitron
│ │ └── train_capacitron_t2.py
│ ├── tacotron2-DCA
│ │ └── train_tacotron_dca.py
│ ├── tacotron2-DDC
│ │ └── train_tacotron_ddc.py
│ ├── univnet
│ │ └── train.py
│ ├── vits_tts
│ │ └── train_vits.py
│ ├── wavegrad
│ │ └── train_wavegrad.py
│ ├── wavernn
│ │ └── train_wavernn.py
│ ├── xtts_v1
│ │ └── train_gpt_xtts.py
│ └── xtts_v2
│ │ └── train_gpt_xtts.py
├── multilingual
│ ├── cml_yourtts
│ │ └── train_yourtts.py
│ └── vits_tts
│ │ ├── train_vits_tts.py
│ │ └── train_vits_tts_phonemes.py
├── thorsten_DE
│ ├── README.md
│ ├── align_tts
│ │ └── train_aligntts.py
│ ├── download_thorsten_DE.sh
│ ├── glow_tts
│ │ └── train_glowtts.py
│ ├── hifigan
│ │ └── train_hifigan.py
│ ├── multiband_melgan
│ │ └── train_multiband_melgan.py
│ ├── speedy_speech
│ │ └── train_speedy_speech.py
│ ├── tacotron2-DDC
│ │ └── train_tacotron_ddc.py
│ ├── univnet
│ │ └── train_univnet.py
│ ├── vits_tts
│ │ └── train_vits.py
│ ├── wavegrad
│ │ └── train_wavegrad.py
│ └── wavernn
│ │ └── train_wavernn.py
└── vctk
│ ├── delightful_tts
│ └── train_delightful_tts.py
│ ├── download_vctk.sh
│ ├── fast_pitch
│ └── train_fast_pitch.py
│ ├── fast_speech
│ └── train_fast_speech.py
│ ├── glow_tts
│ └── train_glow_tts.py
│ ├── resnet_speaker_encoder
│ └── train_encoder.py
│ ├── speedy_speech
│ └── train_speedy_speech.py
│ ├── tacotron-DDC
│ └── train_tacotron-DDC.py
│ ├── tacotron2-DDC
│ └── train_tacotron2-ddc.py
│ ├── tacotron2
│ └── train_tacotron2.py
│ ├── vits
│ └── train_vits.py
│ └── yourtts
│ └── train_yourtts.py
├── requirements.txt
├── train_dvae_xtts.py
├── train_dvae_xtts.sh
├── train_gpt_xtts.py
└── train_gpt_xtts.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv/
2 | hub/
3 | *.pth
4 | __pycache__/
5 | checkpoints/
6 | datasets/
7 | large-datasets/
8 | wandb/
9 | *.ipynb
10 | *.wav
11 | test.py
12 | cps/
13 | vivoice-datasets/
14 | output/
--------------------------------------------------------------------------------
/TTS/VERSION:
--------------------------------------------------------------------------------
1 | 0.22.0
2 |
--------------------------------------------------------------------------------
/TTS/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4 | version = f.read().strip()
5 |
6 | __version__ = version
7 |
--------------------------------------------------------------------------------
/TTS/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/bin/__init__.py
--------------------------------------------------------------------------------
/TTS/bin/collect_env_info.py:
--------------------------------------------------------------------------------
1 | """Get detailed info about the working environment."""
2 | import os
3 | import platform
4 | import sys
5 |
6 | import numpy
7 | import torch
8 |
9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10 | import json
11 |
12 | import TTS
13 |
14 |
15 | def system_info():
16 | return {
17 | "OS": platform.system(),
18 | "architecture": platform.architecture(),
19 | "version": platform.version(),
20 | "processor": platform.processor(),
21 | "python": platform.python_version(),
22 | }
23 |
24 |
25 | def cuda_info():
26 | return {
27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28 | "available": torch.cuda.is_available(),
29 | "version": torch.version.cuda,
30 | }
31 |
32 |
33 | def package_info():
34 | return {
35 | "numpy": numpy.__version__,
36 | "PyTorch_version": torch.__version__,
37 | "PyTorch_debug": torch.version.debug,
38 | "TTS": TTS.__version__,
39 | }
40 |
41 |
42 | def main():
43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44 | print(json.dumps(details, indent=4, sort_keys=True))
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_chars.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | from argparse import RawTextHelpFormatter
4 |
5 | from TTS.config import load_config
6 | from TTS.tts.datasets import load_tts_samples
7 |
8 |
9 | def main():
10 | # pylint: disable=bad-option-value
11 | parser = argparse.ArgumentParser(
12 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13 | """
14 | Example runs:
15 |
16 | python TTS/bin/find_unique_chars.py --config_path config.json
17 | """,
18 | formatter_class=RawTextHelpFormatter,
19 | )
20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21 | args = parser.parse_args()
22 |
23 | c = load_config(args.config_path)
24 |
25 | # load all datasets
26 | train_items, eval_items = load_tts_samples(
27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28 | )
29 |
30 | items = train_items + eval_items
31 |
32 | texts = "".join(item["text"] for item in items)
33 | chars = set(texts)
34 | lower_chars = filter(lambda c: c.islower(), chars)
35 | chars_force_lower = [c.lower() for c in chars]
36 | chars_force_lower = set(chars_force_lower)
37 |
38 | print(f" > Number of unique characters: {len(chars)}")
39 | print(f" > Unique characters: {''.join(sorted(chars))}")
40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_phonemes.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | import multiprocessing
4 | from argparse import RawTextHelpFormatter
5 |
6 | from tqdm.contrib.concurrent import process_map
7 |
8 | from TTS.config import load_config
9 | from TTS.tts.datasets import load_tts_samples
10 | from TTS.tts.utils.text.phonemizers import Gruut
11 |
12 |
13 | def compute_phonemes(item):
14 | text = item["text"]
15 | ph = phonemizer.phonemize(text).replace("|", "")
16 | return set(list(ph))
17 |
18 |
19 | def main():
20 | # pylint: disable=W0601
21 | global c, phonemizer
22 | # pylint: disable=bad-option-value
23 | parser = argparse.ArgumentParser(
24 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25 | """
26 | Example runs:
27 |
28 | python TTS/bin/find_unique_phonemes.py --config_path config.json
29 | """,
30 | formatter_class=RawTextHelpFormatter,
31 | )
32 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33 | args = parser.parse_args()
34 |
35 | c = load_config(args.config_path)
36 |
37 | # load all datasets
38 | train_items, eval_items = load_tts_samples(
39 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40 | )
41 | items = train_items + eval_items
42 | print("Num items:", len(items))
43 |
44 | language_list = [item["language"] for item in items]
45 | is_lang_def = all(language_list)
46 |
47 | if not c.phoneme_language or not is_lang_def:
48 | raise ValueError("Phoneme language must be defined in config.")
49 |
50 | if not language_list.count(language_list[0]) == len(language_list):
51 | raise ValueError(
52 | "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53 | )
54 |
55 | phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56 |
57 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58 | phones = []
59 | for ph in phonemes:
60 | phones.extend(ph)
61 |
62 | phones = set(phones)
63 | lower_phones = filter(lambda c: c.islower(), phones)
64 | phones_force_lower = [c.lower() for c in phones]
65 | phones_force_lower = set(phones_force_lower)
66 |
67 | print(f" > Number of unique phonemes: {len(phones)}")
68 | print(f" > Unique phonemes: {''.join(sorted(phones))}")
69 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/TTS/bin/resample.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import os
4 | from argparse import RawTextHelpFormatter
5 | from multiprocessing import Pool
6 | from shutil import copytree
7 |
8 | import librosa
9 | import soundfile as sf
10 | from tqdm import tqdm
11 |
12 |
13 | def resample_file(func_args):
14 | filename, output_sr = func_args
15 | y, sr = librosa.load(filename, sr=output_sr)
16 | sf.write(filename, y, sr)
17 |
18 |
19 | def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20 | if output_dir:
21 | print("Recursively copying the input folder...")
22 | copytree(input_dir, output_dir)
23 | input_dir = output_dir
24 |
25 | print("Resampling the audio files...")
26 | audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27 | print(f"Found {len(audio_files)} files...")
28 | audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29 | with Pool(processes=n_jobs) as p:
30 | with tqdm(total=len(audio_files)) as pbar:
31 | for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32 | pbar.update()
33 |
34 | print("Done !")
35 |
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(
39 | description="""Resample a folder recusively with librosa
40 | Can be used in place or create a copy of the folder as an output.\n\n
41 | Example run:
42 | python TTS/bin/resample.py
43 | --input_dir /root/LJSpeech-1.1/
44 | --output_sr 22050
45 | --output_dir /root/resampled_LJSpeech-1.1/
46 | --file_ext wav
47 | --n_jobs 24
48 | """,
49 | formatter_class=RawTextHelpFormatter,
50 | )
51 |
52 | parser.add_argument(
53 | "--input_dir",
54 | type=str,
55 | default=None,
56 | required=True,
57 | help="Path of the folder containing the audio files to resample",
58 | )
59 |
60 | parser.add_argument(
61 | "--output_sr",
62 | type=int,
63 | default=22050,
64 | required=False,
65 | help="Samlple rate to which the audio files should be resampled",
66 | )
67 |
68 | parser.add_argument(
69 | "--output_dir",
70 | type=str,
71 | default=None,
72 | required=False,
73 | help="Path of the destination folder. If not defined, the operation is done in place",
74 | )
75 |
76 | parser.add_argument(
77 | "--file_ext",
78 | type=str,
79 | default="wav",
80 | required=False,
81 | help="Extension of the audio files to resample",
82 | )
83 |
84 | parser.add_argument(
85 | "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86 | )
87 |
88 | args = parser.parse_args()
89 |
90 | resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
91 |
--------------------------------------------------------------------------------
/TTS/bin/train_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models import setup_model
9 |
10 |
11 | @dataclass
12 | class TrainTTSArgs(TrainerArgs):
13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14 |
15 |
16 | def main():
17 | """Run `tts` model training directly by a `config.json` file."""
18 | # init trainer args
19 | train_args = TrainTTSArgs()
20 | parser = train_args.init_argparse(arg_prefix="")
21 |
22 | # override trainer args from comman-line args
23 | args, config_overrides = parser.parse_known_args()
24 | train_args.parse_args(args)
25 |
26 | # load config.json and register
27 | if args.config_path or args.continue_path:
28 | if args.config_path:
29 | # init from a file
30 | config = load_config(args.config_path)
31 | if len(config_overrides) > 0:
32 | config.parse_known_args(config_overrides, relaxed_parser=True)
33 | elif args.continue_path:
34 | # continue from a prev experiment
35 | config = load_config(os.path.join(args.continue_path, "config.json"))
36 | if len(config_overrides) > 0:
37 | config.parse_known_args(config_overrides, relaxed_parser=True)
38 | else:
39 | # init from console args
40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41 |
42 | config_base = BaseTrainingConfig()
43 | config_base.parse_known_args(config_overrides)
44 | config = register_config(config_base.model)()
45 |
46 | # load training samples
47 | train_samples, eval_samples = load_tts_samples(
48 | config.datasets,
49 | eval_split=True,
50 | eval_split_max_size=config.eval_split_max_size,
51 | eval_split_size=config.eval_split_size,
52 | )
53 |
54 | # init the model from config
55 | model = setup_model(config, train_samples + eval_samples)
56 |
57 | # init the trainer and 🚀
58 | trainer = Trainer(
59 | train_args,
60 | model.config,
61 | config.output_path,
62 | model=model,
63 | train_samples=train_samples,
64 | eval_samples=eval_samples,
65 | parse_command_line_args=False,
66 | )
67 | trainer.fit()
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/TTS/bin/train_vocoder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.utils.audio import AudioProcessor
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.models import setup_model
10 |
11 |
12 | @dataclass
13 | class TrainVocoderArgs(TrainerArgs):
14 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15 |
16 |
17 | def main():
18 | """Run `tts` model training directly by a `config.json` file."""
19 | # init trainer args
20 | train_args = TrainVocoderArgs()
21 | parser = train_args.init_argparse(arg_prefix="")
22 |
23 | # override trainer args from comman-line args
24 | args, config_overrides = parser.parse_known_args()
25 | train_args.parse_args(args)
26 |
27 | # load config.json and register
28 | if args.config_path or args.continue_path:
29 | if args.config_path:
30 | # init from a file
31 | config = load_config(args.config_path)
32 | if len(config_overrides) > 0:
33 | config.parse_known_args(config_overrides, relaxed_parser=True)
34 | elif args.continue_path:
35 | # continue from a prev experiment
36 | config = load_config(os.path.join(args.continue_path, "config.json"))
37 | if len(config_overrides) > 0:
38 | config.parse_known_args(config_overrides, relaxed_parser=True)
39 | else:
40 | # init from console args
41 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42 |
43 | config_base = BaseTrainingConfig()
44 | config_base.parse_known_args(config_overrides)
45 | config = register_config(config_base.model)()
46 |
47 | # load training samples
48 | if "feature_path" in config and config.feature_path:
49 | # load pre-computed features
50 | print(f" > Loading features from: {config.feature_path}")
51 | eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52 | else:
53 | # load data raw wav files
54 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55 |
56 | # setup audio processor
57 | ap = AudioProcessor(**config.audio)
58 |
59 | # init the model from config
60 | model = setup_model(config)
61 |
62 | # init the trainer and 🚀
63 | trainer = Trainer(
64 | train_args,
65 | config,
66 | config.output_path,
67 | model=model,
68 | train_samples=train_samples,
69 | eval_samples=eval_samples,
70 | training_assets={"audio_processor": ap},
71 | parse_command_line_args=False,
72 | )
73 | trainer.fit()
74 |
75 |
76 | if __name__ == "__main__":
77 | main()
78 |
--------------------------------------------------------------------------------
/TTS/demos/xtts_ft_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | faster_whisper==0.9.0
2 | gradio==4.7.1
--------------------------------------------------------------------------------
/TTS/encoder/README.md:
--------------------------------------------------------------------------------
1 | ### Speaker Encoder
2 |
3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4 |
5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6 |
7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8 |
9 | 
10 |
11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12 |
13 | To run the code, you need to follow the same flow as in TTS.
14 |
15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18 | - Watch training on Tensorboard as in TTS
19 |
--------------------------------------------------------------------------------
/TTS/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/encoder/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/configs/base_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass, field
2 | from typing import Dict, List
3 |
4 | from coqpit import MISSING
5 |
6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7 |
8 |
9 | @dataclass
10 | class BaseEncoderConfig(BaseTrainingConfig):
11 | """Defines parameters for a Generic Encoder model."""
12 |
13 | model: str = None
14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16 | # model params
17 | model_params: Dict = field(
18 | default_factory=lambda: {
19 | "model_name": "lstm",
20 | "input_dim": 80,
21 | "proj_dim": 256,
22 | "lstm_dim": 768,
23 | "num_lstm_layers": 3,
24 | "use_lstm_with_projection": True,
25 | }
26 | )
27 |
28 | audio_augmentation: Dict = field(default_factory=lambda: {})
29 |
30 | # training params
31 | epochs: int = 10000
32 | loss: str = "angleproto"
33 | grad_clip: float = 3.0
34 | lr: float = 0.0001
35 | optimizer: str = "radam"
36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37 | lr_decay: bool = False
38 | warmup_steps: int = 4000
39 |
40 | # logging params
41 | tb_model_param_stats: bool = False
42 | steps_plot_stats: int = 10
43 | save_step: int = 1000
44 | print_step: int = 20
45 | run_eval: bool = False
46 |
47 | # data loader
48 | num_classes_in_batch: int = MISSING
49 | num_utter_per_class: int = MISSING
50 | eval_num_classes_in_batch: int = None
51 | eval_num_utter_per_class: int = None
52 |
53 | num_loader_workers: int = MISSING
54 | voice_len: float = 1.6
55 |
56 | def check_values(self):
57 | super().check_values()
58 | c = asdict(self)
59 | assert (
60 | c["model_params"]["input_dim"] == self.audio.num_mels
61 | ), " [!] model input dimendion must be equal to melspectrogram dimension."
62 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/emotion_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class EmotionEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Emotion Encoder model."""
9 |
10 | model: str = "emotion_encoder"
11 | map_classid_to_classname: dict = None
12 | class_name_key: str = "emotion_name"
13 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/speaker_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class SpeakerEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Speaker Encoder model."""
9 |
10 | model: str = "speaker_encoder"
11 | class_name_key: str = "speaker_name"
12 |
--------------------------------------------------------------------------------
/TTS/encoder/requirements.txt:
--------------------------------------------------------------------------------
1 | umap-learn
2 | numpy>=1.17.0
3 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/encoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/utils/visual.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import umap
5 |
6 | matplotlib.use("Agg")
7 |
8 |
9 | colormap = (
10 | np.array(
11 | [
12 | [76, 255, 0],
13 | [0, 127, 70],
14 | [255, 0, 0],
15 | [255, 217, 38],
16 | [0, 135, 255],
17 | [165, 0, 165],
18 | [255, 167, 255],
19 | [0, 255, 255],
20 | [255, 96, 38],
21 | [142, 76, 0],
22 | [33, 0, 127],
23 | [0, 0, 0],
24 | [183, 183, 183],
25 | ],
26 | dtype=float,
27 | )
28 | / 255
29 | )
30 |
31 |
32 | def plot_embeddings(embeddings, num_classes_in_batch):
33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
34 |
35 | # if necessary get just the first 10 classes
36 | if num_classes_in_batch > 10:
37 | num_classes_in_batch = 10
38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
39 |
40 | model = umap.UMAP()
41 | projection = model.fit_transform(embeddings)
42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
43 | colors = [colormap[i] for i in ground_truth]
44 | fig, ax = plt.subplots(figsize=(16, 10))
45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
46 | plt.gca().set_aspect("equal", "datalim")
47 | plt.title("UMAP projection")
48 | plt.tight_layout()
49 | plt.savefig("umap")
50 | return fig
51 |
--------------------------------------------------------------------------------
/TTS/model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Dict
3 |
4 | import torch
5 | from coqpit import Coqpit
6 | from trainer import TrainerModel
7 |
8 | # pylint: skip-file
9 |
10 |
11 | class BaseTrainerModel(TrainerModel):
12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13 |
14 | Every new 🐸TTS model must inherit it.
15 | """
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def init_from_config(config: Coqpit):
20 | """Init the model and all its attributes from the given config.
21 |
22 | Override this depending on your model.
23 | """
24 | ...
25 |
26 | @abstractmethod
27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28 | """Forward pass for inference.
29 |
30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31 | is considered to be the main output and you can add any other auxiliary outputs as you want.
32 |
33 | We don't use `*kwargs` since it is problematic with the TorchScript API.
34 |
35 | Args:
36 | input (torch.Tensor): [description]
37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38 |
39 | Returns:
40 | Dict: [description]
41 | """
42 | outputs_dict = {"model_outputs": None}
43 | ...
44 | return outputs_dict
45 |
46 | @abstractmethod
47 | def load_checkpoint(
48 | self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
49 | ) -> None:
50 | """Load a model checkpoint gile and get ready for training or inference.
51 |
52 | Args:
53 | config (Coqpit): Model configuration.
54 | checkpoint_path (str): Path to the model checkpoint file.
55 | eval (bool, optional): If true, init model for inference else for training. Defaults to False.
56 | strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
57 | cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
58 | """
59 | ...
60 |
--------------------------------------------------------------------------------
/TTS/server/README.md:
--------------------------------------------------------------------------------
1 | # :frog: TTS demo server
2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
3 |
4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
5 |
6 | Examples runs:
7 |
8 | List officially released models.
9 | ```python TTS/server/server.py --list_models ```
10 |
11 | Run the server with the official models.
12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
13 |
14 | Run the server with the official models on a GPU.
15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
16 |
17 | Run the server with a custom models.
18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
19 |
--------------------------------------------------------------------------------
/TTS/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/server/__init__.py
--------------------------------------------------------------------------------
/TTS/server/conf.json:
--------------------------------------------------------------------------------
1 | {
2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
3 | "tts_file":"best_model.pth", // tts checkpoint file
4 | "tts_config":"config.json", // tts config.json file
5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
6 | "vocoder_config":null,
7 | "vocoder_file": null,
8 | "is_wavernn_batched":true,
9 | "port": 5002,
10 | "use_cuda": true,
11 | "debug": true
12 | }
13 |
--------------------------------------------------------------------------------
/TTS/server/static/coqui-log-green-TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/server/static/coqui-log-green-TTS.png
--------------------------------------------------------------------------------
/TTS/server/templates/details.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | TTS engine
12 |
13 |
14 |
17 |
18 |
19 |
30 |
31 |
32 |
33 |
35 |
36 | {% if show_details == true %}
37 |
38 |
39 | Model details
40 |
41 |
42 |
43 |
44 | CLI arguments:
45 |
46 |
47 | CLI key |
48 | Value |
49 |
50 |
51 | {% for key, value in args.items() %}
52 |
53 |
54 | {{ key }} |
55 | {{ value }} |
56 |
57 |
58 | {% endfor %}
59 |
60 |
61 |
62 |
63 |
64 |
65 | {% if model_config != None %}
66 |
67 |
68 | Model config:
69 |
70 |
71 |
72 | Key |
73 | Value |
74 |
75 |
76 |
77 | {% for key, value in model_config.items() %}
78 |
79 |
80 | {{ key }} |
81 | {{ value }} |
82 |
83 |
84 | {% endfor %}
85 |
86 |
87 |
88 |
89 | {% endif %}
90 |
91 |
92 |
93 |
94 |
95 |
96 | {% if vocoder_config != None %}
97 |
98 | Vocoder model config:
99 |
100 |
101 |
102 | Key |
103 | Value |
104 |
105 |
106 |
107 | {% for key, value in vocoder_config.items() %}
108 |
109 |
110 | {{ key }} |
111 | {{ value }} |
112 |
113 |
114 | {% endfor %}
115 |
116 |
117 |
118 |
119 | {% endif %}
120 |
121 |
122 | {% else %}
123 |
124 | Please start server with --show_details=true to see details.
125 |
126 |
127 | {% endif %}
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/TTS/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | # configs_dir = os.path.dirname(__file__)
7 | # for file in os.listdir(configs_dir):
8 | # path = os.path.join(configs_dir, file)
9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | # module = importlib.import_module("TTS.tts.configs." + config_name)
12 | # for attribute_name in dir(module):
13 | # attribute = getattr(module, attribute_name)
14 |
15 | # if isclass(attribute):
16 | # # Add the class to this package's variables
17 | # globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/tts/configs/tacotron2_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from TTS.tts.configs.tacotron_config import TacotronConfig
4 |
5 |
6 | @dataclass
7 | class Tacotron2Config(TacotronConfig):
8 | """Defines parameters for Tacotron2 based models.
9 |
10 | Example:
11 |
12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
13 | >>> config = Tacotron2Config()
14 |
15 | Check `TacotronConfig` for argument descriptions.
16 | """
17 |
18 | model: str = "tacotron2"
19 | out_channels: int = 80
20 | encoder_in_features: int = 512
21 | decoder_in_features: int = 512
22 |
--------------------------------------------------------------------------------
/TTS/tts/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.layers.losses import *
2 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/align_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
9 | super().__init__()
10 | self.embed = nn.Embedding(num_chars, hidden_channels)
11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
14 |
15 | def forward(self, text, text_lengths):
16 | # B, L -> B, L
17 | emb = self.embed(text)
18 | emb = self.pos_enc(emb.transpose(1, 2))
19 | x = self.FFT(emb, text_lengths)
20 | x = self.out_layer(x).squeeze(-1)
21 | return x
22 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/mdn.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class MDNBlock(nn.Module):
5 | """Mixture of Density Network implementation
6 | https://arxiv.org/pdf/2003.01950.pdf
7 | """
8 |
9 | def __init__(self, in_channels, out_channels):
10 | super().__init__()
11 | self.out_channels = out_channels
12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
13 | self.norm = nn.LayerNorm(in_channels)
14 | self.relu = nn.ReLU()
15 | self.dropout = nn.Dropout(0.1)
16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
17 |
18 | def forward(self, x):
19 | o = self.conv1(x)
20 | o = o.transpose(1, 2)
21 | o = self.norm(o)
22 | o = o.transpose(1, 2)
23 | o = self.relu(o)
24 | o = self.dropout(o)
25 | mu_sigma = self.conv2(o)
26 | # TODO: check this sigmoid
27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
28 | mu = mu_sigma[:, : self.out_channels // 2, :]
29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :]
30 | return mu, log_sigma
31 |
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/bark/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/bark/hubert/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/hubert_manager.py:
--------------------------------------------------------------------------------
1 | # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
2 |
3 | import os.path
4 | import shutil
5 | import urllib.request
6 |
7 | import huggingface_hub
8 |
9 |
10 | class HubertManager:
11 | @staticmethod
12 | def make_sure_hubert_installed(
13 | download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
14 | ):
15 | if not os.path.isfile(model_path):
16 | print("Downloading HuBERT base model")
17 | urllib.request.urlretrieve(download_url, model_path)
18 | print("Downloaded HuBERT")
19 | return model_path
20 | return None
21 |
22 | @staticmethod
23 | def make_sure_tokenizer_installed(
24 | model: str = "quantifier_hubert_base_ls960_14.pth",
25 | repo: str = "GitMylo/bark-voice-cloning",
26 | model_path: str = "",
27 | ):
28 | model_dir = os.path.dirname(model_path)
29 | if not os.path.isfile(model_path):
30 | print("Downloading HuBERT custom tokenizer")
31 | huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
32 | shutil.move(os.path.join(model_dir, model), model_path)
33 | print("Downloaded tokenizer")
34 | return model_path
35 | return None
36 |
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/kmeans_hubert.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified HuBERT model without kmeans.
3 | Original author: https://github.com/lucidrains/
4 | Modified by: https://www.github.com/gitmylo/
5 | License: MIT
6 | """
7 |
8 | # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
9 |
10 | import logging
11 | from pathlib import Path
12 |
13 | import torch
14 | from einops import pack, unpack
15 | from torch import nn
16 | from torchaudio.functional import resample
17 | from transformers import HubertModel
18 |
19 |
20 | def round_down_nearest_multiple(num, divisor):
21 | return num // divisor * divisor
22 |
23 |
24 | def curtail_to_multiple(t, mult, from_left=False):
25 | data_len = t.shape[-1]
26 | rounded_seq_len = round_down_nearest_multiple(data_len, mult)
27 | seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
28 | return t[..., seq_slice]
29 |
30 |
31 | def exists(val):
32 | return val is not None
33 |
34 |
35 | def default(val, d):
36 | return val if exists(val) else d
37 |
38 |
39 | class CustomHubert(nn.Module):
40 | """
41 | checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
42 | or you can train your own
43 | """
44 |
45 | def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
46 | super().__init__()
47 | self.target_sample_hz = target_sample_hz
48 | self.seq_len_multiple_of = seq_len_multiple_of
49 | self.output_layer = output_layer
50 | if device is not None:
51 | self.to(device)
52 | self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
53 | if device is not None:
54 | self.model.to(device)
55 | self.model.eval()
56 |
57 | @property
58 | def groups(self):
59 | return 1
60 |
61 | @torch.no_grad()
62 | def forward(self, wav_input, flatten=True, input_sample_hz=None):
63 | device = wav_input.device
64 |
65 | if exists(input_sample_hz):
66 | wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
67 |
68 | if exists(self.seq_len_multiple_of):
69 | wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
70 |
71 | outputs = self.model.forward(
72 | wav_input,
73 | output_hidden_states=True,
74 | )
75 | embed = outputs["hidden_states"][self.output_layer]
76 | embed, packed_shape = pack([embed], "* d")
77 | codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
78 | if flatten:
79 | return codebook_indices
80 |
81 | (codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
82 | return codebook_indices
83 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/delightful_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn # pylint: disable=consider-using-from-import
3 |
4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
5 |
6 |
7 | class PhonemeProsodyPredictor(nn.Module):
8 | """Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf
9 | It consists of 2 layers of 1D convolutions each followed by a relu activation, layer norm
10 | and dropout, then finally a linear layer.
11 |
12 | Args:
13 | hidden_size (int): Size of hidden channels.
14 | kernel_size (int): Kernel size for the conv layers.
15 | dropout: (float): Probability of dropout.
16 | bottleneck_size (int): bottleneck size for last linear layer.
17 | lrelu_slope (float): Slope of the leaky relu.
18 | """
19 |
20 | def __init__(
21 | self,
22 | hidden_size: int,
23 | kernel_size: int,
24 | dropout: float,
25 | bottleneck_size: int,
26 | lrelu_slope: float,
27 | ):
28 | super().__init__()
29 | self.d_model = hidden_size
30 | self.layers = nn.ModuleList(
31 | [
32 | ConvTransposed(
33 | self.d_model,
34 | self.d_model,
35 | kernel_size=kernel_size,
36 | padding=(kernel_size - 1) // 2,
37 | ),
38 | nn.LeakyReLU(lrelu_slope),
39 | nn.LayerNorm(self.d_model),
40 | nn.Dropout(dropout),
41 | ConvTransposed(
42 | self.d_model,
43 | self.d_model,
44 | kernel_size=kernel_size,
45 | padding=(kernel_size - 1) // 2,
46 | ),
47 | nn.LeakyReLU(lrelu_slope),
48 | nn.LayerNorm(self.d_model),
49 | nn.Dropout(dropout),
50 | ]
51 | )
52 | self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size)
53 |
54 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
55 | """
56 | Shapes:
57 | x: :math: `[B, T, D]`
58 | mask: :math: `[B, T]`
59 | """
60 | mask = mask.unsqueeze(2)
61 | for layer in self.layers:
62 | x = layer(x)
63 | x = x.masked_fill(mask, 0.0)
64 | x = self.predictor_bottleneck(x)
65 | return x
66 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/variance_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn # pylint: disable=consider-using-from-import
3 |
4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
5 |
6 |
7 | class VariancePredictor(nn.Module):
8 | """
9 | Network is 2-layer 1D convolutions with leaky relu activation and then
10 | followed by layer normalization then a dropout layer and finally an
11 | extra linear layer to project the hidden states into the output sequence.
12 |
13 | Args:
14 | channels_in (int): Number of in channels for conv layers.
15 | channels_out (int): Number of out channels for the last linear layer.
16 | kernel_size (int): Size the kernel for the conv layers.
17 | p_dropout (float): Probability of dropout.
18 | lrelu_slope (float): Slope for the leaky relu.
19 |
20 | Inputs: inputs, mask
21 | - **inputs** (batch, time, dim): Tensor containing input vector
22 | - **mask** (batch, time): Tensor containing indices to be masked
23 | Returns:
24 | - **outputs** (batch, time): Tensor produced by last linear layer.
25 | """
26 |
27 | def __init__(
28 | self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float
29 | ):
30 | super().__init__()
31 |
32 | self.layers = nn.ModuleList(
33 | [
34 | ConvTransposed(
35 | channels_in,
36 | channels,
37 | kernel_size=kernel_size,
38 | padding=(kernel_size - 1) // 2,
39 | ),
40 | nn.LeakyReLU(lrelu_slope),
41 | nn.LayerNorm(channels),
42 | nn.Dropout(p_dropout),
43 | ConvTransposed(
44 | channels,
45 | channels,
46 | kernel_size=kernel_size,
47 | padding=(kernel_size - 1) // 2,
48 | ),
49 | nn.LeakyReLU(lrelu_slope),
50 | nn.LayerNorm(channels),
51 | nn.Dropout(p_dropout),
52 | ]
53 | )
54 |
55 | self.linear_layer = nn.Linear(channels, channels_out)
56 |
57 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
58 | """
59 | Shapes:
60 | x: :math: `[B, T_src, C]`
61 | mask: :math: `[B, T_src]`
62 | """
63 | for layer in self.layers:
64 | x = layer(x)
65 | x = self.linear_layer(x)
66 | x = x.squeeze(-1)
67 | x = x.masked_fill(mask, 0.0)
68 | return x
69 |
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/feed_forward/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
4 |
5 |
6 | class DurationPredictor(nn.Module):
7 | """Speedy Speech duration predictor model.
8 | Predicts phoneme durations from encoder outputs.
9 |
10 | Note:
11 | Outputs interpreted as log(durations)
12 | To get actual durations, do exp transformation
13 |
14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
15 |
16 | Args:
17 | hidden_channels (int): number of channels in the inner layers.
18 | """
19 |
20 | def __init__(self, hidden_channels):
21 | super().__init__()
22 |
23 | self.layers = nn.ModuleList(
24 | [
25 | Conv1dBN(hidden_channels, hidden_channels, 4, 1),
26 | Conv1dBN(hidden_channels, hidden_channels, 3, 1),
27 | Conv1dBN(hidden_channels, hidden_channels, 1, 1),
28 | nn.Conv1d(hidden_channels, 1, 1),
29 | ]
30 | )
31 |
32 | def forward(self, x, x_mask):
33 | """
34 | Shapes:
35 | x: [B, C, T]
36 | x_mask: [B, 1, T]
37 | """
38 | o = x
39 | for layer in self.layers:
40 | o = layer(o) * x_mask
41 | return o
42 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/generic/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/gated_conv.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from .normalization import LayerNorm
4 |
5 |
6 | class GatedConvBlock(nn.Module):
7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf
8 | Args:
9 | in_out_channels (int): number of input/output channels.
10 | kernel_size (int): convolution kernel size.
11 | dropout_p (float): dropout rate.
12 | """
13 |
14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers):
15 | super().__init__()
16 | # class arguments
17 | self.dropout_p = dropout_p
18 | self.num_layers = num_layers
19 | # define layers
20 | self.conv_layers = nn.ModuleList()
21 | self.norm_layers = nn.ModuleList()
22 | self.layers = nn.ModuleList()
23 | for _ in range(num_layers):
24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)]
25 | self.norm_layers += [LayerNorm(2 * in_out_channels)]
26 |
27 | def forward(self, x, x_mask):
28 | o = x
29 | res = x
30 | for idx in range(self.num_layers):
31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training)
32 | o = self.conv_layers[idx](o * x_mask)
33 | o = self.norm_layers[idx](o)
34 | o = nn.functional.glu(o, dim=1)
35 | o = res + o
36 | res = o
37 | return o
38 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/pos_encoding.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 |
6 |
7 | class PositionalEncoding(nn.Module):
8 | """Sinusoidal positional encoding for non-recurrent neural networks.
9 | Implementation based on "Attention Is All You Need"
10 |
11 | Args:
12 | channels (int): embedding size
13 | dropout_p (float): dropout rate applied to the output.
14 | max_len (int): maximum sequence length.
15 | use_scale (bool): whether to use a learnable scaling coefficient.
16 | """
17 |
18 | def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
19 | super().__init__()
20 | if channels % 2 != 0:
21 | raise ValueError(
22 | "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
23 | )
24 | self.use_scale = use_scale
25 | if use_scale:
26 | self.scale = torch.nn.Parameter(torch.ones(1))
27 | pe = torch.zeros(max_len, channels)
28 | position = torch.arange(0, max_len).unsqueeze(1)
29 | div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels)
30 | pe[:, 0::2] = torch.sin(position.float() * div_term)
31 | pe[:, 1::2] = torch.cos(position.float() * div_term)
32 | pe = pe.unsqueeze(0).transpose(1, 2)
33 | self.register_buffer("pe", pe)
34 | if dropout_p > 0:
35 | self.dropout = nn.Dropout(p=dropout_p)
36 | self.channels = channels
37 |
38 | def forward(self, x, mask=None, first_idx=None, last_idx=None):
39 | """
40 | Shapes:
41 | x: [B, C, T]
42 | mask: [B, 1, T]
43 | first_idx: int
44 | last_idx: int
45 | """
46 |
47 | x = x * math.sqrt(self.channels)
48 | if first_idx is None:
49 | if self.pe.size(2) < x.size(2):
50 | raise RuntimeError(
51 | f"Sequence is {x.size(2)} but PositionalEncoding is"
52 | f" limited to {self.pe.size(2)}. See max_len argument."
53 | )
54 | if mask is not None:
55 | pos_enc = self.pe[:, :, : x.size(2)] * mask
56 | else:
57 | pos_enc = self.pe[:, :, : x.size(2)]
58 | if self.use_scale:
59 | x = x + self.scale * pos_enc
60 | else:
61 | x = x + pos_enc
62 | else:
63 | if self.use_scale:
64 | x = x + self.scale * self.pe[:, :, first_idx:last_idx]
65 | else:
66 | x = x + self.pe[:, :, first_idx:last_idx]
67 | if hasattr(self, "dropout"):
68 | x = self.dropout(x)
69 | return x
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/time_depth_sep_conv.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class TimeDepthSeparableConv(nn.Module):
6 | """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
7 | It shows competative results with less computation and memory footprint."""
8 |
9 | def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
10 | super().__init__()
11 |
12 | self.in_channels = in_channels
13 | self.out_channels = out_channels
14 | self.hid_channels = hid_channels
15 | self.kernel_size = kernel_size
16 |
17 | self.time_conv = nn.Conv1d(
18 | in_channels,
19 | 2 * hid_channels,
20 | kernel_size=1,
21 | stride=1,
22 | padding=0,
23 | bias=bias,
24 | )
25 | self.norm1 = nn.BatchNorm1d(2 * hid_channels)
26 | self.depth_conv = nn.Conv1d(
27 | hid_channels,
28 | hid_channels,
29 | kernel_size,
30 | stride=1,
31 | padding=(kernel_size - 1) // 2,
32 | groups=hid_channels,
33 | bias=bias,
34 | )
35 | self.norm2 = nn.BatchNorm1d(hid_channels)
36 | self.time_conv2 = nn.Conv1d(
37 | hid_channels,
38 | out_channels,
39 | kernel_size=1,
40 | stride=1,
41 | padding=0,
42 | bias=bias,
43 | )
44 | self.norm3 = nn.BatchNorm1d(out_channels)
45 |
46 | def forward(self, x):
47 | x_res = x
48 | x = self.time_conv(x)
49 | x = self.norm1(x)
50 | x = nn.functional.glu(x, dim=1)
51 | x = self.depth_conv(x)
52 | x = self.norm2(x)
53 | x = x * torch.sigmoid(x)
54 | x = self.time_conv2(x)
55 | x = self.norm3(x)
56 | x = x_res + x
57 | return x
58 |
59 |
60 | class TimeDepthSeparableConvBlock(nn.Module):
61 | def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
62 | super().__init__()
63 | assert (kernel_size - 1) % 2 == 0
64 | assert num_layers > 1
65 |
66 | self.layers = nn.ModuleList()
67 | layer = TimeDepthSeparableConv(
68 | in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
69 | )
70 | self.layers.append(layer)
71 | for idx in range(num_layers - 1):
72 | layer = TimeDepthSeparableConv(
73 | hid_channels,
74 | hid_channels,
75 | out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
76 | kernel_size,
77 | bias,
78 | )
79 | self.layers.append(layer)
80 |
81 | def forward(self, x, mask):
82 | for layer in self.layers:
83 | x = layer(x * mask)
84 | return x
85 |
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/glow_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from ..generic.normalization import LayerNorm
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | """Glow-TTS duration prediction model.
9 |
10 | ::
11 |
12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
13 |
14 | Args:
15 | in_channels (int): Number of channels of the input tensor.
16 | hidden_channels (int): Number of hidden channels of the network.
17 | kernel_size (int): Kernel size for the conv layers.
18 | dropout_p (float): Dropout rate used after each conv layer.
19 | """
20 |
21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None):
22 | super().__init__()
23 |
24 | # add language embedding dim in the input
25 | if language_emb_dim:
26 | in_channels += language_emb_dim
27 |
28 | # class arguments
29 | self.in_channels = in_channels
30 | self.filter_channels = hidden_channels
31 | self.kernel_size = kernel_size
32 | self.dropout_p = dropout_p
33 | # layers
34 | self.drop = nn.Dropout(dropout_p)
35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
36 | self.norm_1 = LayerNorm(hidden_channels)
37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
38 | self.norm_2 = LayerNorm(hidden_channels)
39 | # output layer
40 | self.proj = nn.Conv1d(hidden_channels, 1, 1)
41 | if cond_channels is not None and cond_channels != 0:
42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1)
43 |
44 | if language_emb_dim != 0 and language_emb_dim is not None:
45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1)
46 |
47 | def forward(self, x, x_mask, g=None, lang_emb=None):
48 | """
49 | Shapes:
50 | - x: :math:`[B, C, T]`
51 | - x_mask: :math:`[B, 1, T]`
52 | - g: :math:`[B, C, 1]`
53 | """
54 | if g is not None:
55 | x = x + self.cond(g)
56 |
57 | if lang_emb is not None:
58 | x = x + self.cond_lang(lang_emb)
59 |
60 | x = self.conv_1(x * x_mask)
61 | x = torch.relu(x)
62 | x = self.norm_1(x)
63 | x = self.drop(x)
64 | x = self.conv_2(x * x_mask)
65 | x = torch.relu(x)
66 | x = self.norm_2(x)
67 | x = self.drop(x)
68 | x = self.proj(x * x_mask)
69 | return x * x_mask
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/overflow/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/decoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
5 | from TTS.tts.utils.helpers import sequence_mask
6 |
7 |
8 | class Decoder(nn.Module):
9 | """Uses glow decoder with some modifications.
10 | ::
11 |
12 | Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
13 |
14 | Args:
15 | in_channels (int): channels of input tensor.
16 | hidden_channels (int): hidden decoder channels.
17 | kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
18 | dilation_rate (int): rate to increase dilation by each layer in a decoder block.
19 | num_flow_blocks (int): number of decoder blocks.
20 | num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
21 | dropout_p (float): wavenet dropout rate.
22 | sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
23 | """
24 |
25 | def __init__(
26 | self,
27 | in_channels,
28 | hidden_channels,
29 | kernel_size,
30 | dilation_rate,
31 | num_flow_blocks,
32 | num_coupling_layers,
33 | dropout_p=0.0,
34 | num_splits=4,
35 | num_squeeze=2,
36 | sigmoid_scale=False,
37 | c_in_channels=0,
38 | ):
39 | super().__init__()
40 |
41 | self.glow_decoder = GlowDecoder(
42 | in_channels,
43 | hidden_channels,
44 | kernel_size,
45 | dilation_rate,
46 | num_flow_blocks,
47 | num_coupling_layers,
48 | dropout_p,
49 | num_splits,
50 | num_squeeze,
51 | sigmoid_scale,
52 | c_in_channels,
53 | )
54 | self.n_sqz = num_squeeze
55 |
56 | def forward(self, x, x_len, g=None, reverse=False):
57 | """
58 | Input shapes:
59 | - x: :math:`[B, C, T]`
60 | - x_len :math:`[B]`
61 | - g: :math:`[B, C]`
62 |
63 | Output shapes:
64 | - x: :math:`[B, C, T]`
65 | - x_len :math:`[B]`
66 | - logget_tot :math:`[B]`
67 | """
68 | x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
69 | x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
70 | x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
71 | return x, x_len, logdet_tot
72 |
73 | def preprocess(self, y, y_lengths, y_max_length):
74 | if y_max_length is not None:
75 | y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
76 | y = y[:, :, :y_max_length]
77 | y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
78 | return y, y_lengths, y_max_length
79 |
80 | def store_inverse(self):
81 | self.glow_decoder.store_inverse()
82 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/plotting_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import torch
6 |
7 |
8 | def validate_numpy_array(value: Any):
9 | r"""
10 | Validates the input and makes sure it returns a numpy array (i.e on CPU)
11 |
12 | Args:
13 | value (Any): the input value
14 |
15 | Raises:
16 | TypeError: if the value is not a numpy array or torch tensor
17 |
18 | Returns:
19 | np.ndarray: numpy array of the value
20 | """
21 | if isinstance(value, np.ndarray):
22 | pass
23 | elif isinstance(value, list):
24 | value = np.array(value)
25 | elif torch.is_tensor(value):
26 | value = value.cpu().numpy()
27 | else:
28 | raise TypeError("Value must be a numpy array, a torch tensor or a list")
29 |
30 | return value
31 |
32 |
33 | def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
34 | """Get the most probable state means from the log_alpha_scaled.
35 |
36 | Args:
37 | log_alpha_scaled (torch.Tensor): Log alpha scaled values.
38 | - Shape: :math:`(T, N)`
39 | means (torch.Tensor): Means of the states.
40 | - Shape: :math:`(N, T, D_out)`
41 | decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
42 | """
43 | max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
44 | max_len = means.shape[0]
45 | n_mel_channels = means.shape[2]
46 | max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
47 | means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
48 | if decoder is not None:
49 | mel = (
50 | decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
51 | .squeeze(0)
52 | .T
53 | )
54 | else:
55 | mel = means
56 | return mel
57 |
58 |
59 | def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
60 | """Generates trainsition probabilities plot for the states and the probability of transition.
61 |
62 | Args:
63 | states (torch.IntTensor): the states
64 | transition_probabilities (torch.FloatTensor): the transition probabilities
65 | """
66 | states = validate_numpy_array(states)
67 | transition_probabilities = validate_numpy_array(transition_probabilities)
68 |
69 | fig, ax = plt.subplots(figsize=(30, 3))
70 | ax.plot(transition_probabilities, "o")
71 | ax.set_title("Transition probability of state")
72 | ax.set_xlabel("hidden state")
73 | ax.set_ylabel("probability")
74 | ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
75 | ax.set_xticklabels([int(x) for x in states], rotation=90)
76 | plt.tight_layout()
77 | if not output_fig:
78 | plt.close()
79 | return fig
80 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tacotron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/tacotron/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/random_latent_generator.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
9 | if bias is not None:
10 | rest_dim = [1] * (input.ndim - bias.ndim - 1)
11 | return (
12 | F.leaky_relu(
13 | input + bias.view(1, bias.shape[0], *rest_dim),
14 | negative_slope=negative_slope,
15 | )
16 | * scale
17 | )
18 | else:
19 | return F.leaky_relu(input, negative_slope=0.2) * scale
20 |
21 |
22 | class EqualLinear(nn.Module):
23 | def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1):
24 | super().__init__()
25 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
26 | if bias:
27 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
28 | else:
29 | self.bias = None
30 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul
31 | self.lr_mul = lr_mul
32 |
33 | def forward(self, input):
34 | out = F.linear(input, self.weight * self.scale)
35 | out = fused_leaky_relu(out, self.bias * self.lr_mul)
36 | return out
37 |
38 |
39 | class RandomLatentConverter(nn.Module):
40 | def __init__(self, channels):
41 | super().__init__()
42 | self.layers = nn.Sequential(
43 | *[EqualLinear(channels, channels, lr_mul=0.1) for _ in range(5)], nn.Linear(channels, channels)
44 | )
45 | self.channels = channels
46 |
47 | def forward(self, ref):
48 | r = torch.randn(ref.shape[0], self.channels, device=ref.device)
49 | y = self.layers(r)
50 | return y
51 |
52 |
53 | if __name__ == "__main__":
54 | model = RandomLatentConverter(512)
55 | model(torch.randn(5, 512))
56 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/tokenizer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from tokenizers import Tokenizer
5 |
6 | from TTS.tts.utils.text.cleaners import english_cleaners
7 |
8 | DEFAULT_VOCAB_FILE = os.path.join(
9 | os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
10 | )
11 |
12 |
13 | class VoiceBpeTokenizer:
14 | def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None):
15 | self.tokenizer = None
16 | if vocab_file is not None:
17 | self.tokenizer = Tokenizer.from_file(vocab_file)
18 | if vocab_str is not None:
19 | self.tokenizer = Tokenizer.from_str(vocab_str)
20 |
21 | def preprocess_text(self, txt):
22 | txt = english_cleaners(txt)
23 | return txt
24 |
25 | def encode(self, txt):
26 | txt = self.preprocess_text(txt)
27 | txt = txt.replace(" ", "[SPACE]")
28 | return self.tokenizer.encode(txt).ids
29 |
30 | def decode(self, seq):
31 | if isinstance(seq, torch.Tensor):
32 | seq = seq.cpu().numpy()
33 | txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
34 | txt = txt.replace("[SPACE]", " ")
35 | txt = txt.replace("[STOP]", "")
36 | txt = txt.replace("[UNK]", "")
37 | return txt
38 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib import request
3 |
4 | from tqdm import tqdm
5 |
6 | DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models")
7 | MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
8 | MODELS_DIR = "/data/speech_synth/models/"
9 | MODELS = {
10 | "autoregressive.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth",
11 | "classifier.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth",
12 | "clvp2.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth",
13 | "diffusion_decoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth",
14 | "vocoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth",
15 | "rlg_auto.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth",
16 | "rlg_diffuser.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth",
17 | }
18 |
19 |
20 | def download_models(specific_models=None):
21 | """
22 | Call to download all the models that Tortoise uses.
23 | """
24 | os.makedirs(MODELS_DIR, exist_ok=True)
25 | for model_name, url in MODELS.items():
26 | if specific_models is not None and model_name not in specific_models:
27 | continue
28 | model_path = os.path.join(MODELS_DIR, model_name)
29 | if os.path.exists(model_path):
30 | continue
31 | print(f"Downloading {model_name} from {url}...")
32 | with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
33 | request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n))
34 | print("Done.")
35 |
36 |
37 | def get_model_path(model_name, models_dir=MODELS_DIR):
38 | """
39 | Get path to given model, download it if it doesn't exist.
40 | """
41 | if model_name not in MODELS:
42 | raise ValueError(f"Model {model_name} not found in available models.")
43 | model_path = os.path.join(models_dir, model_name)
44 | if not os.path.exists(model_path) and models_dir == MODELS_DIR:
45 | download_models([model_name])
46 | return model_path
47 |
--------------------------------------------------------------------------------
/TTS/tts/layers/xtts/xtts_manager.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | class SpeakerManager():
4 | def __init__(self, speaker_file_path=None):
5 | self.speakers = torch.load(speaker_file_path)
6 |
7 | @property
8 | def name_to_id(self):
9 | return self.speakers.keys()
10 |
11 | @property
12 | def num_speakers(self):
13 | return len(self.name_to_id)
14 |
15 | @property
16 | def speaker_names(self):
17 | return list(self.name_to_id.keys())
18 |
19 |
20 | class LanguageManager():
21 | def __init__(self, config):
22 | self.langs = config["languages"]
23 |
24 | @property
25 | def name_to_id(self):
26 | return self.langs
27 |
28 | @property
29 | def num_languages(self):
30 | return len(self.name_to_id)
31 |
32 | @property
33 | def language_names(self):
34 | return list(self.name_to_id)
35 |
--------------------------------------------------------------------------------
/TTS/tts/models/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 |
3 | from TTS.utils.generic_utils import find_module
4 |
5 |
6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
7 | print(" > Using model: {}".format(config.model))
8 | # fetch the right model implementation.
9 | if "base_model" in config and config["base_model"] is not None:
10 | MyModel = find_module("TTS.tts.models", config.base_model.lower())
11 | else:
12 | MyModel = find_module("TTS.tts.models", config.model.lower())
13 | model = MyModel.init_from_config(config=config, samples=samples)
14 | return model
15 |
--------------------------------------------------------------------------------
/TTS/tts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/data.py:
--------------------------------------------------------------------------------
1 | import bisect
2 |
3 | import numpy as np
4 | import torch
5 |
6 |
7 | def _pad_data(x, length):
8 | _pad = 0
9 | assert x.ndim == 1
10 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
11 |
12 |
13 | def prepare_data(inputs):
14 | max_len = max((len(x) for x in inputs))
15 | return np.stack([_pad_data(x, max_len) for x in inputs])
16 |
17 |
18 | def _pad_tensor(x, length):
19 | _pad = 0.0
20 | assert x.ndim == 2
21 | x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
22 | return x
23 |
24 |
25 | def prepare_tensor(inputs, out_steps):
26 | max_len = max((x.shape[1] for x in inputs))
27 | remainder = max_len % out_steps
28 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
29 | return np.stack([_pad_tensor(x, pad_len) for x in inputs])
30 |
31 |
32 | def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
33 | """Pad stop target array.
34 |
35 | Args:
36 | x (np.ndarray): Stop target array.
37 | length (int): Length after padding.
38 | pad_val (int, optional): Padding value. Defaults to 1.
39 |
40 | Returns:
41 | np.ndarray: Padded stop target array.
42 | """
43 | assert x.ndim == 1
44 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
45 |
46 |
47 | def prepare_stop_target(inputs, out_steps):
48 | """Pad row vectors with 1."""
49 | max_len = max((x.shape[0] for x in inputs))
50 | remainder = max_len % out_steps
51 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
52 | return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
53 |
54 |
55 | def pad_per_step(inputs, pad_len):
56 | return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
57 |
58 |
59 | def get_length_balancer_weights(items: list, num_buckets=10):
60 | # get all durations
61 | audio_lengths = np.array([item["audio_length"] for item in items])
62 | # create the $num_buckets buckets classes based in the dataset max and min length
63 | max_length = int(max(audio_lengths))
64 | min_length = int(min(audio_lengths))
65 | step = int((max_length - min_length) / num_buckets) + 1
66 | buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
67 | # add each sample in their respective length bucket
68 | buckets_names = np.array(
69 | [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
70 | )
71 | # count and compute the weights_bucket for each sample
72 | unique_buckets_names = np.unique(buckets_names).tolist()
73 | bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
74 | bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
75 | weight_bucket = 1.0 / bucket_count
76 | dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
77 | # normalize
78 | dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
79 | return torch.from_numpy(dataset_samples_weight).float()
80 |
--------------------------------------------------------------------------------
/TTS/tts/utils/fairseq.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def rehash_fairseq_vits_checkpoint(checkpoint_file):
5 | chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
6 | new_chk = {}
7 | for k, v in chk.items():
8 | if "enc_p." in k:
9 | new_chk[k.replace("enc_p.", "text_encoder.")] = v
10 | elif "dec." in k:
11 | new_chk[k.replace("dec.", "waveform_decoder.")] = v
12 | elif "enc_q." in k:
13 | new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
14 | elif "flow.flows.2." in k:
15 | new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
16 | elif "flow.flows.4." in k:
17 | new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
18 | elif "flow.flows.6." in k:
19 | new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
20 | elif "dp.flows.0.m" in k:
21 | new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
22 | elif "dp.flows.0.logs" in k:
23 | new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
24 | elif "dp.flows.1" in k:
25 | new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
26 | elif "dp.flows.3" in k:
27 | new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
28 | elif "dp.flows.5" in k:
29 | new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
30 | elif "dp.flows.7" in k:
31 | new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
32 | elif "dp.post_flows.0.m" in k:
33 | new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
34 | elif "dp.post_flows.0.logs" in k:
35 | new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
36 | elif "dp.post_flows.1" in k:
37 | new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
38 | elif "dp.post_flows.3" in k:
39 | new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
40 | elif "dp.post_flows.5" in k:
41 | new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
42 | elif "dp.post_flows.7" in k:
43 | new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
44 | elif "dp." in k:
45 | new_chk[k.replace("dp.", "duration_predictor.")] = v
46 | else:
47 | new_chk[k] = v
48 | return new_chk
49 |
--------------------------------------------------------------------------------
/TTS/tts/utils/measures.py:
--------------------------------------------------------------------------------
1 | def alignment_diagonal_score(alignments, binary=False):
2 | """
3 | Compute how diagonal alignment predictions are. It is useful
4 | to measure the alignment consistency of a model
5 | Args:
6 | alignments (torch.Tensor): batch of alignments.
7 | binary (bool): if True, ignore scores and consider attention
8 | as a binary mask.
9 | Shape:
10 | - alignments : :math:`[B, T_de, T_en]`
11 | """
12 | maxs = alignments.max(dim=1)[0]
13 | if binary:
14 | maxs[maxs > 0] = 1
15 | return maxs.mean(dim=1).mean(dim=0).item()
16 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/monotonic_align/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | cimport cython
4 | cimport numpy as np
5 |
6 | from cython.parallel import prange
7 |
8 |
9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 | cdef int x
13 | cdef int y
14 | cdef float v_prev
15 | cdef float v_cur
16 | cdef float tmp
17 | cdef int index = t_x - 1
18 |
19 | for y in range(t_y):
20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 | if x == y:
22 | v_cur = max_neg_val
23 | else:
24 | v_cur = value[x, y-1]
25 | if x == 0:
26 | if y == 0:
27 | v_prev = 0.
28 | else:
29 | v_prev = max_neg_val
30 | else:
31 | v_prev = value[x-1, y-1]
32 | value[x, y] = max(v_cur, v_prev) + value[x, y]
33 |
34 | for y in range(t_y - 1, -1, -1):
35 | path[index, y] = 1
36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 | index = index - 1
38 |
39 |
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 | cdef int b = values.shape[0]
44 |
45 | cdef int i
46 | for i in prange(b, nogil=True):
47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 |
5 | # setup(name='monotonic_align',
6 | # ext_modules=cythonize("core.pyx"),
7 | # include_dirs=[numpy.get_include()])
8 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
2 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/bangla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/bangla/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/belarusian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/belarusian/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/belarusian/phonemizer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | finder = None
4 |
5 |
6 | def init():
7 | try:
8 | import jpype
9 | import jpype.imports
10 | except ModuleNotFoundError:
11 | raise ModuleNotFoundError(
12 | "Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`."
13 | )
14 |
15 | try:
16 | jar_path = os.environ["BEL_FANETYKA_JAR"]
17 | except KeyError:
18 | raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file")
19 |
20 | jpype.startJVM(classpath=[jar_path])
21 |
22 | # import the Java modules
23 | from org.alex73.korpus.base import GrammarDB2, GrammarFinder
24 |
25 | grammar_db = GrammarDB2.initializeFromJar()
26 | global finder
27 | finder = GrammarFinder(grammar_db)
28 |
29 |
30 | def belarusian_text_to_phonemes(text: str) -> str:
31 | # Initialize only on first run
32 | if finder is None:
33 | init()
34 |
35 | from org.alex73.fanetyka.impl import FanetykaText
36 |
37 | return str(FanetykaText(finder, text).ipa)
38 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/chinese_mandarin/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import jieba
4 | import pypinyin
5 |
6 | from .pinyinToPhonemes import PINYIN_DICT
7 |
8 |
9 | def _chinese_character_to_pinyin(text: str) -> List[str]:
10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist]
12 | return pinyins_flat_list
13 |
14 |
15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
16 | segment = pinyin[:-1]
17 | tone = pinyin[-1]
18 | phoneme = PINYIN_DICT.get(segment, [""])[0]
19 | return phoneme + tone
20 |
21 |
22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
23 | tokenized_text = jieba.cut(text, HMM=False)
24 | tokenized_text = " ".join(tokenized_text)
25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
26 |
27 | results: List[str] = []
28 |
29 | for token in pinyined_text:
30 | if token[-1] in "12345": # TODO transform to is_pinyin()
31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
32 |
33 | results += list(pinyin_phonemes)
34 | else: # is ponctuation or other
35 | results += list(token)
36 |
37 | return seperator.join(results)
38 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/english/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in english:
4 | abbreviations_en = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("mrs", "misess"),
8 | ("mr", "mister"),
9 | ("dr", "doctor"),
10 | ("st", "saint"),
11 | ("co", "company"),
12 | ("jr", "junior"),
13 | ("maj", "major"),
14 | ("gen", "general"),
15 | ("drs", "doctors"),
16 | ("rev", "reverend"),
17 | ("lt", "lieutenant"),
18 | ("hon", "honorable"),
19 | ("sgt", "sergeant"),
20 | ("capt", "captain"),
21 | ("esq", "esquire"),
22 | ("ltd", "limited"),
23 | ("col", "colonel"),
24 | ("ft", "fort"),
25 | ]
26 | ]
27 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/number_norm.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import re
4 | from typing import Dict
5 |
6 | import inflect
7 |
8 | _inflect = inflect.engine()
9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"-?[0-9]+")
14 |
15 |
16 | def _remove_commas(m):
17 | return m.group(1).replace(",", "")
18 |
19 |
20 | def _expand_decimal_point(m):
21 | return m.group(1).replace(".", " point ")
22 |
23 |
24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
25 | parts = value.replace(",", "").split(".")
26 | if len(parts) > 2:
27 | return f"{value} {inflection[2]}" # Unexpected format
28 | text = []
29 | integer = int(parts[0]) if parts[0] else 0
30 | if integer > 0:
31 | integer_unit = inflection.get(integer, inflection[2])
32 | text.append(f"{integer} {integer_unit}")
33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
34 | if fraction > 0:
35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02])
36 | text.append(f"{fraction} {fraction_unit}")
37 | if len(text) == 0:
38 | return f"zero {inflection[2]}"
39 | return " ".join(text)
40 |
41 |
42 | def _expand_currency(m: "re.Match") -> str:
43 | currencies = {
44 | "$": {
45 | 0.01: "cent",
46 | 0.02: "cents",
47 | 1: "dollar",
48 | 2: "dollars",
49 | },
50 | "€": {
51 | 0.01: "cent",
52 | 0.02: "cents",
53 | 1: "euro",
54 | 2: "euros",
55 | },
56 | "£": {
57 | 0.01: "penny",
58 | 0.02: "pence",
59 | 1: "pound sterling",
60 | 2: "pounds sterling",
61 | },
62 | "¥": {
63 | # TODO rin
64 | 0.02: "sen",
65 | 2: "yen",
66 | },
67 | }
68 | unit = m.group(1)
69 | currency = currencies[unit]
70 | value = m.group(2)
71 | return __expand_currency(value, currency)
72 |
73 |
74 | def _expand_ordinal(m):
75 | return _inflect.number_to_words(m.group(0))
76 |
77 |
78 | def _expand_number(m):
79 | num = int(m.group(0))
80 | if 1000 < num < 3000:
81 | if num == 2000:
82 | return "two thousand"
83 | if 2000 < num < 2010:
84 | return "two thousand " + _inflect.number_to_words(num % 100)
85 | if num % 100 == 0:
86 | return _inflect.number_to_words(num // 100) + " hundred"
87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
88 | return _inflect.number_to_words(num, andword="")
89 |
90 |
91 | def normalize_numbers(text):
92 | text = re.sub(_comma_number_re, _remove_commas, text)
93 | text = re.sub(_currency_re, _expand_currency, text)
94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
95 | text = re.sub(_ordinal_re, _expand_ordinal, text)
96 | text = re.sub(_number_re, _expand_number, text)
97 | return text
98 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/time_norm.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 |
7 | _time_re = re.compile(
8 | r"""\b
9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
10 | :
11 | ([0-5][0-9]) # minutes
12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 | \b""",
14 | re.IGNORECASE | re.X,
15 | )
16 |
17 |
18 | def _expand_num(n: int) -> str:
19 | return _inflect.number_to_words(n)
20 |
21 |
22 | def _expand_time_english(match: "re.Match") -> str:
23 | hour = int(match.group(1))
24 | past_noon = hour >= 12
25 | time = []
26 | if hour > 12:
27 | hour -= 12
28 | elif hour == 0:
29 | hour = 12
30 | past_noon = True
31 | time.append(_expand_num(hour))
32 |
33 | minute = int(match.group(6))
34 | if minute > 0:
35 | if minute < 10:
36 | time.append("oh")
37 | time.append(_expand_num(minute))
38 | am_pm = match.group(7)
39 | if am_pm is None:
40 | time.append("p m" if past_noon else "a m")
41 | else:
42 | time.extend(list(am_pm.replace(".", "")))
43 | return " ".join(time)
44 |
45 |
46 | def expand_time_english(text: str) -> str:
47 | return re.sub(_time_re, _expand_time_english, text)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/french/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in french:
4 | abbreviations_fr = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("M", "monsieur"),
8 | ("Mlle", "mademoiselle"),
9 | ("Mlles", "mesdemoiselles"),
10 | ("Mme", "Madame"),
11 | ("Mmes", "Mesdames"),
12 | ("N.B", "nota bene"),
13 | ("M", "monsieur"),
14 | ("p.c.q", "parce que"),
15 | ("Pr", "professeur"),
16 | ("qqch", "quelque chose"),
17 | ("rdv", "rendez-vous"),
18 | ("max", "maximum"),
19 | ("min", "minimum"),
20 | ("no", "numéro"),
21 | ("adr", "adresse"),
22 | ("dr", "docteur"),
23 | ("st", "saint"),
24 | ("co", "companie"),
25 | ("jr", "junior"),
26 | ("sgt", "sergent"),
27 | ("capt", "capitain"),
28 | ("col", "colonel"),
29 | ("av", "avenue"),
30 | ("av. J.-C", "avant Jésus-Christ"),
31 | ("apr. J.-C", "après Jésus-Christ"),
32 | ("art", "article"),
33 | ("boul", "boulevard"),
34 | ("c.-à-d", "c’est-à-dire"),
35 | ("etc", "et cetera"),
36 | ("ex", "exemple"),
37 | ("excl", "exclusivement"),
38 | ("boul", "boulevard"),
39 | ]
40 | ] + [
41 | (re.compile("\\b%s" % x[0]), x[1])
42 | for x in [
43 | ("Mlle", "mademoiselle"),
44 | ("Mlles", "mesdemoiselles"),
45 | ("Mme", "Madame"),
46 | ("Mmes", "Mesdames"),
47 | ]
48 | ]
49 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/japanese/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/japanese/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/korean/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/ko_dictionary.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Add the word you want to the dictionary.
3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
4 |
5 |
6 | english_dictionary = {
7 | "KOREA": "코리아",
8 | "IDOL": "아이돌",
9 | "IT": "아이티",
10 | "IQ": "아이큐",
11 | "UP": "업",
12 | "DOWN": "다운",
13 | "PC": "피씨",
14 | "CCTV": "씨씨티비",
15 | "SNS": "에스엔에스",
16 | "AI": "에이아이",
17 | "CEO": "씨이오",
18 | "A": "에이",
19 | "B": "비",
20 | "C": "씨",
21 | "D": "디",
22 | "E": "이",
23 | "F": "에프",
24 | "G": "지",
25 | "H": "에이치",
26 | "I": "아이",
27 | "J": "제이",
28 | "K": "케이",
29 | "L": "엘",
30 | "M": "엠",
31 | "N": "엔",
32 | "O": "오",
33 | "P": "피",
34 | "Q": "큐",
35 | "R": "알",
36 | "S": "에스",
37 | "T": "티",
38 | "U": "유",
39 | "V": "브이",
40 | "W": "더블유",
41 | "X": "엑스",
42 | "Y": "와이",
43 | "Z": "제트",
44 | }
45 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/korean.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
3 | import re
4 |
5 | from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
6 |
7 |
8 | def normalize(text):
9 | text = text.strip()
10 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
11 | text = normalize_with_dictionary(text, etc_dictionary)
12 | text = normalize_english(text)
13 | text = text.lower()
14 | return text
15 |
16 |
17 | def normalize_with_dictionary(text, dic):
18 | if any(key in text for key in dic.keys()):
19 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
20 | return pattern.sub(lambda x: dic[x.group()], text)
21 | return text
22 |
23 |
24 | def normalize_english(text):
25 | def fn(m):
26 | word = m.group()
27 | if word in english_dictionary:
28 | return english_dictionary.get(word)
29 | return word
30 |
31 | text = re.sub("([A-Za-z]+)", fn, text)
32 | return text
33 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/phonemizer.py:
--------------------------------------------------------------------------------
1 | from jamo import hangul_to_jamo
2 |
3 | from TTS.tts.utils.text.korean.korean import normalize
4 |
5 | g2p = None
6 |
7 |
8 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
9 | """
10 |
11 | The input and output values look the same, but they are different in Unicode.
12 |
13 | example :
14 |
15 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
16 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
17 |
18 | """
19 | global g2p # pylint: disable=global-statement
20 | if g2p is None:
21 | from g2pkk import G2p
22 |
23 | g2p = G2p()
24 |
25 | if character == "english":
26 | from anyascii import anyascii
27 |
28 | text = normalize(text)
29 | text = g2p(text)
30 | text = anyascii(text)
31 | return text
32 |
33 | text = normalize(text)
34 | text = g2p(text)
35 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
36 | return "".join(text)
37 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer
2 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
3 | from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer
4 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
5 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
6 | from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
7 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
8 |
9 | try:
10 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
11 | except ImportError:
12 | JA_JP_Phonemizer = None
13 | pass
14 |
15 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)}
16 |
17 |
18 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
19 | GRUUT_LANGS = list(Gruut.supported_languages())
20 |
21 |
22 | # Dict setting default phonemizers for each language
23 | # Add Gruut languages
24 | _ = [Gruut.name()] * len(GRUUT_LANGS)
25 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
26 |
27 |
28 | # Add ESpeak languages and override any existing ones
29 | _ = [ESpeak.name()] * len(ESPEAK_LANGS)
30 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
31 | DEF_LANG_TO_PHONEMIZER.update(_new_dict)
32 |
33 |
34 | # Force default for some languages
35 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
36 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
37 | DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
38 | DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name()
39 | DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name()
40 |
41 |
42 | # JA phonemizer has deal breaking dependencies like MeCab for some systems.
43 | # So we only have it when we have it.
44 | if JA_JP_Phonemizer is not None:
45 | PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer
46 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
47 |
48 |
49 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
50 | """Initiate a phonemizer by name
51 |
52 | Args:
53 | name (str):
54 | Name of the phonemizer that should match `phonemizer.name()`.
55 |
56 | kwargs (dict):
57 | Extra keyword arguments that should be passed to the phonemizer.
58 | """
59 | if name == "espeak":
60 | return ESpeak(**kwargs)
61 | if name == "gruut":
62 | return Gruut(**kwargs)
63 | if name == "zh_cn_phonemizer":
64 | return ZH_CN_Phonemizer(**kwargs)
65 | if name == "ja_jp_phonemizer":
66 | if JA_JP_Phonemizer is None:
67 | raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.")
68 | return JA_JP_Phonemizer(**kwargs)
69 | if name == "ko_kr_phonemizer":
70 | return KO_KR_Phonemizer(**kwargs)
71 | if name == "bn_phonemizer":
72 | return BN_Phonemizer(**kwargs)
73 | if name == "be_phonemizer":
74 | return BEL_Phonemizer(**kwargs)
75 | raise ValueError(f"Phonemizer {name} not found")
76 |
77 |
78 | if __name__ == "__main__":
79 | print(DEF_LANG_TO_PHONEMIZER)
80 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class BN_Phonemizer(BasePhonemizer):
10 | """🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Bangla knowledge should check this implementation
24 | """
25 |
26 | language = "bn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "bn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_bn(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
37 | ph = bangla_text_to_phonemes(text)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_bn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"bn": "Bangla"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | if __name__ == "__main__":
55 | txt = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে."
56 | e = BN_Phonemizer()
57 | print(e.supported_languages())
58 | print(e.version())
59 | print(e.language)
60 | print(e.name())
61 | print(e.is_available())
62 | print("`" + e.phonemize(txt) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_BE_PUNCS = ",!." # TODO
7 |
8 |
9 | class BEL_Phonemizer(BasePhonemizer):
10 | """🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 | """
19 |
20 | language = "be"
21 |
22 | def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
23 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
24 |
25 | @staticmethod
26 | def name():
27 | return "be_phonemizer"
28 |
29 | @staticmethod
30 | def phonemize_be(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
31 | return belarusian_text_to_phonemes(text)
32 |
33 | def _phonemize(self, text, separator):
34 | return self.phonemize_be(text, separator)
35 |
36 | @staticmethod
37 | def supported_languages() -> Dict:
38 | return {"be": "Belarusian"}
39 |
40 | def version(self) -> str:
41 | return "0.0.1"
42 |
43 | def is_available(self) -> bool:
44 | return True
45 |
46 |
47 | if __name__ == "__main__":
48 | txt = "тэст"
49 | e = BEL_Phonemizer()
50 | print(e.supported_languages())
51 | print(e.version())
52 | print(e.language)
53 | print(e.name())
54 | print(e.is_available())
55 | print("`" + e.phonemize(txt) + "`")
56 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 | _TRANS_TABLE = {"、": ","}
9 |
10 |
11 | def trans(text):
12 | for i, j in _TRANS_TABLE.items():
13 | text = text.replace(i, j)
14 | return text
15 |
16 |
17 | class JA_JP_Phonemizer(BasePhonemizer):
18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
19 |
20 | TODO: someone with JA knowledge should check this implementation
21 |
22 | Example:
23 |
24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
25 | >>> phonemizer = JA_JP_Phonemizer()
26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|")
27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
28 |
29 | """
30 |
31 | language = "ja-jp"
32 |
33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
35 |
36 | @staticmethod
37 | def name():
38 | return "ja_jp_phonemizer"
39 |
40 | def _phonemize(self, text: str, separator: str = "|") -> str:
41 | ph = japanese_text_to_phonemes(text)
42 | if separator is not None or separator != "":
43 | return separator.join(ph)
44 | return ph
45 |
46 | def phonemize(self, text: str, separator="|", language=None) -> str:
47 | """Custom phonemize for JP_JA
48 |
49 | Skip pre-post processing steps used by the other phonemizers.
50 | """
51 | return self._phonemize(text, separator)
52 |
53 | @staticmethod
54 | def supported_languages() -> Dict:
55 | return {"ja-jp": "Japanese (Japan)"}
56 |
57 | def version(self) -> str:
58 | return "0.0.1"
59 |
60 | def is_available(self) -> bool:
61 | return True
62 |
63 |
64 | # if __name__ == "__main__":
65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。"
66 | # e = JA_JP_Phonemizer()
67 | # print(e.supported_languages())
68 | # print(e.version())
69 | # print(e.language)
70 | # print(e.name())
71 | # print(e.is_available())
72 | # print("`" + e.phonemize(text) + "`")
73 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class KO_KR_Phonemizer(BasePhonemizer):
10 | """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
11 |
12 | TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
13 |
14 | Example:
15 |
16 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
17 | >>> phonemizer = KO_KR_Phonemizer()
18 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
19 | 'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
20 |
21 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
22 | >>> phonemizer = KO_KR_Phonemizer()
23 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
24 | 'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
25 |
26 | """
27 |
28 | language = "ko-kr"
29 |
30 | def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
31 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
32 |
33 | @staticmethod
34 | def name():
35 | return "ko_kr_phonemizer"
36 |
37 | def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
38 | ph = korean_text_to_phonemes(text, character=character)
39 | if separator is not None or separator != "":
40 | return separator.join(ph)
41 | return ph
42 |
43 | def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
44 | return self._phonemize(text, separator, character)
45 |
46 | @staticmethod
47 | def supported_languages() -> Dict:
48 | return {"ko-kr": "hangeul(korean)"}
49 |
50 | def version(self) -> str:
51 | return "0.0.2"
52 |
53 | def is_available(self) -> bool:
54 | return True
55 |
56 |
57 | if __name__ == "__main__":
58 | texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
59 | e = KO_KR_Phonemizer()
60 | print(e.supported_languages())
61 | print(e.version())
62 | print(e.language)
63 | print(e.name())
64 | print(e.is_available())
65 | print(e.phonemize(texts))
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/multi_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
4 |
5 |
6 | class MultiPhonemizer:
7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
8 |
9 | Args:
10 | custom_lang_to_phonemizer (Dict):
11 | Custom phonemizer mapping if you want to change the defaults. In the format of
12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
13 |
14 | TODO: find a way to pass custom kwargs to the phonemizers
15 | """
16 |
17 | lang_to_phonemizer = {}
18 |
19 | def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value
20 | for k, v in lang_to_phonemizer_name.items():
21 | if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
22 | lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
23 | elif v == "":
24 | raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
25 | self.lang_to_phonemizer_name = lang_to_phonemizer_name
26 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
27 |
28 | @staticmethod
29 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
30 | lang_to_phonemizer = {}
31 | for k, v in lang_to_phonemizer_name.items():
32 | lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
33 | return lang_to_phonemizer
34 |
35 | @staticmethod
36 | def name():
37 | return "multi-phonemizer"
38 |
39 | def phonemize(self, text, separator="|", language=""):
40 | if language == "":
41 | raise ValueError("Language must be set for multi-phonemizer to phonemize.")
42 | return self.lang_to_phonemizer[language].phonemize(text, separator)
43 |
44 | def supported_languages(self) -> List:
45 | return list(self.lang_to_phonemizer.keys())
46 |
47 | def print_logs(self, level: int = 0):
48 | indent = "\t" * level
49 | print(f"{indent}| > phoneme language: {self.supported_languages()}")
50 | print(f"{indent}| > phoneme backend: {self.name()}")
51 |
52 |
53 | # if __name__ == "__main__":
54 | # texts = {
55 | # "tr": "Merhaba, bu Türkçe bit örnek!",
56 | # "en-us": "Hello, this is English example!",
57 | # "de": "Hallo, das ist ein Deutches Beipiel!",
58 | # "zh-cn": "这是中国的例子",
59 | # }
60 | # phonemes = {}
61 | # ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
62 | # for lang, text in texts.items():
63 | # phoneme = ph.phonemize(text, lang)
64 | # phonemes[lang] = phoneme
65 | # print(phonemes)
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class ZH_CN_Phonemizer(BasePhonemizer):
10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Mandarin knowledge should check this implementation
24 | """
25 |
26 | language = "zh-cn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "zh_cn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str:
37 | ph = chinese_text_to_phonemes(text, separator)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_zh_cn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"zh-cn": "Chinese (China)"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | # if __name__ == "__main__":
55 | # text = "这是,样本中文。"
56 | # e = ZH_CN_Phonemizer()
57 | # print(e.supported_languages())
58 | # print(e.version())
59 | # print(e.language)
60 | # print(e.name())
61 | # print(e.is_available())
62 | # print("`" + e.phonemize(text) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/utils/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.utils.audio.processor import AudioProcessor
2 |
--------------------------------------------------------------------------------
/TTS/utils/capacitron_optimizer.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | from trainer.trainer_utils import get_optimizer
4 |
5 |
6 | class CapacitronOptimizer:
7 | """Double optimizer class for the Capacitron model."""
8 |
9 | def __init__(self, config: dict, model_params: Generator) -> None:
10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
11 |
12 | optimizer_names = list(config.optimizer_params.keys())
13 | optimizer_parameters = list(config.optimizer_params.values())
14 |
15 | self.primary_optimizer = get_optimizer(
16 | optimizer_names[0],
17 | optimizer_parameters[0],
18 | config.lr,
19 | parameters=self.primary_params,
20 | )
21 |
22 | self.secondary_optimizer = get_optimizer(
23 | optimizer_names[1],
24 | self.extract_optimizer_parameters(optimizer_parameters[1]),
25 | optimizer_parameters[1]["lr"],
26 | parameters=self.secondary_params,
27 | )
28 |
29 | self.param_groups = self.primary_optimizer.param_groups
30 |
31 | def first_step(self):
32 | self.secondary_optimizer.step()
33 | self.secondary_optimizer.zero_grad()
34 | self.primary_optimizer.zero_grad()
35 |
36 | def step(self):
37 | # Update param groups to display the correct learning rate
38 | self.param_groups = self.primary_optimizer.param_groups
39 | self.primary_optimizer.step()
40 |
41 | def zero_grad(self, set_to_none=False):
42 | self.primary_optimizer.zero_grad(set_to_none)
43 | self.secondary_optimizer.zero_grad(set_to_none)
44 |
45 | def load_state_dict(self, state_dict):
46 | self.primary_optimizer.load_state_dict(state_dict[0])
47 | self.secondary_optimizer.load_state_dict(state_dict[1])
48 |
49 | def state_dict(self):
50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
51 |
52 | @staticmethod
53 | def split_model_parameters(model_params: Generator) -> list:
54 | primary_params = []
55 | secondary_params = []
56 | for name, param in model_params:
57 | if param.requires_grad:
58 | if name == "capacitron_vae_layer.beta":
59 | secondary_params.append(param)
60 | else:
61 | primary_params.append(param)
62 | return [iter(primary_params), iter(secondary_params)]
63 |
64 | @staticmethod
65 | def extract_optimizer_parameters(params: dict) -> dict:
66 | """Extract parameters that are not the learning rate"""
67 | return {k: v for k, v in params.items() if k != "lr"}
68 |
--------------------------------------------------------------------------------
/TTS/utils/distribute.py:
--------------------------------------------------------------------------------
1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
2 | import torch
3 | import torch.distributed as dist
4 |
5 |
6 | def reduce_tensor(tensor, num_gpus):
7 | rt = tensor.clone()
8 | dist.all_reduce(rt, op=dist.reduce_op.SUM)
9 | rt /= num_gpus
10 | return rt
11 |
12 |
13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA."
15 |
16 | # Set cuda device so everything is done on the right GPU.
17 | torch.cuda.set_device(rank % torch.cuda.device_count())
18 |
19 | # Initialize distributed communication
20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name)
21 |
--------------------------------------------------------------------------------
/TTS/utils/io.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle as pickle_tts
3 | from typing import Any, Callable, Dict, Union
4 |
5 | import fsspec
6 | import torch
7 |
8 | from TTS.utils.generic_utils import get_user_data_dir
9 |
10 |
11 | class RenamingUnpickler(pickle_tts.Unpickler):
12 | """Overload default pickler to solve module renaming problem"""
13 |
14 | def find_class(self, module, name):
15 | return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name)
16 |
17 |
18 | class AttrDict(dict):
19 | """A custom dict which converts dict keys
20 | to class attributes"""
21 |
22 | def __init__(self, *args, **kwargs):
23 | super().__init__(*args, **kwargs)
24 | self.__dict__ = self
25 |
26 |
27 | def load_fsspec(
28 | path: str,
29 | map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
30 | cache: bool = True,
31 | **kwargs,
32 | ) -> Any:
33 | """Like torch.load but can load from other locations (e.g. s3:// , gs://).
34 |
35 | Args:
36 | path: Any path or url supported by fsspec.
37 | map_location: torch.device or str.
38 | cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
39 | **kwargs: Keyword arguments forwarded to torch.load.
40 |
41 | Returns:
42 | Object stored in path.
43 | """
44 | is_local = os.path.isdir(path) or os.path.isfile(path)
45 | if cache and not is_local:
46 | with fsspec.open(
47 | f"filecache::{path}",
48 | filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
49 | mode="rb",
50 | ) as f:
51 | return torch.load(f, map_location=map_location, **kwargs)
52 | else:
53 | with fsspec.open(path, "rb") as f:
54 | return torch.load(f, map_location=map_location, **kwargs)
55 |
56 |
57 | def load_checkpoint(
58 | model, checkpoint_path, use_cuda=False, eval=False, cache=False
59 | ): # pylint: disable=redefined-builtin
60 | try:
61 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
62 | except ModuleNotFoundError:
63 | pickle_tts.Unpickler = RenamingUnpickler
64 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
65 | model.load_state_dict(state["model"])
66 | if use_cuda:
67 | model.cuda()
68 | if eval:
69 | model.eval()
70 | return model, state
71 |
--------------------------------------------------------------------------------
/TTS/utils/training.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
6 | r"""Check model gradient against unexpected jumps and failures"""
7 | skip_flag = False
8 | if ignore_stopnet:
9 | if not amp_opt_params:
10 | grad_norm = torch.nn.utils.clip_grad_norm_(
11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip
12 | )
13 | else:
14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
15 | else:
16 | if not amp_opt_params:
17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
18 | else:
19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
20 |
21 | # compatibility with different torch versions
22 | if isinstance(grad_norm, float):
23 | if np.isinf(grad_norm):
24 | print(" | > Gradient is INF !!")
25 | skip_flag = True
26 | else:
27 | if torch.isinf(grad_norm):
28 | print(" | > Gradient is INF !!")
29 | skip_flag = True
30 | return grad_norm, skip_flag
31 |
32 |
33 | def gradual_training_scheduler(global_step, config):
34 | """Setup the gradual training schedule wrt number
35 | of active GPUs"""
36 | num_gpus = torch.cuda.device_count()
37 | if num_gpus == 0:
38 | num_gpus = 1
39 | new_values = None
40 | # we set the scheduling wrt num_gpus
41 | for values in config.gradual_training:
42 | if global_step * num_gpus >= values[0]:
43 | new_values = values
44 | return new_values[1], new_values[2]
45 |
--------------------------------------------------------------------------------
/TTS/utils/vad.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchaudio
3 |
4 |
5 | def read_audio(path):
6 | wav, sr = torchaudio.load(path)
7 |
8 | if wav.size(0) > 1:
9 | wav = wav.mean(dim=0, keepdim=True)
10 |
11 | return wav.squeeze(0), sr
12 |
13 |
14 | def resample_wav(wav, sr, new_sr):
15 | wav = wav.unsqueeze(0)
16 | transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
17 | wav = transform(wav)
18 | return wav.squeeze(0)
19 |
20 |
21 | def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
22 | factor = new_sr / vad_sr
23 | new_timestamps = []
24 | if just_begging_end and timestamps:
25 | # get just the start and end timestamps
26 | new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
27 | new_timestamps.append(new_dict)
28 | else:
29 | for ts in timestamps:
30 | # map to the new SR
31 | new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
32 | new_timestamps.append(new_dict)
33 |
34 | return new_timestamps
35 |
36 |
37 | def get_vad_model_and_utils(use_cuda=False, use_onnx=False):
38 | model, utils = torch.hub.load(
39 | repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True
40 | )
41 | if use_cuda:
42 | model = model.cuda()
43 |
44 | get_speech_timestamps, save_audio, _, _, collect_chunks = utils
45 | return model, get_speech_timestamps, save_audio, collect_chunks
46 |
47 |
48 | def remove_silence(
49 | model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
50 | ):
51 | # get the VAD model and utils functions
52 | model, get_speech_timestamps, _, collect_chunks = model_and_utils
53 |
54 | # read ground truth wav and resample the audio for the VAD
55 | try:
56 | wav, gt_sample_rate = read_audio(audio_path)
57 | except:
58 | print(f"> ❗ Failed to read {audio_path}")
59 | return None, False
60 |
61 | # if needed, resample the audio for the VAD model
62 | if gt_sample_rate != vad_sample_rate:
63 | wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate)
64 | else:
65 | wav_vad = wav
66 |
67 | if use_cuda:
68 | wav_vad = wav_vad.cuda()
69 |
70 | # get speech timestamps from full audio file
71 | speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
72 |
73 | # map the current speech_timestamps to the sample rate of the ground truth audio
74 | new_speech_timestamps = map_timestamps_to_new_sr(
75 | vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
76 | )
77 |
78 | # if have speech timestamps else save the wav
79 | if new_speech_timestamps:
80 | wav = collect_chunks(new_speech_timestamps, wav)
81 | is_speech = True
82 | else:
83 | print(f"> The file {audio_path} probably does not have speech please check it !!")
84 | is_speech = False
85 |
86 | # save
87 | torchaudio.save(out_path, wav[None, :], gt_sample_rate)
88 | return out_path, is_speech
89 |
--------------------------------------------------------------------------------
/TTS/vc/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/configs/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/models/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import re
3 | from typing import Dict, List, Union
4 |
5 |
6 | def to_camel(text):
7 | text = text.capitalize()
8 | return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
9 |
10 |
11 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC":
12 | print(" > Using model: {}".format(config.model))
13 | # fetch the right model implementation.
14 | if "model" in config and config["model"].lower() == "freevc":
15 | MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
16 | model = MyModel.init_from_config(config, samples)
17 | return model
18 |
--------------------------------------------------------------------------------
/TTS/vc/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/freevc/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/freevc/speaker_encoder/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/audio.py:
--------------------------------------------------------------------------------
1 | import struct
2 | from pathlib import Path
3 | from typing import Optional, Union
4 |
5 | # import webrtcvad
6 | import librosa
7 | import numpy as np
8 | from scipy.ndimage.morphology import binary_dilation
9 |
10 | from TTS.vc.modules.freevc.speaker_encoder.hparams import *
11 |
12 | int16_max = (2**15) - 1
13 |
14 |
15 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
16 | """
17 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform
18 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
19 |
20 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
21 | just .wav), either the waveform as a numpy array of floats.
22 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
23 | preprocessing. After preprocessing, the waveform's sampling rate will match the data
24 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
25 | this argument will be ignored.
26 | """
27 | # Load the wav from disk if needed
28 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
29 | wav, source_sr = librosa.load(fpath_or_wav, sr=None)
30 | else:
31 | wav = fpath_or_wav
32 |
33 | # Resample the wav if needed
34 | if source_sr is not None and source_sr != sampling_rate:
35 | wav = librosa.resample(wav, source_sr, sampling_rate)
36 |
37 | # Apply the preprocessing: normalize volume and shorten long silences
38 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
39 | wav = trim_long_silences(wav)
40 |
41 | return wav
42 |
43 |
44 | def wav_to_mel_spectrogram(wav):
45 | """
46 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
47 | Note: this not a log-mel spectrogram.
48 | """
49 | frames = librosa.feature.melspectrogram(
50 | y=wav,
51 | sr=sampling_rate,
52 | n_fft=int(sampling_rate * mel_window_length / 1000),
53 | hop_length=int(sampling_rate * mel_window_step / 1000),
54 | n_mels=mel_n_channels,
55 | )
56 | return frames.astype(np.float32).T
57 |
58 |
59 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
60 | if increase_only and decrease_only:
61 | raise ValueError("Both increase only and decrease only are set")
62 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
63 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
64 | return wav
65 | return wav * (10 ** (dBFS_change / 20))
66 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/hparams.py:
--------------------------------------------------------------------------------
1 | ## Mel-filterbank
2 | mel_window_length = 25 # In milliseconds
3 | mel_window_step = 10 # In milliseconds
4 | mel_n_channels = 40
5 |
6 |
7 | ## Audio
8 | sampling_rate = 16000
9 | # Number of spectrogram frames in a partial utterance
10 | partials_n_frames = 160 # 1600 ms
11 |
12 |
13 | ## Voice Activation Detection
14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
15 | # This sets the granularity of the VAD. Should not need to be changed.
16 | vad_window_length = 30 # In milliseconds
17 | # Number of frames to average together when performing the moving average smoothing.
18 | # The larger this value, the larger the VAD variations must be to not get smoothed out.
19 | vad_moving_average_width = 8
20 | # Maximum number of consecutive silent frames a segment can have.
21 | vad_max_silence_length = 6
22 |
23 |
24 | ## Audio volume normalization
25 | audio_norm_target_dBFS = -30
26 |
27 |
28 | ## Model parameters
29 | model_hidden_size = 256
30 | model_embedding_size = 256
31 | model_num_layers = 3
32 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/wavlm/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import urllib.request
3 |
4 | import torch
5 |
6 | from TTS.utils.generic_utils import get_user_data_dir
7 | from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig
8 |
9 | model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
10 |
11 |
12 | def get_wavlm(device="cpu"):
13 | """Download the model and return the model object."""
14 |
15 | output_path = get_user_data_dir("tts")
16 |
17 | output_path = os.path.join(output_path, "wavlm")
18 | if not os.path.exists(output_path):
19 | os.makedirs(output_path)
20 |
21 | output_path = os.path.join(output_path, "WavLM-Large.pt")
22 | if not os.path.exists(output_path):
23 | print(f" > Downloading WavLM model to {output_path} ...")
24 | urllib.request.urlretrieve(model_uri, output_path)
25 |
26 | checkpoint = torch.load(output_path, map_location=torch.device(device))
27 | cfg = WavLMConfig(checkpoint["cfg"])
28 | wavlm = WavLM(cfg).to(device)
29 | wavlm.load_state_dict(checkpoint["model"])
30 | wavlm.eval()
31 | return wavlm
32 |
33 |
34 | if __name__ == "__main__":
35 | wavlm = get_wavlm()
36 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/wavlm/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "./wavlm-large/",
3 | "activation_dropout": 0.0,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "WavLMModel"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 768,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": false,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": true,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_dropout": 0.0,
51 | "feat_extract_norm": "layer",
52 | "feat_proj_dropout": 0.1,
53 | "feat_quantizer_dropout": 0.0,
54 | "final_dropout": 0.0,
55 | "gradient_checkpointing": false,
56 | "hidden_act": "gelu",
57 | "hidden_dropout": 0.1,
58 | "hidden_size": 1024,
59 | "initializer_range": 0.02,
60 | "intermediate_size": 4096,
61 | "layer_norm_eps": 1e-05,
62 | "layerdrop": 0.1,
63 | "mask_channel_length": 10,
64 | "mask_channel_min_space": 1,
65 | "mask_channel_other": 0.0,
66 | "mask_channel_prob": 0.0,
67 | "mask_channel_selection": "static",
68 | "mask_feature_length": 10,
69 | "mask_feature_min_masks": 0,
70 | "mask_feature_prob": 0.0,
71 | "mask_time_length": 10,
72 | "mask_time_min_masks": 2,
73 | "mask_time_min_space": 1,
74 | "mask_time_other": 0.0,
75 | "mask_time_prob": 0.075,
76 | "mask_time_selection": "static",
77 | "max_bucket_distance": 800,
78 | "model_type": "wavlm",
79 | "num_adapter_layers": 3,
80 | "num_attention_heads": 16,
81 | "num_buckets": 320,
82 | "num_codevector_groups": 2,
83 | "num_codevectors_per_group": 320,
84 | "num_conv_pos_embedding_groups": 16,
85 | "num_conv_pos_embeddings": 128,
86 | "num_ctc_classes": 80,
87 | "num_feat_extract_layers": 7,
88 | "num_hidden_layers": 24,
89 | "num_negatives": 100,
90 | "output_hidden_size": 1024,
91 | "pad_token_id": 0,
92 | "proj_codevector_dim": 768,
93 | "replace_prob": 0.5,
94 | "tokenizer_class": "Wav2Vec2CTCTokenizer",
95 | "torch_dtype": "float32",
96 | "transformers_version": "4.15.0.dev0",
97 | "use_weighted_layer_sum": false,
98 | "vocab_size": 32
99 | }
--------------------------------------------------------------------------------
/TTS/vocoder/README.md:
--------------------------------------------------------------------------------
1 | # Mozilla TTS Vocoders (Experimental)
2 |
3 | Here there are vocoder model implementations which can be combined with the other TTS models.
4 |
5 | Currently, following models are implemented:
6 |
7 | - Melgan
8 | - MultiBand-Melgan
9 | - ParallelWaveGAN
10 | - GAN-TTS (Discriminator Only)
11 |
12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
13 |
14 | ## Training a model
15 |
16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
17 |
18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
19 |
20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
21 |
22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
23 |
24 | Example config files can be found under `tts/vocoder/configs/` folder.
25 |
26 | You can continue a previous training run by the following command.
27 |
28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
29 |
30 | You can fine-tune a pre-trained model by the following command.
31 |
32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
33 |
34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
35 |
36 | You can also follow your training runs on Tensorboard as you do with our TTS models.
37 |
38 | ## Acknowledgement
39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
40 |
--------------------------------------------------------------------------------
/TTS/vocoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | configs_dir = os.path.dirname(__file__)
7 | for file in os.listdir(configs_dir):
8 | path = os.path.join(configs_dir, file)
9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | module = importlib.import_module("TTS.vocoder.configs." + config_name)
12 | for attribute_name in dir(module):
13 | attribute = getattr(module, attribute_name)
14 |
15 | if isclass(attribute):
16 | # Add the class to this package's variables
17 | globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from coqpit import Coqpit
4 | from torch.utils.data import Dataset
5 |
6 | from TTS.utils.audio import AudioProcessor
7 | from TTS.vocoder.datasets.gan_dataset import GANDataset
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
11 |
12 |
13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset:
14 | if config.model.lower() in "gan":
15 | dataset = GANDataset(
16 | ap=ap,
17 | items=data_items,
18 | seq_len=config.seq_len,
19 | hop_len=ap.hop_length,
20 | pad_short=config.pad_short,
21 | conv_pad=config.conv_pad,
22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False,
23 | is_training=not is_eval,
24 | return_segments=not is_eval,
25 | use_noise_augment=config.use_noise_augment,
26 | use_cache=config.use_cache,
27 | verbose=verbose,
28 | )
29 | dataset.shuffle_mapping()
30 | elif config.model.lower() == "wavegrad":
31 | dataset = WaveGradDataset(
32 | ap=ap,
33 | items=data_items,
34 | seq_len=config.seq_len,
35 | hop_len=ap.hop_length,
36 | pad_short=config.pad_short,
37 | conv_pad=config.conv_pad,
38 | is_training=not is_eval,
39 | return_segments=True,
40 | use_noise_augment=False,
41 | use_cache=config.use_cache,
42 | verbose=verbose,
43 | )
44 | elif config.model.lower() == "wavernn":
45 | dataset = WaveRNNDataset(
46 | ap=ap,
47 | items=data_items,
48 | seq_len=config.seq_len,
49 | hop_len=ap.hop_length,
50 | pad=config.model_params.pad,
51 | mode=config.model_params.mode,
52 | mulaw=config.model_params.mulaw,
53 | is_training=not is_eval,
54 | verbose=verbose,
55 | )
56 | else:
57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.")
58 | return dataset
59 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/preprocess.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | from coqpit import Coqpit
7 | from tqdm import tqdm
8 |
9 | from TTS.utils.audio import AudioProcessor
10 | from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize
11 |
12 |
13 | def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor):
14 | """Process wav and compute mel and quantized wave signal.
15 | It is mainly used by WaveRNN dataloader.
16 |
17 | Args:
18 | out_path (str): Parent folder path to save the files.
19 | config (Coqpit): Model config.
20 | ap (AudioProcessor): Audio processor.
21 | """
22 | os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
23 | os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
24 | wav_files = find_wav_files(config.data_path)
25 | for path in tqdm(wav_files):
26 | wav_name = Path(path).stem
27 | quant_path = os.path.join(out_path, "quant", wav_name + ".npy")
28 | mel_path = os.path.join(out_path, "mel", wav_name + ".npy")
29 | y = ap.load_wav(path)
30 | mel = ap.melspectrogram(y)
31 | np.save(mel_path, mel)
32 | if isinstance(config.mode, int):
33 | quant = (
34 | mulaw_encode(wav=y, mulaw_qc=config.mode)
35 | if config.model_args.mulaw
36 | else quantize(x=y, quantize_bits=config.mode)
37 | )
38 | np.save(quant_path, quant)
39 |
40 |
41 | def find_wav_files(data_path, file_ext="wav"):
42 | wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True)
43 | return wav_paths
44 |
45 |
46 | def find_feat_files(data_path):
47 | feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True)
48 | return feat_paths
49 |
50 |
51 | def load_wav_data(data_path, eval_split_size, file_ext="wav"):
52 | wav_paths = find_wav_files(data_path, file_ext=file_ext)
53 | assert len(wav_paths) > 0, f" [!] {data_path} is empty."
54 | np.random.seed(0)
55 | np.random.shuffle(wav_paths)
56 | return wav_paths[:eval_split_size], wav_paths[eval_split_size:]
57 |
58 |
59 | def load_wav_feat_data(data_path, feat_path, eval_split_size):
60 | wav_paths = find_wav_files(data_path)
61 | feat_paths = find_feat_files(feat_path)
62 |
63 | wav_paths.sort(key=lambda x: Path(x).stem)
64 | feat_paths.sort(key=lambda x: Path(x).stem)
65 |
66 | assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}"
67 | for wav, feat in zip(wav_paths, feat_paths):
68 | wav_name = Path(wav).stem
69 | feat_name = Path(feat).stem
70 | assert wav_name == feat_name
71 |
72 | items = list(zip(wav_paths, feat_paths))
73 | np.random.seed(0)
74 | np.random.shuffle(items)
75 | return items[:eval_split_size], items[eval_split_size:]
76 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/layers/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/layers/hifigan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils.parametrize import remove_parametrizations
3 |
4 |
5 | # pylint: disable=dangerous-default-value
6 | class ResStack(nn.Module):
7 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]):
8 | super().__init__()
9 | resstack = []
10 | for dilation in dilations:
11 | resstack += [
12 | nn.LeakyReLU(0.2),
13 | nn.ReflectionPad1d(dilation),
14 | nn.utils.parametrizations.weight_norm(
15 | nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)
16 | ),
17 | nn.LeakyReLU(0.2),
18 | nn.ReflectionPad1d(padding),
19 | nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)),
20 | ]
21 | self.resstack = nn.Sequential(*resstack)
22 |
23 | self.shortcut = nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1))
24 |
25 | def forward(self, x):
26 | x1 = self.shortcut(x)
27 | x2 = self.resstack(x)
28 | return x1 + x2
29 |
30 | def remove_weight_norm(self):
31 | remove_parametrizations(self.shortcut, "weight")
32 | remove_parametrizations(self.resstack[2], "weight")
33 | remove_parametrizations(self.resstack[5], "weight")
34 | remove_parametrizations(self.resstack[8], "weight")
35 | remove_parametrizations(self.resstack[11], "weight")
36 | remove_parametrizations(self.resstack[14], "weight")
37 | remove_parametrizations(self.resstack[17], "weight")
38 |
39 |
40 | class MRF(nn.Module):
41 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value
42 | super().__init__()
43 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations)
44 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations)
45 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations)
46 |
47 | def forward(self, x):
48 | x1 = self.resblock1(x)
49 | x2 = self.resblock2(x)
50 | x3 = self.resblock3(x)
51 | return x1 + x2 + x3
52 |
53 | def remove_weight_norm(self):
54 | self.resblock1.remove_weight_norm()
55 | self.resblock2.remove_weight_norm()
56 | self.resblock3.remove_weight_norm()
57 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/melgan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils.parametrizations import weight_norm
3 | from torch.nn.utils.parametrize import remove_parametrizations
4 |
5 |
6 | class ResidualStack(nn.Module):
7 | def __init__(self, channels, num_res_blocks, kernel_size):
8 | super().__init__()
9 |
10 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
11 | base_padding = (kernel_size - 1) // 2
12 |
13 | self.blocks = nn.ModuleList()
14 | for idx in range(num_res_blocks):
15 | layer_kernel_size = kernel_size
16 | layer_dilation = layer_kernel_size**idx
17 | layer_padding = base_padding * layer_dilation
18 | self.blocks += [
19 | nn.Sequential(
20 | nn.LeakyReLU(0.2),
21 | nn.ReflectionPad1d(layer_padding),
22 | weight_norm(
23 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True)
24 | ),
25 | nn.LeakyReLU(0.2),
26 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
27 | )
28 | ]
29 |
30 | self.shortcuts = nn.ModuleList(
31 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for _ in range(num_res_blocks)]
32 | )
33 |
34 | def forward(self, x):
35 | for block, shortcut in zip(self.blocks, self.shortcuts):
36 | x = shortcut(x) + block(x)
37 | return x
38 |
39 | def remove_weight_norm(self):
40 | for block, shortcut in zip(self.blocks, self.shortcuts):
41 | remove_parametrizations(block[2], "weight")
42 | remove_parametrizations(block[4], "weight")
43 | remove_parametrizations(shortcut, "weight")
44 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/parallel_wavegan.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 |
5 | class ResidualBlock(torch.nn.Module):
6 | """Residual block module in WaveNet."""
7 |
8 | def __init__(
9 | self,
10 | kernel_size=3,
11 | res_channels=64,
12 | gate_channels=128,
13 | skip_channels=64,
14 | aux_channels=80,
15 | dropout=0.0,
16 | dilation=1,
17 | bias=True,
18 | use_causal_conv=False,
19 | ):
20 | super().__init__()
21 | self.dropout = dropout
22 | # no future time stamps available
23 | if use_causal_conv:
24 | padding = (kernel_size - 1) * dilation
25 | else:
26 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
27 | padding = (kernel_size - 1) // 2 * dilation
28 | self.use_causal_conv = use_causal_conv
29 |
30 | # dilation conv
31 | self.conv = torch.nn.Conv1d(
32 | res_channels, gate_channels, kernel_size, padding=padding, dilation=dilation, bias=bias
33 | )
34 |
35 | # local conditioning
36 | if aux_channels > 0:
37 | self.conv1x1_aux = torch.nn.Conv1d(aux_channels, gate_channels, 1, bias=False)
38 | else:
39 | self.conv1x1_aux = None
40 |
41 | # conv output is split into two groups
42 | gate_out_channels = gate_channels // 2
43 | self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, res_channels, 1, bias=bias)
44 | self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, skip_channels, 1, bias=bias)
45 |
46 | def forward(self, x, c):
47 | """
48 | x: B x D_res x T
49 | c: B x D_aux x T
50 | """
51 | residual = x
52 | x = F.dropout(x, p=self.dropout, training=self.training)
53 | x = self.conv(x)
54 |
55 | # remove future time steps if use_causal_conv conv
56 | x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x
57 |
58 | # split into two part for gated activation
59 | splitdim = 1
60 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
61 |
62 | # local conditioning
63 | if c is not None:
64 | assert self.conv1x1_aux is not None
65 | c = self.conv1x1_aux(c)
66 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
67 | xa, xb = xa + ca, xb + cb
68 |
69 | x = torch.tanh(xa) * torch.sigmoid(xb)
70 |
71 | # for skip connection
72 | s = self.conv1x1_skip(x)
73 |
74 | # for residual connection
75 | x = (self.conv1x1_out(x) + residual) * (0.5**2)
76 |
77 | return x, s
78 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/pqmf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from scipy import signal as sig
5 |
6 |
7 | # adapted from
8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
9 | class PQMF(torch.nn.Module):
10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
11 | super().__init__()
12 |
13 | self.N = N
14 | self.taps = taps
15 | self.cutoff = cutoff
16 | self.beta = beta
17 |
18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta))
19 | H = np.zeros((N, len(QMF)))
20 | G = np.zeros((N, len(QMF)))
21 | for k in range(N):
22 | constant_factor = (
23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2))
24 | ) # TODO: (taps - 1) -> taps
25 | phase = (-1) ** k * np.pi / 4
26 | H[k] = 2 * QMF * np.cos(constant_factor + phase)
27 |
28 | G[k] = 2 * QMF * np.cos(constant_factor - phase)
29 |
30 | H = torch.from_numpy(H[:, None, :]).float()
31 | G = torch.from_numpy(G[None, :, :]).float()
32 |
33 | self.register_buffer("H", H)
34 | self.register_buffer("G", G)
35 |
36 | updown_filter = torch.zeros((N, N, N)).float()
37 | for k in range(N):
38 | updown_filter[k, k, 0] = 1.0
39 | self.register_buffer("updown_filter", updown_filter)
40 | self.N = N
41 |
42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
43 |
44 | def forward(self, x):
45 | return self.analysis(x)
46 |
47 | def analysis(self, x):
48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
49 |
50 | def synthesis(self, x):
51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N)
52 | x = F.conv1d(x, self.G, padding=self.taps // 2)
53 | return x
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/base_vocoder.py:
--------------------------------------------------------------------------------
1 | from coqpit import Coqpit
2 |
3 | from TTS.model import BaseTrainerModel
4 |
5 | # pylint: skip-file
6 |
7 |
8 | class BaseVocoder(BaseTrainerModel):
9 | """Base `vocoder` class. Every new `vocoder` model must inherit this.
10 |
11 | It defines `vocoder` specific functions on top of `Model`.
12 |
13 | Notes on input/output tensor shapes:
14 | Any input or output tensor of the model must be shaped as
15 |
16 | - 3D tensors `batch x time x channels`
17 | - 2D tensors `batch x channels`
18 | - 1D tensors `batch x 1`
19 | """
20 |
21 | MODEL_TYPE = "vocoder"
22 |
23 | def __init__(self, config):
24 | super().__init__()
25 | self._set_model_args(config)
26 |
27 | def _set_model_args(self, config: Coqpit):
28 | """Setup model args based on the config type.
29 |
30 | If the config is for training with a name like "*Config", then the model args are embeded in the
31 | config.model_args
32 |
33 | If the config is for the model with a name like "*Args", then we assign the directly.
34 | """
35 | # don't use isintance not to import recursively
36 | if "Config" in config.__class__.__name__:
37 | if "characters" in config:
38 | _, self.config, num_chars = self.get_characters(config)
39 | self.config.num_chars = num_chars
40 | if hasattr(self.config, "model_args"):
41 | config.model_args.num_chars = num_chars
42 | if "model_args" in config:
43 | self.args = self.config.model_args
44 | # This is for backward compatibility
45 | if "model_params" in config:
46 | self.args = self.config.model_params
47 | else:
48 | self.config = config
49 | if "model_args" in config:
50 | self.args = self.config.model_args
51 | # This is for backward compatibility
52 | if "model_params" in config:
53 | self.args = self.config.model_params
54 | else:
55 | raise ValueError("config must be either a *Config or *Args")
56 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/fullband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.models.melgan_generator import MelganGenerator
4 |
5 |
6 | class FullbandMelganGenerator(MelganGenerator):
7 | def __init__(
8 | self,
9 | in_channels=80,
10 | out_channels=1,
11 | proj_kernel=7,
12 | base_channels=512,
13 | upsample_factors=(2, 8, 2, 2),
14 | res_kernel=3,
15 | num_res_blocks=4,
16 | ):
17 | super().__init__(
18 | in_channels=in_channels,
19 | out_channels=out_channels,
20 | proj_kernel=proj_kernel,
21 | base_channels=base_channels,
22 | upsample_factors=upsample_factors,
23 | res_kernel=res_kernel,
24 | num_res_blocks=num_res_blocks,
25 | )
26 |
27 | @torch.no_grad()
28 | def inference(self, cond_features):
29 | cond_features = cond_features.to(self.layers[1].weight.device)
30 | cond_features = torch.nn.functional.pad(
31 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
32 | )
33 | return self.layers(cond_features)
34 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_discriminator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 | from torch.nn.utils.parametrizations import weight_norm
4 |
5 |
6 | class MelganDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | kernel_sizes=(5, 3),
12 | base_channels=16,
13 | max_channels=1024,
14 | downsample_factors=(4, 4, 4, 4),
15 | groups_denominator=4,
16 | ):
17 | super().__init__()
18 | self.layers = nn.ModuleList()
19 |
20 | layer_kernel_size = np.prod(kernel_sizes)
21 | layer_padding = (layer_kernel_size - 1) // 2
22 |
23 | # initial layer
24 | self.layers += [
25 | nn.Sequential(
26 | nn.ReflectionPad1d(layer_padding),
27 | weight_norm(nn.Conv1d(in_channels, base_channels, layer_kernel_size, stride=1)),
28 | nn.LeakyReLU(0.2, inplace=True),
29 | )
30 | ]
31 |
32 | # downsampling layers
33 | layer_in_channels = base_channels
34 | for downsample_factor in downsample_factors:
35 | layer_out_channels = min(layer_in_channels * downsample_factor, max_channels)
36 | layer_kernel_size = downsample_factor * 10 + 1
37 | layer_padding = (layer_kernel_size - 1) // 2
38 | layer_groups = layer_in_channels // groups_denominator
39 | self.layers += [
40 | nn.Sequential(
41 | weight_norm(
42 | nn.Conv1d(
43 | layer_in_channels,
44 | layer_out_channels,
45 | kernel_size=layer_kernel_size,
46 | stride=downsample_factor,
47 | padding=layer_padding,
48 | groups=layer_groups,
49 | )
50 | ),
51 | nn.LeakyReLU(0.2, inplace=True),
52 | )
53 | ]
54 | layer_in_channels = layer_out_channels
55 |
56 | # last 2 layers
57 | layer_padding1 = (kernel_sizes[0] - 1) // 2
58 | layer_padding2 = (kernel_sizes[1] - 1) // 2
59 | self.layers += [
60 | nn.Sequential(
61 | weight_norm(
62 | nn.Conv1d(
63 | layer_out_channels,
64 | layer_out_channels,
65 | kernel_size=kernel_sizes[0],
66 | stride=1,
67 | padding=layer_padding1,
68 | )
69 | ),
70 | nn.LeakyReLU(0.2, inplace=True),
71 | ),
72 | weight_norm(
73 | nn.Conv1d(
74 | layer_out_channels, out_channels, kernel_size=kernel_sizes[1], stride=1, padding=layer_padding2
75 | )
76 | ),
77 | ]
78 |
79 | def forward(self, x):
80 | feats = []
81 | for layer in self.layers:
82 | x = layer(x)
83 | feats.append(x)
84 | return x, feats
85 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_multiscale_discriminator.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
4 |
5 |
6 | class MelganMultiscaleDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | num_scales=3,
12 | kernel_sizes=(5, 3),
13 | base_channels=16,
14 | max_channels=1024,
15 | downsample_factors=(4, 4, 4),
16 | pooling_kernel_size=4,
17 | pooling_stride=2,
18 | pooling_padding=2,
19 | groups_denominator=4,
20 | ):
21 | super().__init__()
22 |
23 | self.discriminators = nn.ModuleList(
24 | [
25 | MelganDiscriminator(
26 | in_channels=in_channels,
27 | out_channels=out_channels,
28 | kernel_sizes=kernel_sizes,
29 | base_channels=base_channels,
30 | max_channels=max_channels,
31 | downsample_factors=downsample_factors,
32 | groups_denominator=groups_denominator,
33 | )
34 | for _ in range(num_scales)
35 | ]
36 | )
37 |
38 | self.pooling = nn.AvgPool1d(
39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False
40 | )
41 |
42 | def forward(self, x):
43 | scores = []
44 | feats = []
45 | for disc in self.discriminators:
46 | score, feat = disc(x)
47 | scores.append(score)
48 | feats.append(feat)
49 | x = self.pooling(x)
50 | return scores, feats
51 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/multiband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.layers.pqmf import PQMF
4 | from TTS.vocoder.models.melgan_generator import MelganGenerator
5 |
6 |
7 | class MultibandMelganGenerator(MelganGenerator):
8 | def __init__(
9 | self,
10 | in_channels=80,
11 | out_channels=4,
12 | proj_kernel=7,
13 | base_channels=384,
14 | upsample_factors=(2, 8, 2, 2),
15 | res_kernel=3,
16 | num_res_blocks=3,
17 | ):
18 | super().__init__(
19 | in_channels=in_channels,
20 | out_channels=out_channels,
21 | proj_kernel=proj_kernel,
22 | base_channels=base_channels,
23 | upsample_factors=upsample_factors,
24 | res_kernel=res_kernel,
25 | num_res_blocks=num_res_blocks,
26 | )
27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
28 |
29 | def pqmf_analysis(self, x):
30 | return self.pqmf_layer.analysis(x)
31 |
32 | def pqmf_synthesis(self, x):
33 | return self.pqmf_layer.synthesis(x)
34 |
35 | @torch.no_grad()
36 | def inference(self, cond_features):
37 | cond_features = cond_features.to(self.layers[1].weight.device)
38 | cond_features = torch.nn.functional.pad(
39 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
40 | )
41 | return self.pqmf_synthesis(self.layers(cond_features))
42 |
--------------------------------------------------------------------------------
/TTS/vocoder/pqmf_output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/pqmf_output.wav
--------------------------------------------------------------------------------
/TTS/vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/utils/generic_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import numpy as np
4 | import torch
5 | from matplotlib import pyplot as plt
6 |
7 | from TTS.tts.utils.visual import plot_spectrogram
8 | from TTS.utils.audio import AudioProcessor
9 |
10 |
11 | def interpolate_vocoder_input(scale_factor, spec):
12 | """Interpolate spectrogram by the scale factor.
13 | It is mainly used to match the sampling rates of
14 | the tts and vocoder models.
15 |
16 | Args:
17 | scale_factor (float): scale factor to interpolate the spectrogram
18 | spec (np.array): spectrogram to be interpolated
19 |
20 | Returns:
21 | torch.tensor: interpolated spectrogram.
22 | """
23 | print(" > before interpolation :", spec.shape)
24 | spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable
25 | spec = torch.nn.functional.interpolate(
26 | spec, scale_factor=scale_factor, recompute_scale_factor=True, mode="bilinear", align_corners=False
27 | ).squeeze(0)
28 | print(" > after interpolation :", spec.shape)
29 | return spec
30 |
31 |
32 | def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
33 | """Plot the predicted and the real waveform and their spectrograms.
34 |
35 | Args:
36 | y_hat (torch.tensor): Predicted waveform.
37 | y (torch.tensor): Real waveform.
38 | ap (AudioProcessor): Audio processor used to process the waveform.
39 | name_prefix (str, optional): Name prefix used to name the figures. Defaults to None.
40 |
41 | Returns:
42 | Dict: output figures keyed by the name of the figures.
43 | """ """Plot vocoder model results"""
44 | if name_prefix is None:
45 | name_prefix = ""
46 |
47 | # select an instance from batch
48 | y_hat = y_hat[0].squeeze().detach().cpu().numpy()
49 | y = y[0].squeeze().detach().cpu().numpy()
50 |
51 | spec_fake = ap.melspectrogram(y_hat).T
52 | spec_real = ap.melspectrogram(y).T
53 | spec_diff = np.abs(spec_fake - spec_real)
54 |
55 | # plot figure and save it
56 | fig_wave = plt.figure()
57 | plt.subplot(2, 1, 1)
58 | plt.plot(y)
59 | plt.title("groundtruth speech")
60 | plt.subplot(2, 1, 2)
61 | plt.plot(y_hat)
62 | plt.title("generated speech")
63 | plt.tight_layout()
64 | plt.close()
65 |
66 | figures = {
67 | name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake),
68 | name_prefix + "spectrogram/real": plot_spectrogram(spec_real),
69 | name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff),
70 | name_prefix + "speech_comparison": fig_wave,
71 | }
72 | return figures
73 |
--------------------------------------------------------------------------------
/download_checkpoint.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from transformers import HfArgumentParser
3 | from typing import Optional
4 | from TTS.utils.manage import ModelManager
5 | import os
6 |
7 | @dataclass
8 | class DownloadArgs:
9 | output_path: str = field(
10 | default="checkpoints",
11 | metadata={"help": "Path to pretrained + checkpoint model"}
12 | )
13 |
14 | def download(output_path: str = "checkpoints"):
15 | CHECKPOINTS_OUT_PATH = os.path.join(output_path, "XTTS_v2.0_original_model_files/")
16 | os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
17 |
18 | # DVAE files
19 | DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
20 | MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
21 |
22 | # Set the path to the downloaded files
23 | DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
24 | MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
25 |
26 | # download DVAE files if needed
27 | if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
28 | print(" > Downloading DVAE files!")
29 | ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
30 |
31 | # Download XTTS v2.0 checkpoint if needed
32 | TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
33 | XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
34 | XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
35 |
36 | # XTTS transfer learning parameters
37 | TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))
38 | XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))
39 | XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))
40 |
41 | # download XTTS v2.0 files if needed
42 | if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
43 | print(" > Downloading XTTS v2.0 files!")
44 | ModelManager._download_model_files(
45 | [TOKENIZER_FILE_LINK, XTTS_CONFIG_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
46 | # [TOKENIZER_FILE_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
47 | )
48 |
49 | if __name__ == "__main__":
50 | parser = HfArgumentParser(DownloadArgs)
51 | args = parser.parse_args()
52 | download(output_path=args.output_path)
--------------------------------------------------------------------------------
/recipes/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS Training Recipes
2 |
3 | TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset.
4 |
5 | For each dataset, you need to download the dataset once. Then you run the training for the model you want.
6 |
7 | Run each script from the root TTS folder as follows.
8 |
9 | ```console
10 | $ sh ./recipes//download_.sh
11 | $ python recipes///train.py
12 | ```
13 |
14 | For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows.
15 |
16 | ```console
17 | python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac
18 | ```
19 |
20 | If you train a new model using TTS, feel free to share your training to expand the list of recipes.
21 |
22 | You can also open a new discussion and share your progress with the 🐸 community.
--------------------------------------------------------------------------------
/recipes/bel-alex73/.gitignore:
--------------------------------------------------------------------------------
1 | /docker-prepare/*.txt
2 |
--------------------------------------------------------------------------------
/recipes/bel-alex73/docker-prepare-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | cd $( dirname -- "$0"; )
5 |
6 | cp ../../requirements*.txt docker-prepare/
7 |
8 | docker build -t tts-learn -f docker-prepare/Dockerfile docker-prepare/
9 |
10 | mkdir -p ../../../storage
11 | docker run --rm -it \
12 | -p 2525:2525 \
13 | --shm-size=256M \
14 | --name tts-learn-run \
15 | -v $(pwd)/../../:/a/TTS \
16 | -v $(pwd)/../../../cv-corpus:/a/cv-corpus \
17 | -v $(pwd)/../../../fanetyka/:/a/fanetyka/ \
18 | -v $(pwd)/../../../storage:/storage \
19 | tts-learn
20 |
--------------------------------------------------------------------------------
/recipes/bel-alex73/docker-prepare/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:22.04
2 |
3 | RUN apt -y update
4 | RUN apt -y upgrade
5 | RUN apt -y install --no-install-recommends pip ffmpeg openjdk-19-jre-headless
6 |
7 | RUN mkdir /a/
8 | ADD requirements*.txt /a/
9 | WORKDIR /a/
10 | RUN pip install -r requirements.txt -r requirements.dev.txt -r requirements.notebooks.txt
11 | RUN pip install seaborn pydub notebook
12 |
13 | RUN apt -y install --no-install-recommends gcc libpython3.10-dev
14 |
15 | ADD runtime.sh /a/
16 |
17 | WORKDIR /a/TTS/
18 | CMD /a/runtime.sh
19 |
--------------------------------------------------------------------------------
/recipes/bel-alex73/docker-prepare/runtime.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cd /a/TTS
4 | pip install -e .[all,dev,notebooks]
5 |
6 | LANG=C.utf8 bash
7 |
--------------------------------------------------------------------------------
/recipes/bel-alex73/dump_config.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | from train_glowtts import config
5 |
6 | s = json.dumps(config, default=vars, indent=2)
7 | s = re.sub(r'"test_sentences":\s*\[\],', "", s)
8 | print(s)
9 |
--------------------------------------------------------------------------------
/recipes/bel-alex73/train_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from coqpit import Coqpit
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.tts.configs.shared_configs import BaseAudioConfig
7 | from TTS.utils.audio import AudioProcessor
8 | from TTS.vocoder.configs.hifigan_config import *
9 | from TTS.vocoder.datasets.preprocess import load_wav_data
10 | from TTS.vocoder.models.gan import GAN
11 |
12 | output_path = "/storage/output-hifigan/"
13 |
14 | audio_config = BaseAudioConfig(
15 | mel_fmin=50,
16 | mel_fmax=8000,
17 | hop_length=256,
18 | stats_path="/storage/TTS/scale_stats.npy",
19 | )
20 |
21 | config = HifiganConfig(
22 | batch_size=74,
23 | eval_batch_size=16,
24 | num_loader_workers=8,
25 | num_eval_loader_workers=8,
26 | lr_disc=0.0002,
27 | lr_gen=0.0002,
28 | run_eval=True,
29 | test_delay_epochs=5,
30 | epochs=1000,
31 | use_noise_augment=True,
32 | seq_len=8192,
33 | pad_short=2000,
34 | save_step=5000,
35 | print_step=50,
36 | print_eval=True,
37 | mixed_precision=False,
38 | eval_split_size=30,
39 | save_n_checkpoints=2,
40 | save_best_after=5000,
41 | data_path="/storage/filtered_dataset",
42 | output_path=output_path,
43 | audio=audio_config,
44 | )
45 |
46 | # init audio processor
47 | ap = AudioProcessor.init_from_config(config)
48 |
49 | # load training samples
50 | print("config.eval_split_size = ", config.eval_split_size)
51 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
52 |
53 | # init model
54 | model = GAN(config, ap)
55 |
56 | # init the trainer and 🚀
57 | trainer = Trainer(
58 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
59 | )
60 | trainer.fit()
61 |
--------------------------------------------------------------------------------
/recipes/blizzard2013/README.md:
--------------------------------------------------------------------------------
1 | # How to get the Blizzard 2013 Dataset
2 |
3 | The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
4 |
5 | To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
6 |
7 | To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
8 |
9 | You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
10 |
11 | 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
12 | 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).
--------------------------------------------------------------------------------
/recipes/kokoro/tacotron2-DDC/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | CORPUS=kokoro-speech-v1_1-small
5 | echo $RUN_DIR
6 | if [ \! -d $RUN_DIR/$CORPUS ] ; then
7 | echo "$RUN_DIR/$CORPUS doesn't exist."
8 | echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
9 | exit 1
10 | fi
11 | # create train-val splits
12 | shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
13 | head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
14 | tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
15 | # compute dataset mean and variance for normalization
16 | python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
17 | # training ....
18 | # change the GPU id if needed
19 | CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \
20 | --coqpit.output_path $RUN_DIR \
21 | --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
22 | --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
23 | --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
--------------------------------------------------------------------------------
/recipes/ljspeech/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS LJspeech Recipes
2 |
3 | For running the recipes
4 |
5 | 1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```.
6 | 2. Go to your desired model folder and run the training.
7 |
8 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
9 | ```terminal
10 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py
11 | ```
12 |
13 | Running bash scripts.
14 | ```terminal
15 | bash run.sh
16 | ```
17 |
18 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
19 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
20 |
--------------------------------------------------------------------------------
/recipes/ljspeech/align_tts/train_aligntts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig
6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.align_tts import AlignTTS
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio import AudioProcessor
11 |
12 | output_path = os.path.dirname(os.path.abspath(__file__))
13 |
14 | # init configs
15 | dataset_config = BaseDatasetConfig(
16 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
17 | )
18 | config = AlignTTSConfig(
19 | batch_size=32,
20 | eval_batch_size=16,
21 | num_loader_workers=4,
22 | num_eval_loader_workers=4,
23 | run_eval=True,
24 | test_delay_epochs=-1,
25 | epochs=1000,
26 | text_cleaner="english_cleaners",
27 | use_phonemes=False,
28 | phoneme_language="en-us",
29 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
30 | print_step=25,
31 | print_eval=True,
32 | mixed_precision=False,
33 | output_path=output_path,
34 | datasets=[dataset_config],
35 | )
36 |
37 | # INITIALIZE THE AUDIO PROCESSOR
38 | # Audio processor is used for feature extraction and audio I/O.
39 | # It mainly serves to the dataloader and the training loggers.
40 | ap = AudioProcessor.init_from_config(config)
41 |
42 | # INITIALIZE THE TOKENIZER
43 | # Tokenizer is used to convert text to sequences of token IDs.
44 | # If characters are not defined in the config, default characters are passed to the config
45 | tokenizer, config = TTSTokenizer.init_from_config(config)
46 |
47 | # LOAD DATA SAMPLES
48 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
49 | # You can define your custom sample loader returning the list of samples.
50 | # Or define your custom formatter and pass it to the `load_tts_samples`.
51 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
52 | train_samples, eval_samples = load_tts_samples(
53 | dataset_config,
54 | eval_split=True,
55 | eval_split_max_size=config.eval_split_max_size,
56 | eval_split_size=config.eval_split_size,
57 | )
58 |
59 | # init model
60 | model = AlignTTS(config, ap, tokenizer)
61 |
62 | # INITIALIZE THE TRAINER
63 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
64 | # distributed training, etc.
65 | trainer = Trainer(
66 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
67 | )
68 |
69 | # AND... 3,2,1... 🚀
70 | trainer.fit()
71 |
--------------------------------------------------------------------------------
/recipes/ljspeech/delightful_tts/train_delightful_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.config.shared_configs import BaseDatasetConfig
6 | from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio.processor import AudioProcessor
11 |
12 | data_path = ""
13 | output_path = os.path.dirname(os.path.abspath(__file__))
14 |
15 | dataset_config = BaseDatasetConfig(
16 | dataset_name="ljspeech", formatter="ljspeech", meta_file_train="metadata.csv", path=data_path
17 | )
18 |
19 | audio_config = DelightfulTtsAudioConfig()
20 | model_args = DelightfulTtsArgs()
21 |
22 | vocoder_config = VocoderConfig()
23 |
24 | delightful_tts_config = DelightfulTTSConfig(
25 | run_name="delightful_tts_ljspeech",
26 | run_description="Train like in delightful tts paper.",
27 | model_args=model_args,
28 | audio=audio_config,
29 | vocoder=vocoder_config,
30 | batch_size=32,
31 | eval_batch_size=16,
32 | num_loader_workers=10,
33 | num_eval_loader_workers=10,
34 | precompute_num_workers=10,
35 | batch_group_size=2,
36 | compute_input_seq_cache=True,
37 | compute_f0=True,
38 | f0_cache_path=os.path.join(output_path, "f0_cache"),
39 | run_eval=True,
40 | test_delay_epochs=-1,
41 | epochs=1000,
42 | text_cleaner="english_cleaners",
43 | use_phonemes=True,
44 | phoneme_language="en-us",
45 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
46 | print_step=50,
47 | print_eval=False,
48 | mixed_precision=True,
49 | output_path=output_path,
50 | datasets=[dataset_config],
51 | start_by_longest=False,
52 | eval_split_size=0.1,
53 | binary_align_loss_alpha=0.0,
54 | use_attn_priors=False,
55 | lr_gen=4e-1,
56 | lr=4e-1,
57 | lr_disc=4e-1,
58 | max_text_len=130,
59 | )
60 |
61 | tokenizer, config = TTSTokenizer.init_from_config(delightful_tts_config)
62 |
63 | ap = AudioProcessor.init_from_config(config)
64 |
65 |
66 | train_samples, eval_samples = load_tts_samples(
67 | dataset_config,
68 | eval_split=True,
69 | eval_split_max_size=config.eval_split_max_size,
70 | eval_split_size=config.eval_split_size,
71 | )
72 |
73 | model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=None)
74 |
75 | trainer = Trainer(
76 | TrainerArgs(),
77 | config,
78 | output_path,
79 | model=model,
80 | train_samples=train_samples,
81 | eval_samples=eval_samples,
82 | )
83 |
84 | trainer.fit()
85 |
--------------------------------------------------------------------------------
/recipes/ljspeech/download_ljspeech.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | echo $RUN_DIR
5 | # download LJSpeech dataset
6 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
7 | # extract
8 | tar -xjf LJSpeech-1.1.tar.bz2
9 | # create train-val splits
10 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
11 | head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
12 | tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
13 | mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/
14 | rm LJSpeech-1.1.tar.bz2
--------------------------------------------------------------------------------
/recipes/ljspeech/hifigan/train_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import HifiganConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 |
12 | config = HifiganConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=5,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # init audio processor
34 | ap = AudioProcessor(**config.audio.to_dict())
35 |
36 | # load training samples
37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
38 |
39 | # init model
40 | model = GAN(config, ap)
41 |
42 | # init the trainer and 🚀
43 | trainer = Trainer(
44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
45 | )
46 | trainer.fit()
47 |
--------------------------------------------------------------------------------
/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import MultibandMelganConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 |
12 | config = MultibandMelganConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=5,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # init audio processor
34 | ap = AudioProcessor(**config.audio.to_dict())
35 |
36 | # load training samples
37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
38 |
39 | # init model
40 | model = GAN(config, ap)
41 |
42 | # init the trainer and 🚀
43 | trainer = Trainer(
44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
45 | )
46 | trainer.fit()
47 |
--------------------------------------------------------------------------------
/recipes/ljspeech/overflow/lj_parameters.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/recipes/ljspeech/overflow/lj_parameters.pt
--------------------------------------------------------------------------------
/recipes/ljspeech/overflow/train_overflow.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.config.shared_configs import BaseAudioConfig
6 | from TTS.tts.configs.overflow_config import OverflowConfig
7 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
8 | from TTS.tts.datasets import load_tts_samples
9 | from TTS.tts.models.overflow import Overflow
10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
11 | from TTS.utils.audio import AudioProcessor
12 |
13 | output_path = os.path.dirname(os.path.abspath(__file__))
14 |
15 | # init configs
16 | dataset_config = BaseDatasetConfig(
17 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join("data", "LJSpeech-1.1/")
18 | )
19 |
20 | audio_config = BaseAudioConfig(
21 | sample_rate=22050,
22 | do_trim_silence=True,
23 | trim_db=60.0,
24 | signal_norm=False,
25 | mel_fmin=0.0,
26 | mel_fmax=8000,
27 | spec_gain=1.0,
28 | log_func="np.log",
29 | ref_level_db=20,
30 | preemphasis=0.0,
31 | )
32 |
33 | config = OverflowConfig( # This is the config that is saved for the future use
34 | run_name="overflow_ljspeech",
35 | audio=audio_config,
36 | batch_size=30,
37 | eval_batch_size=16,
38 | num_loader_workers=4,
39 | num_eval_loader_workers=4,
40 | run_eval=True,
41 | test_delay_epochs=-1,
42 | epochs=1000,
43 | text_cleaner="phoneme_cleaners",
44 | use_phonemes=True,
45 | phoneme_language="en-us",
46 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
47 | precompute_num_workers=8,
48 | mel_statistics_parameter_path=os.path.join(output_path, "lj_parameters.pt"),
49 | force_generate_statistics=False,
50 | print_step=1,
51 | print_eval=True,
52 | mixed_precision=True,
53 | output_path=output_path,
54 | datasets=[dataset_config],
55 | )
56 |
57 | # INITIALIZE THE AUDIO PROCESSOR
58 | # Audio processor is used for feature extraction and audio I/O.
59 | # It mainly serves to the dataloader and the training loggers.
60 | ap = AudioProcessor.init_from_config(config)
61 |
62 | # INITIALIZE THE TOKENIZER
63 | # Tokenizer is used to convert text to sequences of token IDs.
64 | # If characters are not defined in the config, default characters are passed to the config
65 | tokenizer, config = TTSTokenizer.init_from_config(config)
66 |
67 | # LOAD DATA SAMPLES
68 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
69 | # You can define your custom sample loader returning the list of samples.
70 | # Or define your custom formatter and pass it to the `load_tts_samples`.
71 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
72 | train_samples, eval_samples = load_tts_samples(
73 | dataset_config,
74 | eval_split=True,
75 | eval_split_max_size=config.eval_split_max_size,
76 | eval_split_size=config.eval_split_size,
77 | )
78 |
79 | # INITIALIZE THE MODEL
80 | # Models take a config object and a speaker manager as input
81 | # Config defines the details of the model like the number of layers, the size of the embedding, etc.
82 | # Speaker manager is used by multi-speaker models.
83 | model = Overflow(config, ap, tokenizer)
84 |
85 |
86 | # init the trainer and 🚀
87 | trainer = Trainer(
88 | TrainerArgs(),
89 | config,
90 | output_path,
91 | model=model,
92 | train_samples=train_samples,
93 | eval_samples=eval_samples,
94 | gpu=1,
95 | )
96 | trainer.fit()
97 |
--------------------------------------------------------------------------------
/recipes/ljspeech/speedy_speech/train_speedy_speech.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.config import BaseAudioConfig, BaseDatasetConfig
6 | from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.forward_tts import ForwardTTS
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio import AudioProcessor
11 |
12 | output_path = os.path.dirname(os.path.abspath(__file__))
13 | dataset_config = BaseDatasetConfig(
14 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
15 | )
16 |
17 | audio_config = BaseAudioConfig(
18 | sample_rate=22050,
19 | do_trim_silence=True,
20 | trim_db=60.0,
21 | signal_norm=False,
22 | mel_fmin=0.0,
23 | mel_fmax=8000,
24 | spec_gain=1.0,
25 | log_func="np.log",
26 | ref_level_db=20,
27 | preemphasis=0.0,
28 | )
29 |
30 | config = SpeedySpeechConfig(
31 | run_name="speedy_speech_ljspeech",
32 | audio=audio_config,
33 | batch_size=32,
34 | eval_batch_size=16,
35 | num_loader_workers=4,
36 | num_eval_loader_workers=4,
37 | compute_input_seq_cache=True,
38 | run_eval=True,
39 | test_delay_epochs=-1,
40 | epochs=1000,
41 | text_cleaner="english_cleaners",
42 | use_phonemes=True,
43 | phoneme_language="en-us",
44 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
45 | precompute_num_workers=4,
46 | print_step=50,
47 | print_eval=False,
48 | mixed_precision=False,
49 | max_seq_len=500000,
50 | output_path=output_path,
51 | datasets=[dataset_config],
52 | )
53 |
54 | # INITIALIZE THE AUDIO PROCESSOR
55 | # Audio processor is used for feature extraction and audio I/O.
56 | # It mainly serves to the dataloader and the training loggers.
57 | ap = AudioProcessor.init_from_config(config)
58 |
59 | # INITIALIZE THE TOKENIZER
60 | # Tokenizer is used to convert text to sequences of token IDs.
61 | # If characters are not defined in the config, default characters are passed to the config
62 | tokenizer, config = TTSTokenizer.init_from_config(config)
63 |
64 | # LOAD DATA SAMPLES
65 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
66 | # You can define your custom sample loader returning the list of samples.
67 | # Or define your custom formatter and pass it to the `load_tts_samples`.
68 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
69 | train_samples, eval_samples = load_tts_samples(
70 | dataset_config,
71 | eval_split=True,
72 | eval_split_max_size=config.eval_split_max_size,
73 | eval_split_size=config.eval_split_size,
74 | )
75 |
76 | # init model
77 | model = ForwardTTS(config, ap, tokenizer)
78 |
79 | # INITIALIZE THE TRAINER
80 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
81 | # distributed training, etc.
82 | trainer = Trainer(
83 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
84 | )
85 |
86 | # AND... 3,2,1... 🚀
87 | trainer.fit()
88 |
--------------------------------------------------------------------------------
/recipes/ljspeech/univnet/train.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import UnivnetConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = UnivnetConfig(
12 | batch_size=64,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1000,
19 | seq_len=8192,
20 | pad_short=2000,
21 | use_noise_augment=True,
22 | eval_split_size=10,
23 | print_step=25,
24 | print_eval=False,
25 | mixed_precision=False,
26 | lr_gen=1e-4,
27 | lr_disc=1e-4,
28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
29 | output_path=output_path,
30 | )
31 |
32 | # init audio processor
33 | ap = AudioProcessor(**config.audio.to_dict())
34 |
35 | # load training samples
36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
37 |
38 | # init model
39 | model = GAN(config, ap)
40 |
41 | # init the trainer and 🚀
42 | trainer = Trainer(
43 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
44 | )
45 | trainer.fit()
46 |
--------------------------------------------------------------------------------
/recipes/ljspeech/vits_tts/train_vits.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
6 | from TTS.tts.configs.vits_config import VitsConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.vits import Vits, VitsAudioConfig
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio import AudioProcessor
11 |
12 | output_path = os.path.dirname(os.path.abspath(__file__))
13 | dataset_config = BaseDatasetConfig(
14 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
15 | )
16 | audio_config = VitsAudioConfig(
17 | sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
18 | )
19 |
20 | config = VitsConfig(
21 | audio=audio_config,
22 | run_name="vits_ljspeech",
23 | batch_size=32,
24 | eval_batch_size=16,
25 | batch_group_size=5,
26 | num_loader_workers=8,
27 | num_eval_loader_workers=4,
28 | run_eval=True,
29 | test_delay_epochs=-1,
30 | epochs=1000,
31 | text_cleaner="english_cleaners",
32 | use_phonemes=True,
33 | phoneme_language="en-us",
34 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
35 | compute_input_seq_cache=True,
36 | print_step=25,
37 | print_eval=True,
38 | mixed_precision=True,
39 | output_path=output_path,
40 | datasets=[dataset_config],
41 | cudnn_benchmark=False,
42 | )
43 |
44 | # INITIALIZE THE AUDIO PROCESSOR
45 | # Audio processor is used for feature extraction and audio I/O.
46 | # It mainly serves to the dataloader and the training loggers.
47 | ap = AudioProcessor.init_from_config(config)
48 |
49 | # INITIALIZE THE TOKENIZER
50 | # Tokenizer is used to convert text to sequences of token IDs.
51 | # config is updated with the default characters if not defined in the config.
52 | tokenizer, config = TTSTokenizer.init_from_config(config)
53 |
54 | # LOAD DATA SAMPLES
55 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
56 | # You can define your custom sample loader returning the list of samples.
57 | # Or define your custom formatter and pass it to the `load_tts_samples`.
58 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
59 | train_samples, eval_samples = load_tts_samples(
60 | dataset_config,
61 | eval_split=True,
62 | eval_split_max_size=config.eval_split_max_size,
63 | eval_split_size=config.eval_split_size,
64 | )
65 |
66 | # init model
67 | model = Vits(config, ap, tokenizer, speaker_manager=None)
68 |
69 | # init the trainer and 🚀
70 | trainer = Trainer(
71 | TrainerArgs(),
72 | config,
73 | output_path,
74 | model=model,
75 | train_samples=train_samples,
76 | eval_samples=eval_samples,
77 | )
78 | trainer.fit()
79 |
--------------------------------------------------------------------------------
/recipes/ljspeech/wavegrad/train_wavegrad.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import WavegradConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.wavegrad import Wavegrad
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = WavegradConfig(
12 | batch_size=32,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1000,
19 | seq_len=6144,
20 | pad_short=2000,
21 | use_noise_augment=True,
22 | eval_split_size=50,
23 | print_step=50,
24 | print_eval=True,
25 | mixed_precision=False,
26 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
27 | output_path=output_path,
28 | )
29 |
30 | # init audio processor
31 | ap = AudioProcessor(**config.audio.to_dict())
32 |
33 | # load training samples
34 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
35 |
36 | # init model
37 | model = Wavegrad(config)
38 |
39 | # init the trainer and 🚀
40 | trainer = Trainer(
41 | TrainerArgs(),
42 | config,
43 | output_path,
44 | model=model,
45 | train_samples=train_samples,
46 | eval_samples=eval_samples,
47 | training_assets={"audio_processor": ap},
48 | )
49 | trainer.fit()
50 |
--------------------------------------------------------------------------------
/recipes/ljspeech/wavernn/train_wavernn.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import WavernnConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.wavernn import Wavernn
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = WavernnConfig(
12 | batch_size=64,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=10000,
19 | seq_len=1280,
20 | pad_short=2000,
21 | use_noise_augment=False,
22 | eval_split_size=10,
23 | print_step=25,
24 | print_eval=True,
25 | mixed_precision=False,
26 | lr=1e-4,
27 | grad_clip=4,
28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
29 | output_path=output_path,
30 | )
31 |
32 | # init audio processor
33 | ap = AudioProcessor(**config.audio.to_dict())
34 |
35 | # load training samples
36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
37 |
38 | # init model
39 | model = Wavernn(config)
40 |
41 | # init the trainer and 🚀
42 | trainer = Trainer(
43 | TrainerArgs(),
44 | config,
45 | output_path,
46 | model=model,
47 | train_samples=train_samples,
48 | eval_samples=eval_samples,
49 | training_assets={"audio_processor": ap},
50 | )
51 | trainer.fit()
52 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS Thorsten Recipes
2 |
3 | For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
4 |
5 | You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
6 |
7 | Then, go to your desired model folder and run the training.
8 |
9 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
10 | ```terminal
11 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py
12 | ```
13 |
14 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
15 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
16 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/align_tts/train_aligntts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig
6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.align_tts import AlignTTS
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio import AudioProcessor
11 | from TTS.utils.downloaders import download_thorsten_de
12 |
13 | output_path = os.path.dirname(os.path.abspath(__file__))
14 |
15 | # init configs
16 | dataset_config = BaseDatasetConfig(
17 | formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
18 | )
19 |
20 | # download dataset if not already present
21 | if not os.path.exists(dataset_config.path):
22 | print("Downloading dataset")
23 | download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
24 |
25 | config = AlignTTSConfig(
26 | batch_size=32,
27 | eval_batch_size=16,
28 | num_loader_workers=4,
29 | num_eval_loader_workers=4,
30 | run_eval=True,
31 | test_delay_epochs=-1,
32 | epochs=1000,
33 | text_cleaner="phoneme_cleaners",
34 | use_phonemes=False,
35 | phoneme_language="de",
36 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
37 | print_step=25,
38 | print_eval=True,
39 | mixed_precision=False,
40 | test_sentences=[
41 | "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
42 | "Sei eine Stimme, kein Echo.",
43 | "Es tut mir Leid David. Das kann ich leider nicht machen.",
44 | "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
45 | "Vor dem 22. November 1963.",
46 | ],
47 | output_path=output_path,
48 | datasets=[dataset_config],
49 | )
50 |
51 | # INITIALIZE THE AUDIO PROCESSOR
52 | # Audio processor is used for feature extraction and audio I/O.
53 | # It mainly serves to the dataloader and the training loggers.
54 | ap = AudioProcessor.init_from_config(config)
55 |
56 | # INITIALIZE THE TOKENIZER
57 | # Tokenizer is used to convert text to sequences of token IDs.
58 | # If characters are not defined in the config, default characters are passed to the config
59 | tokenizer, config = TTSTokenizer.init_from_config(config)
60 |
61 | # LOAD DATA SAMPLES
62 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
63 | # You can define your custom sample loader returning the list of samples.
64 | # Or define your custom formatter and pass it to the `load_tts_samples`.
65 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
66 | train_samples, eval_samples = load_tts_samples(
67 | dataset_config,
68 | eval_split=True,
69 | eval_split_max_size=config.eval_split_max_size,
70 | eval_split_size=config.eval_split_size,
71 | )
72 |
73 | # init model
74 | model = AlignTTS(config, ap, tokenizer)
75 |
76 | # INITIALIZE THE TRAINER
77 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
78 | # distributed training, etc.
79 | trainer = Trainer(
80 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
81 | )
82 |
83 | # AND... 3,2,1... 🚀
84 | trainer.fit()
85 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/download_thorsten_DE.sh:
--------------------------------------------------------------------------------
1 | # create venv
2 | python3 -m venv env
3 | source .env/bin/activate
4 | pip install pip --upgrade
5 |
6 | # download Thorsten_DE dataset
7 | pip install gdown
8 | gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
9 | tar -xzf dataset.tgz
10 |
11 | # create train-val splits
12 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
13 | head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
14 | tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
15 |
16 | # rename dataset and remove archive
17 | mv LJSpeech-1.1 thorsten-de
18 | rm dataset.tgz
19 |
20 | # destry venv
21 | rm -rf env
22 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/hifigan/train_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import HifiganConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 |
13 | config = HifiganConfig(
14 | batch_size=32,
15 | eval_batch_size=16,
16 | num_loader_workers=4,
17 | num_eval_loader_workers=4,
18 | run_eval=True,
19 | test_delay_epochs=5,
20 | epochs=1000,
21 | seq_len=8192,
22 | pad_short=2000,
23 | use_noise_augment=True,
24 | eval_split_size=10,
25 | print_step=25,
26 | print_eval=False,
27 | mixed_precision=False,
28 | lr_gen=1e-4,
29 | lr_disc=1e-4,
30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
31 | output_path=output_path,
32 | )
33 |
34 | # download dataset if not already present
35 | if not os.path.exists(config.data_path):
36 | print("Downloading dataset")
37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
38 | download_thorsten_de(download_path)
39 |
40 | # init audio processor
41 | ap = AudioProcessor(**config.audio.to_dict())
42 |
43 | # load training samples
44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
45 |
46 | # init model
47 | model = GAN(config, ap)
48 |
49 | # init the trainer and 🚀
50 | trainer = Trainer(
51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
52 | )
53 | trainer.fit()
54 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import MultibandMelganConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 |
13 | config = MultibandMelganConfig(
14 | batch_size=32,
15 | eval_batch_size=16,
16 | num_loader_workers=4,
17 | num_eval_loader_workers=4,
18 | run_eval=True,
19 | test_delay_epochs=5,
20 | epochs=1000,
21 | seq_len=8192,
22 | pad_short=2000,
23 | use_noise_augment=True,
24 | eval_split_size=10,
25 | print_step=25,
26 | print_eval=False,
27 | mixed_precision=False,
28 | lr_gen=1e-4,
29 | lr_disc=1e-4,
30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
31 | output_path=output_path,
32 | )
33 |
34 | # download dataset if not already present
35 | if not os.path.exists(config.data_path):
36 | print("Downloading dataset")
37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
38 | download_thorsten_de(download_path)
39 |
40 | # init audio processor
41 | ap = AudioProcessor(**config.audio.to_dict())
42 |
43 | # load training samples
44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
45 |
46 | # init model
47 | model = GAN(config, ap)
48 |
49 | # init the trainer and 🚀
50 | trainer = Trainer(
51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
52 | )
53 | trainer.fit()
54 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/univnet/train_univnet.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import UnivnetConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = UnivnetConfig(
13 | batch_size=64,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # download dataset if not already present
34 | if not os.path.exists(config.data_path):
35 | print("Downloading dataset")
36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
37 | download_thorsten_de(download_path)
38 |
39 | # init audio processor
40 | ap = AudioProcessor(**config.audio.to_dict())
41 |
42 | # load training samples
43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
44 |
45 | # init model
46 | model = GAN(config, ap)
47 |
48 | # init the trainer and 🚀
49 | trainer = Trainer(
50 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
51 | )
52 | trainer.fit()
53 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/wavegrad/train_wavegrad.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import WavegradConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.wavegrad import Wavegrad
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = WavegradConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=1000,
20 | seq_len=6144,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=50,
24 | print_step=50,
25 | print_eval=True,
26 | mixed_precision=False,
27 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
28 | output_path=output_path,
29 | )
30 |
31 | # download dataset if not already present
32 | if not os.path.exists(config.data_path):
33 | print("Downloading dataset")
34 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
35 | download_thorsten_de(download_path)
36 |
37 | # init audio processor
38 | ap = AudioProcessor(**config.audio.to_dict())
39 |
40 | # load training samples
41 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
42 |
43 | # init model
44 | model = Wavegrad(config)
45 |
46 | # init the trainer and 🚀
47 | trainer = Trainer(
48 | TrainerArgs(),
49 | config,
50 | output_path,
51 | model=model,
52 | train_samples=train_samples,
53 | eval_samples=eval_samples,
54 | training_assets={"audio_processor": ap},
55 | )
56 | trainer.fit()
57 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/wavernn/train_wavernn.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import WavernnConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.wavernn import Wavernn
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = WavernnConfig(
13 | batch_size=64,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=10000,
20 | seq_len=1280,
21 | pad_short=2000,
22 | use_noise_augment=False,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=True,
26 | mixed_precision=False,
27 | lr=1e-4,
28 | grad_clip=4,
29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # download dataset if not already present
34 | if not os.path.exists(config.data_path):
35 | print("Downloading dataset")
36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
37 | download_thorsten_de(download_path)
38 |
39 | # init audio processor
40 | ap = AudioProcessor(**config.audio.to_dict())
41 |
42 | # load training samples
43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
44 |
45 | # init model
46 | model = Wavernn(config)
47 |
48 | # init the trainer and 🚀
49 | trainer = Trainer(
50 | TrainerArgs(),
51 | config,
52 | output_path,
53 | model=model,
54 | train_samples=train_samples,
55 | eval_samples=eval_samples,
56 | training_assets={"audio_processor": ap},
57 | )
58 | trainer.fit()
59 |
--------------------------------------------------------------------------------
/recipes/vctk/delightful_tts/train_delightful_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.config.shared_configs import BaseDatasetConfig
6 | from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig
9 | from TTS.tts.utils.speakers import SpeakerManager
10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
11 | from TTS.utils.audio.processor import AudioProcessor
12 |
13 | data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad"
14 | output_path = os.path.dirname(os.path.abspath(__file__))
15 |
16 |
17 | dataset_config = BaseDatasetConfig(
18 | dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us"
19 | )
20 |
21 | audio_config = DelightfulTtsAudioConfig()
22 |
23 | model_args = DelightfulTtsArgs()
24 |
25 | vocoder_config = VocoderConfig()
26 |
27 | something_tts_config = DelightfulTTSConfig(
28 | run_name="delightful_tts_vctk",
29 | run_description="Train like in delightful tts paper.",
30 | model_args=model_args,
31 | audio=audio_config,
32 | vocoder=vocoder_config,
33 | batch_size=32,
34 | eval_batch_size=16,
35 | num_loader_workers=10,
36 | num_eval_loader_workers=10,
37 | precompute_num_workers=40,
38 | compute_input_seq_cache=True,
39 | compute_f0=True,
40 | f0_cache_path=os.path.join(output_path, "f0_cache"),
41 | run_eval=True,
42 | test_delay_epochs=-1,
43 | epochs=1000,
44 | text_cleaner="english_cleaners",
45 | use_phonemes=True,
46 | phoneme_language="en-us",
47 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
48 | print_step=50,
49 | print_eval=False,
50 | mixed_precision=True,
51 | output_path=output_path,
52 | datasets=[dataset_config],
53 | start_by_longest=True,
54 | binary_align_loss_alpha=0.0,
55 | use_attn_priors=False,
56 | max_text_len=60,
57 | steps_to_start_discriminator=10000,
58 | )
59 |
60 | tokenizer, config = TTSTokenizer.init_from_config(something_tts_config)
61 |
62 | ap = AudioProcessor.init_from_config(config)
63 |
64 |
65 | train_samples, eval_samples = load_tts_samples(
66 | dataset_config,
67 | eval_split=True,
68 | eval_split_max_size=config.eval_split_max_size,
69 | eval_split_size=config.eval_split_size,
70 | )
71 |
72 |
73 | speaker_manager = SpeakerManager()
74 | speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
75 | config.model_args.num_speakers = speaker_manager.num_speakers
76 |
77 |
78 | model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None)
79 |
80 | trainer = Trainer(
81 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
82 | )
83 |
84 | trainer.fit()
85 |
--------------------------------------------------------------------------------
/recipes/vctk/download_vctk.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | echo $RUN_DIR
5 | # download VCTK dataset
6 | wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip
7 | # extract
8 | mkdir VCTK
9 | unzip VCTK-Corpus-0.92 -d VCTK
10 | # create train-val splits
11 | mv VCTK $RUN_DIR/recipes/vctk/
12 | rm VCTK-Corpus-0.92.zip
13 |
--------------------------------------------------------------------------------
/recipes/vctk/vits/train_vits.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
6 | from TTS.tts.configs.vits_config import VitsConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
9 | from TTS.tts.utils.speakers import SpeakerManager
10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
11 | from TTS.utils.audio import AudioProcessor
12 |
13 | output_path = os.path.dirname(os.path.abspath(__file__))
14 | dataset_config = BaseDatasetConfig(
15 | formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
16 | )
17 |
18 |
19 | audio_config = VitsAudioConfig(
20 | sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
21 | )
22 |
23 | vitsArgs = VitsArgs(
24 | use_speaker_embedding=True,
25 | )
26 |
27 | config = VitsConfig(
28 | model_args=vitsArgs,
29 | audio=audio_config,
30 | run_name="vits_vctk",
31 | batch_size=32,
32 | eval_batch_size=16,
33 | batch_group_size=5,
34 | num_loader_workers=4,
35 | num_eval_loader_workers=4,
36 | run_eval=True,
37 | test_delay_epochs=-1,
38 | epochs=1000,
39 | text_cleaner="english_cleaners",
40 | use_phonemes=True,
41 | phoneme_language="en",
42 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
43 | compute_input_seq_cache=True,
44 | print_step=25,
45 | print_eval=False,
46 | mixed_precision=True,
47 | max_text_len=325, # change this if you have a larger VRAM than 16GB
48 | output_path=output_path,
49 | datasets=[dataset_config],
50 | cudnn_benchmark=False,
51 | )
52 |
53 | # INITIALIZE THE AUDIO PROCESSOR
54 | # Audio processor is used for feature extraction and audio I/O.
55 | # It mainly serves to the dataloader and the training loggers.
56 | ap = AudioProcessor.init_from_config(config)
57 |
58 | # INITIALIZE THE TOKENIZER
59 | # Tokenizer is used to convert text to sequences of token IDs.
60 | # config is updated with the default characters if not defined in the config.
61 | tokenizer, config = TTSTokenizer.init_from_config(config)
62 |
63 | # LOAD DATA SAMPLES
64 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
65 | # You can define your custom sample loader returning the list of samples.
66 | # Or define your custom formatter and pass it to the `load_tts_samples`.
67 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
68 | train_samples, eval_samples = load_tts_samples(
69 | dataset_config,
70 | eval_split=True,
71 | eval_split_max_size=config.eval_split_max_size,
72 | eval_split_size=config.eval_split_size,
73 | )
74 |
75 | # init speaker manager for multi-speaker training
76 | # it maps speaker-id to speaker-name in the model and data-loader
77 | speaker_manager = SpeakerManager()
78 | speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
79 | config.model_args.num_speakers = speaker_manager.num_speakers
80 |
81 | # init model
82 | model = Vits(config, ap, tokenizer, speaker_manager)
83 |
84 | # init the trainer and 🚀
85 | trainer = Trainer(
86 | TrainerArgs(),
87 | config,
88 | output_path,
89 | model=model,
90 | train_samples=train_samples,
91 | eval_samples=eval_samples,
92 | )
93 | trainer.fit()
94 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # core deps
2 | numpy==1.22.0;python_version<="3.10"
3 | numpy>=1.24.3;python_version>"3.10"
4 | cython>=0.29.30
5 | scipy>=1.11.2
6 | torch>=2.1
7 | torchaudio
8 | soundfile>=0.12.0
9 | librosa>=0.10.0
10 | scikit-learn>=1.3.0
11 | numba==0.55.1;python_version<"3.9"
12 | numba>=0.57.0;python_version>="3.9"
13 | inflect>=5.6.0
14 | tqdm>=4.64.1
15 | anyascii>=0.3.0
16 | pyyaml>=6.0
17 | fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
18 | aiohttp>=3.8.1
19 | packaging>=23.1
20 | mutagen==1.47.0
21 | # deps for examples
22 | flask>=2.0.1
23 | # deps for inference
24 | pysbd>=0.3.4
25 | # deps for notebooks
26 | umap-learn>=0.5.1
27 | pandas>=1.4,<2.0
28 | # deps for training
29 | matplotlib>=3.7.0
30 | # coqui stack
31 | trainer>=0.0.36
32 | # config management
33 | coqpit>=0.0.16
34 | # chinese g2p deps
35 | jieba
36 | pypinyin
37 | # korean
38 | hangul_romanize
39 | # gruut+supported langs
40 | gruut[de,es,fr]==2.2.3
41 | # deps for korean
42 | jamo
43 | nltk
44 | g2pkk>=0.1.1
45 | # deps for bangla
46 | bangla
47 | bnnumerizer
48 | bnunicodenormalizer
49 | #deps for tortoise
50 | einops>=0.6.0
51 | transformers>=4.45.2
52 | #deps for bark
53 | encodec>=0.1.1
54 | # deps for XTTS
55 | unidecode>=1.3.2
56 | num2words
57 | spacy[ja]>=3
58 | tokenizers==0.20.1
59 | vinorm==2.0.7
60 | underthesea==6.8.4
61 |
--------------------------------------------------------------------------------
/train_dvae_xtts.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=1 python train_dvae_xtts.py \
2 | --output_path=checkpoints/ \
3 | --train_csv_path=datasets/metadata_train.csv \
4 | --eval_csv_path=datasets/metadata_eval.csv \
5 | --language="vi" \
6 | --num_epochs=5 \
7 | --batch_size=512 \
8 | --lr=5e-6
--------------------------------------------------------------------------------
/train_gpt_xtts.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python train_gpt_xtts.py \
2 | --output_path checkpoints/ \
3 | --metadatas datasets/metadata_train_v2.csv,datasets/metadata_eval_v2.csv,vi large-datasets/metadata_train.csv,large-datasets/metadata_eval.csv,vi \
4 | --num_epochs 3 \
5 | --batch_size 8 \
6 | --grad_acumm 4 \
7 | --max_text_length 400 \
8 | --max_audio_length 330750 \
9 | --weight_decay 1e-2 \
10 | --lr 5e-6 \
11 | --save_step 50000
--------------------------------------------------------------------------------