├── .gitignore ├── Readme.md ├── TTS ├── .models.json ├── VERSION ├── __init__.py ├── api.py ├── bin │ ├── __init__.py │ ├── collect_env_info.py │ ├── compute_attention_masks.py │ ├── compute_embeddings.py │ ├── compute_statistics.py │ ├── eval_encoder.py │ ├── extract_tts_spectrograms.py │ ├── find_unique_chars.py │ ├── find_unique_phonemes.py │ ├── remove_silence_using_vad.py │ ├── resample.py │ ├── synthesize.py │ ├── train_encoder.py │ ├── train_tts.py │ ├── train_vocoder.py │ └── tune_wavegrad.py ├── config │ ├── __init__.py │ └── shared_configs.py ├── demos │ └── xtts_ft_demo │ │ ├── requirements.txt │ │ ├── utils │ │ ├── formatter.py │ │ └── gpt_train.py │ │ └── xtts_demo.py ├── encoder │ ├── README.md │ ├── __init__.py │ ├── configs │ │ ├── base_encoder_config.py │ │ ├── emotion_encoder_config.py │ │ └── speaker_encoder_config.py │ ├── dataset.py │ ├── losses.py │ ├── models │ │ ├── base_encoder.py │ │ ├── lstm.py │ │ └── resnet.py │ ├── requirements.txt │ └── utils │ │ ├── __init__.py │ │ ├── generic_utils.py │ │ ├── prepare_voxceleb.py │ │ ├── training.py │ │ └── visual.py ├── model.py ├── server │ ├── README.md │ ├── __init__.py │ ├── conf.json │ ├── server.py │ ├── static │ │ └── coqui-log-green-TTS.png │ └── templates │ │ ├── details.html │ │ └── index.html ├── tts │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── align_tts_config.py │ │ ├── bark_config.py │ │ ├── delightful_tts_config.py │ │ ├── fast_pitch_config.py │ │ ├── fast_speech_config.py │ │ ├── fastspeech2_config.py │ │ ├── glow_tts_config.py │ │ ├── neuralhmm_tts_config.py │ │ ├── overflow_config.py │ │ ├── shared_configs.py │ │ ├── speedy_speech_config.py │ │ ├── tacotron2_config.py │ │ ├── tacotron_config.py │ │ ├── tortoise_config.py │ │ ├── vits_config.py │ │ └── xtts_config.py │ ├── datasets │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── formatters.py │ ├── layers │ │ ├── __init__.py │ │ ├── align_tts │ │ │ ├── __init__.py │ │ │ ├── duration_predictor.py │ │ │ └── mdn.py │ │ ├── bark │ │ │ ├── __init__.py │ │ │ ├── hubert │ │ │ │ ├── __init__.py │ │ │ │ ├── hubert_manager.py │ │ │ │ ├── kmeans_hubert.py │ │ │ │ └── tokenizer.py │ │ │ ├── inference_funcs.py │ │ │ ├── load_model.py │ │ │ ├── model.py │ │ │ └── model_fine.py │ │ ├── delightful_tts │ │ │ ├── __init__.py │ │ │ ├── acoustic_model.py │ │ │ ├── conformer.py │ │ │ ├── conv_layers.py │ │ │ ├── encoders.py │ │ │ ├── energy_adaptor.py │ │ │ ├── kernel_predictor.py │ │ │ ├── networks.py │ │ │ ├── phoneme_prosody_predictor.py │ │ │ ├── pitch_adaptor.py │ │ │ └── variance_predictor.py │ │ ├── feed_forward │ │ │ ├── __init__.py │ │ │ ├── decoder.py │ │ │ ├── duration_predictor.py │ │ │ └── encoder.py │ │ ├── generic │ │ │ ├── __init__.py │ │ │ ├── aligner.py │ │ │ ├── gated_conv.py │ │ │ ├── normalization.py │ │ │ ├── pos_encoding.py │ │ │ ├── res_conv_bn.py │ │ │ ├── time_depth_sep_conv.py │ │ │ ├── transformer.py │ │ │ └── wavenet.py │ │ ├── glow_tts │ │ │ ├── __init__.py │ │ │ ├── decoder.py │ │ │ ├── duration_predictor.py │ │ │ ├── encoder.py │ │ │ ├── glow.py │ │ │ └── transformer.py │ │ ├── losses.py │ │ ├── overflow │ │ │ ├── __init__.py │ │ │ ├── common_layers.py │ │ │ ├── decoder.py │ │ │ ├── neural_hmm.py │ │ │ └── plotting_utils.py │ │ ├── tacotron │ │ │ ├── __init__.py │ │ │ ├── attentions.py │ │ │ ├── capacitron_layers.py │ │ │ ├── common_layers.py │ │ │ ├── gst_layers.py │ │ │ ├── tacotron.py │ │ │ └── tacotron2.py │ │ ├── tortoise │ │ │ ├── arch_utils.py │ │ │ ├── audio_utils.py │ │ │ ├── autoregressive.py │ │ │ ├── classifier.py │ │ │ ├── clvp.py │ │ │ ├── diffusion.py │ │ │ ├── diffusion_decoder.py │ │ │ ├── dpm_solver.py │ │ │ ├── random_latent_generator.py │ │ │ ├── tokenizer.py │ │ │ ├── transformer.py │ │ │ ├── utils.py │ │ │ ├── vocoder.py │ │ │ ├── wav2vec_alignment.py │ │ │ └── xtransformers.py │ │ ├── vits │ │ │ ├── discriminator.py │ │ │ ├── networks.py │ │ │ ├── stochastic_duration_predictor.py │ │ │ └── transforms.py │ │ └── xtts │ │ │ ├── dvae.py │ │ │ ├── gpt.py │ │ │ ├── gpt_inference.py │ │ │ ├── hifigan_decoder.py │ │ │ ├── latent_encoder.py │ │ │ ├── perceiver_encoder.py │ │ │ ├── stream_generator.py │ │ │ ├── tokenizer.py │ │ │ ├── trainer │ │ │ ├── dataset.py │ │ │ ├── dvae_dataset.py │ │ │ └── gpt_trainer.py │ │ │ ├── xtts_manager.py │ │ │ └── zh_num2words.py │ ├── models │ │ ├── __init__.py │ │ ├── align_tts.py │ │ ├── bark.py │ │ ├── base_tacotron.py │ │ ├── base_tts.py │ │ ├── delightful_tts.py │ │ ├── forward_tts.py │ │ ├── glow_tts.py │ │ ├── neuralhmm_tts.py │ │ ├── overflow.py │ │ ├── tacotron.py │ │ ├── tacotron2.py │ │ ├── tortoise.py │ │ ├── vits.py │ │ └── xtts.py │ └── utils │ │ ├── __init__.py │ │ ├── assets │ │ └── tortoise │ │ │ └── tokenizer.json │ │ ├── data.py │ │ ├── fairseq.py │ │ ├── helpers.py │ │ ├── languages.py │ │ ├── managers.py │ │ ├── measures.py │ │ ├── monotonic_align │ │ ├── __init__.py │ │ ├── core.pyx │ │ └── setup.py │ │ ├── speakers.py │ │ ├── ssim.py │ │ ├── synthesis.py │ │ ├── text │ │ ├── __init__.py │ │ ├── bangla │ │ │ ├── __init__.py │ │ │ └── phonemizer.py │ │ ├── belarusian │ │ │ ├── __init__.py │ │ │ └── phonemizer.py │ │ ├── characters.py │ │ ├── chinese_mandarin │ │ │ ├── __init__.py │ │ │ ├── numbers.py │ │ │ ├── phonemizer.py │ │ │ └── pinyinToPhonemes.py │ │ ├── cleaners.py │ │ ├── cmudict.py │ │ ├── english │ │ │ ├── __init__.py │ │ │ ├── abbreviations.py │ │ │ ├── number_norm.py │ │ │ └── time_norm.py │ │ ├── french │ │ │ ├── __init__.py │ │ │ └── abbreviations.py │ │ ├── japanese │ │ │ ├── __init__.py │ │ │ └── phonemizer.py │ │ ├── korean │ │ │ ├── __init__.py │ │ │ ├── ko_dictionary.py │ │ │ ├── korean.py │ │ │ └── phonemizer.py │ │ ├── phonemizers │ │ │ ├── __init__.py │ │ │ ├── bangla_phonemizer.py │ │ │ ├── base.py │ │ │ ├── belarusian_phonemizer.py │ │ │ ├── espeak_wrapper.py │ │ │ ├── gruut_wrapper.py │ │ │ ├── ja_jp_phonemizer.py │ │ │ ├── ko_kr_phonemizer.py │ │ │ ├── multi_phonemizer.py │ │ │ └── zh_cn_phonemizer.py │ │ ├── punctuation.py │ │ └── tokenizer.py │ │ └── visual.py ├── utils │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── numpy_transforms.py │ │ ├── processor.py │ │ └── torch_transforms.py │ ├── callbacks.py │ ├── capacitron_optimizer.py │ ├── distribute.py │ ├── download.py │ ├── downloaders.py │ ├── generic_utils.py │ ├── io.py │ ├── manage.py │ ├── radam.py │ ├── samplers.py │ ├── synthesizer.py │ ├── training.py │ └── vad.py ├── vc │ ├── configs │ │ ├── __init__.py │ │ ├── freevc_config.py │ │ └── shared_configs.py │ ├── models │ │ ├── __init__.py │ │ ├── base_vc.py │ │ └── freevc.py │ └── modules │ │ ├── __init__.py │ │ └── freevc │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── mel_processing.py │ │ ├── modules.py │ │ ├── speaker_encoder │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── hparams.py │ │ └── speaker_encoder.py │ │ └── wavlm │ │ ├── __init__.py │ │ ├── config.json │ │ ├── modules.py │ │ └── wavlm.py └── vocoder │ ├── README.md │ ├── __init__.py │ ├── configs │ ├── __init__.py │ ├── fullband_melgan_config.py │ ├── hifigan_config.py │ ├── melgan_config.py │ ├── multiband_melgan_config.py │ ├── parallel_wavegan_config.py │ ├── shared_configs.py │ ├── univnet_config.py │ ├── wavegrad_config.py │ └── wavernn_config.py │ ├── datasets │ ├── __init__.py │ ├── gan_dataset.py │ ├── preprocess.py │ ├── wavegrad_dataset.py │ └── wavernn_dataset.py │ ├── layers │ ├── __init__.py │ ├── hifigan.py │ ├── losses.py │ ├── lvc_block.py │ ├── melgan.py │ ├── parallel_wavegan.py │ ├── pqmf.py │ ├── qmf.dat │ ├── upsample.py │ └── wavegrad.py │ ├── models │ ├── __init__.py │ ├── base_vocoder.py │ ├── fullband_melgan_generator.py │ ├── gan.py │ ├── hifigan_discriminator.py │ ├── hifigan_generator.py │ ├── melgan_discriminator.py │ ├── melgan_generator.py │ ├── melgan_multiscale_discriminator.py │ ├── multiband_melgan_generator.py │ ├── parallel_wavegan_discriminator.py │ ├── parallel_wavegan_generator.py │ ├── random_window_discriminator.py │ ├── univnet_discriminator.py │ ├── univnet_generator.py │ ├── wavegrad.py │ └── wavernn.py │ ├── pqmf_output.wav │ └── utils │ ├── __init__.py │ ├── distribution.py │ └── generic_utils.py ├── download_checkpoint.py ├── extend_vocab_config.py ├── recipes ├── README.md ├── bel-alex73 │ ├── .gitignore │ ├── README.md │ ├── choose_speaker.ipynb │ ├── docker-prepare-start.sh │ ├── docker-prepare │ │ ├── Dockerfile │ │ └── runtime.sh │ ├── dump_config.py │ ├── train_glowtts.py │ └── train_hifigan.py ├── blizzard2013 │ ├── README.md │ ├── tacotron1-Capacitron │ │ └── train_capacitron_t1.py │ └── tacotron2-Capacitron │ │ └── train_capacitron_t2.py ├── kokoro │ └── tacotron2-DDC │ │ ├── run.sh │ │ └── tacotron2-DDC.json ├── ljspeech │ ├── README.md │ ├── align_tts │ │ └── train_aligntts.py │ ├── delightful_tts │ │ └── train_delightful_tts.py │ ├── download_ljspeech.sh │ ├── fast_pitch │ │ └── train_fast_pitch.py │ ├── fast_speech │ │ └── train_fast_speech.py │ ├── fastspeech2 │ │ └── train_fastspeech2.py │ ├── glow_tts │ │ └── train_glowtts.py │ ├── hifigan │ │ └── train_hifigan.py │ ├── multiband_melgan │ │ └── train_multiband_melgan.py │ ├── neuralhmm_tts │ │ └── train_neuralhmmtts.py │ ├── overflow │ │ ├── lj_parameters.pt │ │ └── train_overflow.py │ ├── speedy_speech │ │ └── train_speedy_speech.py │ ├── tacotron2-Capacitron │ │ └── train_capacitron_t2.py │ ├── tacotron2-DCA │ │ └── train_tacotron_dca.py │ ├── tacotron2-DDC │ │ └── train_tacotron_ddc.py │ ├── univnet │ │ └── train.py │ ├── vits_tts │ │ └── train_vits.py │ ├── wavegrad │ │ └── train_wavegrad.py │ ├── wavernn │ │ └── train_wavernn.py │ ├── xtts_v1 │ │ └── train_gpt_xtts.py │ └── xtts_v2 │ │ └── train_gpt_xtts.py ├── multilingual │ ├── cml_yourtts │ │ └── train_yourtts.py │ └── vits_tts │ │ ├── train_vits_tts.py │ │ └── train_vits_tts_phonemes.py ├── thorsten_DE │ ├── README.md │ ├── align_tts │ │ └── train_aligntts.py │ ├── download_thorsten_DE.sh │ ├── glow_tts │ │ └── train_glowtts.py │ ├── hifigan │ │ └── train_hifigan.py │ ├── multiband_melgan │ │ └── train_multiband_melgan.py │ ├── speedy_speech │ │ └── train_speedy_speech.py │ ├── tacotron2-DDC │ │ └── train_tacotron_ddc.py │ ├── univnet │ │ └── train_univnet.py │ ├── vits_tts │ │ └── train_vits.py │ ├── wavegrad │ │ └── train_wavegrad.py │ └── wavernn │ │ └── train_wavernn.py └── vctk │ ├── delightful_tts │ └── train_delightful_tts.py │ ├── download_vctk.sh │ ├── fast_pitch │ └── train_fast_pitch.py │ ├── fast_speech │ └── train_fast_speech.py │ ├── glow_tts │ └── train_glow_tts.py │ ├── resnet_speaker_encoder │ └── train_encoder.py │ ├── speedy_speech │ └── train_speedy_speech.py │ ├── tacotron-DDC │ └── train_tacotron-DDC.py │ ├── tacotron2-DDC │ └── train_tacotron2-ddc.py │ ├── tacotron2 │ └── train_tacotron2.py │ ├── vits │ └── train_vits.py │ └── yourtts │ └── train_yourtts.py ├── requirements.txt ├── train_dvae_xtts.py ├── train_dvae_xtts.sh ├── train_gpt_xtts.py └── train_gpt_xtts.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | hub/ 3 | *.pth 4 | __pycache__/ 5 | checkpoints/ 6 | datasets/ 7 | large-datasets/ 8 | wandb/ 9 | *.ipynb 10 | *.wav 11 | test.py 12 | cps/ 13 | vivoice-datasets/ 14 | output/ -------------------------------------------------------------------------------- /TTS/VERSION: -------------------------------------------------------------------------------- 1 | 0.22.0 2 | -------------------------------------------------------------------------------- /TTS/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f: 4 | version = f.read().strip() 5 | 6 | __version__ = version 7 | -------------------------------------------------------------------------------- /TTS/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/bin/__init__.py -------------------------------------------------------------------------------- /TTS/bin/collect_env_info.py: -------------------------------------------------------------------------------- 1 | """Get detailed info about the working environment.""" 2 | import os 3 | import platform 4 | import sys 5 | 6 | import numpy 7 | import torch 8 | 9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")] 10 | import json 11 | 12 | import TTS 13 | 14 | 15 | def system_info(): 16 | return { 17 | "OS": platform.system(), 18 | "architecture": platform.architecture(), 19 | "version": platform.version(), 20 | "processor": platform.processor(), 21 | "python": platform.python_version(), 22 | } 23 | 24 | 25 | def cuda_info(): 26 | return { 27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], 28 | "available": torch.cuda.is_available(), 29 | "version": torch.version.cuda, 30 | } 31 | 32 | 33 | def package_info(): 34 | return { 35 | "numpy": numpy.__version__, 36 | "PyTorch_version": torch.__version__, 37 | "PyTorch_debug": torch.version.debug, 38 | "TTS": TTS.__version__, 39 | } 40 | 41 | 42 | def main(): 43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()} 44 | print(json.dumps(details, indent=4, sort_keys=True)) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /TTS/bin/find_unique_chars.py: -------------------------------------------------------------------------------- 1 | """Find all the unique characters in a dataset""" 2 | import argparse 3 | from argparse import RawTextHelpFormatter 4 | 5 | from TTS.config import load_config 6 | from TTS.tts.datasets import load_tts_samples 7 | 8 | 9 | def main(): 10 | # pylint: disable=bad-option-value 11 | parser = argparse.ArgumentParser( 12 | description="""Find all the unique characters or phonemes in a dataset.\n\n""" 13 | """ 14 | Example runs: 15 | 16 | python TTS/bin/find_unique_chars.py --config_path config.json 17 | """, 18 | formatter_class=RawTextHelpFormatter, 19 | ) 20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) 21 | args = parser.parse_args() 22 | 23 | c = load_config(args.config_path) 24 | 25 | # load all datasets 26 | train_items, eval_items = load_tts_samples( 27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size 28 | ) 29 | 30 | items = train_items + eval_items 31 | 32 | texts = "".join(item["text"] for item in items) 33 | chars = set(texts) 34 | lower_chars = filter(lambda c: c.islower(), chars) 35 | chars_force_lower = [c.lower() for c in chars] 36 | chars_force_lower = set(chars_force_lower) 37 | 38 | print(f" > Number of unique characters: {len(chars)}") 39 | print(f" > Unique characters: {''.join(sorted(chars))}") 40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") 41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /TTS/bin/find_unique_phonemes.py: -------------------------------------------------------------------------------- 1 | """Find all the unique characters in a dataset""" 2 | import argparse 3 | import multiprocessing 4 | from argparse import RawTextHelpFormatter 5 | 6 | from tqdm.contrib.concurrent import process_map 7 | 8 | from TTS.config import load_config 9 | from TTS.tts.datasets import load_tts_samples 10 | from TTS.tts.utils.text.phonemizers import Gruut 11 | 12 | 13 | def compute_phonemes(item): 14 | text = item["text"] 15 | ph = phonemizer.phonemize(text).replace("|", "") 16 | return set(list(ph)) 17 | 18 | 19 | def main(): 20 | # pylint: disable=W0601 21 | global c, phonemizer 22 | # pylint: disable=bad-option-value 23 | parser = argparse.ArgumentParser( 24 | description="""Find all the unique characters or phonemes in a dataset.\n\n""" 25 | """ 26 | Example runs: 27 | 28 | python TTS/bin/find_unique_phonemes.py --config_path config.json 29 | """, 30 | formatter_class=RawTextHelpFormatter, 31 | ) 32 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) 33 | args = parser.parse_args() 34 | 35 | c = load_config(args.config_path) 36 | 37 | # load all datasets 38 | train_items, eval_items = load_tts_samples( 39 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size 40 | ) 41 | items = train_items + eval_items 42 | print("Num items:", len(items)) 43 | 44 | language_list = [item["language"] for item in items] 45 | is_lang_def = all(language_list) 46 | 47 | if not c.phoneme_language or not is_lang_def: 48 | raise ValueError("Phoneme language must be defined in config.") 49 | 50 | if not language_list.count(language_list[0]) == len(language_list): 51 | raise ValueError( 52 | "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!" 53 | ) 54 | 55 | phonemizer = Gruut(language=language_list[0], keep_puncs=True) 56 | 57 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) 58 | phones = [] 59 | for ph in phonemes: 60 | phones.extend(ph) 61 | 62 | phones = set(phones) 63 | lower_phones = filter(lambda c: c.islower(), phones) 64 | phones_force_lower = [c.lower() for c in phones] 65 | phones_force_lower = set(phones_force_lower) 66 | 67 | print(f" > Number of unique phonemes: {len(phones)}") 68 | print(f" > Unique phonemes: {''.join(sorted(phones))}") 69 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") 70 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /TTS/bin/resample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from argparse import RawTextHelpFormatter 5 | from multiprocessing import Pool 6 | from shutil import copytree 7 | 8 | import librosa 9 | import soundfile as sf 10 | from tqdm import tqdm 11 | 12 | 13 | def resample_file(func_args): 14 | filename, output_sr = func_args 15 | y, sr = librosa.load(filename, sr=output_sr) 16 | sf.write(filename, y, sr) 17 | 18 | 19 | def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10): 20 | if output_dir: 21 | print("Recursively copying the input folder...") 22 | copytree(input_dir, output_dir) 23 | input_dir = output_dir 24 | 25 | print("Resampling the audio files...") 26 | audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True) 27 | print(f"Found {len(audio_files)} files...") 28 | audio_files = list(zip(audio_files, len(audio_files) * [output_sr])) 29 | with Pool(processes=n_jobs) as p: 30 | with tqdm(total=len(audio_files)) as pbar: 31 | for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)): 32 | pbar.update() 33 | 34 | print("Done !") 35 | 36 | 37 | if __name__ == "__main__": 38 | parser = argparse.ArgumentParser( 39 | description="""Resample a folder recusively with librosa 40 | Can be used in place or create a copy of the folder as an output.\n\n 41 | Example run: 42 | python TTS/bin/resample.py 43 | --input_dir /root/LJSpeech-1.1/ 44 | --output_sr 22050 45 | --output_dir /root/resampled_LJSpeech-1.1/ 46 | --file_ext wav 47 | --n_jobs 24 48 | """, 49 | formatter_class=RawTextHelpFormatter, 50 | ) 51 | 52 | parser.add_argument( 53 | "--input_dir", 54 | type=str, 55 | default=None, 56 | required=True, 57 | help="Path of the folder containing the audio files to resample", 58 | ) 59 | 60 | parser.add_argument( 61 | "--output_sr", 62 | type=int, 63 | default=22050, 64 | required=False, 65 | help="Samlple rate to which the audio files should be resampled", 66 | ) 67 | 68 | parser.add_argument( 69 | "--output_dir", 70 | type=str, 71 | default=None, 72 | required=False, 73 | help="Path of the destination folder. If not defined, the operation is done in place", 74 | ) 75 | 76 | parser.add_argument( 77 | "--file_ext", 78 | type=str, 79 | default="wav", 80 | required=False, 81 | help="Extension of the audio files to resample", 82 | ) 83 | 84 | parser.add_argument( 85 | "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores" 86 | ) 87 | 88 | args = parser.parse_args() 89 | 90 | resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs) 91 | -------------------------------------------------------------------------------- /TTS/bin/train_tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | 4 | from trainer import Trainer, TrainerArgs 5 | 6 | from TTS.config import load_config, register_config 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models import setup_model 9 | 10 | 11 | @dataclass 12 | class TrainTTSArgs(TrainerArgs): 13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."}) 14 | 15 | 16 | def main(): 17 | """Run `tts` model training directly by a `config.json` file.""" 18 | # init trainer args 19 | train_args = TrainTTSArgs() 20 | parser = train_args.init_argparse(arg_prefix="") 21 | 22 | # override trainer args from comman-line args 23 | args, config_overrides = parser.parse_known_args() 24 | train_args.parse_args(args) 25 | 26 | # load config.json and register 27 | if args.config_path or args.continue_path: 28 | if args.config_path: 29 | # init from a file 30 | config = load_config(args.config_path) 31 | if len(config_overrides) > 0: 32 | config.parse_known_args(config_overrides, relaxed_parser=True) 33 | elif args.continue_path: 34 | # continue from a prev experiment 35 | config = load_config(os.path.join(args.continue_path, "config.json")) 36 | if len(config_overrides) > 0: 37 | config.parse_known_args(config_overrides, relaxed_parser=True) 38 | else: 39 | # init from console args 40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel 41 | 42 | config_base = BaseTrainingConfig() 43 | config_base.parse_known_args(config_overrides) 44 | config = register_config(config_base.model)() 45 | 46 | # load training samples 47 | train_samples, eval_samples = load_tts_samples( 48 | config.datasets, 49 | eval_split=True, 50 | eval_split_max_size=config.eval_split_max_size, 51 | eval_split_size=config.eval_split_size, 52 | ) 53 | 54 | # init the model from config 55 | model = setup_model(config, train_samples + eval_samples) 56 | 57 | # init the trainer and 🚀 58 | trainer = Trainer( 59 | train_args, 60 | model.config, 61 | config.output_path, 62 | model=model, 63 | train_samples=train_samples, 64 | eval_samples=eval_samples, 65 | parse_command_line_args=False, 66 | ) 67 | trainer.fit() 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /TTS/bin/train_vocoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | 4 | from trainer import Trainer, TrainerArgs 5 | 6 | from TTS.config import load_config, register_config 7 | from TTS.utils.audio import AudioProcessor 8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data 9 | from TTS.vocoder.models import setup_model 10 | 11 | 12 | @dataclass 13 | class TrainVocoderArgs(TrainerArgs): 14 | config_path: str = field(default=None, metadata={"help": "Path to the config file."}) 15 | 16 | 17 | def main(): 18 | """Run `tts` model training directly by a `config.json` file.""" 19 | # init trainer args 20 | train_args = TrainVocoderArgs() 21 | parser = train_args.init_argparse(arg_prefix="") 22 | 23 | # override trainer args from comman-line args 24 | args, config_overrides = parser.parse_known_args() 25 | train_args.parse_args(args) 26 | 27 | # load config.json and register 28 | if args.config_path or args.continue_path: 29 | if args.config_path: 30 | # init from a file 31 | config = load_config(args.config_path) 32 | if len(config_overrides) > 0: 33 | config.parse_known_args(config_overrides, relaxed_parser=True) 34 | elif args.continue_path: 35 | # continue from a prev experiment 36 | config = load_config(os.path.join(args.continue_path, "config.json")) 37 | if len(config_overrides) > 0: 38 | config.parse_known_args(config_overrides, relaxed_parser=True) 39 | else: 40 | # init from console args 41 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel 42 | 43 | config_base = BaseTrainingConfig() 44 | config_base.parse_known_args(config_overrides) 45 | config = register_config(config_base.model)() 46 | 47 | # load training samples 48 | if "feature_path" in config and config.feature_path: 49 | # load pre-computed features 50 | print(f" > Loading features from: {config.feature_path}") 51 | eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size) 52 | else: 53 | # load data raw wav files 54 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 55 | 56 | # setup audio processor 57 | ap = AudioProcessor(**config.audio) 58 | 59 | # init the model from config 60 | model = setup_model(config) 61 | 62 | # init the trainer and 🚀 63 | trainer = Trainer( 64 | train_args, 65 | config, 66 | config.output_path, 67 | model=model, 68 | train_samples=train_samples, 69 | eval_samples=eval_samples, 70 | training_assets={"audio_processor": ap}, 71 | parse_command_line_args=False, 72 | ) 73 | trainer.fit() 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /TTS/demos/xtts_ft_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | faster_whisper==0.9.0 2 | gradio==4.7.1 -------------------------------------------------------------------------------- /TTS/encoder/README.md: -------------------------------------------------------------------------------- 1 | ### Speaker Encoder 2 | 3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. 4 | 5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. 6 | 7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). 8 | 9 | ![](umap.png) 10 | 11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. 12 | 13 | To run the code, you need to follow the same flow as in TTS. 14 | 15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. 16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` 17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. 18 | - Watch training on Tensorboard as in TTS 19 | -------------------------------------------------------------------------------- /TTS/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/encoder/__init__.py -------------------------------------------------------------------------------- /TTS/encoder/configs/base_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass, field 2 | from typing import Dict, List 3 | 4 | from coqpit import MISSING 5 | 6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig 7 | 8 | 9 | @dataclass 10 | class BaseEncoderConfig(BaseTrainingConfig): 11 | """Defines parameters for a Generic Encoder model.""" 12 | 13 | model: str = None 14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) 15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) 16 | # model params 17 | model_params: Dict = field( 18 | default_factory=lambda: { 19 | "model_name": "lstm", 20 | "input_dim": 80, 21 | "proj_dim": 256, 22 | "lstm_dim": 768, 23 | "num_lstm_layers": 3, 24 | "use_lstm_with_projection": True, 25 | } 26 | ) 27 | 28 | audio_augmentation: Dict = field(default_factory=lambda: {}) 29 | 30 | # training params 31 | epochs: int = 10000 32 | loss: str = "angleproto" 33 | grad_clip: float = 3.0 34 | lr: float = 0.0001 35 | optimizer: str = "radam" 36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) 37 | lr_decay: bool = False 38 | warmup_steps: int = 4000 39 | 40 | # logging params 41 | tb_model_param_stats: bool = False 42 | steps_plot_stats: int = 10 43 | save_step: int = 1000 44 | print_step: int = 20 45 | run_eval: bool = False 46 | 47 | # data loader 48 | num_classes_in_batch: int = MISSING 49 | num_utter_per_class: int = MISSING 50 | eval_num_classes_in_batch: int = None 51 | eval_num_utter_per_class: int = None 52 | 53 | num_loader_workers: int = MISSING 54 | voice_len: float = 1.6 55 | 56 | def check_values(self): 57 | super().check_values() 58 | c = asdict(self) 59 | assert ( 60 | c["model_params"]["input_dim"] == self.audio.num_mels 61 | ), " [!] model input dimendion must be equal to melspectrogram dimension." 62 | -------------------------------------------------------------------------------- /TTS/encoder/configs/emotion_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class EmotionEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Emotion Encoder model.""" 9 | 10 | model: str = "emotion_encoder" 11 | map_classid_to_classname: dict = None 12 | class_name_key: str = "emotion_name" 13 | -------------------------------------------------------------------------------- /TTS/encoder/configs/speaker_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class SpeakerEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Speaker Encoder model.""" 9 | 10 | model: str = "speaker_encoder" 11 | class_name_key: str = "speaker_name" 12 | -------------------------------------------------------------------------------- /TTS/encoder/requirements.txt: -------------------------------------------------------------------------------- 1 | umap-learn 2 | numpy>=1.17.0 3 | -------------------------------------------------------------------------------- /TTS/encoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/encoder/utils/__init__.py -------------------------------------------------------------------------------- /TTS/encoder/utils/visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import umap 5 | 6 | matplotlib.use("Agg") 7 | 8 | 9 | colormap = ( 10 | np.array( 11 | [ 12 | [76, 255, 0], 13 | [0, 127, 70], 14 | [255, 0, 0], 15 | [255, 217, 38], 16 | [0, 135, 255], 17 | [165, 0, 165], 18 | [255, 167, 255], 19 | [0, 255, 255], 20 | [255, 96, 38], 21 | [142, 76, 0], 22 | [33, 0, 127], 23 | [0, 0, 0], 24 | [183, 183, 183], 25 | ], 26 | dtype=float, 27 | ) 28 | / 255 29 | ) 30 | 31 | 32 | def plot_embeddings(embeddings, num_classes_in_batch): 33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch 34 | 35 | # if necessary get just the first 10 classes 36 | if num_classes_in_batch > 10: 37 | num_classes_in_batch = 10 38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] 39 | 40 | model = umap.UMAP() 41 | projection = model.fit_transform(embeddings) 42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) 43 | colors = [colormap[i] for i in ground_truth] 44 | fig, ax = plt.subplots(figsize=(16, 10)) 45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) 46 | plt.gca().set_aspect("equal", "datalim") 47 | plt.title("UMAP projection") 48 | plt.tight_layout() 49 | plt.savefig("umap") 50 | return fig 51 | -------------------------------------------------------------------------------- /TTS/model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Dict 3 | 4 | import torch 5 | from coqpit import Coqpit 6 | from trainer import TrainerModel 7 | 8 | # pylint: skip-file 9 | 10 | 11 | class BaseTrainerModel(TrainerModel): 12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. 13 | 14 | Every new 🐸TTS model must inherit it. 15 | """ 16 | 17 | @staticmethod 18 | @abstractmethod 19 | def init_from_config(config: Coqpit): 20 | """Init the model and all its attributes from the given config. 21 | 22 | Override this depending on your model. 23 | """ 24 | ... 25 | 26 | @abstractmethod 27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict: 28 | """Forward pass for inference. 29 | 30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` 31 | is considered to be the main output and you can add any other auxiliary outputs as you want. 32 | 33 | We don't use `*kwargs` since it is problematic with the TorchScript API. 34 | 35 | Args: 36 | input (torch.Tensor): [description] 37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. 38 | 39 | Returns: 40 | Dict: [description] 41 | """ 42 | outputs_dict = {"model_outputs": None} 43 | ... 44 | return outputs_dict 45 | 46 | @abstractmethod 47 | def load_checkpoint( 48 | self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False 49 | ) -> None: 50 | """Load a model checkpoint gile and get ready for training or inference. 51 | 52 | Args: 53 | config (Coqpit): Model configuration. 54 | checkpoint_path (str): Path to the model checkpoint file. 55 | eval (bool, optional): If true, init model for inference else for training. Defaults to False. 56 | strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. 57 | cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False. 58 | """ 59 | ... 60 | -------------------------------------------------------------------------------- /TTS/server/README.md: -------------------------------------------------------------------------------- 1 | # :frog: TTS demo server 2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. 3 | 4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. 5 | 6 | Examples runs: 7 | 8 | List officially released models. 9 | ```python TTS/server/server.py --list_models ``` 10 | 11 | Run the server with the official models. 12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` 13 | 14 | Run the server with the official models on a GPU. 15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` 16 | 17 | Run the server with a custom models. 18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` 19 | -------------------------------------------------------------------------------- /TTS/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/server/__init__.py -------------------------------------------------------------------------------- /TTS/server/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder 3 | "tts_file":"best_model.pth", // tts checkpoint file 4 | "tts_config":"config.json", // tts config.json file 5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. 6 | "vocoder_config":null, 7 | "vocoder_file": null, 8 | "is_wavernn_batched":true, 9 | "port": 5002, 10 | "use_cuda": true, 11 | "debug": true 12 | } 13 | -------------------------------------------------------------------------------- /TTS/server/static/coqui-log-green-TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/server/static/coqui-log-green-TTS.png -------------------------------------------------------------------------------- /TTS/server/templates/details.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | TTS engine 12 | 13 | 14 | 17 | 18 | 19 | 30 | 31 | 32 | 33 | Fork me on GitHub 35 | 36 | {% if show_details == true %} 37 | 38 |
39 | Model details 40 |
41 | 42 |
43 |
44 | CLI arguments: 45 | 46 | 47 | 48 | 49 | 50 | 51 | {% for key, value in args.items() %} 52 | 53 | 54 | 55 | 56 | 57 | 58 | {% endfor %} 59 |
CLI key Value
{{ key }}{{ value }}
60 |
61 |

62 | 63 |
64 | 65 | {% if model_config != None %} 66 | 67 |
68 | Model config: 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | {% for key, value in model_config.items() %} 78 | 79 | 80 | 81 | 82 | 83 | 84 | {% endfor %} 85 | 86 |
Key Value
{{ key }}{{ value }}
87 |
88 | 89 | {% endif %} 90 | 91 |

92 | 93 | 94 | 95 |
96 | {% if vocoder_config != None %} 97 |
98 | Vocoder model config: 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | {% for key, value in vocoder_config.items() %} 108 | 109 | 110 | 111 | 112 | 113 | 114 | {% endfor %} 115 | 116 | 117 |
Key Value
{{ key }}{{ value }}
118 |
119 | {% endif %} 120 |

121 | 122 | {% else %} 123 |
124 | Please start server with --show_details=true to see details. 125 |
126 | 127 | {% endif %} 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /TTS/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/__init__.py -------------------------------------------------------------------------------- /TTS/tts/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | # configs_dir = os.path.dirname(__file__) 7 | # for file in os.listdir(configs_dir): 8 | # path = os.path.join(configs_dir, file) 9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | # module = importlib.import_module("TTS.tts.configs." + config_name) 12 | # for attribute_name in dir(module): 13 | # attribute = getattr(module, attribute_name) 14 | 15 | # if isclass(attribute): 16 | # # Add the class to this package's variables 17 | # globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /TTS/tts/configs/tacotron2_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from TTS.tts.configs.tacotron_config import TacotronConfig 4 | 5 | 6 | @dataclass 7 | class Tacotron2Config(TacotronConfig): 8 | """Defines parameters for Tacotron2 based models. 9 | 10 | Example: 11 | 12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config 13 | >>> config = Tacotron2Config() 14 | 15 | Check `TacotronConfig` for argument descriptions. 16 | """ 17 | 18 | model: str = "tacotron2" 19 | out_channels: int = 80 20 | encoder_in_features: int = 512 21 | decoder_in_features: int = 512 22 | -------------------------------------------------------------------------------- /TTS/tts/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.layers.losses import * 2 | -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/align_tts/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding 4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock 5 | 6 | 7 | class DurationPredictor(nn.Module): 8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads): 9 | super().__init__() 10 | self.embed = nn.Embedding(num_chars, hidden_channels) 11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1) 12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1) 13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1) 14 | 15 | def forward(self, text, text_lengths): 16 | # B, L -> B, L 17 | emb = self.embed(text) 18 | emb = self.pos_enc(emb.transpose(1, 2)) 19 | x = self.FFT(emb, text_lengths) 20 | x = self.out_layer(x).squeeze(-1) 21 | return x 22 | -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/mdn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class MDNBlock(nn.Module): 5 | """Mixture of Density Network implementation 6 | https://arxiv.org/pdf/2003.01950.pdf 7 | """ 8 | 9 | def __init__(self, in_channels, out_channels): 10 | super().__init__() 11 | self.out_channels = out_channels 12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1) 13 | self.norm = nn.LayerNorm(in_channels) 14 | self.relu = nn.ReLU() 15 | self.dropout = nn.Dropout(0.1) 16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1) 17 | 18 | def forward(self, x): 19 | o = self.conv1(x) 20 | o = o.transpose(1, 2) 21 | o = self.norm(o) 22 | o = o.transpose(1, 2) 23 | o = self.relu(o) 24 | o = self.dropout(o) 25 | mu_sigma = self.conv2(o) 26 | # TODO: check this sigmoid 27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :]) 28 | mu = mu_sigma[:, : self.out_channels // 2, :] 29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :] 30 | return mu, log_sigma 31 | -------------------------------------------------------------------------------- /TTS/tts/layers/bark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/bark/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/bark/hubert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/bark/hubert/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/bark/hubert/hubert_manager.py: -------------------------------------------------------------------------------- 1 | # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 2 | 3 | import os.path 4 | import shutil 5 | import urllib.request 6 | 7 | import huggingface_hub 8 | 9 | 10 | class HubertManager: 11 | @staticmethod 12 | def make_sure_hubert_installed( 13 | download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = "" 14 | ): 15 | if not os.path.isfile(model_path): 16 | print("Downloading HuBERT base model") 17 | urllib.request.urlretrieve(download_url, model_path) 18 | print("Downloaded HuBERT") 19 | return model_path 20 | return None 21 | 22 | @staticmethod 23 | def make_sure_tokenizer_installed( 24 | model: str = "quantifier_hubert_base_ls960_14.pth", 25 | repo: str = "GitMylo/bark-voice-cloning", 26 | model_path: str = "", 27 | ): 28 | model_dir = os.path.dirname(model_path) 29 | if not os.path.isfile(model_path): 30 | print("Downloading HuBERT custom tokenizer") 31 | huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False) 32 | shutil.move(os.path.join(model_dir, model), model_path) 33 | print("Downloaded tokenizer") 34 | return model_path 35 | return None 36 | -------------------------------------------------------------------------------- /TTS/tts/layers/bark/hubert/kmeans_hubert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modified HuBERT model without kmeans. 3 | Original author: https://github.com/lucidrains/ 4 | Modified by: https://www.github.com/gitmylo/ 5 | License: MIT 6 | """ 7 | 8 | # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py 9 | 10 | import logging 11 | from pathlib import Path 12 | 13 | import torch 14 | from einops import pack, unpack 15 | from torch import nn 16 | from torchaudio.functional import resample 17 | from transformers import HubertModel 18 | 19 | 20 | def round_down_nearest_multiple(num, divisor): 21 | return num // divisor * divisor 22 | 23 | 24 | def curtail_to_multiple(t, mult, from_left=False): 25 | data_len = t.shape[-1] 26 | rounded_seq_len = round_down_nearest_multiple(data_len, mult) 27 | seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None) 28 | return t[..., seq_slice] 29 | 30 | 31 | def exists(val): 32 | return val is not None 33 | 34 | 35 | def default(val, d): 36 | return val if exists(val) else d 37 | 38 | 39 | class CustomHubert(nn.Module): 40 | """ 41 | checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert 42 | or you can train your own 43 | """ 44 | 45 | def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None): 46 | super().__init__() 47 | self.target_sample_hz = target_sample_hz 48 | self.seq_len_multiple_of = seq_len_multiple_of 49 | self.output_layer = output_layer 50 | if device is not None: 51 | self.to(device) 52 | self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960") 53 | if device is not None: 54 | self.model.to(device) 55 | self.model.eval() 56 | 57 | @property 58 | def groups(self): 59 | return 1 60 | 61 | @torch.no_grad() 62 | def forward(self, wav_input, flatten=True, input_sample_hz=None): 63 | device = wav_input.device 64 | 65 | if exists(input_sample_hz): 66 | wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz) 67 | 68 | if exists(self.seq_len_multiple_of): 69 | wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of) 70 | 71 | outputs = self.model.forward( 72 | wav_input, 73 | output_hidden_states=True, 74 | ) 75 | embed = outputs["hidden_states"][self.output_layer] 76 | embed, packed_shape = pack([embed], "* d") 77 | codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) 78 | if flatten: 79 | return codebook_indices 80 | 81 | (codebook_indices,) = unpack(codebook_indices, packed_shape, "*") 82 | return codebook_indices 83 | -------------------------------------------------------------------------------- /TTS/tts/layers/delightful_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/delightful_tts/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn # pylint: disable=consider-using-from-import 3 | 4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed 5 | 6 | 7 | class PhonemeProsodyPredictor(nn.Module): 8 | """Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf 9 | It consists of 2 layers of 1D convolutions each followed by a relu activation, layer norm 10 | and dropout, then finally a linear layer. 11 | 12 | Args: 13 | hidden_size (int): Size of hidden channels. 14 | kernel_size (int): Kernel size for the conv layers. 15 | dropout: (float): Probability of dropout. 16 | bottleneck_size (int): bottleneck size for last linear layer. 17 | lrelu_slope (float): Slope of the leaky relu. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | hidden_size: int, 23 | kernel_size: int, 24 | dropout: float, 25 | bottleneck_size: int, 26 | lrelu_slope: float, 27 | ): 28 | super().__init__() 29 | self.d_model = hidden_size 30 | self.layers = nn.ModuleList( 31 | [ 32 | ConvTransposed( 33 | self.d_model, 34 | self.d_model, 35 | kernel_size=kernel_size, 36 | padding=(kernel_size - 1) // 2, 37 | ), 38 | nn.LeakyReLU(lrelu_slope), 39 | nn.LayerNorm(self.d_model), 40 | nn.Dropout(dropout), 41 | ConvTransposed( 42 | self.d_model, 43 | self.d_model, 44 | kernel_size=kernel_size, 45 | padding=(kernel_size - 1) // 2, 46 | ), 47 | nn.LeakyReLU(lrelu_slope), 48 | nn.LayerNorm(self.d_model), 49 | nn.Dropout(dropout), 50 | ] 51 | ) 52 | self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size) 53 | 54 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: 55 | """ 56 | Shapes: 57 | x: :math: `[B, T, D]` 58 | mask: :math: `[B, T]` 59 | """ 60 | mask = mask.unsqueeze(2) 61 | for layer in self.layers: 62 | x = layer(x) 63 | x = x.masked_fill(mask, 0.0) 64 | x = self.predictor_bottleneck(x) 65 | return x 66 | -------------------------------------------------------------------------------- /TTS/tts/layers/delightful_tts/variance_predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn # pylint: disable=consider-using-from-import 3 | 4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed 5 | 6 | 7 | class VariancePredictor(nn.Module): 8 | """ 9 | Network is 2-layer 1D convolutions with leaky relu activation and then 10 | followed by layer normalization then a dropout layer and finally an 11 | extra linear layer to project the hidden states into the output sequence. 12 | 13 | Args: 14 | channels_in (int): Number of in channels for conv layers. 15 | channels_out (int): Number of out channels for the last linear layer. 16 | kernel_size (int): Size the kernel for the conv layers. 17 | p_dropout (float): Probability of dropout. 18 | lrelu_slope (float): Slope for the leaky relu. 19 | 20 | Inputs: inputs, mask 21 | - **inputs** (batch, time, dim): Tensor containing input vector 22 | - **mask** (batch, time): Tensor containing indices to be masked 23 | Returns: 24 | - **outputs** (batch, time): Tensor produced by last linear layer. 25 | """ 26 | 27 | def __init__( 28 | self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float 29 | ): 30 | super().__init__() 31 | 32 | self.layers = nn.ModuleList( 33 | [ 34 | ConvTransposed( 35 | channels_in, 36 | channels, 37 | kernel_size=kernel_size, 38 | padding=(kernel_size - 1) // 2, 39 | ), 40 | nn.LeakyReLU(lrelu_slope), 41 | nn.LayerNorm(channels), 42 | nn.Dropout(p_dropout), 43 | ConvTransposed( 44 | channels, 45 | channels, 46 | kernel_size=kernel_size, 47 | padding=(kernel_size - 1) // 2, 48 | ), 49 | nn.LeakyReLU(lrelu_slope), 50 | nn.LayerNorm(channels), 51 | nn.Dropout(p_dropout), 52 | ] 53 | ) 54 | 55 | self.linear_layer = nn.Linear(channels, channels_out) 56 | 57 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: 58 | """ 59 | Shapes: 60 | x: :math: `[B, T_src, C]` 61 | mask: :math: `[B, T_src]` 62 | """ 63 | for layer in self.layers: 64 | x = layer(x) 65 | x = self.linear_layer(x) 66 | x = x.squeeze(-1) 67 | x = x.masked_fill(mask, 0.0) 68 | return x 69 | -------------------------------------------------------------------------------- /TTS/tts/layers/feed_forward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/feed_forward/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/feed_forward/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN 4 | 5 | 6 | class DurationPredictor(nn.Module): 7 | """Speedy Speech duration predictor model. 8 | Predicts phoneme durations from encoder outputs. 9 | 10 | Note: 11 | Outputs interpreted as log(durations) 12 | To get actual durations, do exp transformation 13 | 14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1 15 | 16 | Args: 17 | hidden_channels (int): number of channels in the inner layers. 18 | """ 19 | 20 | def __init__(self, hidden_channels): 21 | super().__init__() 22 | 23 | self.layers = nn.ModuleList( 24 | [ 25 | Conv1dBN(hidden_channels, hidden_channels, 4, 1), 26 | Conv1dBN(hidden_channels, hidden_channels, 3, 1), 27 | Conv1dBN(hidden_channels, hidden_channels, 1, 1), 28 | nn.Conv1d(hidden_channels, 1, 1), 29 | ] 30 | ) 31 | 32 | def forward(self, x, x_mask): 33 | """ 34 | Shapes: 35 | x: [B, C, T] 36 | x_mask: [B, 1, T] 37 | """ 38 | o = x 39 | for layer in self.layers: 40 | o = layer(o) * x_mask 41 | return o 42 | -------------------------------------------------------------------------------- /TTS/tts/layers/generic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/generic/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/generic/gated_conv.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .normalization import LayerNorm 4 | 5 | 6 | class GatedConvBlock(nn.Module): 7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf 8 | Args: 9 | in_out_channels (int): number of input/output channels. 10 | kernel_size (int): convolution kernel size. 11 | dropout_p (float): dropout rate. 12 | """ 13 | 14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers): 15 | super().__init__() 16 | # class arguments 17 | self.dropout_p = dropout_p 18 | self.num_layers = num_layers 19 | # define layers 20 | self.conv_layers = nn.ModuleList() 21 | self.norm_layers = nn.ModuleList() 22 | self.layers = nn.ModuleList() 23 | for _ in range(num_layers): 24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)] 25 | self.norm_layers += [LayerNorm(2 * in_out_channels)] 26 | 27 | def forward(self, x, x_mask): 28 | o = x 29 | res = x 30 | for idx in range(self.num_layers): 31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training) 32 | o = self.conv_layers[idx](o * x_mask) 33 | o = self.norm_layers[idx](o) 34 | o = nn.functional.glu(o, dim=1) 35 | o = res + o 36 | res = o 37 | return o 38 | -------------------------------------------------------------------------------- /TTS/tts/layers/generic/pos_encoding.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch import nn 5 | 6 | 7 | class PositionalEncoding(nn.Module): 8 | """Sinusoidal positional encoding for non-recurrent neural networks. 9 | Implementation based on "Attention Is All You Need" 10 | 11 | Args: 12 | channels (int): embedding size 13 | dropout_p (float): dropout rate applied to the output. 14 | max_len (int): maximum sequence length. 15 | use_scale (bool): whether to use a learnable scaling coefficient. 16 | """ 17 | 18 | def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False): 19 | super().__init__() 20 | if channels % 2 != 0: 21 | raise ValueError( 22 | "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels) 23 | ) 24 | self.use_scale = use_scale 25 | if use_scale: 26 | self.scale = torch.nn.Parameter(torch.ones(1)) 27 | pe = torch.zeros(max_len, channels) 28 | position = torch.arange(0, max_len).unsqueeze(1) 29 | div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels) 30 | pe[:, 0::2] = torch.sin(position.float() * div_term) 31 | pe[:, 1::2] = torch.cos(position.float() * div_term) 32 | pe = pe.unsqueeze(0).transpose(1, 2) 33 | self.register_buffer("pe", pe) 34 | if dropout_p > 0: 35 | self.dropout = nn.Dropout(p=dropout_p) 36 | self.channels = channels 37 | 38 | def forward(self, x, mask=None, first_idx=None, last_idx=None): 39 | """ 40 | Shapes: 41 | x: [B, C, T] 42 | mask: [B, 1, T] 43 | first_idx: int 44 | last_idx: int 45 | """ 46 | 47 | x = x * math.sqrt(self.channels) 48 | if first_idx is None: 49 | if self.pe.size(2) < x.size(2): 50 | raise RuntimeError( 51 | f"Sequence is {x.size(2)} but PositionalEncoding is" 52 | f" limited to {self.pe.size(2)}. See max_len argument." 53 | ) 54 | if mask is not None: 55 | pos_enc = self.pe[:, :, : x.size(2)] * mask 56 | else: 57 | pos_enc = self.pe[:, :, : x.size(2)] 58 | if self.use_scale: 59 | x = x + self.scale * pos_enc 60 | else: 61 | x = x + pos_enc 62 | else: 63 | if self.use_scale: 64 | x = x + self.scale * self.pe[:, :, first_idx:last_idx] 65 | else: 66 | x = x + self.pe[:, :, first_idx:last_idx] 67 | if hasattr(self, "dropout"): 68 | x = self.dropout(x) 69 | return x 70 | -------------------------------------------------------------------------------- /TTS/tts/layers/generic/time_depth_sep_conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class TimeDepthSeparableConv(nn.Module): 6 | """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf 7 | It shows competative results with less computation and memory footprint.""" 8 | 9 | def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True): 10 | super().__init__() 11 | 12 | self.in_channels = in_channels 13 | self.out_channels = out_channels 14 | self.hid_channels = hid_channels 15 | self.kernel_size = kernel_size 16 | 17 | self.time_conv = nn.Conv1d( 18 | in_channels, 19 | 2 * hid_channels, 20 | kernel_size=1, 21 | stride=1, 22 | padding=0, 23 | bias=bias, 24 | ) 25 | self.norm1 = nn.BatchNorm1d(2 * hid_channels) 26 | self.depth_conv = nn.Conv1d( 27 | hid_channels, 28 | hid_channels, 29 | kernel_size, 30 | stride=1, 31 | padding=(kernel_size - 1) // 2, 32 | groups=hid_channels, 33 | bias=bias, 34 | ) 35 | self.norm2 = nn.BatchNorm1d(hid_channels) 36 | self.time_conv2 = nn.Conv1d( 37 | hid_channels, 38 | out_channels, 39 | kernel_size=1, 40 | stride=1, 41 | padding=0, 42 | bias=bias, 43 | ) 44 | self.norm3 = nn.BatchNorm1d(out_channels) 45 | 46 | def forward(self, x): 47 | x_res = x 48 | x = self.time_conv(x) 49 | x = self.norm1(x) 50 | x = nn.functional.glu(x, dim=1) 51 | x = self.depth_conv(x) 52 | x = self.norm2(x) 53 | x = x * torch.sigmoid(x) 54 | x = self.time_conv2(x) 55 | x = self.norm3(x) 56 | x = x_res + x 57 | return x 58 | 59 | 60 | class TimeDepthSeparableConvBlock(nn.Module): 61 | def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True): 62 | super().__init__() 63 | assert (kernel_size - 1) % 2 == 0 64 | assert num_layers > 1 65 | 66 | self.layers = nn.ModuleList() 67 | layer = TimeDepthSeparableConv( 68 | in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias 69 | ) 70 | self.layers.append(layer) 71 | for idx in range(num_layers - 1): 72 | layer = TimeDepthSeparableConv( 73 | hid_channels, 74 | hid_channels, 75 | out_channels if (idx + 1) == (num_layers - 1) else hid_channels, 76 | kernel_size, 77 | bias, 78 | ) 79 | self.layers.append(layer) 80 | 81 | def forward(self, x, mask): 82 | for layer in self.layers: 83 | x = layer(x * mask) 84 | return x 85 | -------------------------------------------------------------------------------- /TTS/tts/layers/glow_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/glow_tts/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/glow_tts/duration_predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ..generic.normalization import LayerNorm 5 | 6 | 7 | class DurationPredictor(nn.Module): 8 | """Glow-TTS duration prediction model. 9 | 10 | :: 11 | 12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs 13 | 14 | Args: 15 | in_channels (int): Number of channels of the input tensor. 16 | hidden_channels (int): Number of hidden channels of the network. 17 | kernel_size (int): Kernel size for the conv layers. 18 | dropout_p (float): Dropout rate used after each conv layer. 19 | """ 20 | 21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): 22 | super().__init__() 23 | 24 | # add language embedding dim in the input 25 | if language_emb_dim: 26 | in_channels += language_emb_dim 27 | 28 | # class arguments 29 | self.in_channels = in_channels 30 | self.filter_channels = hidden_channels 31 | self.kernel_size = kernel_size 32 | self.dropout_p = dropout_p 33 | # layers 34 | self.drop = nn.Dropout(dropout_p) 35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2) 36 | self.norm_1 = LayerNorm(hidden_channels) 37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2) 38 | self.norm_2 = LayerNorm(hidden_channels) 39 | # output layer 40 | self.proj = nn.Conv1d(hidden_channels, 1, 1) 41 | if cond_channels is not None and cond_channels != 0: 42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1) 43 | 44 | if language_emb_dim != 0 and language_emb_dim is not None: 45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1) 46 | 47 | def forward(self, x, x_mask, g=None, lang_emb=None): 48 | """ 49 | Shapes: 50 | - x: :math:`[B, C, T]` 51 | - x_mask: :math:`[B, 1, T]` 52 | - g: :math:`[B, C, 1]` 53 | """ 54 | if g is not None: 55 | x = x + self.cond(g) 56 | 57 | if lang_emb is not None: 58 | x = x + self.cond_lang(lang_emb) 59 | 60 | x = self.conv_1(x * x_mask) 61 | x = torch.relu(x) 62 | x = self.norm_1(x) 63 | x = self.drop(x) 64 | x = self.conv_2(x * x_mask) 65 | x = torch.relu(x) 66 | x = self.norm_2(x) 67 | x = self.drop(x) 68 | x = self.proj(x * x_mask) 69 | return x * x_mask 70 | -------------------------------------------------------------------------------- /TTS/tts/layers/overflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/overflow/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/overflow/decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder 5 | from TTS.tts.utils.helpers import sequence_mask 6 | 7 | 8 | class Decoder(nn.Module): 9 | """Uses glow decoder with some modifications. 10 | :: 11 | 12 | Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze 13 | 14 | Args: 15 | in_channels (int): channels of input tensor. 16 | hidden_channels (int): hidden decoder channels. 17 | kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.) 18 | dilation_rate (int): rate to increase dilation by each layer in a decoder block. 19 | num_flow_blocks (int): number of decoder blocks. 20 | num_coupling_layers (int): number coupling layers. (number of wavenet layers.) 21 | dropout_p (float): wavenet dropout rate. 22 | sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | in_channels, 28 | hidden_channels, 29 | kernel_size, 30 | dilation_rate, 31 | num_flow_blocks, 32 | num_coupling_layers, 33 | dropout_p=0.0, 34 | num_splits=4, 35 | num_squeeze=2, 36 | sigmoid_scale=False, 37 | c_in_channels=0, 38 | ): 39 | super().__init__() 40 | 41 | self.glow_decoder = GlowDecoder( 42 | in_channels, 43 | hidden_channels, 44 | kernel_size, 45 | dilation_rate, 46 | num_flow_blocks, 47 | num_coupling_layers, 48 | dropout_p, 49 | num_splits, 50 | num_squeeze, 51 | sigmoid_scale, 52 | c_in_channels, 53 | ) 54 | self.n_sqz = num_squeeze 55 | 56 | def forward(self, x, x_len, g=None, reverse=False): 57 | """ 58 | Input shapes: 59 | - x: :math:`[B, C, T]` 60 | - x_len :math:`[B]` 61 | - g: :math:`[B, C]` 62 | 63 | Output shapes: 64 | - x: :math:`[B, C, T]` 65 | - x_len :math:`[B]` 66 | - logget_tot :math:`[B]` 67 | """ 68 | x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max()) 69 | x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype) 70 | x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse) 71 | return x, x_len, logdet_tot 72 | 73 | def preprocess(self, y, y_lengths, y_max_length): 74 | if y_max_length is not None: 75 | y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz 76 | y = y[:, :, :y_max_length] 77 | y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz 78 | return y, y_lengths, y_max_length 79 | 80 | def store_inverse(self): 81 | self.glow_decoder.store_inverse() 82 | -------------------------------------------------------------------------------- /TTS/tts/layers/overflow/plotting_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def validate_numpy_array(value: Any): 9 | r""" 10 | Validates the input and makes sure it returns a numpy array (i.e on CPU) 11 | 12 | Args: 13 | value (Any): the input value 14 | 15 | Raises: 16 | TypeError: if the value is not a numpy array or torch tensor 17 | 18 | Returns: 19 | np.ndarray: numpy array of the value 20 | """ 21 | if isinstance(value, np.ndarray): 22 | pass 23 | elif isinstance(value, list): 24 | value = np.array(value) 25 | elif torch.is_tensor(value): 26 | value = value.cpu().numpy() 27 | else: 28 | raise TypeError("Value must be a numpy array, a torch tensor or a list") 29 | 30 | return value 31 | 32 | 33 | def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None): 34 | """Get the most probable state means from the log_alpha_scaled. 35 | 36 | Args: 37 | log_alpha_scaled (torch.Tensor): Log alpha scaled values. 38 | - Shape: :math:`(T, N)` 39 | means (torch.Tensor): Means of the states. 40 | - Shape: :math:`(N, T, D_out)` 41 | decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None. 42 | """ 43 | max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1] 44 | max_len = means.shape[0] 45 | n_mel_channels = means.shape[2] 46 | max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels) 47 | means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype) 48 | if decoder is not None: 49 | mel = ( 50 | decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0] 51 | .squeeze(0) 52 | .T 53 | ) 54 | else: 55 | mel = means 56 | return mel 57 | 58 | 59 | def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False): 60 | """Generates trainsition probabilities plot for the states and the probability of transition. 61 | 62 | Args: 63 | states (torch.IntTensor): the states 64 | transition_probabilities (torch.FloatTensor): the transition probabilities 65 | """ 66 | states = validate_numpy_array(states) 67 | transition_probabilities = validate_numpy_array(transition_probabilities) 68 | 69 | fig, ax = plt.subplots(figsize=(30, 3)) 70 | ax.plot(transition_probabilities, "o") 71 | ax.set_title("Transition probability of state") 72 | ax.set_xlabel("hidden state") 73 | ax.set_ylabel("probability") 74 | ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension 75 | ax.set_xticklabels([int(x) for x in states], rotation=90) 76 | plt.tight_layout() 77 | if not output_fig: 78 | plt.close() 79 | return fig 80 | -------------------------------------------------------------------------------- /TTS/tts/layers/tacotron/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/layers/tacotron/__init__.py -------------------------------------------------------------------------------- /TTS/tts/layers/tortoise/random_latent_generator.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5): 9 | if bias is not None: 10 | rest_dim = [1] * (input.ndim - bias.ndim - 1) 11 | return ( 12 | F.leaky_relu( 13 | input + bias.view(1, bias.shape[0], *rest_dim), 14 | negative_slope=negative_slope, 15 | ) 16 | * scale 17 | ) 18 | else: 19 | return F.leaky_relu(input, negative_slope=0.2) * scale 20 | 21 | 22 | class EqualLinear(nn.Module): 23 | def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1): 24 | super().__init__() 25 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) 26 | if bias: 27 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) 28 | else: 29 | self.bias = None 30 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul 31 | self.lr_mul = lr_mul 32 | 33 | def forward(self, input): 34 | out = F.linear(input, self.weight * self.scale) 35 | out = fused_leaky_relu(out, self.bias * self.lr_mul) 36 | return out 37 | 38 | 39 | class RandomLatentConverter(nn.Module): 40 | def __init__(self, channels): 41 | super().__init__() 42 | self.layers = nn.Sequential( 43 | *[EqualLinear(channels, channels, lr_mul=0.1) for _ in range(5)], nn.Linear(channels, channels) 44 | ) 45 | self.channels = channels 46 | 47 | def forward(self, ref): 48 | r = torch.randn(ref.shape[0], self.channels, device=ref.device) 49 | y = self.layers(r) 50 | return y 51 | 52 | 53 | if __name__ == "__main__": 54 | model = RandomLatentConverter(512) 55 | model(torch.randn(5, 512)) 56 | -------------------------------------------------------------------------------- /TTS/tts/layers/tortoise/tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from tokenizers import Tokenizer 5 | 6 | from TTS.tts.utils.text.cleaners import english_cleaners 7 | 8 | DEFAULT_VOCAB_FILE = os.path.join( 9 | os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json" 10 | ) 11 | 12 | 13 | class VoiceBpeTokenizer: 14 | def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None): 15 | self.tokenizer = None 16 | if vocab_file is not None: 17 | self.tokenizer = Tokenizer.from_file(vocab_file) 18 | if vocab_str is not None: 19 | self.tokenizer = Tokenizer.from_str(vocab_str) 20 | 21 | def preprocess_text(self, txt): 22 | txt = english_cleaners(txt) 23 | return txt 24 | 25 | def encode(self, txt): 26 | txt = self.preprocess_text(txt) 27 | txt = txt.replace(" ", "[SPACE]") 28 | return self.tokenizer.encode(txt).ids 29 | 30 | def decode(self, seq): 31 | if isinstance(seq, torch.Tensor): 32 | seq = seq.cpu().numpy() 33 | txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") 34 | txt = txt.replace("[SPACE]", " ") 35 | txt = txt.replace("[STOP]", "") 36 | txt = txt.replace("[UNK]", "") 37 | return txt 38 | -------------------------------------------------------------------------------- /TTS/tts/layers/tortoise/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from urllib import request 3 | 4 | from tqdm import tqdm 5 | 6 | DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models") 7 | MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR) 8 | MODELS_DIR = "/data/speech_synth/models/" 9 | MODELS = { 10 | "autoregressive.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth", 11 | "classifier.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth", 12 | "clvp2.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth", 13 | "diffusion_decoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth", 14 | "vocoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth", 15 | "rlg_auto.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth", 16 | "rlg_diffuser.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth", 17 | } 18 | 19 | 20 | def download_models(specific_models=None): 21 | """ 22 | Call to download all the models that Tortoise uses. 23 | """ 24 | os.makedirs(MODELS_DIR, exist_ok=True) 25 | for model_name, url in MODELS.items(): 26 | if specific_models is not None and model_name not in specific_models: 27 | continue 28 | model_path = os.path.join(MODELS_DIR, model_name) 29 | if os.path.exists(model_path): 30 | continue 31 | print(f"Downloading {model_name} from {url}...") 32 | with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t: 33 | request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n)) 34 | print("Done.") 35 | 36 | 37 | def get_model_path(model_name, models_dir=MODELS_DIR): 38 | """ 39 | Get path to given model, download it if it doesn't exist. 40 | """ 41 | if model_name not in MODELS: 42 | raise ValueError(f"Model {model_name} not found in available models.") 43 | model_path = os.path.join(models_dir, model_name) 44 | if not os.path.exists(model_path) and models_dir == MODELS_DIR: 45 | download_models([model_name]) 46 | return model_path 47 | -------------------------------------------------------------------------------- /TTS/tts/layers/xtts/xtts_manager.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SpeakerManager(): 4 | def __init__(self, speaker_file_path=None): 5 | self.speakers = torch.load(speaker_file_path) 6 | 7 | @property 8 | def name_to_id(self): 9 | return self.speakers.keys() 10 | 11 | @property 12 | def num_speakers(self): 13 | return len(self.name_to_id) 14 | 15 | @property 16 | def speaker_names(self): 17 | return list(self.name_to_id.keys()) 18 | 19 | 20 | class LanguageManager(): 21 | def __init__(self, config): 22 | self.langs = config["languages"] 23 | 24 | @property 25 | def name_to_id(self): 26 | return self.langs 27 | 28 | @property 29 | def num_languages(self): 30 | return len(self.name_to_id) 31 | 32 | @property 33 | def language_names(self): 34 | return list(self.name_to_id) 35 | -------------------------------------------------------------------------------- /TTS/tts/models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | from TTS.utils.generic_utils import find_module 4 | 5 | 6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": 7 | print(" > Using model: {}".format(config.model)) 8 | # fetch the right model implementation. 9 | if "base_model" in config and config["base_model"] is not None: 10 | MyModel = find_module("TTS.tts.models", config.base_model.lower()) 11 | else: 12 | MyModel = find_module("TTS.tts.models", config.model.lower()) 13 | model = MyModel.init_from_config(config=config, samples=samples) 14 | return model 15 | -------------------------------------------------------------------------------- /TTS/tts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/data.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def _pad_data(x, length): 8 | _pad = 0 9 | assert x.ndim == 1 10 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad) 11 | 12 | 13 | def prepare_data(inputs): 14 | max_len = max((len(x) for x in inputs)) 15 | return np.stack([_pad_data(x, max_len) for x in inputs]) 16 | 17 | 18 | def _pad_tensor(x, length): 19 | _pad = 0.0 20 | assert x.ndim == 2 21 | x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad) 22 | return x 23 | 24 | 25 | def prepare_tensor(inputs, out_steps): 26 | max_len = max((x.shape[1] for x in inputs)) 27 | remainder = max_len % out_steps 28 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len 29 | return np.stack([_pad_tensor(x, pad_len) for x in inputs]) 30 | 31 | 32 | def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray: 33 | """Pad stop target array. 34 | 35 | Args: 36 | x (np.ndarray): Stop target array. 37 | length (int): Length after padding. 38 | pad_val (int, optional): Padding value. Defaults to 1. 39 | 40 | Returns: 41 | np.ndarray: Padded stop target array. 42 | """ 43 | assert x.ndim == 1 44 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val) 45 | 46 | 47 | def prepare_stop_target(inputs, out_steps): 48 | """Pad row vectors with 1.""" 49 | max_len = max((x.shape[0] for x in inputs)) 50 | remainder = max_len % out_steps 51 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len 52 | return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) 53 | 54 | 55 | def pad_per_step(inputs, pad_len): 56 | return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0) 57 | 58 | 59 | def get_length_balancer_weights(items: list, num_buckets=10): 60 | # get all durations 61 | audio_lengths = np.array([item["audio_length"] for item in items]) 62 | # create the $num_buckets buckets classes based in the dataset max and min length 63 | max_length = int(max(audio_lengths)) 64 | min_length = int(min(audio_lengths)) 65 | step = int((max_length - min_length) / num_buckets) + 1 66 | buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)] 67 | # add each sample in their respective length bucket 68 | buckets_names = np.array( 69 | [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items] 70 | ) 71 | # count and compute the weights_bucket for each sample 72 | unique_buckets_names = np.unique(buckets_names).tolist() 73 | bucket_ids = [unique_buckets_names.index(l) for l in buckets_names] 74 | bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names]) 75 | weight_bucket = 1.0 / bucket_count 76 | dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids]) 77 | # normalize 78 | dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) 79 | return torch.from_numpy(dataset_samples_weight).float() 80 | -------------------------------------------------------------------------------- /TTS/tts/utils/fairseq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def rehash_fairseq_vits_checkpoint(checkpoint_file): 5 | chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"] 6 | new_chk = {} 7 | for k, v in chk.items(): 8 | if "enc_p." in k: 9 | new_chk[k.replace("enc_p.", "text_encoder.")] = v 10 | elif "dec." in k: 11 | new_chk[k.replace("dec.", "waveform_decoder.")] = v 12 | elif "enc_q." in k: 13 | new_chk[k.replace("enc_q.", "posterior_encoder.")] = v 14 | elif "flow.flows.2." in k: 15 | new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v 16 | elif "flow.flows.4." in k: 17 | new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v 18 | elif "flow.flows.6." in k: 19 | new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v 20 | elif "dp.flows.0.m" in k: 21 | new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v 22 | elif "dp.flows.0.logs" in k: 23 | new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v 24 | elif "dp.flows.1" in k: 25 | new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v 26 | elif "dp.flows.3" in k: 27 | new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v 28 | elif "dp.flows.5" in k: 29 | new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v 30 | elif "dp.flows.7" in k: 31 | new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v 32 | elif "dp.post_flows.0.m" in k: 33 | new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v 34 | elif "dp.post_flows.0.logs" in k: 35 | new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v 36 | elif "dp.post_flows.1" in k: 37 | new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v 38 | elif "dp.post_flows.3" in k: 39 | new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v 40 | elif "dp.post_flows.5" in k: 41 | new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v 42 | elif "dp.post_flows.7" in k: 43 | new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v 44 | elif "dp." in k: 45 | new_chk[k.replace("dp.", "duration_predictor.")] = v 46 | else: 47 | new_chk[k] = v 48 | return new_chk 49 | -------------------------------------------------------------------------------- /TTS/tts/utils/measures.py: -------------------------------------------------------------------------------- 1 | def alignment_diagonal_score(alignments, binary=False): 2 | """ 3 | Compute how diagonal alignment predictions are. It is useful 4 | to measure the alignment consistency of a model 5 | Args: 6 | alignments (torch.Tensor): batch of alignments. 7 | binary (bool): if True, ignore scores and consider attention 8 | as a binary mask. 9 | Shape: 10 | - alignments : :math:`[B, T_de, T_en]` 11 | """ 12 | maxs = alignments.max(dim=1)[0] 13 | if binary: 14 | maxs[maxs > 0] = 1 15 | return maxs.mean(dim=1).mean(dim=0).item() 16 | -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/monotonic_align/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 2 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/bangla/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/bangla/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/belarusian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/belarusian/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/belarusian/phonemizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | finder = None 4 | 5 | 6 | def init(): 7 | try: 8 | import jpype 9 | import jpype.imports 10 | except ModuleNotFoundError: 11 | raise ModuleNotFoundError( 12 | "Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`." 13 | ) 14 | 15 | try: 16 | jar_path = os.environ["BEL_FANETYKA_JAR"] 17 | except KeyError: 18 | raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file") 19 | 20 | jpype.startJVM(classpath=[jar_path]) 21 | 22 | # import the Java modules 23 | from org.alex73.korpus.base import GrammarDB2, GrammarFinder 24 | 25 | grammar_db = GrammarDB2.initializeFromJar() 26 | global finder 27 | finder = GrammarFinder(grammar_db) 28 | 29 | 30 | def belarusian_text_to_phonemes(text: str) -> str: 31 | # Initialize only on first run 32 | if finder is None: 33 | init() 34 | 35 | from org.alex73.fanetyka.impl import FanetykaText 36 | 37 | return str(FanetykaText(finder, text).ipa) 38 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/chinese_mandarin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/chinese_mandarin/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/chinese_mandarin/phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import jieba 4 | import pypinyin 5 | 6 | from .pinyinToPhonemes import PINYIN_DICT 7 | 8 | 9 | def _chinese_character_to_pinyin(text: str) -> List[str]: 10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) 11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist] 12 | return pinyins_flat_list 13 | 14 | 15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str: 16 | segment = pinyin[:-1] 17 | tone = pinyin[-1] 18 | phoneme = PINYIN_DICT.get(segment, [""])[0] 19 | return phoneme + tone 20 | 21 | 22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: 23 | tokenized_text = jieba.cut(text, HMM=False) 24 | tokenized_text = " ".join(tokenized_text) 25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) 26 | 27 | results: List[str] = [] 28 | 29 | for token in pinyined_text: 30 | if token[-1] in "12345": # TODO transform to is_pinyin() 31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token) 32 | 33 | results += list(pinyin_phonemes) 34 | else: # is ponctuation or other 35 | results += list(token) 36 | 37 | return seperator.join(results) 38 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/english/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in english: 4 | abbreviations_en = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("mrs", "misess"), 8 | ("mr", "mister"), 9 | ("dr", "doctor"), 10 | ("st", "saint"), 11 | ("co", "company"), 12 | ("jr", "junior"), 13 | ("maj", "major"), 14 | ("gen", "general"), 15 | ("drs", "doctors"), 16 | ("rev", "reverend"), 17 | ("lt", "lieutenant"), 18 | ("hon", "honorable"), 19 | ("sgt", "sergeant"), 20 | ("capt", "captain"), 21 | ("esq", "esquire"), 22 | ("ltd", "limited"), 23 | ("col", "colonel"), 24 | ("ft", "fort"), 25 | ] 26 | ] 27 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/number_norm.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | from typing import Dict 5 | 6 | import inflect 7 | 8 | _inflect = inflect.engine() 9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)") 12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 13 | _number_re = re.compile(r"-?[0-9]+") 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(",", "") 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace(".", " point ") 22 | 23 | 24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str: 25 | parts = value.replace(",", "").split(".") 26 | if len(parts) > 2: 27 | return f"{value} {inflection[2]}" # Unexpected format 28 | text = [] 29 | integer = int(parts[0]) if parts[0] else 0 30 | if integer > 0: 31 | integer_unit = inflection.get(integer, inflection[2]) 32 | text.append(f"{integer} {integer_unit}") 33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0 34 | if fraction > 0: 35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02]) 36 | text.append(f"{fraction} {fraction_unit}") 37 | if len(text) == 0: 38 | return f"zero {inflection[2]}" 39 | return " ".join(text) 40 | 41 | 42 | def _expand_currency(m: "re.Match") -> str: 43 | currencies = { 44 | "$": { 45 | 0.01: "cent", 46 | 0.02: "cents", 47 | 1: "dollar", 48 | 2: "dollars", 49 | }, 50 | "€": { 51 | 0.01: "cent", 52 | 0.02: "cents", 53 | 1: "euro", 54 | 2: "euros", 55 | }, 56 | "£": { 57 | 0.01: "penny", 58 | 0.02: "pence", 59 | 1: "pound sterling", 60 | 2: "pounds sterling", 61 | }, 62 | "¥": { 63 | # TODO rin 64 | 0.02: "sen", 65 | 2: "yen", 66 | }, 67 | } 68 | unit = m.group(1) 69 | currency = currencies[unit] 70 | value = m.group(2) 71 | return __expand_currency(value, currency) 72 | 73 | 74 | def _expand_ordinal(m): 75 | return _inflect.number_to_words(m.group(0)) 76 | 77 | 78 | def _expand_number(m): 79 | num = int(m.group(0)) 80 | if 1000 < num < 3000: 81 | if num == 2000: 82 | return "two thousand" 83 | if 2000 < num < 2010: 84 | return "two thousand " + _inflect.number_to_words(num % 100) 85 | if num % 100 == 0: 86 | return _inflect.number_to_words(num // 100) + " hundred" 87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") 88 | return _inflect.number_to_words(num, andword="") 89 | 90 | 91 | def normalize_numbers(text): 92 | text = re.sub(_comma_number_re, _remove_commas, text) 93 | text = re.sub(_currency_re, _expand_currency, text) 94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 95 | text = re.sub(_ordinal_re, _expand_ordinal, text) 96 | text = re.sub(_number_re, _expand_number, text) 97 | return text 98 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/time_norm.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | 7 | _time_re = re.compile( 8 | r"""\b 9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours 10 | : 11 | ([0-5][0-9]) # minutes 12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm 13 | \b""", 14 | re.IGNORECASE | re.X, 15 | ) 16 | 17 | 18 | def _expand_num(n: int) -> str: 19 | return _inflect.number_to_words(n) 20 | 21 | 22 | def _expand_time_english(match: "re.Match") -> str: 23 | hour = int(match.group(1)) 24 | past_noon = hour >= 12 25 | time = [] 26 | if hour > 12: 27 | hour -= 12 28 | elif hour == 0: 29 | hour = 12 30 | past_noon = True 31 | time.append(_expand_num(hour)) 32 | 33 | minute = int(match.group(6)) 34 | if minute > 0: 35 | if minute < 10: 36 | time.append("oh") 37 | time.append(_expand_num(minute)) 38 | am_pm = match.group(7) 39 | if am_pm is None: 40 | time.append("p m" if past_noon else "a m") 41 | else: 42 | time.extend(list(am_pm.replace(".", ""))) 43 | return " ".join(time) 44 | 45 | 46 | def expand_time_english(text: str) -> str: 47 | return re.sub(_time_re, _expand_time_english, text) 48 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/french/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/french/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/french/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in french: 4 | abbreviations_fr = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("M", "monsieur"), 8 | ("Mlle", "mademoiselle"), 9 | ("Mlles", "mesdemoiselles"), 10 | ("Mme", "Madame"), 11 | ("Mmes", "Mesdames"), 12 | ("N.B", "nota bene"), 13 | ("M", "monsieur"), 14 | ("p.c.q", "parce que"), 15 | ("Pr", "professeur"), 16 | ("qqch", "quelque chose"), 17 | ("rdv", "rendez-vous"), 18 | ("max", "maximum"), 19 | ("min", "minimum"), 20 | ("no", "numéro"), 21 | ("adr", "adresse"), 22 | ("dr", "docteur"), 23 | ("st", "saint"), 24 | ("co", "companie"), 25 | ("jr", "junior"), 26 | ("sgt", "sergent"), 27 | ("capt", "capitain"), 28 | ("col", "colonel"), 29 | ("av", "avenue"), 30 | ("av. J.-C", "avant Jésus-Christ"), 31 | ("apr. J.-C", "après Jésus-Christ"), 32 | ("art", "article"), 33 | ("boul", "boulevard"), 34 | ("c.-à-d", "c’est-à-dire"), 35 | ("etc", "et cetera"), 36 | ("ex", "exemple"), 37 | ("excl", "exclusivement"), 38 | ("boul", "boulevard"), 39 | ] 40 | ] + [ 41 | (re.compile("\\b%s" % x[0]), x[1]) 42 | for x in [ 43 | ("Mlle", "mademoiselle"), 44 | ("Mlles", "mesdemoiselles"), 45 | ("Mme", "Madame"), 46 | ("Mmes", "Mesdames"), 47 | ] 48 | ] 49 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/japanese/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/japanese/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/korean/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/tts/utils/text/korean/__init__.py -------------------------------------------------------------------------------- /TTS/tts/utils/text/korean/ko_dictionary.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Add the word you want to the dictionary. 3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"} 4 | 5 | 6 | english_dictionary = { 7 | "KOREA": "코리아", 8 | "IDOL": "아이돌", 9 | "IT": "아이티", 10 | "IQ": "아이큐", 11 | "UP": "업", 12 | "DOWN": "다운", 13 | "PC": "피씨", 14 | "CCTV": "씨씨티비", 15 | "SNS": "에스엔에스", 16 | "AI": "에이아이", 17 | "CEO": "씨이오", 18 | "A": "에이", 19 | "B": "비", 20 | "C": "씨", 21 | "D": "디", 22 | "E": "이", 23 | "F": "에프", 24 | "G": "지", 25 | "H": "에이치", 26 | "I": "아이", 27 | "J": "제이", 28 | "K": "케이", 29 | "L": "엘", 30 | "M": "엠", 31 | "N": "엔", 32 | "O": "오", 33 | "P": "피", 34 | "Q": "큐", 35 | "R": "알", 36 | "S": "에스", 37 | "T": "티", 38 | "U": "유", 39 | "V": "브이", 40 | "W": "더블유", 41 | "X": "엑스", 42 | "Y": "와이", 43 | "Z": "제트", 44 | } 45 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/korean/korean.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py 3 | import re 4 | 5 | from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary 6 | 7 | 8 | def normalize(text): 9 | text = text.strip() 10 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text) 11 | text = normalize_with_dictionary(text, etc_dictionary) 12 | text = normalize_english(text) 13 | text = text.lower() 14 | return text 15 | 16 | 17 | def normalize_with_dictionary(text, dic): 18 | if any(key in text for key in dic.keys()): 19 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) 20 | return pattern.sub(lambda x: dic[x.group()], text) 21 | return text 22 | 23 | 24 | def normalize_english(text): 25 | def fn(m): 26 | word = m.group() 27 | if word in english_dictionary: 28 | return english_dictionary.get(word) 29 | return word 30 | 31 | text = re.sub("([A-Za-z]+)", fn, text) 32 | return text 33 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/korean/phonemizer.py: -------------------------------------------------------------------------------- 1 | from jamo import hangul_to_jamo 2 | 3 | from TTS.tts.utils.text.korean.korean import normalize 4 | 5 | g2p = None 6 | 7 | 8 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str: 9 | """ 10 | 11 | The input and output values look the same, but they are different in Unicode. 12 | 13 | example : 14 | 15 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘) 16 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ) 17 | 18 | """ 19 | global g2p # pylint: disable=global-statement 20 | if g2p is None: 21 | from g2pkk import G2p 22 | 23 | g2p = G2p() 24 | 25 | if character == "english": 26 | from anyascii import anyascii 27 | 28 | text = normalize(text) 29 | text = g2p(text) 30 | text = anyascii(text) 31 | return text 32 | 33 | text = normalize(text) 34 | text = g2p(text) 35 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ'] 36 | return "".join(text) 37 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer 2 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 3 | from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer 4 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak 5 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut 6 | from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer 7 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer 8 | 9 | try: 10 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer 11 | except ImportError: 12 | JA_JP_Phonemizer = None 13 | pass 14 | 15 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)} 16 | 17 | 18 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys()) 19 | GRUUT_LANGS = list(Gruut.supported_languages()) 20 | 21 | 22 | # Dict setting default phonemizers for each language 23 | # Add Gruut languages 24 | _ = [Gruut.name()] * len(GRUUT_LANGS) 25 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _))) 26 | 27 | 28 | # Add ESpeak languages and override any existing ones 29 | _ = [ESpeak.name()] * len(ESPEAK_LANGS) 30 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _))) 31 | DEF_LANG_TO_PHONEMIZER.update(_new_dict) 32 | 33 | 34 | # Force default for some languages 35 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] 36 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() 37 | DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name() 38 | DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name() 39 | DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name() 40 | 41 | 42 | # JA phonemizer has deal breaking dependencies like MeCab for some systems. 43 | # So we only have it when we have it. 44 | if JA_JP_Phonemizer is not None: 45 | PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer 46 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() 47 | 48 | 49 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: 50 | """Initiate a phonemizer by name 51 | 52 | Args: 53 | name (str): 54 | Name of the phonemizer that should match `phonemizer.name()`. 55 | 56 | kwargs (dict): 57 | Extra keyword arguments that should be passed to the phonemizer. 58 | """ 59 | if name == "espeak": 60 | return ESpeak(**kwargs) 61 | if name == "gruut": 62 | return Gruut(**kwargs) 63 | if name == "zh_cn_phonemizer": 64 | return ZH_CN_Phonemizer(**kwargs) 65 | if name == "ja_jp_phonemizer": 66 | if JA_JP_Phonemizer is None: 67 | raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.") 68 | return JA_JP_Phonemizer(**kwargs) 69 | if name == "ko_kr_phonemizer": 70 | return KO_KR_Phonemizer(**kwargs) 71 | if name == "bn_phonemizer": 72 | return BN_Phonemizer(**kwargs) 73 | if name == "be_phonemizer": 74 | return BEL_Phonemizer(**kwargs) 75 | raise ValueError(f"Phonemizer {name} not found") 76 | 77 | 78 | if __name__ == "__main__": 79 | print(DEF_LANG_TO_PHONEMIZER) 80 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/bangla_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | 9 | class BN_Phonemizer(BasePhonemizer): 10 | """🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer` 11 | 12 | Args: 13 | punctuations (str): 14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`. 15 | 16 | keep_puncs (bool): 17 | If True, keep the punctuations after phonemization. Defaults to False. 18 | 19 | Example :: 20 | 21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。` 22 | 23 | TODO: someone with Bangla knowledge should check this implementation 24 | """ 25 | 26 | language = "bn" 27 | 28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument 29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 30 | 31 | @staticmethod 32 | def name(): 33 | return "bn_phonemizer" 34 | 35 | @staticmethod 36 | def phonemize_bn(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument 37 | ph = bangla_text_to_phonemes(text) 38 | return ph 39 | 40 | def _phonemize(self, text, separator): 41 | return self.phonemize_bn(text, separator) 42 | 43 | @staticmethod 44 | def supported_languages() -> Dict: 45 | return {"bn": "Bangla"} 46 | 47 | def version(self) -> str: 48 | return "0.0.1" 49 | 50 | def is_available(self) -> bool: 51 | return True 52 | 53 | 54 | if __name__ == "__main__": 55 | txt = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে." 56 | e = BN_Phonemizer() 57 | print(e.supported_languages()) 58 | print(e.version()) 59 | print(e.language) 60 | print(e.name()) 61 | print(e.is_available()) 62 | print("`" + e.phonemize(txt) + "`") 63 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_BE_PUNCS = ",!." # TODO 7 | 8 | 9 | class BEL_Phonemizer(BasePhonemizer): 10 | """🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer` 11 | 12 | Args: 13 | punctuations (str): 14 | Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`. 15 | 16 | keep_puncs (bool): 17 | If True, keep the punctuations after phonemization. Defaults to False. 18 | """ 19 | 20 | language = "be" 21 | 22 | def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument 23 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 24 | 25 | @staticmethod 26 | def name(): 27 | return "be_phonemizer" 28 | 29 | @staticmethod 30 | def phonemize_be(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument 31 | return belarusian_text_to_phonemes(text) 32 | 33 | def _phonemize(self, text, separator): 34 | return self.phonemize_be(text, separator) 35 | 36 | @staticmethod 37 | def supported_languages() -> Dict: 38 | return {"be": "Belarusian"} 39 | 40 | def version(self) -> str: 41 | return "0.0.1" 42 | 43 | def is_available(self) -> bool: 44 | return True 45 | 46 | 47 | if __name__ == "__main__": 48 | txt = "тэст" 49 | e = BEL_Phonemizer() 50 | print(e.supported_languages()) 51 | print(e.version()) 52 | print(e.language) 53 | print(e.name()) 54 | print(e.is_available()) 55 | print("`" + e.phonemize(txt) + "`") 56 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | _TRANS_TABLE = {"、": ","} 9 | 10 | 11 | def trans(text): 12 | for i, j in _TRANS_TABLE.items(): 13 | text = text.replace(i, j) 14 | return text 15 | 16 | 17 | class JA_JP_Phonemizer(BasePhonemizer): 18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer` 19 | 20 | TODO: someone with JA knowledge should check this implementation 21 | 22 | Example: 23 | 24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer 25 | >>> phonemizer = JA_JP_Phonemizer() 26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|") 27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?' 28 | 29 | """ 30 | 31 | language = "ja-jp" 32 | 33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument 34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 35 | 36 | @staticmethod 37 | def name(): 38 | return "ja_jp_phonemizer" 39 | 40 | def _phonemize(self, text: str, separator: str = "|") -> str: 41 | ph = japanese_text_to_phonemes(text) 42 | if separator is not None or separator != "": 43 | return separator.join(ph) 44 | return ph 45 | 46 | def phonemize(self, text: str, separator="|", language=None) -> str: 47 | """Custom phonemize for JP_JA 48 | 49 | Skip pre-post processing steps used by the other phonemizers. 50 | """ 51 | return self._phonemize(text, separator) 52 | 53 | @staticmethod 54 | def supported_languages() -> Dict: 55 | return {"ja-jp": "Japanese (Japan)"} 56 | 57 | def version(self) -> str: 58 | return "0.0.1" 59 | 60 | def is_available(self) -> bool: 61 | return True 62 | 63 | 64 | # if __name__ == "__main__": 65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。" 66 | # e = JA_JP_Phonemizer() 67 | # print(e.supported_languages()) 68 | # print(e.version()) 69 | # print(e.language) 70 | # print(e.name()) 71 | # print(e.is_available()) 72 | # print("`" + e.phonemize(text) + "`") 73 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | 9 | class KO_KR_Phonemizer(BasePhonemizer): 10 | """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer` 11 | 12 | TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ) 13 | 14 | Example: 15 | 16 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer 17 | >>> phonemizer = KO_KR_Phonemizer() 18 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|") 19 | 'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.' 20 | 21 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer 22 | >>> phonemizer = KO_KR_Phonemizer() 23 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english') 24 | 'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.' 25 | 26 | """ 27 | 28 | language = "ko-kr" 29 | 30 | def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument 31 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 32 | 33 | @staticmethod 34 | def name(): 35 | return "ko_kr_phonemizer" 36 | 37 | def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str: 38 | ph = korean_text_to_phonemes(text, character=character) 39 | if separator is not None or separator != "": 40 | return separator.join(ph) 41 | return ph 42 | 43 | def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str: 44 | return self._phonemize(text, separator, character) 45 | 46 | @staticmethod 47 | def supported_languages() -> Dict: 48 | return {"ko-kr": "hangeul(korean)"} 49 | 50 | def version(self) -> str: 51 | return "0.0.2" 52 | 53 | def is_available(self) -> bool: 54 | return True 55 | 56 | 57 | if __name__ == "__main__": 58 | texts = "이 문장은 음성합성 테스트를 위한 문장입니다." 59 | e = KO_KR_Phonemizer() 60 | print(e.supported_languages()) 61 | print(e.version()) 62 | print(e.language) 63 | print(e.name()) 64 | print(e.is_available()) 65 | print(e.phonemize(texts)) 66 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/multi_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name 4 | 5 | 6 | class MultiPhonemizer: 7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages 8 | 9 | Args: 10 | custom_lang_to_phonemizer (Dict): 11 | Custom phonemizer mapping if you want to change the defaults. In the format of 12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`. 13 | 14 | TODO: find a way to pass custom kwargs to the phonemizers 15 | """ 16 | 17 | lang_to_phonemizer = {} 18 | 19 | def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value 20 | for k, v in lang_to_phonemizer_name.items(): 21 | if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys(): 22 | lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k] 23 | elif v == "": 24 | raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.") 25 | self.lang_to_phonemizer_name = lang_to_phonemizer_name 26 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) 27 | 28 | @staticmethod 29 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict: 30 | lang_to_phonemizer = {} 31 | for k, v in lang_to_phonemizer_name.items(): 32 | lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k) 33 | return lang_to_phonemizer 34 | 35 | @staticmethod 36 | def name(): 37 | return "multi-phonemizer" 38 | 39 | def phonemize(self, text, separator="|", language=""): 40 | if language == "": 41 | raise ValueError("Language must be set for multi-phonemizer to phonemize.") 42 | return self.lang_to_phonemizer[language].phonemize(text, separator) 43 | 44 | def supported_languages(self) -> List: 45 | return list(self.lang_to_phonemizer.keys()) 46 | 47 | def print_logs(self, level: int = 0): 48 | indent = "\t" * level 49 | print(f"{indent}| > phoneme language: {self.supported_languages()}") 50 | print(f"{indent}| > phoneme backend: {self.name()}") 51 | 52 | 53 | # if __name__ == "__main__": 54 | # texts = { 55 | # "tr": "Merhaba, bu Türkçe bit örnek!", 56 | # "en-us": "Hello, this is English example!", 57 | # "de": "Hallo, das ist ein Deutches Beipiel!", 58 | # "zh-cn": "这是中国的例子", 59 | # } 60 | # phonemes = {} 61 | # ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""}) 62 | # for lang, text in texts.items(): 63 | # phoneme = ph.phonemize(text, lang) 64 | # phonemes[lang] = phoneme 65 | # print(phonemes) 66 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | 9 | class ZH_CN_Phonemizer(BasePhonemizer): 10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer` 11 | 12 | Args: 13 | punctuations (str): 14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`. 15 | 16 | keep_puncs (bool): 17 | If True, keep the punctuations after phonemization. Defaults to False. 18 | 19 | Example :: 20 | 21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。` 22 | 23 | TODO: someone with Mandarin knowledge should check this implementation 24 | """ 25 | 26 | language = "zh-cn" 27 | 28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument 29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 30 | 31 | @staticmethod 32 | def name(): 33 | return "zh_cn_phonemizer" 34 | 35 | @staticmethod 36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str: 37 | ph = chinese_text_to_phonemes(text, separator) 38 | return ph 39 | 40 | def _phonemize(self, text, separator): 41 | return self.phonemize_zh_cn(text, separator) 42 | 43 | @staticmethod 44 | def supported_languages() -> Dict: 45 | return {"zh-cn": "Chinese (China)"} 46 | 47 | def version(self) -> str: 48 | return "0.0.1" 49 | 50 | def is_available(self) -> bool: 51 | return True 52 | 53 | 54 | # if __name__ == "__main__": 55 | # text = "这是,样本中文。" 56 | # e = ZH_CN_Phonemizer() 57 | # print(e.supported_languages()) 58 | # print(e.version()) 59 | # print(e.language) 60 | # print(e.name()) 61 | # print(e.is_available()) 62 | # print("`" + e.phonemize(text) + "`") 63 | -------------------------------------------------------------------------------- /TTS/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/utils/__init__.py -------------------------------------------------------------------------------- /TTS/utils/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.utils.audio.processor import AudioProcessor 2 | -------------------------------------------------------------------------------- /TTS/utils/capacitron_optimizer.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | from trainer.trainer_utils import get_optimizer 4 | 5 | 6 | class CapacitronOptimizer: 7 | """Double optimizer class for the Capacitron model.""" 8 | 9 | def __init__(self, config: dict, model_params: Generator) -> None: 10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params) 11 | 12 | optimizer_names = list(config.optimizer_params.keys()) 13 | optimizer_parameters = list(config.optimizer_params.values()) 14 | 15 | self.primary_optimizer = get_optimizer( 16 | optimizer_names[0], 17 | optimizer_parameters[0], 18 | config.lr, 19 | parameters=self.primary_params, 20 | ) 21 | 22 | self.secondary_optimizer = get_optimizer( 23 | optimizer_names[1], 24 | self.extract_optimizer_parameters(optimizer_parameters[1]), 25 | optimizer_parameters[1]["lr"], 26 | parameters=self.secondary_params, 27 | ) 28 | 29 | self.param_groups = self.primary_optimizer.param_groups 30 | 31 | def first_step(self): 32 | self.secondary_optimizer.step() 33 | self.secondary_optimizer.zero_grad() 34 | self.primary_optimizer.zero_grad() 35 | 36 | def step(self): 37 | # Update param groups to display the correct learning rate 38 | self.param_groups = self.primary_optimizer.param_groups 39 | self.primary_optimizer.step() 40 | 41 | def zero_grad(self, set_to_none=False): 42 | self.primary_optimizer.zero_grad(set_to_none) 43 | self.secondary_optimizer.zero_grad(set_to_none) 44 | 45 | def load_state_dict(self, state_dict): 46 | self.primary_optimizer.load_state_dict(state_dict[0]) 47 | self.secondary_optimizer.load_state_dict(state_dict[1]) 48 | 49 | def state_dict(self): 50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()] 51 | 52 | @staticmethod 53 | def split_model_parameters(model_params: Generator) -> list: 54 | primary_params = [] 55 | secondary_params = [] 56 | for name, param in model_params: 57 | if param.requires_grad: 58 | if name == "capacitron_vae_layer.beta": 59 | secondary_params.append(param) 60 | else: 61 | primary_params.append(param) 62 | return [iter(primary_params), iter(secondary_params)] 63 | 64 | @staticmethod 65 | def extract_optimizer_parameters(params: dict) -> dict: 66 | """Extract parameters that are not the learning rate""" 67 | return {k: v for k, v in params.items() if k != "lr"} 68 | -------------------------------------------------------------------------------- /TTS/utils/distribute.py: -------------------------------------------------------------------------------- 1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py 2 | import torch 3 | import torch.distributed as dist 4 | 5 | 6 | def reduce_tensor(tensor, num_gpus): 7 | rt = tensor.clone() 8 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 9 | rt /= num_gpus 10 | return rt 11 | 12 | 13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): 14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA." 15 | 16 | # Set cuda device so everything is done on the right GPU. 17 | torch.cuda.set_device(rank % torch.cuda.device_count()) 18 | 19 | # Initialize distributed communication 20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name) 21 | -------------------------------------------------------------------------------- /TTS/utils/io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle as pickle_tts 3 | from typing import Any, Callable, Dict, Union 4 | 5 | import fsspec 6 | import torch 7 | 8 | from TTS.utils.generic_utils import get_user_data_dir 9 | 10 | 11 | class RenamingUnpickler(pickle_tts.Unpickler): 12 | """Overload default pickler to solve module renaming problem""" 13 | 14 | def find_class(self, module, name): 15 | return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name) 16 | 17 | 18 | class AttrDict(dict): 19 | """A custom dict which converts dict keys 20 | to class attributes""" 21 | 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | self.__dict__ = self 25 | 26 | 27 | def load_fsspec( 28 | path: str, 29 | map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, 30 | cache: bool = True, 31 | **kwargs, 32 | ) -> Any: 33 | """Like torch.load but can load from other locations (e.g. s3:// , gs://). 34 | 35 | Args: 36 | path: Any path or url supported by fsspec. 37 | map_location: torch.device or str. 38 | cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True. 39 | **kwargs: Keyword arguments forwarded to torch.load. 40 | 41 | Returns: 42 | Object stored in path. 43 | """ 44 | is_local = os.path.isdir(path) or os.path.isfile(path) 45 | if cache and not is_local: 46 | with fsspec.open( 47 | f"filecache::{path}", 48 | filecache={"cache_storage": str(get_user_data_dir("tts_cache"))}, 49 | mode="rb", 50 | ) as f: 51 | return torch.load(f, map_location=map_location, **kwargs) 52 | else: 53 | with fsspec.open(path, "rb") as f: 54 | return torch.load(f, map_location=map_location, **kwargs) 55 | 56 | 57 | def load_checkpoint( 58 | model, checkpoint_path, use_cuda=False, eval=False, cache=False 59 | ): # pylint: disable=redefined-builtin 60 | try: 61 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) 62 | except ModuleNotFoundError: 63 | pickle_tts.Unpickler = RenamingUnpickler 64 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache) 65 | model.load_state_dict(state["model"]) 66 | if use_cuda: 67 | model.cuda() 68 | if eval: 69 | model.eval() 70 | return model, state 71 | -------------------------------------------------------------------------------- /TTS/utils/training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): 6 | r"""Check model gradient against unexpected jumps and failures""" 7 | skip_flag = False 8 | if ignore_stopnet: 9 | if not amp_opt_params: 10 | grad_norm = torch.nn.utils.clip_grad_norm_( 11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip 12 | ) 13 | else: 14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) 15 | else: 16 | if not amp_opt_params: 17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) 18 | else: 19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) 20 | 21 | # compatibility with different torch versions 22 | if isinstance(grad_norm, float): 23 | if np.isinf(grad_norm): 24 | print(" | > Gradient is INF !!") 25 | skip_flag = True 26 | else: 27 | if torch.isinf(grad_norm): 28 | print(" | > Gradient is INF !!") 29 | skip_flag = True 30 | return grad_norm, skip_flag 31 | 32 | 33 | def gradual_training_scheduler(global_step, config): 34 | """Setup the gradual training schedule wrt number 35 | of active GPUs""" 36 | num_gpus = torch.cuda.device_count() 37 | if num_gpus == 0: 38 | num_gpus = 1 39 | new_values = None 40 | # we set the scheduling wrt num_gpus 41 | for values in config.gradual_training: 42 | if global_step * num_gpus >= values[0]: 43 | new_values = values 44 | return new_values[1], new_values[2] 45 | -------------------------------------------------------------------------------- /TTS/utils/vad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | 4 | 5 | def read_audio(path): 6 | wav, sr = torchaudio.load(path) 7 | 8 | if wav.size(0) > 1: 9 | wav = wav.mean(dim=0, keepdim=True) 10 | 11 | return wav.squeeze(0), sr 12 | 13 | 14 | def resample_wav(wav, sr, new_sr): 15 | wav = wav.unsqueeze(0) 16 | transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr) 17 | wav = transform(wav) 18 | return wav.squeeze(0) 19 | 20 | 21 | def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): 22 | factor = new_sr / vad_sr 23 | new_timestamps = [] 24 | if just_begging_end and timestamps: 25 | # get just the start and end timestamps 26 | new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)} 27 | new_timestamps.append(new_dict) 28 | else: 29 | for ts in timestamps: 30 | # map to the new SR 31 | new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)} 32 | new_timestamps.append(new_dict) 33 | 34 | return new_timestamps 35 | 36 | 37 | def get_vad_model_and_utils(use_cuda=False, use_onnx=False): 38 | model, utils = torch.hub.load( 39 | repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True 40 | ) 41 | if use_cuda: 42 | model = model.cuda() 43 | 44 | get_speech_timestamps, save_audio, _, _, collect_chunks = utils 45 | return model, get_speech_timestamps, save_audio, collect_chunks 46 | 47 | 48 | def remove_silence( 49 | model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False 50 | ): 51 | # get the VAD model and utils functions 52 | model, get_speech_timestamps, _, collect_chunks = model_and_utils 53 | 54 | # read ground truth wav and resample the audio for the VAD 55 | try: 56 | wav, gt_sample_rate = read_audio(audio_path) 57 | except: 58 | print(f"> ❗ Failed to read {audio_path}") 59 | return None, False 60 | 61 | # if needed, resample the audio for the VAD model 62 | if gt_sample_rate != vad_sample_rate: 63 | wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate) 64 | else: 65 | wav_vad = wav 66 | 67 | if use_cuda: 68 | wav_vad = wav_vad.cuda() 69 | 70 | # get speech timestamps from full audio file 71 | speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768) 72 | 73 | # map the current speech_timestamps to the sample rate of the ground truth audio 74 | new_speech_timestamps = map_timestamps_to_new_sr( 75 | vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end 76 | ) 77 | 78 | # if have speech timestamps else save the wav 79 | if new_speech_timestamps: 80 | wav = collect_chunks(new_speech_timestamps, wav) 81 | is_speech = True 82 | else: 83 | print(f"> The file {audio_path} probably does not have speech please check it !!") 84 | is_speech = False 85 | 86 | # save 87 | torchaudio.save(out_path, wav[None, :], gt_sample_rate) 88 | return out_path, is_speech 89 | -------------------------------------------------------------------------------- /TTS/vc/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/configs/__init__.py -------------------------------------------------------------------------------- /TTS/vc/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import re 3 | from typing import Dict, List, Union 4 | 5 | 6 | def to_camel(text): 7 | text = text.capitalize() 8 | return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) 9 | 10 | 11 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": 12 | print(" > Using model: {}".format(config.model)) 13 | # fetch the right model implementation. 14 | if "model" in config and config["model"].lower() == "freevc": 15 | MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC 16 | model = MyModel.init_from_config(config, samples) 17 | return model 18 | -------------------------------------------------------------------------------- /TTS/vc/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/__init__.py -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/freevc/__init__.py -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/speaker_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vc/modules/freevc/speaker_encoder/__init__.py -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/speaker_encoder/audio.py: -------------------------------------------------------------------------------- 1 | import struct 2 | from pathlib import Path 3 | from typing import Optional, Union 4 | 5 | # import webrtcvad 6 | import librosa 7 | import numpy as np 8 | from scipy.ndimage.morphology import binary_dilation 9 | 10 | from TTS.vc.modules.freevc.speaker_encoder.hparams import * 11 | 12 | int16_max = (2**15) - 1 13 | 14 | 15 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None): 16 | """ 17 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform 18 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. 19 | 20 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 21 | just .wav), either the waveform as a numpy array of floats. 22 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 23 | preprocessing. After preprocessing, the waveform's sampling rate will match the data 24 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 25 | this argument will be ignored. 26 | """ 27 | # Load the wav from disk if needed 28 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): 29 | wav, source_sr = librosa.load(fpath_or_wav, sr=None) 30 | else: 31 | wav = fpath_or_wav 32 | 33 | # Resample the wav if needed 34 | if source_sr is not None and source_sr != sampling_rate: 35 | wav = librosa.resample(wav, source_sr, sampling_rate) 36 | 37 | # Apply the preprocessing: normalize volume and shorten long silences 38 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) 39 | wav = trim_long_silences(wav) 40 | 41 | return wav 42 | 43 | 44 | def wav_to_mel_spectrogram(wav): 45 | """ 46 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. 47 | Note: this not a log-mel spectrogram. 48 | """ 49 | frames = librosa.feature.melspectrogram( 50 | y=wav, 51 | sr=sampling_rate, 52 | n_fft=int(sampling_rate * mel_window_length / 1000), 53 | hop_length=int(sampling_rate * mel_window_step / 1000), 54 | n_mels=mel_n_channels, 55 | ) 56 | return frames.astype(np.float32).T 57 | 58 | 59 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): 60 | if increase_only and decrease_only: 61 | raise ValueError("Both increase only and decrease only are set") 62 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2)) 63 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): 64 | return wav 65 | return wav * (10 ** (dBFS_change / 20)) 66 | -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/speaker_encoder/hparams.py: -------------------------------------------------------------------------------- 1 | ## Mel-filterbank 2 | mel_window_length = 25 # In milliseconds 3 | mel_window_step = 10 # In milliseconds 4 | mel_n_channels = 40 5 | 6 | 7 | ## Audio 8 | sampling_rate = 16000 9 | # Number of spectrogram frames in a partial utterance 10 | partials_n_frames = 160 # 1600 ms 11 | 12 | 13 | ## Voice Activation Detection 14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 15 | # This sets the granularity of the VAD. Should not need to be changed. 16 | vad_window_length = 30 # In milliseconds 17 | # Number of frames to average together when performing the moving average smoothing. 18 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 19 | vad_moving_average_width = 8 20 | # Maximum number of consecutive silent frames a segment can have. 21 | vad_max_silence_length = 6 22 | 23 | 24 | ## Audio volume normalization 25 | audio_norm_target_dBFS = -30 26 | 27 | 28 | ## Model parameters 29 | model_hidden_size = 256 30 | model_embedding_size = 256 31 | model_num_layers = 3 32 | -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/wavlm/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | 4 | import torch 5 | 6 | from TTS.utils.generic_utils import get_user_data_dir 7 | from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig 8 | 9 | model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" 10 | 11 | 12 | def get_wavlm(device="cpu"): 13 | """Download the model and return the model object.""" 14 | 15 | output_path = get_user_data_dir("tts") 16 | 17 | output_path = os.path.join(output_path, "wavlm") 18 | if not os.path.exists(output_path): 19 | os.makedirs(output_path) 20 | 21 | output_path = os.path.join(output_path, "WavLM-Large.pt") 22 | if not os.path.exists(output_path): 23 | print(f" > Downloading WavLM model to {output_path} ...") 24 | urllib.request.urlretrieve(model_uri, output_path) 25 | 26 | checkpoint = torch.load(output_path, map_location=torch.device(device)) 27 | cfg = WavLMConfig(checkpoint["cfg"]) 28 | wavlm = WavLM(cfg).to(device) 29 | wavlm.load_state_dict(checkpoint["model"]) 30 | wavlm.eval() 31 | return wavlm 32 | 33 | 34 | if __name__ == "__main__": 35 | wavlm = get_wavlm() 36 | -------------------------------------------------------------------------------- /TTS/vc/modules/freevc/wavlm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "./wavlm-large/", 3 | "activation_dropout": 0.0, 4 | "adapter_kernel_size": 3, 5 | "adapter_stride": 2, 6 | "add_adapter": false, 7 | "apply_spec_augment": true, 8 | "architectures": [ 9 | "WavLMModel" 10 | ], 11 | "attention_dropout": 0.1, 12 | "bos_token_id": 1, 13 | "classifier_proj_size": 256, 14 | "codevector_dim": 768, 15 | "contrastive_logits_temperature": 0.1, 16 | "conv_bias": false, 17 | "conv_dim": [ 18 | 512, 19 | 512, 20 | 512, 21 | 512, 22 | 512, 23 | 512, 24 | 512 25 | ], 26 | "conv_kernel": [ 27 | 10, 28 | 3, 29 | 3, 30 | 3, 31 | 3, 32 | 2, 33 | 2 34 | ], 35 | "conv_stride": [ 36 | 5, 37 | 2, 38 | 2, 39 | 2, 40 | 2, 41 | 2, 42 | 2 43 | ], 44 | "ctc_loss_reduction": "sum", 45 | "ctc_zero_infinity": false, 46 | "diversity_loss_weight": 0.1, 47 | "do_stable_layer_norm": true, 48 | "eos_token_id": 2, 49 | "feat_extract_activation": "gelu", 50 | "feat_extract_dropout": 0.0, 51 | "feat_extract_norm": "layer", 52 | "feat_proj_dropout": 0.1, 53 | "feat_quantizer_dropout": 0.0, 54 | "final_dropout": 0.0, 55 | "gradient_checkpointing": false, 56 | "hidden_act": "gelu", 57 | "hidden_dropout": 0.1, 58 | "hidden_size": 1024, 59 | "initializer_range": 0.02, 60 | "intermediate_size": 4096, 61 | "layer_norm_eps": 1e-05, 62 | "layerdrop": 0.1, 63 | "mask_channel_length": 10, 64 | "mask_channel_min_space": 1, 65 | "mask_channel_other": 0.0, 66 | "mask_channel_prob": 0.0, 67 | "mask_channel_selection": "static", 68 | "mask_feature_length": 10, 69 | "mask_feature_min_masks": 0, 70 | "mask_feature_prob": 0.0, 71 | "mask_time_length": 10, 72 | "mask_time_min_masks": 2, 73 | "mask_time_min_space": 1, 74 | "mask_time_other": 0.0, 75 | "mask_time_prob": 0.075, 76 | "mask_time_selection": "static", 77 | "max_bucket_distance": 800, 78 | "model_type": "wavlm", 79 | "num_adapter_layers": 3, 80 | "num_attention_heads": 16, 81 | "num_buckets": 320, 82 | "num_codevector_groups": 2, 83 | "num_codevectors_per_group": 320, 84 | "num_conv_pos_embedding_groups": 16, 85 | "num_conv_pos_embeddings": 128, 86 | "num_ctc_classes": 80, 87 | "num_feat_extract_layers": 7, 88 | "num_hidden_layers": 24, 89 | "num_negatives": 100, 90 | "output_hidden_size": 1024, 91 | "pad_token_id": 0, 92 | "proj_codevector_dim": 768, 93 | "replace_prob": 0.5, 94 | "tokenizer_class": "Wav2Vec2CTCTokenizer", 95 | "torch_dtype": "float32", 96 | "transformers_version": "4.15.0.dev0", 97 | "use_weighted_layer_sum": false, 98 | "vocab_size": 32 99 | } -------------------------------------------------------------------------------- /TTS/vocoder/README.md: -------------------------------------------------------------------------------- 1 | # Mozilla TTS Vocoders (Experimental) 2 | 3 | Here there are vocoder model implementations which can be combined with the other TTS models. 4 | 5 | Currently, following models are implemented: 6 | 7 | - Melgan 8 | - MultiBand-Melgan 9 | - ParallelWaveGAN 10 | - GAN-TTS (Discriminator Only) 11 | 12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework. 13 | 14 | ## Training a model 15 | 16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset. 17 | 18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json''' 19 | 20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command. 21 | 22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json``` 23 | 24 | Example config files can be found under `tts/vocoder/configs/` folder. 25 | 26 | You can continue a previous training run by the following command. 27 | 28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder``` 29 | 30 | You can fine-tune a pre-trained model by the following command. 31 | 32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth``` 33 | 34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off. 35 | 36 | You can also follow your training runs on Tensorboard as you do with our TTS models. 37 | 38 | ## Acknowledgement 39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work. 40 | -------------------------------------------------------------------------------- /TTS/vocoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/__init__.py -------------------------------------------------------------------------------- /TTS/vocoder/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | configs_dir = os.path.dirname(__file__) 7 | for file in os.listdir(configs_dir): 8 | path = os.path.join(configs_dir, file) 9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | module = importlib.import_module("TTS.vocoder.configs." + config_name) 12 | for attribute_name in dir(module): 13 | attribute = getattr(module, attribute_name) 14 | 15 | if isclass(attribute): 16 | # Add the class to this package's variables 17 | globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /TTS/vocoder/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from coqpit import Coqpit 4 | from torch.utils.data import Dataset 5 | 6 | from TTS.utils.audio import AudioProcessor 7 | from TTS.vocoder.datasets.gan_dataset import GANDataset 8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data 9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset 10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset 11 | 12 | 13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: 14 | if config.model.lower() in "gan": 15 | dataset = GANDataset( 16 | ap=ap, 17 | items=data_items, 18 | seq_len=config.seq_len, 19 | hop_len=ap.hop_length, 20 | pad_short=config.pad_short, 21 | conv_pad=config.conv_pad, 22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, 23 | is_training=not is_eval, 24 | return_segments=not is_eval, 25 | use_noise_augment=config.use_noise_augment, 26 | use_cache=config.use_cache, 27 | verbose=verbose, 28 | ) 29 | dataset.shuffle_mapping() 30 | elif config.model.lower() == "wavegrad": 31 | dataset = WaveGradDataset( 32 | ap=ap, 33 | items=data_items, 34 | seq_len=config.seq_len, 35 | hop_len=ap.hop_length, 36 | pad_short=config.pad_short, 37 | conv_pad=config.conv_pad, 38 | is_training=not is_eval, 39 | return_segments=True, 40 | use_noise_augment=False, 41 | use_cache=config.use_cache, 42 | verbose=verbose, 43 | ) 44 | elif config.model.lower() == "wavernn": 45 | dataset = WaveRNNDataset( 46 | ap=ap, 47 | items=data_items, 48 | seq_len=config.seq_len, 49 | hop_len=ap.hop_length, 50 | pad=config.model_params.pad, 51 | mode=config.model_params.mode, 52 | mulaw=config.model_params.mulaw, 53 | is_training=not is_eval, 54 | verbose=verbose, 55 | ) 56 | else: 57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") 58 | return dataset 59 | -------------------------------------------------------------------------------- /TTS/vocoder/datasets/preprocess.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | from coqpit import Coqpit 7 | from tqdm import tqdm 8 | 9 | from TTS.utils.audio import AudioProcessor 10 | from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize 11 | 12 | 13 | def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): 14 | """Process wav and compute mel and quantized wave signal. 15 | It is mainly used by WaveRNN dataloader. 16 | 17 | Args: 18 | out_path (str): Parent folder path to save the files. 19 | config (Coqpit): Model config. 20 | ap (AudioProcessor): Audio processor. 21 | """ 22 | os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) 23 | os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) 24 | wav_files = find_wav_files(config.data_path) 25 | for path in tqdm(wav_files): 26 | wav_name = Path(path).stem 27 | quant_path = os.path.join(out_path, "quant", wav_name + ".npy") 28 | mel_path = os.path.join(out_path, "mel", wav_name + ".npy") 29 | y = ap.load_wav(path) 30 | mel = ap.melspectrogram(y) 31 | np.save(mel_path, mel) 32 | if isinstance(config.mode, int): 33 | quant = ( 34 | mulaw_encode(wav=y, mulaw_qc=config.mode) 35 | if config.model_args.mulaw 36 | else quantize(x=y, quantize_bits=config.mode) 37 | ) 38 | np.save(quant_path, quant) 39 | 40 | 41 | def find_wav_files(data_path, file_ext="wav"): 42 | wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True) 43 | return wav_paths 44 | 45 | 46 | def find_feat_files(data_path): 47 | feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True) 48 | return feat_paths 49 | 50 | 51 | def load_wav_data(data_path, eval_split_size, file_ext="wav"): 52 | wav_paths = find_wav_files(data_path, file_ext=file_ext) 53 | assert len(wav_paths) > 0, f" [!] {data_path} is empty." 54 | np.random.seed(0) 55 | np.random.shuffle(wav_paths) 56 | return wav_paths[:eval_split_size], wav_paths[eval_split_size:] 57 | 58 | 59 | def load_wav_feat_data(data_path, feat_path, eval_split_size): 60 | wav_paths = find_wav_files(data_path) 61 | feat_paths = find_feat_files(feat_path) 62 | 63 | wav_paths.sort(key=lambda x: Path(x).stem) 64 | feat_paths.sort(key=lambda x: Path(x).stem) 65 | 66 | assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}" 67 | for wav, feat in zip(wav_paths, feat_paths): 68 | wav_name = Path(wav).stem 69 | feat_name = Path(feat).stem 70 | assert wav_name == feat_name 71 | 72 | items = list(zip(wav_paths, feat_paths)) 73 | np.random.seed(0) 74 | np.random.shuffle(items) 75 | return items[:eval_split_size], items[eval_split_size:] 76 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/layers/__init__.py -------------------------------------------------------------------------------- /TTS/vocoder/layers/hifigan.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn.utils.parametrize import remove_parametrizations 3 | 4 | 5 | # pylint: disable=dangerous-default-value 6 | class ResStack(nn.Module): 7 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]): 8 | super().__init__() 9 | resstack = [] 10 | for dilation in dilations: 11 | resstack += [ 12 | nn.LeakyReLU(0.2), 13 | nn.ReflectionPad1d(dilation), 14 | nn.utils.parametrizations.weight_norm( 15 | nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation) 16 | ), 17 | nn.LeakyReLU(0.2), 18 | nn.ReflectionPad1d(padding), 19 | nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), 20 | ] 21 | self.resstack = nn.Sequential(*resstack) 22 | 23 | self.shortcut = nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) 24 | 25 | def forward(self, x): 26 | x1 = self.shortcut(x) 27 | x2 = self.resstack(x) 28 | return x1 + x2 29 | 30 | def remove_weight_norm(self): 31 | remove_parametrizations(self.shortcut, "weight") 32 | remove_parametrizations(self.resstack[2], "weight") 33 | remove_parametrizations(self.resstack[5], "weight") 34 | remove_parametrizations(self.resstack[8], "weight") 35 | remove_parametrizations(self.resstack[11], "weight") 36 | remove_parametrizations(self.resstack[14], "weight") 37 | remove_parametrizations(self.resstack[17], "weight") 38 | 39 | 40 | class MRF(nn.Module): 41 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value 42 | super().__init__() 43 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations) 44 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations) 45 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations) 46 | 47 | def forward(self, x): 48 | x1 = self.resblock1(x) 49 | x2 = self.resblock2(x) 50 | x3 = self.resblock3(x) 51 | return x1 + x2 + x3 52 | 53 | def remove_weight_norm(self): 54 | self.resblock1.remove_weight_norm() 55 | self.resblock2.remove_weight_norm() 56 | self.resblock3.remove_weight_norm() 57 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/melgan.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn.utils.parametrizations import weight_norm 3 | from torch.nn.utils.parametrize import remove_parametrizations 4 | 5 | 6 | class ResidualStack(nn.Module): 7 | def __init__(self, channels, num_res_blocks, kernel_size): 8 | super().__init__() 9 | 10 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." 11 | base_padding = (kernel_size - 1) // 2 12 | 13 | self.blocks = nn.ModuleList() 14 | for idx in range(num_res_blocks): 15 | layer_kernel_size = kernel_size 16 | layer_dilation = layer_kernel_size**idx 17 | layer_padding = base_padding * layer_dilation 18 | self.blocks += [ 19 | nn.Sequential( 20 | nn.LeakyReLU(0.2), 21 | nn.ReflectionPad1d(layer_padding), 22 | weight_norm( 23 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True) 24 | ), 25 | nn.LeakyReLU(0.2), 26 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)), 27 | ) 28 | ] 29 | 30 | self.shortcuts = nn.ModuleList( 31 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for _ in range(num_res_blocks)] 32 | ) 33 | 34 | def forward(self, x): 35 | for block, shortcut in zip(self.blocks, self.shortcuts): 36 | x = shortcut(x) + block(x) 37 | return x 38 | 39 | def remove_weight_norm(self): 40 | for block, shortcut in zip(self.blocks, self.shortcuts): 41 | remove_parametrizations(block[2], "weight") 42 | remove_parametrizations(block[4], "weight") 43 | remove_parametrizations(shortcut, "weight") 44 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/parallel_wavegan.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | 5 | class ResidualBlock(torch.nn.Module): 6 | """Residual block module in WaveNet.""" 7 | 8 | def __init__( 9 | self, 10 | kernel_size=3, 11 | res_channels=64, 12 | gate_channels=128, 13 | skip_channels=64, 14 | aux_channels=80, 15 | dropout=0.0, 16 | dilation=1, 17 | bias=True, 18 | use_causal_conv=False, 19 | ): 20 | super().__init__() 21 | self.dropout = dropout 22 | # no future time stamps available 23 | if use_causal_conv: 24 | padding = (kernel_size - 1) * dilation 25 | else: 26 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." 27 | padding = (kernel_size - 1) // 2 * dilation 28 | self.use_causal_conv = use_causal_conv 29 | 30 | # dilation conv 31 | self.conv = torch.nn.Conv1d( 32 | res_channels, gate_channels, kernel_size, padding=padding, dilation=dilation, bias=bias 33 | ) 34 | 35 | # local conditioning 36 | if aux_channels > 0: 37 | self.conv1x1_aux = torch.nn.Conv1d(aux_channels, gate_channels, 1, bias=False) 38 | else: 39 | self.conv1x1_aux = None 40 | 41 | # conv output is split into two groups 42 | gate_out_channels = gate_channels // 2 43 | self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, res_channels, 1, bias=bias) 44 | self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, skip_channels, 1, bias=bias) 45 | 46 | def forward(self, x, c): 47 | """ 48 | x: B x D_res x T 49 | c: B x D_aux x T 50 | """ 51 | residual = x 52 | x = F.dropout(x, p=self.dropout, training=self.training) 53 | x = self.conv(x) 54 | 55 | # remove future time steps if use_causal_conv conv 56 | x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x 57 | 58 | # split into two part for gated activation 59 | splitdim = 1 60 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) 61 | 62 | # local conditioning 63 | if c is not None: 64 | assert self.conv1x1_aux is not None 65 | c = self.conv1x1_aux(c) 66 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) 67 | xa, xb = xa + ca, xb + cb 68 | 69 | x = torch.tanh(xa) * torch.sigmoid(xb) 70 | 71 | # for skip connection 72 | s = self.conv1x1_skip(x) 73 | 74 | # for residual connection 75 | x = (self.conv1x1_out(x) + residual) * (0.5**2) 76 | 77 | return x, s 78 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/pqmf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from scipy import signal as sig 5 | 6 | 7 | # adapted from 8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan 9 | class PQMF(torch.nn.Module): 10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): 11 | super().__init__() 12 | 13 | self.N = N 14 | self.taps = taps 15 | self.cutoff = cutoff 16 | self.beta = beta 17 | 18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta)) 19 | H = np.zeros((N, len(QMF))) 20 | G = np.zeros((N, len(QMF))) 21 | for k in range(N): 22 | constant_factor = ( 23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2)) 24 | ) # TODO: (taps - 1) -> taps 25 | phase = (-1) ** k * np.pi / 4 26 | H[k] = 2 * QMF * np.cos(constant_factor + phase) 27 | 28 | G[k] = 2 * QMF * np.cos(constant_factor - phase) 29 | 30 | H = torch.from_numpy(H[:, None, :]).float() 31 | G = torch.from_numpy(G[None, :, :]).float() 32 | 33 | self.register_buffer("H", H) 34 | self.register_buffer("G", G) 35 | 36 | updown_filter = torch.zeros((N, N, N)).float() 37 | for k in range(N): 38 | updown_filter[k, k, 0] = 1.0 39 | self.register_buffer("updown_filter", updown_filter) 40 | self.N = N 41 | 42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 43 | 44 | def forward(self, x): 45 | return self.analysis(x) 46 | 47 | def analysis(self, x): 48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N) 49 | 50 | def synthesis(self, x): 51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N) 52 | x = F.conv1d(x, self.G, padding=self.taps // 2) 53 | return x 54 | -------------------------------------------------------------------------------- /TTS/vocoder/models/base_vocoder.py: -------------------------------------------------------------------------------- 1 | from coqpit import Coqpit 2 | 3 | from TTS.model import BaseTrainerModel 4 | 5 | # pylint: skip-file 6 | 7 | 8 | class BaseVocoder(BaseTrainerModel): 9 | """Base `vocoder` class. Every new `vocoder` model must inherit this. 10 | 11 | It defines `vocoder` specific functions on top of `Model`. 12 | 13 | Notes on input/output tensor shapes: 14 | Any input or output tensor of the model must be shaped as 15 | 16 | - 3D tensors `batch x time x channels` 17 | - 2D tensors `batch x channels` 18 | - 1D tensors `batch x 1` 19 | """ 20 | 21 | MODEL_TYPE = "vocoder" 22 | 23 | def __init__(self, config): 24 | super().__init__() 25 | self._set_model_args(config) 26 | 27 | def _set_model_args(self, config: Coqpit): 28 | """Setup model args based on the config type. 29 | 30 | If the config is for training with a name like "*Config", then the model args are embeded in the 31 | config.model_args 32 | 33 | If the config is for the model with a name like "*Args", then we assign the directly. 34 | """ 35 | # don't use isintance not to import recursively 36 | if "Config" in config.__class__.__name__: 37 | if "characters" in config: 38 | _, self.config, num_chars = self.get_characters(config) 39 | self.config.num_chars = num_chars 40 | if hasattr(self.config, "model_args"): 41 | config.model_args.num_chars = num_chars 42 | if "model_args" in config: 43 | self.args = self.config.model_args 44 | # This is for backward compatibility 45 | if "model_params" in config: 46 | self.args = self.config.model_params 47 | else: 48 | self.config = config 49 | if "model_args" in config: 50 | self.args = self.config.model_args 51 | # This is for backward compatibility 52 | if "model_params" in config: 53 | self.args = self.config.model_params 54 | else: 55 | raise ValueError("config must be either a *Config or *Args") 56 | -------------------------------------------------------------------------------- /TTS/vocoder/models/fullband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.models.melgan_generator import MelganGenerator 4 | 5 | 6 | class FullbandMelganGenerator(MelganGenerator): 7 | def __init__( 8 | self, 9 | in_channels=80, 10 | out_channels=1, 11 | proj_kernel=7, 12 | base_channels=512, 13 | upsample_factors=(2, 8, 2, 2), 14 | res_kernel=3, 15 | num_res_blocks=4, 16 | ): 17 | super().__init__( 18 | in_channels=in_channels, 19 | out_channels=out_channels, 20 | proj_kernel=proj_kernel, 21 | base_channels=base_channels, 22 | upsample_factors=upsample_factors, 23 | res_kernel=res_kernel, 24 | num_res_blocks=num_res_blocks, 25 | ) 26 | 27 | @torch.no_grad() 28 | def inference(self, cond_features): 29 | cond_features = cond_features.to(self.layers[1].weight.device) 30 | cond_features = torch.nn.functional.pad( 31 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 32 | ) 33 | return self.layers(cond_features) 34 | -------------------------------------------------------------------------------- /TTS/vocoder/models/melgan_discriminator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | from torch.nn.utils.parametrizations import weight_norm 4 | 5 | 6 | class MelganDiscriminator(nn.Module): 7 | def __init__( 8 | self, 9 | in_channels=1, 10 | out_channels=1, 11 | kernel_sizes=(5, 3), 12 | base_channels=16, 13 | max_channels=1024, 14 | downsample_factors=(4, 4, 4, 4), 15 | groups_denominator=4, 16 | ): 17 | super().__init__() 18 | self.layers = nn.ModuleList() 19 | 20 | layer_kernel_size = np.prod(kernel_sizes) 21 | layer_padding = (layer_kernel_size - 1) // 2 22 | 23 | # initial layer 24 | self.layers += [ 25 | nn.Sequential( 26 | nn.ReflectionPad1d(layer_padding), 27 | weight_norm(nn.Conv1d(in_channels, base_channels, layer_kernel_size, stride=1)), 28 | nn.LeakyReLU(0.2, inplace=True), 29 | ) 30 | ] 31 | 32 | # downsampling layers 33 | layer_in_channels = base_channels 34 | for downsample_factor in downsample_factors: 35 | layer_out_channels = min(layer_in_channels * downsample_factor, max_channels) 36 | layer_kernel_size = downsample_factor * 10 + 1 37 | layer_padding = (layer_kernel_size - 1) // 2 38 | layer_groups = layer_in_channels // groups_denominator 39 | self.layers += [ 40 | nn.Sequential( 41 | weight_norm( 42 | nn.Conv1d( 43 | layer_in_channels, 44 | layer_out_channels, 45 | kernel_size=layer_kernel_size, 46 | stride=downsample_factor, 47 | padding=layer_padding, 48 | groups=layer_groups, 49 | ) 50 | ), 51 | nn.LeakyReLU(0.2, inplace=True), 52 | ) 53 | ] 54 | layer_in_channels = layer_out_channels 55 | 56 | # last 2 layers 57 | layer_padding1 = (kernel_sizes[0] - 1) // 2 58 | layer_padding2 = (kernel_sizes[1] - 1) // 2 59 | self.layers += [ 60 | nn.Sequential( 61 | weight_norm( 62 | nn.Conv1d( 63 | layer_out_channels, 64 | layer_out_channels, 65 | kernel_size=kernel_sizes[0], 66 | stride=1, 67 | padding=layer_padding1, 68 | ) 69 | ), 70 | nn.LeakyReLU(0.2, inplace=True), 71 | ), 72 | weight_norm( 73 | nn.Conv1d( 74 | layer_out_channels, out_channels, kernel_size=kernel_sizes[1], stride=1, padding=layer_padding2 75 | ) 76 | ), 77 | ] 78 | 79 | def forward(self, x): 80 | feats = [] 81 | for layer in self.layers: 82 | x = layer(x) 83 | feats.append(x) 84 | return x, feats 85 | -------------------------------------------------------------------------------- /TTS/vocoder/models/melgan_multiscale_discriminator.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator 4 | 5 | 6 | class MelganMultiscaleDiscriminator(nn.Module): 7 | def __init__( 8 | self, 9 | in_channels=1, 10 | out_channels=1, 11 | num_scales=3, 12 | kernel_sizes=(5, 3), 13 | base_channels=16, 14 | max_channels=1024, 15 | downsample_factors=(4, 4, 4), 16 | pooling_kernel_size=4, 17 | pooling_stride=2, 18 | pooling_padding=2, 19 | groups_denominator=4, 20 | ): 21 | super().__init__() 22 | 23 | self.discriminators = nn.ModuleList( 24 | [ 25 | MelganDiscriminator( 26 | in_channels=in_channels, 27 | out_channels=out_channels, 28 | kernel_sizes=kernel_sizes, 29 | base_channels=base_channels, 30 | max_channels=max_channels, 31 | downsample_factors=downsample_factors, 32 | groups_denominator=groups_denominator, 33 | ) 34 | for _ in range(num_scales) 35 | ] 36 | ) 37 | 38 | self.pooling = nn.AvgPool1d( 39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False 40 | ) 41 | 42 | def forward(self, x): 43 | scores = [] 44 | feats = [] 45 | for disc in self.discriminators: 46 | score, feat = disc(x) 47 | scores.append(score) 48 | feats.append(feat) 49 | x = self.pooling(x) 50 | return scores, feats 51 | -------------------------------------------------------------------------------- /TTS/vocoder/models/multiband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.layers.pqmf import PQMF 4 | from TTS.vocoder.models.melgan_generator import MelganGenerator 5 | 6 | 7 | class MultibandMelganGenerator(MelganGenerator): 8 | def __init__( 9 | self, 10 | in_channels=80, 11 | out_channels=4, 12 | proj_kernel=7, 13 | base_channels=384, 14 | upsample_factors=(2, 8, 2, 2), 15 | res_kernel=3, 16 | num_res_blocks=3, 17 | ): 18 | super().__init__( 19 | in_channels=in_channels, 20 | out_channels=out_channels, 21 | proj_kernel=proj_kernel, 22 | base_channels=base_channels, 23 | upsample_factors=upsample_factors, 24 | res_kernel=res_kernel, 25 | num_res_blocks=num_res_blocks, 26 | ) 27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) 28 | 29 | def pqmf_analysis(self, x): 30 | return self.pqmf_layer.analysis(x) 31 | 32 | def pqmf_synthesis(self, x): 33 | return self.pqmf_layer.synthesis(x) 34 | 35 | @torch.no_grad() 36 | def inference(self, cond_features): 37 | cond_features = cond_features.to(self.layers[1].weight.device) 38 | cond_features = torch.nn.functional.pad( 39 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 40 | ) 41 | return self.pqmf_synthesis(self.layers(cond_features)) 42 | -------------------------------------------------------------------------------- /TTS/vocoder/pqmf_output.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/pqmf_output.wav -------------------------------------------------------------------------------- /TTS/vocoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/TTS/vocoder/utils/__init__.py -------------------------------------------------------------------------------- /TTS/vocoder/utils/generic_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | import torch 5 | from matplotlib import pyplot as plt 6 | 7 | from TTS.tts.utils.visual import plot_spectrogram 8 | from TTS.utils.audio import AudioProcessor 9 | 10 | 11 | def interpolate_vocoder_input(scale_factor, spec): 12 | """Interpolate spectrogram by the scale factor. 13 | It is mainly used to match the sampling rates of 14 | the tts and vocoder models. 15 | 16 | Args: 17 | scale_factor (float): scale factor to interpolate the spectrogram 18 | spec (np.array): spectrogram to be interpolated 19 | 20 | Returns: 21 | torch.tensor: interpolated spectrogram. 22 | """ 23 | print(" > before interpolation :", spec.shape) 24 | spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable 25 | spec = torch.nn.functional.interpolate( 26 | spec, scale_factor=scale_factor, recompute_scale_factor=True, mode="bilinear", align_corners=False 27 | ).squeeze(0) 28 | print(" > after interpolation :", spec.shape) 29 | return spec 30 | 31 | 32 | def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict: 33 | """Plot the predicted and the real waveform and their spectrograms. 34 | 35 | Args: 36 | y_hat (torch.tensor): Predicted waveform. 37 | y (torch.tensor): Real waveform. 38 | ap (AudioProcessor): Audio processor used to process the waveform. 39 | name_prefix (str, optional): Name prefix used to name the figures. Defaults to None. 40 | 41 | Returns: 42 | Dict: output figures keyed by the name of the figures. 43 | """ """Plot vocoder model results""" 44 | if name_prefix is None: 45 | name_prefix = "" 46 | 47 | # select an instance from batch 48 | y_hat = y_hat[0].squeeze().detach().cpu().numpy() 49 | y = y[0].squeeze().detach().cpu().numpy() 50 | 51 | spec_fake = ap.melspectrogram(y_hat).T 52 | spec_real = ap.melspectrogram(y).T 53 | spec_diff = np.abs(spec_fake - spec_real) 54 | 55 | # plot figure and save it 56 | fig_wave = plt.figure() 57 | plt.subplot(2, 1, 1) 58 | plt.plot(y) 59 | plt.title("groundtruth speech") 60 | plt.subplot(2, 1, 2) 61 | plt.plot(y_hat) 62 | plt.title("generated speech") 63 | plt.tight_layout() 64 | plt.close() 65 | 66 | figures = { 67 | name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake), 68 | name_prefix + "spectrogram/real": plot_spectrogram(spec_real), 69 | name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff), 70 | name_prefix + "speech_comparison": fig_wave, 71 | } 72 | return figures 73 | -------------------------------------------------------------------------------- /download_checkpoint.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from transformers import HfArgumentParser 3 | from typing import Optional 4 | from TTS.utils.manage import ModelManager 5 | import os 6 | 7 | @dataclass 8 | class DownloadArgs: 9 | output_path: str = field( 10 | default="checkpoints", 11 | metadata={"help": "Path to pretrained + checkpoint model"} 12 | ) 13 | 14 | def download(output_path: str = "checkpoints"): 15 | CHECKPOINTS_OUT_PATH = os.path.join(output_path, "XTTS_v2.0_original_model_files/") 16 | os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) 17 | 18 | # DVAE files 19 | DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" 20 | MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" 21 | 22 | # Set the path to the downloaded files 23 | DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) 24 | MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK)) 25 | 26 | # download DVAE files if needed 27 | if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): 28 | print(" > Downloading DVAE files!") 29 | ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) 30 | 31 | # Download XTTS v2.0 checkpoint if needed 32 | TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" 33 | XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" 34 | XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json" 35 | 36 | # XTTS transfer learning parameters 37 | TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) 38 | XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) 39 | XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) 40 | 41 | # download XTTS v2.0 files if needed 42 | if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): 43 | print(" > Downloading XTTS v2.0 files!") 44 | ModelManager._download_model_files( 45 | [TOKENIZER_FILE_LINK, XTTS_CONFIG_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True 46 | # [TOKENIZER_FILE_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True 47 | ) 48 | 49 | if __name__ == "__main__": 50 | parser = HfArgumentParser(DownloadArgs) 51 | args = parser.parse_args() 52 | download(output_path=args.output_path) -------------------------------------------------------------------------------- /recipes/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Training Recipes 2 | 3 | TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset. 4 | 5 | For each dataset, you need to download the dataset once. Then you run the training for the model you want. 6 | 7 | Run each script from the root TTS folder as follows. 8 | 9 | ```console 10 | $ sh ./recipes//download_.sh 11 | $ python recipes///train.py 12 | ``` 13 | 14 | For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows. 15 | 16 | ```console 17 | python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac 18 | ``` 19 | 20 | If you train a new model using TTS, feel free to share your training to expand the list of recipes. 21 | 22 | You can also open a new discussion and share your progress with the 🐸 community. -------------------------------------------------------------------------------- /recipes/bel-alex73/.gitignore: -------------------------------------------------------------------------------- 1 | /docker-prepare/*.txt 2 | -------------------------------------------------------------------------------- /recipes/bel-alex73/docker-prepare-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | cd $( dirname -- "$0"; ) 5 | 6 | cp ../../requirements*.txt docker-prepare/ 7 | 8 | docker build -t tts-learn -f docker-prepare/Dockerfile docker-prepare/ 9 | 10 | mkdir -p ../../../storage 11 | docker run --rm -it \ 12 | -p 2525:2525 \ 13 | --shm-size=256M \ 14 | --name tts-learn-run \ 15 | -v $(pwd)/../../:/a/TTS \ 16 | -v $(pwd)/../../../cv-corpus:/a/cv-corpus \ 17 | -v $(pwd)/../../../fanetyka/:/a/fanetyka/ \ 18 | -v $(pwd)/../../../storage:/storage \ 19 | tts-learn 20 | -------------------------------------------------------------------------------- /recipes/bel-alex73/docker-prepare/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt -y update 4 | RUN apt -y upgrade 5 | RUN apt -y install --no-install-recommends pip ffmpeg openjdk-19-jre-headless 6 | 7 | RUN mkdir /a/ 8 | ADD requirements*.txt /a/ 9 | WORKDIR /a/ 10 | RUN pip install -r requirements.txt -r requirements.dev.txt -r requirements.notebooks.txt 11 | RUN pip install seaborn pydub notebook 12 | 13 | RUN apt -y install --no-install-recommends gcc libpython3.10-dev 14 | 15 | ADD runtime.sh /a/ 16 | 17 | WORKDIR /a/TTS/ 18 | CMD /a/runtime.sh 19 | -------------------------------------------------------------------------------- /recipes/bel-alex73/docker-prepare/runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /a/TTS 4 | pip install -e .[all,dev,notebooks] 5 | 6 | LANG=C.utf8 bash 7 | -------------------------------------------------------------------------------- /recipes/bel-alex73/dump_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from train_glowtts import config 5 | 6 | s = json.dumps(config, default=vars, indent=2) 7 | s = re.sub(r'"test_sentences":\s*\[\],', "", s) 8 | print(s) 9 | -------------------------------------------------------------------------------- /recipes/bel-alex73/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from coqpit import Coqpit 4 | from trainer import Trainer, TrainerArgs 5 | 6 | from TTS.tts.configs.shared_configs import BaseAudioConfig 7 | from TTS.utils.audio import AudioProcessor 8 | from TTS.vocoder.configs.hifigan_config import * 9 | from TTS.vocoder.datasets.preprocess import load_wav_data 10 | from TTS.vocoder.models.gan import GAN 11 | 12 | output_path = "/storage/output-hifigan/" 13 | 14 | audio_config = BaseAudioConfig( 15 | mel_fmin=50, 16 | mel_fmax=8000, 17 | hop_length=256, 18 | stats_path="/storage/TTS/scale_stats.npy", 19 | ) 20 | 21 | config = HifiganConfig( 22 | batch_size=74, 23 | eval_batch_size=16, 24 | num_loader_workers=8, 25 | num_eval_loader_workers=8, 26 | lr_disc=0.0002, 27 | lr_gen=0.0002, 28 | run_eval=True, 29 | test_delay_epochs=5, 30 | epochs=1000, 31 | use_noise_augment=True, 32 | seq_len=8192, 33 | pad_short=2000, 34 | save_step=5000, 35 | print_step=50, 36 | print_eval=True, 37 | mixed_precision=False, 38 | eval_split_size=30, 39 | save_n_checkpoints=2, 40 | save_best_after=5000, 41 | data_path="/storage/filtered_dataset", 42 | output_path=output_path, 43 | audio=audio_config, 44 | ) 45 | 46 | # init audio processor 47 | ap = AudioProcessor.init_from_config(config) 48 | 49 | # load training samples 50 | print("config.eval_split_size = ", config.eval_split_size) 51 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 52 | 53 | # init model 54 | model = GAN(config, ap) 55 | 56 | # init the trainer and 🚀 57 | trainer = Trainer( 58 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 59 | ) 60 | trainer.fit() 61 | -------------------------------------------------------------------------------- /recipes/blizzard2013/README.md: -------------------------------------------------------------------------------- 1 | # How to get the Blizzard 2013 Dataset 2 | 3 | The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody. 4 | 5 | To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings. 6 | 7 | To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh. 8 | 9 | You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset. 10 | 11 | 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments). 12 | 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation). -------------------------------------------------------------------------------- /recipes/kokoro/tacotron2-DDC/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | CORPUS=kokoro-speech-v1_1-small 5 | echo $RUN_DIR 6 | if [ \! -d $RUN_DIR/$CORPUS ] ; then 7 | echo "$RUN_DIR/$CORPUS doesn't exist." 8 | echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." 9 | exit 1 10 | fi 11 | # create train-val splits 12 | shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv 13 | head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv 14 | tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv 15 | # compute dataset mean and variance for normalization 16 | python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ 17 | # training .... 18 | # change the GPU id if needed 19 | CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ 20 | --coqpit.output_path $RUN_DIR \ 21 | --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ 22 | --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ 23 | --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ -------------------------------------------------------------------------------- /recipes/ljspeech/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS LJspeech Recipes 2 | 3 | For running the recipes 4 | 5 | 1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. 6 | 2. Go to your desired model folder and run the training. 7 | 8 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 9 | ```terminal 10 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 11 | ``` 12 | 13 | Running bash scripts. 14 | ```terminal 15 | bash run.sh 16 | ``` 17 | 18 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 19 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 20 | -------------------------------------------------------------------------------- /recipes/ljspeech/align_tts/train_aligntts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig 6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.align_tts import AlignTTS 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio import AudioProcessor 11 | 12 | output_path = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | # init configs 15 | dataset_config = BaseDatasetConfig( 16 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") 17 | ) 18 | config = AlignTTSConfig( 19 | batch_size=32, 20 | eval_batch_size=16, 21 | num_loader_workers=4, 22 | num_eval_loader_workers=4, 23 | run_eval=True, 24 | test_delay_epochs=-1, 25 | epochs=1000, 26 | text_cleaner="english_cleaners", 27 | use_phonemes=False, 28 | phoneme_language="en-us", 29 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 30 | print_step=25, 31 | print_eval=True, 32 | mixed_precision=False, 33 | output_path=output_path, 34 | datasets=[dataset_config], 35 | ) 36 | 37 | # INITIALIZE THE AUDIO PROCESSOR 38 | # Audio processor is used for feature extraction and audio I/O. 39 | # It mainly serves to the dataloader and the training loggers. 40 | ap = AudioProcessor.init_from_config(config) 41 | 42 | # INITIALIZE THE TOKENIZER 43 | # Tokenizer is used to convert text to sequences of token IDs. 44 | # If characters are not defined in the config, default characters are passed to the config 45 | tokenizer, config = TTSTokenizer.init_from_config(config) 46 | 47 | # LOAD DATA SAMPLES 48 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 49 | # You can define your custom sample loader returning the list of samples. 50 | # Or define your custom formatter and pass it to the `load_tts_samples`. 51 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 52 | train_samples, eval_samples = load_tts_samples( 53 | dataset_config, 54 | eval_split=True, 55 | eval_split_max_size=config.eval_split_max_size, 56 | eval_split_size=config.eval_split_size, 57 | ) 58 | 59 | # init model 60 | model = AlignTTS(config, ap, tokenizer) 61 | 62 | # INITIALIZE THE TRAINER 63 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, 64 | # distributed training, etc. 65 | trainer = Trainer( 66 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 67 | ) 68 | 69 | # AND... 3,2,1... 🚀 70 | trainer.fit() 71 | -------------------------------------------------------------------------------- /recipes/ljspeech/delightful_tts/train_delightful_tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.config.shared_configs import BaseDatasetConfig 6 | from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio.processor import AudioProcessor 11 | 12 | data_path = "" 13 | output_path = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | dataset_config = BaseDatasetConfig( 16 | dataset_name="ljspeech", formatter="ljspeech", meta_file_train="metadata.csv", path=data_path 17 | ) 18 | 19 | audio_config = DelightfulTtsAudioConfig() 20 | model_args = DelightfulTtsArgs() 21 | 22 | vocoder_config = VocoderConfig() 23 | 24 | delightful_tts_config = DelightfulTTSConfig( 25 | run_name="delightful_tts_ljspeech", 26 | run_description="Train like in delightful tts paper.", 27 | model_args=model_args, 28 | audio=audio_config, 29 | vocoder=vocoder_config, 30 | batch_size=32, 31 | eval_batch_size=16, 32 | num_loader_workers=10, 33 | num_eval_loader_workers=10, 34 | precompute_num_workers=10, 35 | batch_group_size=2, 36 | compute_input_seq_cache=True, 37 | compute_f0=True, 38 | f0_cache_path=os.path.join(output_path, "f0_cache"), 39 | run_eval=True, 40 | test_delay_epochs=-1, 41 | epochs=1000, 42 | text_cleaner="english_cleaners", 43 | use_phonemes=True, 44 | phoneme_language="en-us", 45 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 46 | print_step=50, 47 | print_eval=False, 48 | mixed_precision=True, 49 | output_path=output_path, 50 | datasets=[dataset_config], 51 | start_by_longest=False, 52 | eval_split_size=0.1, 53 | binary_align_loss_alpha=0.0, 54 | use_attn_priors=False, 55 | lr_gen=4e-1, 56 | lr=4e-1, 57 | lr_disc=4e-1, 58 | max_text_len=130, 59 | ) 60 | 61 | tokenizer, config = TTSTokenizer.init_from_config(delightful_tts_config) 62 | 63 | ap = AudioProcessor.init_from_config(config) 64 | 65 | 66 | train_samples, eval_samples = load_tts_samples( 67 | dataset_config, 68 | eval_split=True, 69 | eval_split_max_size=config.eval_split_max_size, 70 | eval_split_size=config.eval_split_size, 71 | ) 72 | 73 | model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=None) 74 | 75 | trainer = Trainer( 76 | TrainerArgs(), 77 | config, 78 | output_path, 79 | model=model, 80 | train_samples=train_samples, 81 | eval_samples=eval_samples, 82 | ) 83 | 84 | trainer.fit() 85 | -------------------------------------------------------------------------------- /recipes/ljspeech/download_ljspeech.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download LJSpeech dataset 6 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 7 | # extract 8 | tar -xjf LJSpeech-1.1.tar.bz2 9 | # create train-val splits 10 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 11 | head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 12 | tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 13 | mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/ 14 | rm LJSpeech-1.1.tar.bz2 -------------------------------------------------------------------------------- /recipes/ljspeech/hifigan/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import HifiganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = HifiganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /recipes/ljspeech/multiband_melgan/train_multiband_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import MultibandMelganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = MultibandMelganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /recipes/ljspeech/overflow/lj_parameters.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhnh2002/XTTSv2-Finetuning-for-New-Languages/8e59ec37e8c9cf9343503ca7430f5b23729f6389/recipes/ljspeech/overflow/lj_parameters.pt -------------------------------------------------------------------------------- /recipes/ljspeech/overflow/train_overflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.config.shared_configs import BaseAudioConfig 6 | from TTS.tts.configs.overflow_config import OverflowConfig 7 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 8 | from TTS.tts.datasets import load_tts_samples 9 | from TTS.tts.models.overflow import Overflow 10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 11 | from TTS.utils.audio import AudioProcessor 12 | 13 | output_path = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | # init configs 16 | dataset_config = BaseDatasetConfig( 17 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join("data", "LJSpeech-1.1/") 18 | ) 19 | 20 | audio_config = BaseAudioConfig( 21 | sample_rate=22050, 22 | do_trim_silence=True, 23 | trim_db=60.0, 24 | signal_norm=False, 25 | mel_fmin=0.0, 26 | mel_fmax=8000, 27 | spec_gain=1.0, 28 | log_func="np.log", 29 | ref_level_db=20, 30 | preemphasis=0.0, 31 | ) 32 | 33 | config = OverflowConfig( # This is the config that is saved for the future use 34 | run_name="overflow_ljspeech", 35 | audio=audio_config, 36 | batch_size=30, 37 | eval_batch_size=16, 38 | num_loader_workers=4, 39 | num_eval_loader_workers=4, 40 | run_eval=True, 41 | test_delay_epochs=-1, 42 | epochs=1000, 43 | text_cleaner="phoneme_cleaners", 44 | use_phonemes=True, 45 | phoneme_language="en-us", 46 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 47 | precompute_num_workers=8, 48 | mel_statistics_parameter_path=os.path.join(output_path, "lj_parameters.pt"), 49 | force_generate_statistics=False, 50 | print_step=1, 51 | print_eval=True, 52 | mixed_precision=True, 53 | output_path=output_path, 54 | datasets=[dataset_config], 55 | ) 56 | 57 | # INITIALIZE THE AUDIO PROCESSOR 58 | # Audio processor is used for feature extraction and audio I/O. 59 | # It mainly serves to the dataloader and the training loggers. 60 | ap = AudioProcessor.init_from_config(config) 61 | 62 | # INITIALIZE THE TOKENIZER 63 | # Tokenizer is used to convert text to sequences of token IDs. 64 | # If characters are not defined in the config, default characters are passed to the config 65 | tokenizer, config = TTSTokenizer.init_from_config(config) 66 | 67 | # LOAD DATA SAMPLES 68 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 69 | # You can define your custom sample loader returning the list of samples. 70 | # Or define your custom formatter and pass it to the `load_tts_samples`. 71 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 72 | train_samples, eval_samples = load_tts_samples( 73 | dataset_config, 74 | eval_split=True, 75 | eval_split_max_size=config.eval_split_max_size, 76 | eval_split_size=config.eval_split_size, 77 | ) 78 | 79 | # INITIALIZE THE MODEL 80 | # Models take a config object and a speaker manager as input 81 | # Config defines the details of the model like the number of layers, the size of the embedding, etc. 82 | # Speaker manager is used by multi-speaker models. 83 | model = Overflow(config, ap, tokenizer) 84 | 85 | 86 | # init the trainer and 🚀 87 | trainer = Trainer( 88 | TrainerArgs(), 89 | config, 90 | output_path, 91 | model=model, 92 | train_samples=train_samples, 93 | eval_samples=eval_samples, 94 | gpu=1, 95 | ) 96 | trainer.fit() 97 | -------------------------------------------------------------------------------- /recipes/ljspeech/speedy_speech/train_speedy_speech.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.config import BaseAudioConfig, BaseDatasetConfig 6 | from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.forward_tts import ForwardTTS 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio import AudioProcessor 11 | 12 | output_path = os.path.dirname(os.path.abspath(__file__)) 13 | dataset_config = BaseDatasetConfig( 14 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") 15 | ) 16 | 17 | audio_config = BaseAudioConfig( 18 | sample_rate=22050, 19 | do_trim_silence=True, 20 | trim_db=60.0, 21 | signal_norm=False, 22 | mel_fmin=0.0, 23 | mel_fmax=8000, 24 | spec_gain=1.0, 25 | log_func="np.log", 26 | ref_level_db=20, 27 | preemphasis=0.0, 28 | ) 29 | 30 | config = SpeedySpeechConfig( 31 | run_name="speedy_speech_ljspeech", 32 | audio=audio_config, 33 | batch_size=32, 34 | eval_batch_size=16, 35 | num_loader_workers=4, 36 | num_eval_loader_workers=4, 37 | compute_input_seq_cache=True, 38 | run_eval=True, 39 | test_delay_epochs=-1, 40 | epochs=1000, 41 | text_cleaner="english_cleaners", 42 | use_phonemes=True, 43 | phoneme_language="en-us", 44 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 45 | precompute_num_workers=4, 46 | print_step=50, 47 | print_eval=False, 48 | mixed_precision=False, 49 | max_seq_len=500000, 50 | output_path=output_path, 51 | datasets=[dataset_config], 52 | ) 53 | 54 | # INITIALIZE THE AUDIO PROCESSOR 55 | # Audio processor is used for feature extraction and audio I/O. 56 | # It mainly serves to the dataloader and the training loggers. 57 | ap = AudioProcessor.init_from_config(config) 58 | 59 | # INITIALIZE THE TOKENIZER 60 | # Tokenizer is used to convert text to sequences of token IDs. 61 | # If characters are not defined in the config, default characters are passed to the config 62 | tokenizer, config = TTSTokenizer.init_from_config(config) 63 | 64 | # LOAD DATA SAMPLES 65 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 66 | # You can define your custom sample loader returning the list of samples. 67 | # Or define your custom formatter and pass it to the `load_tts_samples`. 68 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 69 | train_samples, eval_samples = load_tts_samples( 70 | dataset_config, 71 | eval_split=True, 72 | eval_split_max_size=config.eval_split_max_size, 73 | eval_split_size=config.eval_split_size, 74 | ) 75 | 76 | # init model 77 | model = ForwardTTS(config, ap, tokenizer) 78 | 79 | # INITIALIZE THE TRAINER 80 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, 81 | # distributed training, etc. 82 | trainer = Trainer( 83 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 84 | ) 85 | 86 | # AND... 3,2,1... 🚀 87 | trainer.fit() 88 | -------------------------------------------------------------------------------- /recipes/ljspeech/univnet/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import UnivnetConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = UnivnetConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=8192, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=False, 25 | mixed_precision=False, 26 | lr_gen=1e-4, 27 | lr_disc=1e-4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = GAN(config, ap) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 44 | ) 45 | trainer.fit() 46 | -------------------------------------------------------------------------------- /recipes/ljspeech/vits_tts/train_vits.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 6 | from TTS.tts.configs.vits_config import VitsConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.vits import Vits, VitsAudioConfig 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio import AudioProcessor 11 | 12 | output_path = os.path.dirname(os.path.abspath(__file__)) 13 | dataset_config = BaseDatasetConfig( 14 | formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") 15 | ) 16 | audio_config = VitsAudioConfig( 17 | sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None 18 | ) 19 | 20 | config = VitsConfig( 21 | audio=audio_config, 22 | run_name="vits_ljspeech", 23 | batch_size=32, 24 | eval_batch_size=16, 25 | batch_group_size=5, 26 | num_loader_workers=8, 27 | num_eval_loader_workers=4, 28 | run_eval=True, 29 | test_delay_epochs=-1, 30 | epochs=1000, 31 | text_cleaner="english_cleaners", 32 | use_phonemes=True, 33 | phoneme_language="en-us", 34 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 35 | compute_input_seq_cache=True, 36 | print_step=25, 37 | print_eval=True, 38 | mixed_precision=True, 39 | output_path=output_path, 40 | datasets=[dataset_config], 41 | cudnn_benchmark=False, 42 | ) 43 | 44 | # INITIALIZE THE AUDIO PROCESSOR 45 | # Audio processor is used for feature extraction and audio I/O. 46 | # It mainly serves to the dataloader and the training loggers. 47 | ap = AudioProcessor.init_from_config(config) 48 | 49 | # INITIALIZE THE TOKENIZER 50 | # Tokenizer is used to convert text to sequences of token IDs. 51 | # config is updated with the default characters if not defined in the config. 52 | tokenizer, config = TTSTokenizer.init_from_config(config) 53 | 54 | # LOAD DATA SAMPLES 55 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 56 | # You can define your custom sample loader returning the list of samples. 57 | # Or define your custom formatter and pass it to the `load_tts_samples`. 58 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 59 | train_samples, eval_samples = load_tts_samples( 60 | dataset_config, 61 | eval_split=True, 62 | eval_split_max_size=config.eval_split_max_size, 63 | eval_split_size=config.eval_split_size, 64 | ) 65 | 66 | # init model 67 | model = Vits(config, ap, tokenizer, speaker_manager=None) 68 | 69 | # init the trainer and 🚀 70 | trainer = Trainer( 71 | TrainerArgs(), 72 | config, 73 | output_path, 74 | model=model, 75 | train_samples=train_samples, 76 | eval_samples=eval_samples, 77 | ) 78 | trainer.fit() 79 | -------------------------------------------------------------------------------- /recipes/ljspeech/wavegrad/train_wavegrad.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavegradConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavegrad import Wavegrad 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavegradConfig( 12 | batch_size=32, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=6144, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=50, 23 | print_step=50, 24 | print_eval=True, 25 | mixed_precision=False, 26 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 27 | output_path=output_path, 28 | ) 29 | 30 | # init audio processor 31 | ap = AudioProcessor(**config.audio.to_dict()) 32 | 33 | # load training samples 34 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 35 | 36 | # init model 37 | model = Wavegrad(config) 38 | 39 | # init the trainer and 🚀 40 | trainer = Trainer( 41 | TrainerArgs(), 42 | config, 43 | output_path, 44 | model=model, 45 | train_samples=train_samples, 46 | eval_samples=eval_samples, 47 | training_assets={"audio_processor": ap}, 48 | ) 49 | trainer.fit() 50 | -------------------------------------------------------------------------------- /recipes/ljspeech/wavernn/train_wavernn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavernn import Wavernn 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavernnConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=10000, 19 | seq_len=1280, 20 | pad_short=2000, 21 | use_noise_augment=False, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=True, 25 | mixed_precision=False, 26 | lr=1e-4, 27 | grad_clip=4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = Wavernn(config) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), 44 | config, 45 | output_path, 46 | model=model, 47 | train_samples=train_samples, 48 | eval_samples=eval_samples, 49 | training_assets={"audio_processor": ap}, 50 | ) 51 | trainer.fit() 52 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Thorsten Recipes 2 | 3 | For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset. 4 | 5 | You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present. 6 | 7 | Then, go to your desired model folder and run the training. 8 | 9 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 10 | ```terminal 11 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 12 | ``` 13 | 14 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 15 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 16 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/align_tts/train_aligntts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig 6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.align_tts import AlignTTS 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio import AudioProcessor 11 | from TTS.utils.downloaders import download_thorsten_de 12 | 13 | output_path = os.path.dirname(os.path.abspath(__file__)) 14 | 15 | # init configs 16 | dataset_config = BaseDatasetConfig( 17 | formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") 18 | ) 19 | 20 | # download dataset if not already present 21 | if not os.path.exists(dataset_config.path): 22 | print("Downloading dataset") 23 | download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0]) 24 | 25 | config = AlignTTSConfig( 26 | batch_size=32, 27 | eval_batch_size=16, 28 | num_loader_workers=4, 29 | num_eval_loader_workers=4, 30 | run_eval=True, 31 | test_delay_epochs=-1, 32 | epochs=1000, 33 | text_cleaner="phoneme_cleaners", 34 | use_phonemes=False, 35 | phoneme_language="de", 36 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 37 | print_step=25, 38 | print_eval=True, 39 | mixed_precision=False, 40 | test_sentences=[ 41 | "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.", 42 | "Sei eine Stimme, kein Echo.", 43 | "Es tut mir Leid David. Das kann ich leider nicht machen.", 44 | "Dieser Kuchen ist großartig. Er ist so lecker und feucht.", 45 | "Vor dem 22. November 1963.", 46 | ], 47 | output_path=output_path, 48 | datasets=[dataset_config], 49 | ) 50 | 51 | # INITIALIZE THE AUDIO PROCESSOR 52 | # Audio processor is used for feature extraction and audio I/O. 53 | # It mainly serves to the dataloader and the training loggers. 54 | ap = AudioProcessor.init_from_config(config) 55 | 56 | # INITIALIZE THE TOKENIZER 57 | # Tokenizer is used to convert text to sequences of token IDs. 58 | # If characters are not defined in the config, default characters are passed to the config 59 | tokenizer, config = TTSTokenizer.init_from_config(config) 60 | 61 | # LOAD DATA SAMPLES 62 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 63 | # You can define your custom sample loader returning the list of samples. 64 | # Or define your custom formatter and pass it to the `load_tts_samples`. 65 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 66 | train_samples, eval_samples = load_tts_samples( 67 | dataset_config, 68 | eval_split=True, 69 | eval_split_max_size=config.eval_split_max_size, 70 | eval_split_size=config.eval_split_size, 71 | ) 72 | 73 | # init model 74 | model = AlignTTS(config, ap, tokenizer) 75 | 76 | # INITIALIZE THE TRAINER 77 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, 78 | # distributed training, etc. 79 | trainer = Trainer( 80 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 81 | ) 82 | 83 | # AND... 3,2,1... 🚀 84 | trainer.fit() 85 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/download_thorsten_DE.sh: -------------------------------------------------------------------------------- 1 | # create venv 2 | python3 -m venv env 3 | source .env/bin/activate 4 | pip install pip --upgrade 5 | 6 | # download Thorsten_DE dataset 7 | pip install gdown 8 | gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz 9 | tar -xzf dataset.tgz 10 | 11 | # create train-val splits 12 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 13 | head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 14 | tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 15 | 16 | # rename dataset and remove archive 17 | mv LJSpeech-1.1 thorsten-de 18 | rm dataset.tgz 19 | 20 | # destry venv 21 | rm -rf env 22 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/hifigan/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import HifiganConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | config = HifiganConfig( 14 | batch_size=32, 15 | eval_batch_size=16, 16 | num_loader_workers=4, 17 | num_eval_loader_workers=4, 18 | run_eval=True, 19 | test_delay_epochs=5, 20 | epochs=1000, 21 | seq_len=8192, 22 | pad_short=2000, 23 | use_noise_augment=True, 24 | eval_split_size=10, 25 | print_step=25, 26 | print_eval=False, 27 | mixed_precision=False, 28 | lr_gen=1e-4, 29 | lr_disc=1e-4, 30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 31 | output_path=output_path, 32 | ) 33 | 34 | # download dataset if not already present 35 | if not os.path.exists(config.data_path): 36 | print("Downloading dataset") 37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 38 | download_thorsten_de(download_path) 39 | 40 | # init audio processor 41 | ap = AudioProcessor(**config.audio.to_dict()) 42 | 43 | # load training samples 44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 45 | 46 | # init model 47 | model = GAN(config, ap) 48 | 49 | # init the trainer and 🚀 50 | trainer = Trainer( 51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 52 | ) 53 | trainer.fit() 54 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import MultibandMelganConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | config = MultibandMelganConfig( 14 | batch_size=32, 15 | eval_batch_size=16, 16 | num_loader_workers=4, 17 | num_eval_loader_workers=4, 18 | run_eval=True, 19 | test_delay_epochs=5, 20 | epochs=1000, 21 | seq_len=8192, 22 | pad_short=2000, 23 | use_noise_augment=True, 24 | eval_split_size=10, 25 | print_step=25, 26 | print_eval=False, 27 | mixed_precision=False, 28 | lr_gen=1e-4, 29 | lr_disc=1e-4, 30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 31 | output_path=output_path, 32 | ) 33 | 34 | # download dataset if not already present 35 | if not os.path.exists(config.data_path): 36 | print("Downloading dataset") 37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 38 | download_thorsten_de(download_path) 39 | 40 | # init audio processor 41 | ap = AudioProcessor(**config.audio.to_dict()) 42 | 43 | # load training samples 44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 45 | 46 | # init model 47 | model = GAN(config, ap) 48 | 49 | # init the trainer and 🚀 50 | trainer = Trainer( 51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 52 | ) 53 | trainer.fit() 54 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/univnet/train_univnet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import UnivnetConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = UnivnetConfig( 13 | batch_size=64, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # download dataset if not already present 34 | if not os.path.exists(config.data_path): 35 | print("Downloading dataset") 36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 37 | download_thorsten_de(download_path) 38 | 39 | # init audio processor 40 | ap = AudioProcessor(**config.audio.to_dict()) 41 | 42 | # load training samples 43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 44 | 45 | # init model 46 | model = GAN(config, ap) 47 | 48 | # init the trainer and 🚀 49 | trainer = Trainer( 50 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 51 | ) 52 | trainer.fit() 53 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/wavegrad/train_wavegrad.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import WavegradConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.wavegrad import Wavegrad 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = WavegradConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1000, 20 | seq_len=6144, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=50, 24 | print_step=50, 25 | print_eval=True, 26 | mixed_precision=False, 27 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 28 | output_path=output_path, 29 | ) 30 | 31 | # download dataset if not already present 32 | if not os.path.exists(config.data_path): 33 | print("Downloading dataset") 34 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 35 | download_thorsten_de(download_path) 36 | 37 | # init audio processor 38 | ap = AudioProcessor(**config.audio.to_dict()) 39 | 40 | # load training samples 41 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 42 | 43 | # init model 44 | model = Wavegrad(config) 45 | 46 | # init the trainer and 🚀 47 | trainer = Trainer( 48 | TrainerArgs(), 49 | config, 50 | output_path, 51 | model=model, 52 | train_samples=train_samples, 53 | eval_samples=eval_samples, 54 | training_assets={"audio_processor": ap}, 55 | ) 56 | trainer.fit() 57 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/wavernn/train_wavernn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import WavernnConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.wavernn import Wavernn 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = WavernnConfig( 13 | batch_size=64, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=10000, 20 | seq_len=1280, 21 | pad_short=2000, 22 | use_noise_augment=False, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=True, 26 | mixed_precision=False, 27 | lr=1e-4, 28 | grad_clip=4, 29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # download dataset if not already present 34 | if not os.path.exists(config.data_path): 35 | print("Downloading dataset") 36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 37 | download_thorsten_de(download_path) 38 | 39 | # init audio processor 40 | ap = AudioProcessor(**config.audio.to_dict()) 41 | 42 | # load training samples 43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 44 | 45 | # init model 46 | model = Wavernn(config) 47 | 48 | # init the trainer and 🚀 49 | trainer = Trainer( 50 | TrainerArgs(), 51 | config, 52 | output_path, 53 | model=model, 54 | train_samples=train_samples, 55 | eval_samples=eval_samples, 56 | training_assets={"audio_processor": ap}, 57 | ) 58 | trainer.fit() 59 | -------------------------------------------------------------------------------- /recipes/vctk/delightful_tts/train_delightful_tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.config.shared_configs import BaseDatasetConfig 6 | from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.delightful_tts import DelightfulTTS, DelightfulTtsArgs, VocoderConfig 9 | from TTS.tts.utils.speakers import SpeakerManager 10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 11 | from TTS.utils.audio.processor import AudioProcessor 12 | 13 | data_path = "/raid/datasets/vctk_v092_48khz_removed_silence_silero_vad" 14 | output_path = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | 17 | dataset_config = BaseDatasetConfig( 18 | dataset_name="vctk", formatter="vctk", meta_file_train="", path=data_path, language="en-us" 19 | ) 20 | 21 | audio_config = DelightfulTtsAudioConfig() 22 | 23 | model_args = DelightfulTtsArgs() 24 | 25 | vocoder_config = VocoderConfig() 26 | 27 | something_tts_config = DelightfulTTSConfig( 28 | run_name="delightful_tts_vctk", 29 | run_description="Train like in delightful tts paper.", 30 | model_args=model_args, 31 | audio=audio_config, 32 | vocoder=vocoder_config, 33 | batch_size=32, 34 | eval_batch_size=16, 35 | num_loader_workers=10, 36 | num_eval_loader_workers=10, 37 | precompute_num_workers=40, 38 | compute_input_seq_cache=True, 39 | compute_f0=True, 40 | f0_cache_path=os.path.join(output_path, "f0_cache"), 41 | run_eval=True, 42 | test_delay_epochs=-1, 43 | epochs=1000, 44 | text_cleaner="english_cleaners", 45 | use_phonemes=True, 46 | phoneme_language="en-us", 47 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 48 | print_step=50, 49 | print_eval=False, 50 | mixed_precision=True, 51 | output_path=output_path, 52 | datasets=[dataset_config], 53 | start_by_longest=True, 54 | binary_align_loss_alpha=0.0, 55 | use_attn_priors=False, 56 | max_text_len=60, 57 | steps_to_start_discriminator=10000, 58 | ) 59 | 60 | tokenizer, config = TTSTokenizer.init_from_config(something_tts_config) 61 | 62 | ap = AudioProcessor.init_from_config(config) 63 | 64 | 65 | train_samples, eval_samples = load_tts_samples( 66 | dataset_config, 67 | eval_split=True, 68 | eval_split_max_size=config.eval_split_max_size, 69 | eval_split_size=config.eval_split_size, 70 | ) 71 | 72 | 73 | speaker_manager = SpeakerManager() 74 | speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") 75 | config.model_args.num_speakers = speaker_manager.num_speakers 76 | 77 | 78 | model = DelightfulTTS(ap=ap, config=config, tokenizer=tokenizer, speaker_manager=speaker_manager, emotion_manager=None) 79 | 80 | trainer = Trainer( 81 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 82 | ) 83 | 84 | trainer.fit() 85 | -------------------------------------------------------------------------------- /recipes/vctk/download_vctk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download VCTK dataset 6 | wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip 7 | # extract 8 | mkdir VCTK 9 | unzip VCTK-Corpus-0.92 -d VCTK 10 | # create train-val splits 11 | mv VCTK $RUN_DIR/recipes/vctk/ 12 | rm VCTK-Corpus-0.92.zip 13 | -------------------------------------------------------------------------------- /recipes/vctk/vits/train_vits.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 6 | from TTS.tts.configs.vits_config import VitsConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig 9 | from TTS.tts.utils.speakers import SpeakerManager 10 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 11 | from TTS.utils.audio import AudioProcessor 12 | 13 | output_path = os.path.dirname(os.path.abspath(__file__)) 14 | dataset_config = BaseDatasetConfig( 15 | formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/") 16 | ) 17 | 18 | 19 | audio_config = VitsAudioConfig( 20 | sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None 21 | ) 22 | 23 | vitsArgs = VitsArgs( 24 | use_speaker_embedding=True, 25 | ) 26 | 27 | config = VitsConfig( 28 | model_args=vitsArgs, 29 | audio=audio_config, 30 | run_name="vits_vctk", 31 | batch_size=32, 32 | eval_batch_size=16, 33 | batch_group_size=5, 34 | num_loader_workers=4, 35 | num_eval_loader_workers=4, 36 | run_eval=True, 37 | test_delay_epochs=-1, 38 | epochs=1000, 39 | text_cleaner="english_cleaners", 40 | use_phonemes=True, 41 | phoneme_language="en", 42 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 43 | compute_input_seq_cache=True, 44 | print_step=25, 45 | print_eval=False, 46 | mixed_precision=True, 47 | max_text_len=325, # change this if you have a larger VRAM than 16GB 48 | output_path=output_path, 49 | datasets=[dataset_config], 50 | cudnn_benchmark=False, 51 | ) 52 | 53 | # INITIALIZE THE AUDIO PROCESSOR 54 | # Audio processor is used for feature extraction and audio I/O. 55 | # It mainly serves to the dataloader and the training loggers. 56 | ap = AudioProcessor.init_from_config(config) 57 | 58 | # INITIALIZE THE TOKENIZER 59 | # Tokenizer is used to convert text to sequences of token IDs. 60 | # config is updated with the default characters if not defined in the config. 61 | tokenizer, config = TTSTokenizer.init_from_config(config) 62 | 63 | # LOAD DATA SAMPLES 64 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 65 | # You can define your custom sample loader returning the list of samples. 66 | # Or define your custom formatter and pass it to the `load_tts_samples`. 67 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 68 | train_samples, eval_samples = load_tts_samples( 69 | dataset_config, 70 | eval_split=True, 71 | eval_split_max_size=config.eval_split_max_size, 72 | eval_split_size=config.eval_split_size, 73 | ) 74 | 75 | # init speaker manager for multi-speaker training 76 | # it maps speaker-id to speaker-name in the model and data-loader 77 | speaker_manager = SpeakerManager() 78 | speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") 79 | config.model_args.num_speakers = speaker_manager.num_speakers 80 | 81 | # init model 82 | model = Vits(config, ap, tokenizer, speaker_manager) 83 | 84 | # init the trainer and 🚀 85 | trainer = Trainer( 86 | TrainerArgs(), 87 | config, 88 | output_path, 89 | model=model, 90 | train_samples=train_samples, 91 | eval_samples=eval_samples, 92 | ) 93 | trainer.fit() 94 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # core deps 2 | numpy==1.22.0;python_version<="3.10" 3 | numpy>=1.24.3;python_version>"3.10" 4 | cython>=0.29.30 5 | scipy>=1.11.2 6 | torch>=2.1 7 | torchaudio 8 | soundfile>=0.12.0 9 | librosa>=0.10.0 10 | scikit-learn>=1.3.0 11 | numba==0.55.1;python_version<"3.9" 12 | numba>=0.57.0;python_version>="3.9" 13 | inflect>=5.6.0 14 | tqdm>=4.64.1 15 | anyascii>=0.3.0 16 | pyyaml>=6.0 17 | fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail 18 | aiohttp>=3.8.1 19 | packaging>=23.1 20 | mutagen==1.47.0 21 | # deps for examples 22 | flask>=2.0.1 23 | # deps for inference 24 | pysbd>=0.3.4 25 | # deps for notebooks 26 | umap-learn>=0.5.1 27 | pandas>=1.4,<2.0 28 | # deps for training 29 | matplotlib>=3.7.0 30 | # coqui stack 31 | trainer>=0.0.36 32 | # config management 33 | coqpit>=0.0.16 34 | # chinese g2p deps 35 | jieba 36 | pypinyin 37 | # korean 38 | hangul_romanize 39 | # gruut+supported langs 40 | gruut[de,es,fr]==2.2.3 41 | # deps for korean 42 | jamo 43 | nltk 44 | g2pkk>=0.1.1 45 | # deps for bangla 46 | bangla 47 | bnnumerizer 48 | bnunicodenormalizer 49 | #deps for tortoise 50 | einops>=0.6.0 51 | transformers>=4.45.2 52 | #deps for bark 53 | encodec>=0.1.1 54 | # deps for XTTS 55 | unidecode>=1.3.2 56 | num2words 57 | spacy[ja]>=3 58 | tokenizers==0.20.1 59 | vinorm==2.0.7 60 | underthesea==6.8.4 61 | -------------------------------------------------------------------------------- /train_dvae_xtts.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=1 python train_dvae_xtts.py \ 2 | --output_path=checkpoints/ \ 3 | --train_csv_path=datasets/metadata_train.csv \ 4 | --eval_csv_path=datasets/metadata_eval.csv \ 5 | --language="vi" \ 6 | --num_epochs=5 \ 7 | --batch_size=512 \ 8 | --lr=5e-6 -------------------------------------------------------------------------------- /train_gpt_xtts.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 python train_gpt_xtts.py \ 2 | --output_path checkpoints/ \ 3 | --metadatas datasets/metadata_train_v2.csv,datasets/metadata_eval_v2.csv,vi large-datasets/metadata_train.csv,large-datasets/metadata_eval.csv,vi \ 4 | --num_epochs 3 \ 5 | --batch_size 8 \ 6 | --grad_acumm 4 \ 7 | --max_text_length 400 \ 8 | --max_audio_length 330750 \ 9 | --weight_decay 1e-2 \ 10 | --lr 5e-6 \ 11 | --save_step 50000 --------------------------------------------------------------------------------