├── .gitignore
├── README.md
├── TTS
├── .models.json
├── VERSION
├── __init__.py
├── api.py
├── bin
│ ├── __init__.py
│ ├── collect_env_info.py
│ ├── compute_attention_masks.py
│ ├── compute_embeddings.py
│ ├── compute_statistics.py
│ ├── eval_encoder.py
│ ├── extract_tts_spectrograms.py
│ ├── find_unique_chars.py
│ ├── find_unique_phonemes.py
│ ├── remove_silence_using_vad.py
│ ├── resample.py
│ ├── synthesize.py
│ ├── train_encoder.py
│ ├── train_tts.py
│ ├── train_vocoder.py
│ └── tune_wavegrad.py
├── config
│ ├── __init__.py
│ └── shared_configs.py
├── cs_api.py
├── demos
│ └── xtts_ft_demo
│ │ ├── requirements.txt
│ │ ├── utils
│ │ ├── formatter.py
│ │ └── gpt_train.py
│ │ └── xtts_demo.py
├── encoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ │ ├── base_encoder_config.py
│ │ ├── emotion_encoder_config.py
│ │ └── speaker_encoder_config.py
│ ├── dataset.py
│ ├── losses.py
│ ├── models
│ │ ├── base_encoder.py
│ │ ├── lstm.py
│ │ └── resnet.py
│ ├── requirements.txt
│ └── utils
│ │ ├── __init__.py
│ │ ├── generic_utils.py
│ │ ├── prepare_voxceleb.py
│ │ ├── training.py
│ │ └── visual.py
├── model.py
├── server
│ ├── README.md
│ ├── __init__.py
│ ├── conf.json
│ ├── server.py
│ ├── static
│ │ └── coqui-log-green-TTS.png
│ └── templates
│ │ ├── details.html
│ │ └── index.html
├── tts
│ ├── __init__.py
│ ├── configs
│ │ ├── __init__.py
│ │ ├── align_tts_config.py
│ │ ├── bark_config.py
│ │ ├── delightful_tts_config.py
│ │ ├── fast_pitch_config.py
│ │ ├── fast_speech_config.py
│ │ ├── fastspeech2_config.py
│ │ ├── glow_tts_config.py
│ │ ├── neuralhmm_tts_config.py
│ │ ├── overflow_config.py
│ │ ├── shared_configs.py
│ │ ├── speedy_speech_config.py
│ │ ├── tacotron2_config.py
│ │ ├── tacotron_config.py
│ │ ├── tortoise_config.py
│ │ ├── vits_config.py
│ │ └── xtts_config.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── formatters.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── align_tts
│ │ │ ├── __init__.py
│ │ │ ├── duration_predictor.py
│ │ │ └── mdn.py
│ │ ├── bark
│ │ │ ├── __init__.py
│ │ │ ├── hubert
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hubert_manager.py
│ │ │ │ ├── kmeans_hubert.py
│ │ │ │ └── tokenizer.py
│ │ │ ├── inference_funcs.py
│ │ │ ├── load_model.py
│ │ │ ├── model.py
│ │ │ └── model_fine.py
│ │ ├── delightful_tts
│ │ │ ├── __init__.py
│ │ │ ├── acoustic_model.py
│ │ │ ├── conformer.py
│ │ │ ├── conv_layers.py
│ │ │ ├── encoders.py
│ │ │ ├── energy_adaptor.py
│ │ │ ├── kernel_predictor.py
│ │ │ ├── networks.py
│ │ │ ├── phoneme_prosody_predictor.py
│ │ │ ├── pitch_adaptor.py
│ │ │ └── variance_predictor.py
│ │ ├── feed_forward
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ └── encoder.py
│ │ ├── generic
│ │ │ ├── __init__.py
│ │ │ ├── aligner.py
│ │ │ ├── gated_conv.py
│ │ │ ├── normalization.py
│ │ │ ├── pos_encoding.py
│ │ │ ├── res_conv_bn.py
│ │ │ ├── time_depth_sep_conv.py
│ │ │ ├── transformer.py
│ │ │ └── wavenet.py
│ │ ├── glow_tts
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ ├── encoder.py
│ │ │ ├── glow.py
│ │ │ └── transformer.py
│ │ ├── losses.py
│ │ ├── overflow
│ │ │ ├── __init__.py
│ │ │ ├── common_layers.py
│ │ │ ├── decoder.py
│ │ │ ├── neural_hmm.py
│ │ │ └── plotting_utils.py
│ │ ├── tacotron
│ │ │ ├── __init__.py
│ │ │ ├── attentions.py
│ │ │ ├── capacitron_layers.py
│ │ │ ├── common_layers.py
│ │ │ ├── gst_layers.py
│ │ │ ├── tacotron.py
│ │ │ └── tacotron2.py
│ │ ├── tortoise
│ │ │ ├── arch_utils.py
│ │ │ ├── audio_utils.py
│ │ │ ├── autoregressive.py
│ │ │ ├── classifier.py
│ │ │ ├── clvp.py
│ │ │ ├── diffusion.py
│ │ │ ├── diffusion_decoder.py
│ │ │ ├── dpm_solver.py
│ │ │ ├── random_latent_generator.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transformer.py
│ │ │ ├── utils.py
│ │ │ ├── vocoder.py
│ │ │ ├── wav2vec_alignment.py
│ │ │ └── xtransformers.py
│ │ ├── vits
│ │ │ ├── discriminator.py
│ │ │ ├── networks.py
│ │ │ ├── stochastic_duration_predictor.py
│ │ │ └── transforms.py
│ │ └── xtts
│ │ │ ├── dvae.py
│ │ │ ├── gpt.py
│ │ │ ├── gpt_inference.py
│ │ │ ├── hifigan_decoder.py
│ │ │ ├── latent_encoder.py
│ │ │ ├── perceiver_encoder.py
│ │ │ ├── stream_generator.py
│ │ │ ├── tokenizer.py
│ │ │ ├── trainer
│ │ │ ├── dataset.py
│ │ │ └── gpt_trainer.py
│ │ │ └── zh_num2words.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── align_tts.py
│ │ ├── bark.py
│ │ ├── base_tacotron.py
│ │ ├── base_tts.py
│ │ ├── delightful_tts.py
│ │ ├── forward_tts.py
│ │ ├── glow_tts.py
│ │ ├── neuralhmm_tts.py
│ │ ├── overflow.py
│ │ ├── tacotron.py
│ │ ├── tacotron2.py
│ │ ├── tortoise.py
│ │ ├── vits.py
│ │ └── xtts.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── assets
│ │ └── tortoise
│ │ │ └── tokenizer.json
│ │ ├── data.py
│ │ ├── fairseq.py
│ │ ├── helpers.py
│ │ ├── languages.py
│ │ ├── managers.py
│ │ ├── measures.py
│ │ ├── monotonic_align
│ │ ├── __init__.py
│ │ ├── core.pyx
│ │ └── setup.py
│ │ ├── speakers.py
│ │ ├── ssim.py
│ │ ├── synthesis.py
│ │ ├── text
│ │ ├── __init__.py
│ │ ├── bangla
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── belarusian
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── characters.py
│ │ ├── chinese_mandarin
│ │ │ ├── __init__.py
│ │ │ ├── numbers.py
│ │ │ ├── phonemizer.py
│ │ │ └── pinyinToPhonemes.py
│ │ ├── cleaners.py
│ │ ├── cmudict.py
│ │ ├── english
│ │ │ ├── __init__.py
│ │ │ ├── abbreviations.py
│ │ │ ├── number_norm.py
│ │ │ └── time_norm.py
│ │ ├── french
│ │ │ ├── __init__.py
│ │ │ └── abbreviations.py
│ │ ├── japanese
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── korean
│ │ │ ├── __init__.py
│ │ │ ├── ko_dictionary.py
│ │ │ ├── korean.py
│ │ │ └── phonemizer.py
│ │ ├── phonemizers
│ │ │ ├── __init__.py
│ │ │ ├── bangla_phonemizer.py
│ │ │ ├── base.py
│ │ │ ├── belarusian_phonemizer.py
│ │ │ ├── espeak_wrapper.py
│ │ │ ├── gruut_wrapper.py
│ │ │ ├── ja_jp_phonemizer.py
│ │ │ ├── ko_kr_phonemizer.py
│ │ │ ├── multi_phonemizer.py
│ │ │ └── zh_cn_phonemizer.py
│ │ ├── punctuation.py
│ │ └── tokenizer.py
│ │ └── visual.py
├── utils
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── numpy_transforms.py
│ │ ├── processor.py
│ │ └── torch_transforms.py
│ ├── callbacks.py
│ ├── capacitron_optimizer.py
│ ├── distribute.py
│ ├── download.py
│ ├── downloaders.py
│ ├── generic_utils.py
│ ├── io.py
│ ├── manage.py
│ ├── radam.py
│ ├── samplers.py
│ ├── synthesizer.py
│ ├── training.py
│ └── vad.py
├── vc
│ ├── configs
│ │ ├── __init__.py
│ │ ├── freevc_config.py
│ │ └── shared_configs.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── base_vc.py
│ │ └── freevc.py
│ └── modules
│ │ ├── __init__.py
│ │ └── freevc
│ │ ├── __init__.py
│ │ ├── commons.py
│ │ ├── mel_processing.py
│ │ ├── modules.py
│ │ ├── speaker_encoder
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── hparams.py
│ │ └── speaker_encoder.py
│ │ └── wavlm
│ │ ├── __init__.py
│ │ ├── config.json
│ │ ├── modules.py
│ │ └── wavlm.py
└── vocoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ ├── __init__.py
│ ├── fullband_melgan_config.py
│ ├── hifigan_config.py
│ ├── melgan_config.py
│ ├── multiband_melgan_config.py
│ ├── parallel_wavegan_config.py
│ ├── shared_configs.py
│ ├── univnet_config.py
│ ├── wavegrad_config.py
│ └── wavernn_config.py
│ ├── datasets
│ ├── __init__.py
│ ├── gan_dataset.py
│ ├── preprocess.py
│ ├── wavegrad_dataset.py
│ └── wavernn_dataset.py
│ ├── layers
│ ├── __init__.py
│ ├── hifigan.py
│ ├── losses.py
│ ├── lvc_block.py
│ ├── melgan.py
│ ├── parallel_wavegan.py
│ ├── pqmf.py
│ ├── qmf.dat
│ ├── upsample.py
│ └── wavegrad.py
│ ├── models
│ ├── __init__.py
│ ├── base_vocoder.py
│ ├── fullband_melgan_generator.py
│ ├── gan.py
│ ├── hifigan_discriminator.py
│ ├── hifigan_generator.py
│ ├── melgan_discriminator.py
│ ├── melgan_generator.py
│ ├── melgan_multiscale_discriminator.py
│ ├── multiband_melgan_generator.py
│ ├── parallel_wavegan_discriminator.py
│ ├── parallel_wavegan_generator.py
│ ├── random_window_discriminator.py
│ ├── univnet_discriminator.py
│ ├── univnet_generator.py
│ ├── wavegrad.py
│ └── wavernn.py
│ ├── pqmf_output.wav
│ └── utils
│ ├── __init__.py
│ ├── distribution.py
│ └── generic_utils.py
├── app.py
├── config.txt
├── rename_tool.py
├── requirements.txt
├── source
├── asset
│ └── 1.png
└── model_v2
│ ├── config.json
│ ├── hash.md5
│ └── vocab.json
├── tts_v2.py
└── tts_v2_api.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pth
2 | *.onnx
3 | *.pyc
4 | .idea/
5 | venv/
6 | outpou/
7 | .gitignore.rej
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Xtts_v2版本
2 |
3 | ## 描述
4 |
5 | 基于coqui-ai/TTS_V2的一个简单语音克隆可视化工具,支持多国语言,不需要训练即可克隆音色。
6 |
7 |
8 | ## 示例
9 | 
10 |
11 |
12 | ## 本地模型下载
13 | [huggingface 地址](https://huggingface.co/coqui/XTTS-v2/tree/main)
14 |
15 | ## 代码源自:
16 |
17 | [地址:coqui-ai/TTS](https://github.com/coqui-ai/TTS)
--------------------------------------------------------------------------------
/TTS/VERSION:
--------------------------------------------------------------------------------
1 | 0.21.3
2 |
--------------------------------------------------------------------------------
/TTS/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4 | version = f.read().strip()
5 |
6 | __version__ = version
7 |
--------------------------------------------------------------------------------
/TTS/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/bin/__init__.py
--------------------------------------------------------------------------------
/TTS/bin/collect_env_info.py:
--------------------------------------------------------------------------------
1 | """Get detailed info about the working environment."""
2 | import os
3 | import platform
4 | import sys
5 |
6 | import numpy
7 | import torch
8 |
9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10 | import json
11 |
12 | import TTS
13 |
14 |
15 | def system_info():
16 | return {
17 | "OS": platform.system(),
18 | "architecture": platform.architecture(),
19 | "version": platform.version(),
20 | "processor": platform.processor(),
21 | "python": platform.python_version(),
22 | }
23 |
24 |
25 | def cuda_info():
26 | return {
27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28 | "available": torch.cuda.is_available(),
29 | "version": torch.version.cuda,
30 | }
31 |
32 |
33 | def package_info():
34 | return {
35 | "numpy": numpy.__version__,
36 | "PyTorch_version": torch.__version__,
37 | "PyTorch_debug": torch.version.debug,
38 | "TTS": TTS.__version__,
39 | }
40 |
41 |
42 | def main():
43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44 | print(json.dumps(details, indent=4, sort_keys=True))
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/TTS/bin/compute_statistics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import argparse
5 | import glob
6 | import os
7 |
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | # from TTS.utils.io import load_config
12 | from TTS.config import load_config
13 | from TTS.tts.datasets import load_tts_samples
14 | from TTS.utils.audio import AudioProcessor
15 |
16 |
17 | def main():
18 | """Run preprocessing process."""
19 | parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20 | parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21 | parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22 | parser.add_argument(
23 | "--data_path",
24 | type=str,
25 | required=False,
26 | help="folder including the target set of wavs overriding dataset config.",
27 | )
28 | args, overrides = parser.parse_known_args()
29 |
30 | CONFIG = load_config(args.config_path)
31 | CONFIG.parse_known_args(overrides, relaxed_parser=True)
32 |
33 | # load config
34 | CONFIG.audio.signal_norm = False # do not apply earlier normalization
35 | CONFIG.audio.stats_path = None # discard pre-defined stats
36 |
37 | # load audio processor
38 | ap = AudioProcessor(**CONFIG.audio.to_dict())
39 |
40 | # load the meta data of target dataset
41 | if args.data_path:
42 | dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43 | else:
44 | dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45 | print(f" > There are {len(dataset_items)} files.")
46 |
47 | mel_sum = 0
48 | mel_square_sum = 0
49 | linear_sum = 0
50 | linear_square_sum = 0
51 | N = 0
52 | for item in tqdm(dataset_items):
53 | # compute features
54 | wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55 | linear = ap.spectrogram(wav)
56 | mel = ap.melspectrogram(wav)
57 |
58 | # compute stats
59 | N += mel.shape[1]
60 | mel_sum += mel.sum(1)
61 | linear_sum += linear.sum(1)
62 | mel_square_sum += (mel**2).sum(axis=1)
63 | linear_square_sum += (linear**2).sum(axis=1)
64 |
65 | mel_mean = mel_sum / N
66 | mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67 | linear_mean = linear_sum / N
68 | linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69 |
70 | output_file_path = args.out_path
71 | stats = {}
72 | stats["mel_mean"] = mel_mean
73 | stats["mel_std"] = mel_scale
74 | stats["linear_mean"] = linear_mean
75 | stats["linear_std"] = linear_scale
76 |
77 | print(f" > Avg mel spec mean: {mel_mean.mean()}")
78 | print(f" > Avg mel spec scale: {mel_scale.mean()}")
79 | print(f" > Avg linear spec mean: {linear_mean.mean()}")
80 | print(f" > Avg linear spec scale: {linear_scale.mean()}")
81 |
82 | # set default config values for mean-var scaling
83 | CONFIG.audio.stats_path = output_file_path
84 | CONFIG.audio.signal_norm = True
85 | # remove redundant values
86 | del CONFIG.audio.max_norm
87 | del CONFIG.audio.min_level_db
88 | del CONFIG.audio.symmetric_norm
89 | del CONFIG.audio.clip_norm
90 | stats["audio_config"] = CONFIG.audio.to_dict()
91 | np.save(output_file_path, stats, allow_pickle=True)
92 | print(f" > stats saved to {output_file_path}")
93 |
94 |
95 | if __name__ == "__main__":
96 | main()
97 |
--------------------------------------------------------------------------------
/TTS/bin/eval_encoder.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from argparse import RawTextHelpFormatter
3 |
4 | import torch
5 | from tqdm import tqdm
6 |
7 | from TTS.config import load_config
8 | from TTS.tts.datasets import load_tts_samples
9 | from TTS.tts.utils.speakers import SpeakerManager
10 |
11 |
12 | def compute_encoder_accuracy(dataset_items, encoder_manager):
13 | class_name_key = encoder_manager.encoder_config.class_name_key
14 | map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15 |
16 | class_acc_dict = {}
17 |
18 | # compute embeddings for all wav_files
19 | for item in tqdm(dataset_items):
20 | class_name = item[class_name_key]
21 | wav_file = item["audio_file"]
22 |
23 | # extract the embedding
24 | embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25 | if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26 | embedding = torch.FloatTensor(embedd).unsqueeze(0)
27 | if encoder_manager.use_cuda:
28 | embedding = embedding.cuda()
29 |
30 | class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31 | predicted_label = map_classid_to_classname[str(class_id)]
32 | else:
33 | predicted_label = None
34 |
35 | if class_name is not None and predicted_label is not None:
36 | is_equal = int(class_name == predicted_label)
37 | if class_name not in class_acc_dict:
38 | class_acc_dict[class_name] = [is_equal]
39 | else:
40 | class_acc_dict[class_name].append(is_equal)
41 | else:
42 | raise RuntimeError("Error: class_name or/and predicted_label are None")
43 |
44 | acc_avg = 0
45 | for key, values in class_acc_dict.items():
46 | acc = sum(values) / len(values)
47 | print("Class", key, "Accuracy:", acc)
48 | acc_avg += acc
49 |
50 | print("Average Accuracy:", acc_avg / len(class_acc_dict))
51 |
52 |
53 | if __name__ == "__main__":
54 | parser = argparse.ArgumentParser(
55 | description="""Compute the accuracy of the encoder.\n\n"""
56 | """
57 | Example runs:
58 | python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59 | """,
60 | formatter_class=RawTextHelpFormatter,
61 | )
62 | parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63 | parser.add_argument(
64 | "config_path",
65 | type=str,
66 | help="Path to model config file.",
67 | )
68 |
69 | parser.add_argument(
70 | "config_dataset_path",
71 | type=str,
72 | help="Path to dataset config file.",
73 | )
74 | parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75 | parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76 |
77 | args = parser.parse_args()
78 |
79 | c_dataset = load_config(args.config_dataset_path)
80 |
81 | meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82 | items = meta_data_train + meta_data_eval
83 |
84 | enc_manager = SpeakerManager(
85 | encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86 | )
87 |
88 | compute_encoder_accuracy(items, enc_manager)
89 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_chars.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | from argparse import RawTextHelpFormatter
4 |
5 | from TTS.config import load_config
6 | from TTS.tts.datasets import load_tts_samples
7 |
8 |
9 | def main():
10 | # pylint: disable=bad-option-value
11 | parser = argparse.ArgumentParser(
12 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13 | """
14 | Example runs:
15 |
16 | python TTS/bin/find_unique_chars.py --config_path config.json
17 | """,
18 | formatter_class=RawTextHelpFormatter,
19 | )
20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21 | args = parser.parse_args()
22 |
23 | c = load_config(args.config_path)
24 |
25 | # load all datasets
26 | train_items, eval_items = load_tts_samples(
27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28 | )
29 |
30 | items = train_items + eval_items
31 |
32 | texts = "".join(item["text"] for item in items)
33 | chars = set(texts)
34 | lower_chars = filter(lambda c: c.islower(), chars)
35 | chars_force_lower = [c.lower() for c in chars]
36 | chars_force_lower = set(chars_force_lower)
37 |
38 | print(f" > Number of unique characters: {len(chars)}")
39 | print(f" > Unique characters: {''.join(sorted(chars))}")
40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_phonemes.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | import multiprocessing
4 | from argparse import RawTextHelpFormatter
5 |
6 | from tqdm.contrib.concurrent import process_map
7 |
8 | from TTS.config import load_config
9 | from TTS.tts.datasets import load_tts_samples
10 | from TTS.tts.utils.text.phonemizers import Gruut
11 |
12 |
13 | def compute_phonemes(item):
14 | text = item["text"]
15 | ph = phonemizer.phonemize(text).replace("|", "")
16 | return set(list(ph))
17 |
18 |
19 | def main():
20 | # pylint: disable=W0601
21 | global c, phonemizer
22 | # pylint: disable=bad-option-value
23 | parser = argparse.ArgumentParser(
24 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25 | """
26 | Example runs:
27 |
28 | python TTS/bin/find_unique_phonemes.py --config_path config.json
29 | """,
30 | formatter_class=RawTextHelpFormatter,
31 | )
32 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33 | args = parser.parse_args()
34 |
35 | c = load_config(args.config_path)
36 |
37 | # load all datasets
38 | train_items, eval_items = load_tts_samples(
39 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40 | )
41 | items = train_items + eval_items
42 | print("Num items:", len(items))
43 |
44 | language_list = [item["language"] for item in items]
45 | is_lang_def = all(language_list)
46 |
47 | if not c.phoneme_language or not is_lang_def:
48 | raise ValueError("Phoneme language must be defined in config.")
49 |
50 | if not language_list.count(language_list[0]) == len(language_list):
51 | raise ValueError(
52 | "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53 | )
54 |
55 | phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56 |
57 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58 | phones = []
59 | for ph in phonemes:
60 | phones.extend(ph)
61 |
62 | phones = set(phones)
63 | lower_phones = filter(lambda c: c.islower(), phones)
64 | phones_force_lower = [c.lower() for c in phones]
65 | phones_force_lower = set(phones_force_lower)
66 |
67 | print(f" > Number of unique phonemes: {len(phones)}")
68 | print(f" > Unique phonemes: {''.join(sorted(phones))}")
69 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/TTS/bin/resample.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import os
4 | from argparse import RawTextHelpFormatter
5 | from multiprocessing import Pool
6 | from shutil import copytree
7 |
8 | import librosa
9 | import soundfile as sf
10 | from tqdm import tqdm
11 |
12 |
13 | def resample_file(func_args):
14 | filename, output_sr = func_args
15 | y, sr = librosa.load(filename, sr=output_sr)
16 | sf.write(filename, y, sr)
17 |
18 |
19 | def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20 | if output_dir:
21 | print("Recursively copying the input folder...")
22 | copytree(input_dir, output_dir)
23 | input_dir = output_dir
24 |
25 | print("Resampling the audio files...")
26 | audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27 | print(f"Found {len(audio_files)} files...")
28 | audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29 | with Pool(processes=n_jobs) as p:
30 | with tqdm(total=len(audio_files)) as pbar:
31 | for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32 | pbar.update()
33 |
34 | print("Done !")
35 |
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(
39 | description="""Resample a folder recusively with librosa
40 | Can be used in place or create a copy of the folder as an output.\n\n
41 | Example run:
42 | python TTS/bin/resample.py
43 | --input_dir /root/LJSpeech-1.1/
44 | --output_sr 22050
45 | --output_dir /root/resampled_LJSpeech-1.1/
46 | --file_ext wav
47 | --n_jobs 24
48 | """,
49 | formatter_class=RawTextHelpFormatter,
50 | )
51 |
52 | parser.add_argument(
53 | "--input_dir",
54 | type=str,
55 | default=None,
56 | required=True,
57 | help="Path of the folder containing the audio files to resample",
58 | )
59 |
60 | parser.add_argument(
61 | "--output_sr",
62 | type=int,
63 | default=22050,
64 | required=False,
65 | help="Samlple rate to which the audio files should be resampled",
66 | )
67 |
68 | parser.add_argument(
69 | "--output_dir",
70 | type=str,
71 | default=None,
72 | required=False,
73 | help="Path of the destination folder. If not defined, the operation is done in place",
74 | )
75 |
76 | parser.add_argument(
77 | "--file_ext",
78 | type=str,
79 | default="wav",
80 | required=False,
81 | help="Extension of the audio files to resample",
82 | )
83 |
84 | parser.add_argument(
85 | "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86 | )
87 |
88 | args = parser.parse_args()
89 |
90 | resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
91 |
--------------------------------------------------------------------------------
/TTS/bin/train_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models import setup_model
9 |
10 |
11 | @dataclass
12 | class TrainTTSArgs(TrainerArgs):
13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14 |
15 |
16 | def main():
17 | """Run `tts` model training directly by a `config.json` file."""
18 | # init trainer args
19 | train_args = TrainTTSArgs()
20 | parser = train_args.init_argparse(arg_prefix="")
21 |
22 | # override trainer args from comman-line args
23 | args, config_overrides = parser.parse_known_args()
24 | train_args.parse_args(args)
25 |
26 | # load config.json and register
27 | if args.config_path or args.continue_path:
28 | if args.config_path:
29 | # init from a file
30 | config = load_config(args.config_path)
31 | if len(config_overrides) > 0:
32 | config.parse_known_args(config_overrides, relaxed_parser=True)
33 | elif args.continue_path:
34 | # continue from a prev experiment
35 | config = load_config(os.path.join(args.continue_path, "config.json"))
36 | if len(config_overrides) > 0:
37 | config.parse_known_args(config_overrides, relaxed_parser=True)
38 | else:
39 | # init from console args
40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41 |
42 | config_base = BaseTrainingConfig()
43 | config_base.parse_known_args(config_overrides)
44 | config = register_config(config_base.model)()
45 |
46 | # load training samples
47 | train_samples, eval_samples = load_tts_samples(
48 | config.datasets,
49 | eval_split=True,
50 | eval_split_max_size=config.eval_split_max_size,
51 | eval_split_size=config.eval_split_size,
52 | )
53 |
54 | # init the model from config
55 | model = setup_model(config, train_samples + eval_samples)
56 |
57 | # init the trainer and 🚀
58 | trainer = Trainer(
59 | train_args,
60 | model.config,
61 | config.output_path,
62 | model=model,
63 | train_samples=train_samples,
64 | eval_samples=eval_samples,
65 | parse_command_line_args=False,
66 | )
67 | trainer.fit()
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/TTS/bin/train_vocoder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.utils.audio import AudioProcessor
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.models import setup_model
10 |
11 |
12 | @dataclass
13 | class TrainVocoderArgs(TrainerArgs):
14 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15 |
16 |
17 | def main():
18 | """Run `tts` model training directly by a `config.json` file."""
19 | # init trainer args
20 | train_args = TrainVocoderArgs()
21 | parser = train_args.init_argparse(arg_prefix="")
22 |
23 | # override trainer args from comman-line args
24 | args, config_overrides = parser.parse_known_args()
25 | train_args.parse_args(args)
26 |
27 | # load config.json and register
28 | if args.config_path or args.continue_path:
29 | if args.config_path:
30 | # init from a file
31 | config = load_config(args.config_path)
32 | if len(config_overrides) > 0:
33 | config.parse_known_args(config_overrides, relaxed_parser=True)
34 | elif args.continue_path:
35 | # continue from a prev experiment
36 | config = load_config(os.path.join(args.continue_path, "config.json"))
37 | if len(config_overrides) > 0:
38 | config.parse_known_args(config_overrides, relaxed_parser=True)
39 | else:
40 | # init from console args
41 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42 |
43 | config_base = BaseTrainingConfig()
44 | config_base.parse_known_args(config_overrides)
45 | config = register_config(config_base.model)()
46 |
47 | # load training samples
48 | if "feature_path" in config and config.feature_path:
49 | # load pre-computed features
50 | print(f" > Loading features from: {config.feature_path}")
51 | eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52 | else:
53 | # load data raw wav files
54 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55 |
56 | # setup audio processor
57 | ap = AudioProcessor(**config.audio)
58 |
59 | # init the model from config
60 | model = setup_model(config)
61 |
62 | # init the trainer and 🚀
63 | trainer = Trainer(
64 | train_args,
65 | config,
66 | config.output_path,
67 | model=model,
68 | train_samples=train_samples,
69 | eval_samples=eval_samples,
70 | training_assets={"audio_processor": ap},
71 | parse_command_line_args=False,
72 | )
73 | trainer.fit()
74 |
75 |
76 | if __name__ == "__main__":
77 | main()
78 |
--------------------------------------------------------------------------------
/TTS/demos/xtts_ft_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | faster_whisper==0.9.0
2 | gradio==4.7.1
--------------------------------------------------------------------------------
/TTS/encoder/README.md:
--------------------------------------------------------------------------------
1 | ### Speaker Encoder
2 |
3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4 |
5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6 |
7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8 |
9 | 
10 |
11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12 |
13 | To run the code, you need to follow the same flow as in TTS.
14 |
15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18 | - Watch training on Tensorboard as in TTS
19 |
--------------------------------------------------------------------------------
/TTS/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/encoder/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/configs/base_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass, field
2 | from typing import Dict, List
3 |
4 | from coqpit import MISSING
5 |
6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7 |
8 |
9 | @dataclass
10 | class BaseEncoderConfig(BaseTrainingConfig):
11 | """Defines parameters for a Generic Encoder model."""
12 |
13 | model: str = None
14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16 | # model params
17 | model_params: Dict = field(
18 | default_factory=lambda: {
19 | "model_name": "lstm",
20 | "input_dim": 80,
21 | "proj_dim": 256,
22 | "lstm_dim": 768,
23 | "num_lstm_layers": 3,
24 | "use_lstm_with_projection": True,
25 | }
26 | )
27 |
28 | audio_augmentation: Dict = field(default_factory=lambda: {})
29 |
30 | # training params
31 | epochs: int = 10000
32 | loss: str = "angleproto"
33 | grad_clip: float = 3.0
34 | lr: float = 0.0001
35 | optimizer: str = "radam"
36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37 | lr_decay: bool = False
38 | warmup_steps: int = 4000
39 |
40 | # logging params
41 | tb_model_param_stats: bool = False
42 | steps_plot_stats: int = 10
43 | save_step: int = 1000
44 | print_step: int = 20
45 | run_eval: bool = False
46 |
47 | # data loader
48 | num_classes_in_batch: int = MISSING
49 | num_utter_per_class: int = MISSING
50 | eval_num_classes_in_batch: int = None
51 | eval_num_utter_per_class: int = None
52 |
53 | num_loader_workers: int = MISSING
54 | voice_len: float = 1.6
55 |
56 | def check_values(self):
57 | super().check_values()
58 | c = asdict(self)
59 | assert (
60 | c["model_params"]["input_dim"] == self.audio.num_mels
61 | ), " [!] model input dimendion must be equal to melspectrogram dimension."
62 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/emotion_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class EmotionEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Emotion Encoder model."""
9 |
10 | model: str = "emotion_encoder"
11 | map_classid_to_classname: dict = None
12 | class_name_key: str = "emotion_name"
13 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/speaker_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class SpeakerEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Speaker Encoder model."""
9 |
10 | model: str = "speaker_encoder"
11 | class_name_key: str = "speaker_name"
12 |
--------------------------------------------------------------------------------
/TTS/encoder/models/lstm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from TTS.encoder.models.base_encoder import BaseEncoder
5 |
6 |
7 | class LSTMWithProjection(nn.Module):
8 | def __init__(self, input_size, hidden_size, proj_size):
9 | super().__init__()
10 | self.input_size = input_size
11 | self.hidden_size = hidden_size
12 | self.proj_size = proj_size
13 | self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14 | self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15 |
16 | def forward(self, x):
17 | self.lstm.flatten_parameters()
18 | o, (_, _) = self.lstm(x)
19 | return self.linear(o)
20 |
21 |
22 | class LSTMWithoutProjection(nn.Module):
23 | def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24 | super().__init__()
25 | self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26 | self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27 | self.relu = nn.ReLU()
28 |
29 | def forward(self, x):
30 | _, (hidden, _) = self.lstm(x)
31 | return self.relu(self.linear(hidden[-1]))
32 |
33 |
34 | class LSTMSpeakerEncoder(BaseEncoder):
35 | def __init__(
36 | self,
37 | input_dim,
38 | proj_dim=256,
39 | lstm_dim=768,
40 | num_lstm_layers=3,
41 | use_lstm_with_projection=True,
42 | use_torch_spec=False,
43 | audio_config=None,
44 | ):
45 | super().__init__()
46 | self.use_lstm_with_projection = use_lstm_with_projection
47 | self.use_torch_spec = use_torch_spec
48 | self.audio_config = audio_config
49 | self.proj_dim = proj_dim
50 |
51 | layers = []
52 | # choise LSTM layer
53 | if use_lstm_with_projection:
54 | layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55 | for _ in range(num_lstm_layers - 1):
56 | layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57 | self.layers = nn.Sequential(*layers)
58 | else:
59 | self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60 |
61 | self.instancenorm = nn.InstanceNorm1d(input_dim)
62 |
63 | if self.use_torch_spec:
64 | self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65 | else:
66 | self.torch_spec = None
67 |
68 | self._init_layers()
69 |
70 | def _init_layers(self):
71 | for name, param in self.layers.named_parameters():
72 | if "bias" in name:
73 | nn.init.constant_(param, 0.0)
74 | elif "weight" in name:
75 | nn.init.xavier_normal_(param)
76 |
77 | def forward(self, x, l2_norm=True):
78 | """Forward pass of the model.
79 |
80 | Args:
81 | x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82 | to compute the spectrogram on-the-fly.
83 | l2_norm (bool): Whether to L2-normalize the outputs.
84 |
85 | Shapes:
86 | - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87 | """
88 | with torch.no_grad():
89 | with torch.cuda.amp.autocast(enabled=False):
90 | if self.use_torch_spec:
91 | x.squeeze_(1)
92 | x = self.torch_spec(x)
93 | x = self.instancenorm(x).transpose(1, 2)
94 | d = self.layers(x)
95 | if self.use_lstm_with_projection:
96 | d = d[:, -1]
97 | if l2_norm:
98 | d = torch.nn.functional.normalize(d, p=2, dim=1)
99 | return d
100 |
--------------------------------------------------------------------------------
/TTS/encoder/requirements.txt:
--------------------------------------------------------------------------------
1 | umap-learn
2 | numpy>=1.17.0
3 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/encoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/utils/visual.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import umap
5 |
6 | matplotlib.use("Agg")
7 |
8 |
9 | colormap = (
10 | np.array(
11 | [
12 | [76, 255, 0],
13 | [0, 127, 70],
14 | [255, 0, 0],
15 | [255, 217, 38],
16 | [0, 135, 255],
17 | [165, 0, 165],
18 | [255, 167, 255],
19 | [0, 255, 255],
20 | [255, 96, 38],
21 | [142, 76, 0],
22 | [33, 0, 127],
23 | [0, 0, 0],
24 | [183, 183, 183],
25 | ],
26 | dtype=float,
27 | )
28 | / 255
29 | )
30 |
31 |
32 | def plot_embeddings(embeddings, num_classes_in_batch):
33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
34 |
35 | # if necessary get just the first 10 classes
36 | if num_classes_in_batch > 10:
37 | num_classes_in_batch = 10
38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
39 |
40 | model = umap.UMAP()
41 | projection = model.fit_transform(embeddings)
42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
43 | colors = [colormap[i] for i in ground_truth]
44 | fig, ax = plt.subplots(figsize=(16, 10))
45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
46 | plt.gca().set_aspect("equal", "datalim")
47 | plt.title("UMAP projection")
48 | plt.tight_layout()
49 | plt.savefig("umap")
50 | return fig
51 |
--------------------------------------------------------------------------------
/TTS/model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Dict
3 |
4 | import torch
5 | from coqpit import Coqpit
6 | from trainer import TrainerModel
7 |
8 | # pylint: skip-file
9 |
10 |
11 | class BaseTrainerModel(TrainerModel):
12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13 |
14 | Every new 🐸TTS model must inherit it.
15 | """
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def init_from_config(config: Coqpit):
20 | """Init the model and all its attributes from the given config.
21 |
22 | Override this depending on your model.
23 | """
24 | ...
25 |
26 | @abstractmethod
27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28 | """Forward pass for inference.
29 |
30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31 | is considered to be the main output and you can add any other auxiliary outputs as you want.
32 |
33 | We don't use `*kwargs` since it is problematic with the TorchScript API.
34 |
35 | Args:
36 | input (torch.Tensor): [description]
37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38 |
39 | Returns:
40 | Dict: [description]
41 | """
42 | outputs_dict = {"model_outputs": None}
43 | ...
44 | return outputs_dict
45 |
46 | @abstractmethod
47 | def load_checkpoint(
48 | self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
49 | ) -> None:
50 | """Load a model checkpoint gile and get ready for training or inference.
51 |
52 | Args:
53 | config (Coqpit): Model configuration.
54 | checkpoint_path (str): Path to the model checkpoint file.
55 | eval (bool, optional): If true, init model for inference else for training. Defaults to False.
56 | strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
57 | cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
58 | """
59 | ...
60 |
--------------------------------------------------------------------------------
/TTS/server/README.md:
--------------------------------------------------------------------------------
1 | # :frog: TTS demo server
2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
3 |
4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
5 |
6 | Examples runs:
7 |
8 | List officially released models.
9 | ```python TTS/server/server.py --list_models ```
10 |
11 | Run the server with the official models.
12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
13 |
14 | Run the server with the official models on a GPU.
15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
16 |
17 | Run the server with a custom models.
18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
19 |
--------------------------------------------------------------------------------
/TTS/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/server/__init__.py
--------------------------------------------------------------------------------
/TTS/server/conf.json:
--------------------------------------------------------------------------------
1 | {
2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
3 | "tts_file":"best_model.pth", // tts checkpoint file
4 | "tts_config":"config.json", // tts config.json file
5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
6 | "vocoder_config":null,
7 | "vocoder_file": null,
8 | "is_wavernn_batched":true,
9 | "port": 5002,
10 | "use_cuda": true,
11 | "debug": true
12 | }
13 |
--------------------------------------------------------------------------------
/TTS/server/static/coqui-log-green-TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/server/static/coqui-log-green-TTS.png
--------------------------------------------------------------------------------
/TTS/server/templates/details.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | TTS engine
12 |
13 |
14 |
17 |
18 |
19 |
30 |
31 |
32 |
33 |
35 |
36 | {% if show_details == true %}
37 |
38 |
39 | Model details
40 |
41 |
42 |
43 |
44 | CLI arguments:
45 |
46 |
47 | CLI key |
48 | Value |
49 |
50 |
51 | {% for key, value in args.items() %}
52 |
53 |
54 | {{ key }} |
55 | {{ value }} |
56 |
57 |
58 | {% endfor %}
59 |
60 |
61 |
62 |
63 |
64 |
65 | {% if model_config != None %}
66 |
67 |
68 | Model config:
69 |
70 |
71 |
72 | Key |
73 | Value |
74 |
75 |
76 |
77 | {% for key, value in model_config.items() %}
78 |
79 |
80 | {{ key }} |
81 | {{ value }} |
82 |
83 |
84 | {% endfor %}
85 |
86 |
87 |
88 |
89 | {% endif %}
90 |
91 |
92 |
93 |
94 |
95 |
96 | {% if vocoder_config != None %}
97 |
98 | Vocoder model config:
99 |
100 |
101 |
102 | Key |
103 | Value |
104 |
105 |
106 |
107 | {% for key, value in vocoder_config.items() %}
108 |
109 |
110 | {{ key }} |
111 | {{ value }} |
112 |
113 |
114 | {% endfor %}
115 |
116 |
117 |
118 |
119 | {% endif %}
120 |
121 |
122 | {% else %}
123 |
124 | Please start server with --show_details=true to see details.
125 |
126 |
127 | {% endif %}
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/TTS/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | # configs_dir = os.path.dirname(__file__)
7 | # for file in os.listdir(configs_dir):
8 | # path = os.path.join(configs_dir, file)
9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | # module = importlib.import_module("TTS.tts.configs." + config_name)
12 | # for attribute_name in dir(module):
13 | # attribute = getattr(module, attribute_name)
14 |
15 | # if isclass(attribute):
16 | # # Add the class to this package's variables
17 | # globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/tts/configs/tacotron2_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from TTS.tts.configs.tacotron_config import TacotronConfig
4 |
5 |
6 | @dataclass
7 | class Tacotron2Config(TacotronConfig):
8 | """Defines parameters for Tacotron2 based models.
9 |
10 | Example:
11 |
12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
13 | >>> config = Tacotron2Config()
14 |
15 | Check `TacotronConfig` for argument descriptions.
16 | """
17 |
18 | model: str = "tacotron2"
19 | out_channels: int = 80
20 | encoder_in_features: int = 512
21 | decoder_in_features: int = 512
22 |
--------------------------------------------------------------------------------
/TTS/tts/configs/tortoise_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 |
3 | from TTS.tts.configs.shared_configs import BaseTTSConfig
4 | from TTS.tts.models.tortoise import TortoiseArgs, TortoiseAudioConfig
5 |
6 |
7 | @dataclass
8 | class TortoiseConfig(BaseTTSConfig):
9 | """Defines parameters for Tortoise TTS model.
10 |
11 | Args:
12 | model (str):
13 | Model name. Do not change unless you know what you are doing.
14 |
15 | model_args (TortoiseArgs):
16 | Model architecture arguments. Defaults to `TortoiseArgs()`.
17 |
18 | audio (TortoiseAudioConfig):
19 | Audio processing configuration. Defaults to `TortoiseAudioConfig()`.
20 |
21 | model_dir (str):
22 | Path to the folder that has all the Tortoise models. Defaults to None.
23 |
24 | temperature (float):
25 | Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
26 |
27 | length_penalty (float):
28 | Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
29 | which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
30 | length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
31 |
32 | reperation_penalty (float):
33 | The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
34 |
35 | top_p (float):
36 | If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
37 | Defaults to `0.8`.
38 |
39 | cond_free_k (float):
40 | Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
41 | As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
42 | Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
43 |
44 | diffusion_temperature (float):
45 | Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
46 | are the "mean" prediction of the diffusion network and will sound bland and smeared.
47 | Defaults to `1.0`.
48 |
49 | num_autoregressive_samples (int):
50 | Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
51 | As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
52 | Defaults to `16`.
53 |
54 | diffusion_iterations (int):
55 | Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
56 | the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
57 | however. Defaults to `30`.
58 |
59 | sampler (str):
60 | Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
61 | Note:
62 | Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
63 |
64 | Example:
65 |
66 | >>> from TTS.tts.configs.tortoise_config import TortoiseConfig
67 | >>> config = TortoiseConfig()
68 | """
69 |
70 | model: str = "tortoise"
71 | # model specific params
72 | model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
73 | audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
74 | model_dir: str = None
75 |
76 | # settings
77 | temperature: float = 0.2
78 | length_penalty: float = 1.0
79 | repetition_penalty: float = 2.0
80 | top_p: float = 0.8
81 | cond_free_k: float = 2.0
82 | diffusion_temperature: float = 1.0
83 |
84 | # inference params
85 | num_autoregressive_samples: int = 16
86 | diffusion_iterations: int = 30
87 | sampler: str = "ddim"
88 |
--------------------------------------------------------------------------------
/TTS/tts/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.layers.losses import *
2 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/align_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
9 | super().__init__()
10 | self.embed = nn.Embedding(num_chars, hidden_channels)
11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
14 |
15 | def forward(self, text, text_lengths):
16 | # B, L -> B, L
17 | emb = self.embed(text)
18 | emb = self.pos_enc(emb.transpose(1, 2))
19 | x = self.FFT(emb, text_lengths)
20 | x = self.out_layer(x).squeeze(-1)
21 | return x
22 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/mdn.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class MDNBlock(nn.Module):
5 | """Mixture of Density Network implementation
6 | https://arxiv.org/pdf/2003.01950.pdf
7 | """
8 |
9 | def __init__(self, in_channels, out_channels):
10 | super().__init__()
11 | self.out_channels = out_channels
12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
13 | self.norm = nn.LayerNorm(in_channels)
14 | self.relu = nn.ReLU()
15 | self.dropout = nn.Dropout(0.1)
16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
17 |
18 | def forward(self, x):
19 | o = self.conv1(x)
20 | o = o.transpose(1, 2)
21 | o = self.norm(o)
22 | o = o.transpose(1, 2)
23 | o = self.relu(o)
24 | o = self.dropout(o)
25 | mu_sigma = self.conv2(o)
26 | # TODO: check this sigmoid
27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
28 | mu = mu_sigma[:, : self.out_channels // 2, :]
29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :]
30 | return mu, log_sigma
31 |
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/bark/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/bark/hubert/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/hubert_manager.py:
--------------------------------------------------------------------------------
1 | # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
2 |
3 | import os.path
4 | import shutil
5 | import urllib.request
6 |
7 | import huggingface_hub
8 |
9 |
10 | class HubertManager:
11 | @staticmethod
12 | def make_sure_hubert_installed(
13 | download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
14 | ):
15 | if not os.path.isfile(model_path):
16 | print("Downloading HuBERT base model")
17 | urllib.request.urlretrieve(download_url, model_path)
18 | print("Downloaded HuBERT")
19 | return model_path
20 | return None
21 |
22 | @staticmethod
23 | def make_sure_tokenizer_installed(
24 | model: str = "quantifier_hubert_base_ls960_14.pth",
25 | repo: str = "GitMylo/bark-voice-cloning",
26 | model_path: str = "",
27 | ):
28 | model_dir = os.path.dirname(model_path)
29 | if not os.path.isfile(model_path):
30 | print("Downloading HuBERT custom tokenizer")
31 | huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
32 | shutil.move(os.path.join(model_dir, model), model_path)
33 | print("Downloaded tokenizer")
34 | return model_path
35 | return None
36 |
--------------------------------------------------------------------------------
/TTS/tts/layers/bark/hubert/kmeans_hubert.py:
--------------------------------------------------------------------------------
1 | """
2 | Modified HuBERT model without kmeans.
3 | Original author: https://github.com/lucidrains/
4 | Modified by: https://www.github.com/gitmylo/
5 | License: MIT
6 | """
7 |
8 | # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
9 |
10 | import logging
11 | from pathlib import Path
12 |
13 | import torch
14 | from einops import pack, unpack
15 | from torch import nn
16 | from torchaudio.functional import resample
17 | from transformers import HubertModel
18 |
19 |
20 | def round_down_nearest_multiple(num, divisor):
21 | return num // divisor * divisor
22 |
23 |
24 | def curtail_to_multiple(t, mult, from_left=False):
25 | data_len = t.shape[-1]
26 | rounded_seq_len = round_down_nearest_multiple(data_len, mult)
27 | seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
28 | return t[..., seq_slice]
29 |
30 |
31 | def exists(val):
32 | return val is not None
33 |
34 |
35 | def default(val, d):
36 | return val if exists(val) else d
37 |
38 |
39 | class CustomHubert(nn.Module):
40 | """
41 | checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
42 | or you can train your own
43 | """
44 |
45 | def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
46 | super().__init__()
47 | self.target_sample_hz = target_sample_hz
48 | self.seq_len_multiple_of = seq_len_multiple_of
49 | self.output_layer = output_layer
50 | if device is not None:
51 | self.to(device)
52 | self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
53 | if device is not None:
54 | self.model.to(device)
55 | self.model.eval()
56 |
57 | @property
58 | def groups(self):
59 | return 1
60 |
61 | @torch.no_grad()
62 | def forward(self, wav_input, flatten=True, input_sample_hz=None):
63 | device = wav_input.device
64 |
65 | if exists(input_sample_hz):
66 | wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
67 |
68 | if exists(self.seq_len_multiple_of):
69 | wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
70 |
71 | outputs = self.model.forward(
72 | wav_input,
73 | output_hidden_states=True,
74 | )
75 | embed = outputs["hidden_states"][self.output_layer]
76 | embed, packed_shape = pack([embed], "* d")
77 | codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
78 | if flatten:
79 | return codebook_indices
80 |
81 | (codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
82 | return codebook_indices
83 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/delightful_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/energy_adaptor.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Tuple
2 |
3 | import torch
4 | import torch.nn as nn # pylint: disable=consider-using-from-import
5 |
6 | from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
7 | from TTS.tts.utils.helpers import average_over_durations
8 |
9 |
10 | class EnergyAdaptor(nn.Module): # pylint: disable=abstract-method
11 | """Variance Adaptor with an added 1D conv layer. Used to
12 | get energy embeddings.
13 |
14 | Args:
15 | channels_in (int): Number of in channels for conv layers.
16 | channels_out (int): Number of out channels.
17 | kernel_size (int): Size the kernel for the conv layers.
18 | dropout (float): Probability of dropout.
19 | lrelu_slope (float): Slope for the leaky relu.
20 | emb_kernel_size (int): Size the kernel for the pitch embedding.
21 |
22 | Inputs: inputs, mask
23 | - **inputs** (batch, time1, dim): Tensor containing input vector
24 | - **target** (batch, 1, time2): Tensor containing the energy target
25 | - **dr** (batch, time1): Tensor containing aligner durations vector
26 | - **mask** (batch, time1): Tensor containing indices to be masked
27 | Returns:
28 | - **energy prediction** (batch, 1, time1): Tensor produced by energy predictor
29 | - **energy embedding** (batch, channels, time1): Tensor produced energy adaptor
30 | - **average energy target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
31 |
32 | """
33 |
34 | def __init__(
35 | self,
36 | channels_in: int,
37 | channels_hidden: int,
38 | channels_out: int,
39 | kernel_size: int,
40 | dropout: float,
41 | lrelu_slope: float,
42 | emb_kernel_size: int,
43 | ):
44 | super().__init__()
45 | self.energy_predictor = VariancePredictor(
46 | channels_in=channels_in,
47 | channels=channels_hidden,
48 | channels_out=channels_out,
49 | kernel_size=kernel_size,
50 | p_dropout=dropout,
51 | lrelu_slope=lrelu_slope,
52 | )
53 | self.energy_emb = nn.Conv1d(
54 | 1,
55 | channels_hidden,
56 | kernel_size=emb_kernel_size,
57 | padding=int((emb_kernel_size - 1) / 2),
58 | )
59 |
60 | def get_energy_embedding_train(
61 | self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
62 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
63 | """
64 | Shapes:
65 | x: :math: `[B, T_src, C]`
66 | target: :math: `[B, 1, T_max2]`
67 | dr: :math: `[B, T_src]`
68 | mask: :math: `[B, T_src]`
69 | """
70 | energy_pred = self.energy_predictor(x, mask)
71 | energy_pred.unsqueeze_(1)
72 | avg_energy_target = average_over_durations(target, dr)
73 | energy_emb = self.energy_emb(avg_energy_target)
74 | return energy_pred, avg_energy_target, energy_emb
75 |
76 | def get_energy_embedding(self, x: torch.Tensor, mask: torch.Tensor, energy_transform: Callable) -> torch.Tensor:
77 | energy_pred = self.energy_predictor(x, mask)
78 | energy_pred.unsqueeze_(1)
79 | if energy_transform is not None:
80 | energy_pred = energy_transform(energy_pred, (~mask).sum(dim=(1, 2)), self.pitch_mean, self.pitch_std)
81 | energy_emb_pred = self.energy_emb(energy_pred)
82 | return energy_emb_pred, energy_pred
83 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/phoneme_prosody_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn # pylint: disable=consider-using-from-import
3 |
4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
5 |
6 |
7 | class PhonemeProsodyPredictor(nn.Module):
8 | """Non-parallel Prosody Predictor inspired by: https://arxiv.org/pdf/2102.00851.pdf
9 | It consists of 2 layers of 1D convolutions each followed by a relu activation, layer norm
10 | and dropout, then finally a linear layer.
11 |
12 | Args:
13 | hidden_size (int): Size of hidden channels.
14 | kernel_size (int): Kernel size for the conv layers.
15 | dropout: (float): Probability of dropout.
16 | bottleneck_size (int): bottleneck size for last linear layer.
17 | lrelu_slope (float): Slope of the leaky relu.
18 | """
19 |
20 | def __init__(
21 | self,
22 | hidden_size: int,
23 | kernel_size: int,
24 | dropout: float,
25 | bottleneck_size: int,
26 | lrelu_slope: float,
27 | ):
28 | super().__init__()
29 | self.d_model = hidden_size
30 | self.layers = nn.ModuleList(
31 | [
32 | ConvTransposed(
33 | self.d_model,
34 | self.d_model,
35 | kernel_size=kernel_size,
36 | padding=(kernel_size - 1) // 2,
37 | ),
38 | nn.LeakyReLU(lrelu_slope),
39 | nn.LayerNorm(self.d_model),
40 | nn.Dropout(dropout),
41 | ConvTransposed(
42 | self.d_model,
43 | self.d_model,
44 | kernel_size=kernel_size,
45 | padding=(kernel_size - 1) // 2,
46 | ),
47 | nn.LeakyReLU(lrelu_slope),
48 | nn.LayerNorm(self.d_model),
49 | nn.Dropout(dropout),
50 | ]
51 | )
52 | self.predictor_bottleneck = nn.Linear(self.d_model, bottleneck_size)
53 |
54 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
55 | """
56 | Shapes:
57 | x: :math: `[B, T, D]`
58 | mask: :math: `[B, T]`
59 | """
60 | mask = mask.unsqueeze(2)
61 | for layer in self.layers:
62 | x = layer(x)
63 | x = x.masked_fill(mask, 0.0)
64 | x = self.predictor_bottleneck(x)
65 | return x
66 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/pitch_adaptor.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Tuple
2 |
3 | import torch
4 | import torch.nn as nn # pylint: disable=consider-using-from-import
5 |
6 | from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
7 | from TTS.tts.utils.helpers import average_over_durations
8 |
9 |
10 | class PitchAdaptor(nn.Module): # pylint: disable=abstract-method
11 | """Module to get pitch embeddings via pitch predictor
12 |
13 | Args:
14 | n_input (int): Number of pitch predictor input channels.
15 | n_hidden (int): Number of pitch predictor hidden channels.
16 | n_out (int): Number of pitch predictor out channels.
17 | kernel size (int): Size of the kernel for conv layers.
18 | emb_kernel_size (int): Size the kernel for the pitch embedding.
19 | p_dropout (float): Probability of dropout.
20 | lrelu_slope (float): Slope for the leaky relu.
21 |
22 | Inputs: inputs, mask
23 | - **inputs** (batch, time1, dim): Tensor containing input vector
24 | - **target** (batch, 1, time2): Tensor containing the pitch target
25 | - **dr** (batch, time1): Tensor containing aligner durations vector
26 | - **mask** (batch, time1): Tensor containing indices to be masked
27 | Returns:
28 | - **pitch prediction** (batch, 1, time1): Tensor produced by pitch predictor
29 | - **pitch embedding** (batch, channels, time1): Tensor produced pitch pitch adaptor
30 | - **average pitch target(train only)** (batch, 1, time1): Tensor produced after averaging over durations
31 | """
32 |
33 | def __init__(
34 | self,
35 | n_input: int,
36 | n_hidden: int,
37 | n_out: int,
38 | kernel_size: int,
39 | emb_kernel_size: int,
40 | p_dropout: float,
41 | lrelu_slope: float,
42 | ):
43 | super().__init__()
44 | self.pitch_predictor = VariancePredictor(
45 | channels_in=n_input,
46 | channels=n_hidden,
47 | channels_out=n_out,
48 | kernel_size=kernel_size,
49 | p_dropout=p_dropout,
50 | lrelu_slope=lrelu_slope,
51 | )
52 | self.pitch_emb = nn.Conv1d(
53 | 1,
54 | n_input,
55 | kernel_size=emb_kernel_size,
56 | padding=int((emb_kernel_size - 1) / 2),
57 | )
58 |
59 | def get_pitch_embedding_train(
60 | self, x: torch.Tensor, target: torch.Tensor, dr: torch.IntTensor, mask: torch.Tensor
61 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
62 | """
63 | Shapes:
64 | x: :math: `[B, T_src, C]`
65 | target: :math: `[B, 1, T_max2]`
66 | dr: :math: `[B, T_src]`
67 | mask: :math: `[B, T_src]`
68 | """
69 | pitch_pred = self.pitch_predictor(x, mask) # [B, T_src, C_hidden], [B, T_src] --> [B, T_src]
70 | pitch_pred.unsqueeze_(1) # --> [B, 1, T_src]
71 | avg_pitch_target = average_over_durations(target, dr) # [B, 1, T_mel], [B, T_src] --> [B, 1, T_src]
72 | pitch_emb = self.pitch_emb(avg_pitch_target) # [B, 1, T_src] --> [B, C_hidden, T_src]
73 | return pitch_pred, avg_pitch_target, pitch_emb
74 |
75 | def get_pitch_embedding(
76 | self,
77 | x: torch.Tensor,
78 | mask: torch.Tensor,
79 | pitch_transform: Callable,
80 | pitch_mean: torch.Tensor,
81 | pitch_std: torch.Tensor,
82 | ) -> torch.Tensor:
83 | pitch_pred = self.pitch_predictor(x, mask)
84 | if pitch_transform is not None:
85 | pitch_pred = pitch_transform(pitch_pred, (~mask).sum(), pitch_mean, pitch_std)
86 | pitch_pred.unsqueeze_(1)
87 | pitch_emb_pred = self.pitch_emb(pitch_pred)
88 | return pitch_emb_pred, pitch_pred
89 |
--------------------------------------------------------------------------------
/TTS/tts/layers/delightful_tts/variance_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn # pylint: disable=consider-using-from-import
3 |
4 | from TTS.tts.layers.delightful_tts.conv_layers import ConvTransposed
5 |
6 |
7 | class VariancePredictor(nn.Module):
8 | """
9 | Network is 2-layer 1D convolutions with leaky relu activation and then
10 | followed by layer normalization then a dropout layer and finally an
11 | extra linear layer to project the hidden states into the output sequence.
12 |
13 | Args:
14 | channels_in (int): Number of in channels for conv layers.
15 | channels_out (int): Number of out channels for the last linear layer.
16 | kernel_size (int): Size the kernel for the conv layers.
17 | p_dropout (float): Probability of dropout.
18 | lrelu_slope (float): Slope for the leaky relu.
19 |
20 | Inputs: inputs, mask
21 | - **inputs** (batch, time, dim): Tensor containing input vector
22 | - **mask** (batch, time): Tensor containing indices to be masked
23 | Returns:
24 | - **outputs** (batch, time): Tensor produced by last linear layer.
25 | """
26 |
27 | def __init__(
28 | self, channels_in: int, channels: int, channels_out: int, kernel_size: int, p_dropout: float, lrelu_slope: float
29 | ):
30 | super().__init__()
31 |
32 | self.layers = nn.ModuleList(
33 | [
34 | ConvTransposed(
35 | channels_in,
36 | channels,
37 | kernel_size=kernel_size,
38 | padding=(kernel_size - 1) // 2,
39 | ),
40 | nn.LeakyReLU(lrelu_slope),
41 | nn.LayerNorm(channels),
42 | nn.Dropout(p_dropout),
43 | ConvTransposed(
44 | channels,
45 | channels,
46 | kernel_size=kernel_size,
47 | padding=(kernel_size - 1) // 2,
48 | ),
49 | nn.LeakyReLU(lrelu_slope),
50 | nn.LayerNorm(channels),
51 | nn.Dropout(p_dropout),
52 | ]
53 | )
54 |
55 | self.linear_layer = nn.Linear(channels, channels_out)
56 |
57 | def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
58 | """
59 | Shapes:
60 | x: :math: `[B, T_src, C]`
61 | mask: :math: `[B, T_src]`
62 | """
63 | for layer in self.layers:
64 | x = layer(x)
65 | x = self.linear_layer(x)
66 | x = x.squeeze(-1)
67 | x = x.masked_fill(mask, 0.0)
68 | return x
69 |
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/feed_forward/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
4 |
5 |
6 | class DurationPredictor(nn.Module):
7 | """Speedy Speech duration predictor model.
8 | Predicts phoneme durations from encoder outputs.
9 |
10 | Note:
11 | Outputs interpreted as log(durations)
12 | To get actual durations, do exp transformation
13 |
14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
15 |
16 | Args:
17 | hidden_channels (int): number of channels in the inner layers.
18 | """
19 |
20 | def __init__(self, hidden_channels):
21 | super().__init__()
22 |
23 | self.layers = nn.ModuleList(
24 | [
25 | Conv1dBN(hidden_channels, hidden_channels, 4, 1),
26 | Conv1dBN(hidden_channels, hidden_channels, 3, 1),
27 | Conv1dBN(hidden_channels, hidden_channels, 1, 1),
28 | nn.Conv1d(hidden_channels, 1, 1),
29 | ]
30 | )
31 |
32 | def forward(self, x, x_mask):
33 | """
34 | Shapes:
35 | x: [B, C, T]
36 | x_mask: [B, 1, T]
37 | """
38 | o = x
39 | for layer in self.layers:
40 | o = layer(o) * x_mask
41 | return o
42 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/generic/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/aligner.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import torch
4 | from torch import nn
5 |
6 |
7 | class AlignmentNetwork(torch.nn.Module):
8 | """Aligner Network for learning alignment between the input text and the model output with Gaussian Attention.
9 |
10 | ::
11 |
12 | query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
13 | key -> conv1d -> relu -> conv1d -----------------------^
14 |
15 | Args:
16 | in_query_channels (int): Number of channels in the query network. Defaults to 80.
17 | in_key_channels (int): Number of channels in the key network. Defaults to 512.
18 | attn_channels (int): Number of inner channels in the attention layers. Defaults to 80.
19 | temperature (float): Temperature for the softmax. Defaults to 0.0005.
20 | """
21 |
22 | def __init__(
23 | self,
24 | in_query_channels=80,
25 | in_key_channels=512,
26 | attn_channels=80,
27 | temperature=0.0005,
28 | ):
29 | super().__init__()
30 | self.temperature = temperature
31 | self.softmax = torch.nn.Softmax(dim=3)
32 | self.log_softmax = torch.nn.LogSoftmax(dim=3)
33 |
34 | self.key_layer = nn.Sequential(
35 | nn.Conv1d(
36 | in_key_channels,
37 | in_key_channels * 2,
38 | kernel_size=3,
39 | padding=1,
40 | bias=True,
41 | ),
42 | torch.nn.ReLU(),
43 | nn.Conv1d(in_key_channels * 2, attn_channels, kernel_size=1, padding=0, bias=True),
44 | )
45 |
46 | self.query_layer = nn.Sequential(
47 | nn.Conv1d(
48 | in_query_channels,
49 | in_query_channels * 2,
50 | kernel_size=3,
51 | padding=1,
52 | bias=True,
53 | ),
54 | torch.nn.ReLU(),
55 | nn.Conv1d(in_query_channels * 2, in_query_channels, kernel_size=1, padding=0, bias=True),
56 | torch.nn.ReLU(),
57 | nn.Conv1d(in_query_channels, attn_channels, kernel_size=1, padding=0, bias=True),
58 | )
59 |
60 | self.init_layers()
61 |
62 | def init_layers(self):
63 | torch.nn.init.xavier_uniform_(self.key_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
64 | torch.nn.init.xavier_uniform_(self.key_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
65 | torch.nn.init.xavier_uniform_(self.query_layer[0].weight, gain=torch.nn.init.calculate_gain("relu"))
66 | torch.nn.init.xavier_uniform_(self.query_layer[2].weight, gain=torch.nn.init.calculate_gain("linear"))
67 | torch.nn.init.xavier_uniform_(self.query_layer[4].weight, gain=torch.nn.init.calculate_gain("linear"))
68 |
69 | def forward(
70 | self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
71 | ) -> Tuple[torch.tensor, torch.tensor]:
72 | """Forward pass of the aligner encoder.
73 | Shapes:
74 | - queries: :math:`[B, C, T_de]`
75 | - keys: :math:`[B, C_emb, T_en]`
76 | - mask: :math:`[B, T_de]`
77 | Output:
78 | attn (torch.tensor): :math:`[B, 1, T_en, T_de]` soft attention mask.
79 | attn_logp (torch.tensor): :math:`[ßB, 1, T_en , T_de]` log probabilities.
80 | """
81 | key_out = self.key_layer(keys)
82 | query_out = self.query_layer(queries)
83 | attn_factor = (query_out[:, :, :, None] - key_out[:, :, None]) ** 2
84 | attn_logp = -self.temperature * attn_factor.sum(1, keepdim=True)
85 | if attn_prior is not None:
86 | attn_logp = self.log_softmax(attn_logp) + torch.log(attn_prior[:, None] + 1e-8)
87 |
88 | if mask is not None:
89 | attn_logp.data.masked_fill_(~mask.bool().unsqueeze(2), -float("inf"))
90 |
91 | attn = self.softmax(attn_logp)
92 | return attn, attn_logp
93 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/gated_conv.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from .normalization import LayerNorm
4 |
5 |
6 | class GatedConvBlock(nn.Module):
7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf
8 | Args:
9 | in_out_channels (int): number of input/output channels.
10 | kernel_size (int): convolution kernel size.
11 | dropout_p (float): dropout rate.
12 | """
13 |
14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers):
15 | super().__init__()
16 | # class arguments
17 | self.dropout_p = dropout_p
18 | self.num_layers = num_layers
19 | # define layers
20 | self.conv_layers = nn.ModuleList()
21 | self.norm_layers = nn.ModuleList()
22 | self.layers = nn.ModuleList()
23 | for _ in range(num_layers):
24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)]
25 | self.norm_layers += [LayerNorm(2 * in_out_channels)]
26 |
27 | def forward(self, x, x_mask):
28 | o = x
29 | res = x
30 | for idx in range(self.num_layers):
31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training)
32 | o = self.conv_layers[idx](o * x_mask)
33 | o = self.norm_layers[idx](o)
34 | o = nn.functional.glu(o, dim=1)
35 | o = res + o
36 | res = o
37 | return o
38 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/pos_encoding.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 |
6 |
7 | class PositionalEncoding(nn.Module):
8 | """Sinusoidal positional encoding for non-recurrent neural networks.
9 | Implementation based on "Attention Is All You Need"
10 |
11 | Args:
12 | channels (int): embedding size
13 | dropout_p (float): dropout rate applied to the output.
14 | max_len (int): maximum sequence length.
15 | use_scale (bool): whether to use a learnable scaling coefficient.
16 | """
17 |
18 | def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
19 | super().__init__()
20 | if channels % 2 != 0:
21 | raise ValueError(
22 | "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
23 | )
24 | self.use_scale = use_scale
25 | if use_scale:
26 | self.scale = torch.nn.Parameter(torch.ones(1))
27 | pe = torch.zeros(max_len, channels)
28 | position = torch.arange(0, max_len).unsqueeze(1)
29 | div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels)
30 | pe[:, 0::2] = torch.sin(position.float() * div_term)
31 | pe[:, 1::2] = torch.cos(position.float() * div_term)
32 | pe = pe.unsqueeze(0).transpose(1, 2)
33 | self.register_buffer("pe", pe)
34 | if dropout_p > 0:
35 | self.dropout = nn.Dropout(p=dropout_p)
36 | self.channels = channels
37 |
38 | def forward(self, x, mask=None, first_idx=None, last_idx=None):
39 | """
40 | Shapes:
41 | x: [B, C, T]
42 | mask: [B, 1, T]
43 | first_idx: int
44 | last_idx: int
45 | """
46 |
47 | x = x * math.sqrt(self.channels)
48 | if first_idx is None:
49 | if self.pe.size(2) < x.size(2):
50 | raise RuntimeError(
51 | f"Sequence is {x.size(2)} but PositionalEncoding is"
52 | f" limited to {self.pe.size(2)}. See max_len argument."
53 | )
54 | if mask is not None:
55 | pos_enc = self.pe[:, :, : x.size(2)] * mask
56 | else:
57 | pos_enc = self.pe[:, :, : x.size(2)]
58 | if self.use_scale:
59 | x = x + self.scale * pos_enc
60 | else:
61 | x = x + pos_enc
62 | else:
63 | if self.use_scale:
64 | x = x + self.scale * self.pe[:, :, first_idx:last_idx]
65 | else:
66 | x = x + self.pe[:, :, first_idx:last_idx]
67 | if hasattr(self, "dropout"):
68 | x = self.dropout(x)
69 | return x
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/time_depth_sep_conv.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class TimeDepthSeparableConv(nn.Module):
6 | """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
7 | It shows competative results with less computation and memory footprint."""
8 |
9 | def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
10 | super().__init__()
11 |
12 | self.in_channels = in_channels
13 | self.out_channels = out_channels
14 | self.hid_channels = hid_channels
15 | self.kernel_size = kernel_size
16 |
17 | self.time_conv = nn.Conv1d(
18 | in_channels,
19 | 2 * hid_channels,
20 | kernel_size=1,
21 | stride=1,
22 | padding=0,
23 | bias=bias,
24 | )
25 | self.norm1 = nn.BatchNorm1d(2 * hid_channels)
26 | self.depth_conv = nn.Conv1d(
27 | hid_channels,
28 | hid_channels,
29 | kernel_size,
30 | stride=1,
31 | padding=(kernel_size - 1) // 2,
32 | groups=hid_channels,
33 | bias=bias,
34 | )
35 | self.norm2 = nn.BatchNorm1d(hid_channels)
36 | self.time_conv2 = nn.Conv1d(
37 | hid_channels,
38 | out_channels,
39 | kernel_size=1,
40 | stride=1,
41 | padding=0,
42 | bias=bias,
43 | )
44 | self.norm3 = nn.BatchNorm1d(out_channels)
45 |
46 | def forward(self, x):
47 | x_res = x
48 | x = self.time_conv(x)
49 | x = self.norm1(x)
50 | x = nn.functional.glu(x, dim=1)
51 | x = self.depth_conv(x)
52 | x = self.norm2(x)
53 | x = x * torch.sigmoid(x)
54 | x = self.time_conv2(x)
55 | x = self.norm3(x)
56 | x = x_res + x
57 | return x
58 |
59 |
60 | class TimeDepthSeparableConvBlock(nn.Module):
61 | def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
62 | super().__init__()
63 | assert (kernel_size - 1) % 2 == 0
64 | assert num_layers > 1
65 |
66 | self.layers = nn.ModuleList()
67 | layer = TimeDepthSeparableConv(
68 | in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
69 | )
70 | self.layers.append(layer)
71 | for idx in range(num_layers - 1):
72 | layer = TimeDepthSeparableConv(
73 | hid_channels,
74 | hid_channels,
75 | out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
76 | kernel_size,
77 | bias,
78 | )
79 | self.layers.append(layer)
80 |
81 | def forward(self, x, mask):
82 | for layer in self.layers:
83 | x = layer(x * mask)
84 | return x
85 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/transformer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 |
6 | class FFTransformer(nn.Module):
7 | def __init__(self, in_out_channels, num_heads, hidden_channels_ffn=1024, kernel_size_fft=3, dropout_p=0.1):
8 | super().__init__()
9 | self.self_attn = nn.MultiheadAttention(in_out_channels, num_heads, dropout=dropout_p)
10 |
11 | padding = (kernel_size_fft - 1) // 2
12 | self.conv1 = nn.Conv1d(in_out_channels, hidden_channels_ffn, kernel_size=kernel_size_fft, padding=padding)
13 | self.conv2 = nn.Conv1d(hidden_channels_ffn, in_out_channels, kernel_size=kernel_size_fft, padding=padding)
14 |
15 | self.norm1 = nn.LayerNorm(in_out_channels)
16 | self.norm2 = nn.LayerNorm(in_out_channels)
17 |
18 | self.dropout1 = nn.Dropout(dropout_p)
19 | self.dropout2 = nn.Dropout(dropout_p)
20 |
21 | def forward(self, src, src_mask=None, src_key_padding_mask=None):
22 | """😦 ugly looking with all the transposing"""
23 | src = src.permute(2, 0, 1)
24 | src2, enc_align = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
25 | src = src + self.dropout1(src2)
26 | src = self.norm1(src + src2)
27 | # T x B x D -> B x D x T
28 | src = src.permute(1, 2, 0)
29 | src2 = self.conv2(F.relu(self.conv1(src)))
30 | src2 = self.dropout2(src2)
31 | src = src + src2
32 | src = src.transpose(1, 2)
33 | src = self.norm2(src)
34 | src = src.transpose(1, 2)
35 | return src, enc_align
36 |
37 |
38 | class FFTransformerBlock(nn.Module):
39 | def __init__(self, in_out_channels, num_heads, hidden_channels_ffn, num_layers, dropout_p):
40 | super().__init__()
41 | self.fft_layers = nn.ModuleList(
42 | [
43 | FFTransformer(
44 | in_out_channels=in_out_channels,
45 | num_heads=num_heads,
46 | hidden_channels_ffn=hidden_channels_ffn,
47 | dropout_p=dropout_p,
48 | )
49 | for _ in range(num_layers)
50 | ]
51 | )
52 |
53 | def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
54 | """
55 | TODO: handle multi-speaker
56 | Shapes:
57 | - x: :math:`[B, C, T]`
58 | - mask: :math:`[B, 1, T] or [B, T]`
59 | """
60 | if mask is not None and mask.ndim == 3:
61 | mask = mask.squeeze(1)
62 | # mask is negated, torch uses 1s and 0s reversely.
63 | mask = ~mask.bool()
64 | alignments = []
65 | for layer in self.fft_layers:
66 | x, align = layer(x, src_key_padding_mask=mask)
67 | alignments.append(align.unsqueeze(1))
68 | alignments = torch.cat(alignments, 1)
69 | return x
70 |
71 |
72 | class FFTDurationPredictor:
73 | def __init__(
74 | self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
75 | ): # pylint: disable=unused-argument
76 | self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
77 | self.proj = nn.Linear(in_channels, 1)
78 |
79 | def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
80 | """
81 | Shapes:
82 | - x: :math:`[B, C, T]`
83 | - mask: :math:`[B, 1, T]`
84 |
85 | TODO: Handle the cond input
86 | """
87 | x = self.fft(x, mask=mask)
88 | x = self.proj(x)
89 | return x
90 |
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/glow_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from ..generic.normalization import LayerNorm
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | """Glow-TTS duration prediction model.
9 |
10 | ::
11 |
12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
13 |
14 | Args:
15 | in_channels (int): Number of channels of the input tensor.
16 | hidden_channels (int): Number of hidden channels of the network.
17 | kernel_size (int): Kernel size for the conv layers.
18 | dropout_p (float): Dropout rate used after each conv layer.
19 | """
20 |
21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None):
22 | super().__init__()
23 |
24 | # add language embedding dim in the input
25 | if language_emb_dim:
26 | in_channels += language_emb_dim
27 |
28 | # class arguments
29 | self.in_channels = in_channels
30 | self.filter_channels = hidden_channels
31 | self.kernel_size = kernel_size
32 | self.dropout_p = dropout_p
33 | # layers
34 | self.drop = nn.Dropout(dropout_p)
35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
36 | self.norm_1 = LayerNorm(hidden_channels)
37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
38 | self.norm_2 = LayerNorm(hidden_channels)
39 | # output layer
40 | self.proj = nn.Conv1d(hidden_channels, 1, 1)
41 | if cond_channels is not None and cond_channels != 0:
42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1)
43 |
44 | if language_emb_dim != 0 and language_emb_dim is not None:
45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1)
46 |
47 | def forward(self, x, x_mask, g=None, lang_emb=None):
48 | """
49 | Shapes:
50 | - x: :math:`[B, C, T]`
51 | - x_mask: :math:`[B, 1, T]`
52 | - g: :math:`[B, C, 1]`
53 | """
54 | if g is not None:
55 | x = x + self.cond(g)
56 |
57 | if lang_emb is not None:
58 | x = x + self.cond_lang(lang_emb)
59 |
60 | x = self.conv_1(x * x_mask)
61 | x = torch.relu(x)
62 | x = self.norm_1(x)
63 | x = self.drop(x)
64 | x = self.conv_2(x * x_mask)
65 | x = torch.relu(x)
66 | x = self.norm_2(x)
67 | x = self.drop(x)
68 | x = self.proj(x * x_mask)
69 | return x * x_mask
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/overflow/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/decoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
5 | from TTS.tts.utils.helpers import sequence_mask
6 |
7 |
8 | class Decoder(nn.Module):
9 | """Uses glow decoder with some modifications.
10 | ::
11 |
12 | Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
13 |
14 | Args:
15 | in_channels (int): channels of input tensor.
16 | hidden_channels (int): hidden decoder channels.
17 | kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
18 | dilation_rate (int): rate to increase dilation by each layer in a decoder block.
19 | num_flow_blocks (int): number of decoder blocks.
20 | num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
21 | dropout_p (float): wavenet dropout rate.
22 | sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
23 | """
24 |
25 | def __init__(
26 | self,
27 | in_channels,
28 | hidden_channels,
29 | kernel_size,
30 | dilation_rate,
31 | num_flow_blocks,
32 | num_coupling_layers,
33 | dropout_p=0.0,
34 | num_splits=4,
35 | num_squeeze=2,
36 | sigmoid_scale=False,
37 | c_in_channels=0,
38 | ):
39 | super().__init__()
40 |
41 | self.glow_decoder = GlowDecoder(
42 | in_channels,
43 | hidden_channels,
44 | kernel_size,
45 | dilation_rate,
46 | num_flow_blocks,
47 | num_coupling_layers,
48 | dropout_p,
49 | num_splits,
50 | num_squeeze,
51 | sigmoid_scale,
52 | c_in_channels,
53 | )
54 | self.n_sqz = num_squeeze
55 |
56 | def forward(self, x, x_len, g=None, reverse=False):
57 | """
58 | Input shapes:
59 | - x: :math:`[B, C, T]`
60 | - x_len :math:`[B]`
61 | - g: :math:`[B, C]`
62 |
63 | Output shapes:
64 | - x: :math:`[B, C, T]`
65 | - x_len :math:`[B]`
66 | - logget_tot :math:`[B]`
67 | """
68 | x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
69 | x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
70 | x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
71 | return x, x_len, logdet_tot
72 |
73 | def preprocess(self, y, y_lengths, y_max_length):
74 | if y_max_length is not None:
75 | y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
76 | y = y[:, :, :y_max_length]
77 | y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
78 | return y, y_lengths, y_max_length
79 |
80 | def store_inverse(self):
81 | self.glow_decoder.store_inverse()
82 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/plotting_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import torch
6 |
7 |
8 | def validate_numpy_array(value: Any):
9 | r"""
10 | Validates the input and makes sure it returns a numpy array (i.e on CPU)
11 |
12 | Args:
13 | value (Any): the input value
14 |
15 | Raises:
16 | TypeError: if the value is not a numpy array or torch tensor
17 |
18 | Returns:
19 | np.ndarray: numpy array of the value
20 | """
21 | if isinstance(value, np.ndarray):
22 | pass
23 | elif isinstance(value, list):
24 | value = np.array(value)
25 | elif torch.is_tensor(value):
26 | value = value.cpu().numpy()
27 | else:
28 | raise TypeError("Value must be a numpy array, a torch tensor or a list")
29 |
30 | return value
31 |
32 |
33 | def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
34 | """Get the most probable state means from the log_alpha_scaled.
35 |
36 | Args:
37 | log_alpha_scaled (torch.Tensor): Log alpha scaled values.
38 | - Shape: :math:`(T, N)`
39 | means (torch.Tensor): Means of the states.
40 | - Shape: :math:`(N, T, D_out)`
41 | decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
42 | """
43 | max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
44 | max_len = means.shape[0]
45 | n_mel_channels = means.shape[2]
46 | max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
47 | means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
48 | if decoder is not None:
49 | mel = (
50 | decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
51 | .squeeze(0)
52 | .T
53 | )
54 | else:
55 | mel = means
56 | return mel
57 |
58 |
59 | def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
60 | """Generates trainsition probabilities plot for the states and the probability of transition.
61 |
62 | Args:
63 | states (torch.IntTensor): the states
64 | transition_probabilities (torch.FloatTensor): the transition probabilities
65 | """
66 | states = validate_numpy_array(states)
67 | transition_probabilities = validate_numpy_array(transition_probabilities)
68 |
69 | fig, ax = plt.subplots(figsize=(30, 3))
70 | ax.plot(transition_probabilities, "o")
71 | ax.set_title("Transition probability of state")
72 | ax.set_xlabel("hidden state")
73 | ax.set_ylabel("probability")
74 | ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
75 | ax.set_xticklabels([int(x) for x in states], rotation=90)
76 | plt.tight_layout()
77 | if not output_fig:
78 | plt.close()
79 | return fig
80 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tacotron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/layers/tacotron/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/random_latent_generator.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
9 | if bias is not None:
10 | rest_dim = [1] * (input.ndim - bias.ndim - 1)
11 | return (
12 | F.leaky_relu(
13 | input + bias.view(1, bias.shape[0], *rest_dim),
14 | negative_slope=negative_slope,
15 | )
16 | * scale
17 | )
18 | else:
19 | return F.leaky_relu(input, negative_slope=0.2) * scale
20 |
21 |
22 | class EqualLinear(nn.Module):
23 | def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1):
24 | super().__init__()
25 | self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
26 | if bias:
27 | self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
28 | else:
29 | self.bias = None
30 | self.scale = (1 / math.sqrt(in_dim)) * lr_mul
31 | self.lr_mul = lr_mul
32 |
33 | def forward(self, input):
34 | out = F.linear(input, self.weight * self.scale)
35 | out = fused_leaky_relu(out, self.bias * self.lr_mul)
36 | return out
37 |
38 |
39 | class RandomLatentConverter(nn.Module):
40 | def __init__(self, channels):
41 | super().__init__()
42 | self.layers = nn.Sequential(
43 | *[EqualLinear(channels, channels, lr_mul=0.1) for _ in range(5)], nn.Linear(channels, channels)
44 | )
45 | self.channels = channels
46 |
47 | def forward(self, ref):
48 | r = torch.randn(ref.shape[0], self.channels, device=ref.device)
49 | y = self.layers(r)
50 | return y
51 |
52 |
53 | if __name__ == "__main__":
54 | model = RandomLatentConverter(512)
55 | model(torch.randn(5, 512))
56 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/tokenizer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from tokenizers import Tokenizer
5 |
6 | from TTS.tts.utils.text.cleaners import english_cleaners
7 |
8 | DEFAULT_VOCAB_FILE = os.path.join(
9 | os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
10 | )
11 |
12 |
13 | class VoiceBpeTokenizer:
14 | def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None):
15 | self.tokenizer = None
16 | if vocab_file is not None:
17 | self.tokenizer = Tokenizer.from_file(vocab_file)
18 | if vocab_str is not None:
19 | self.tokenizer = Tokenizer.from_str(vocab_str)
20 |
21 | def preprocess_text(self, txt):
22 | txt = english_cleaners(txt)
23 | return txt
24 |
25 | def encode(self, txt):
26 | txt = self.preprocess_text(txt)
27 | txt = txt.replace(" ", "[SPACE]")
28 | return self.tokenizer.encode(txt).ids
29 |
30 | def decode(self, seq):
31 | if isinstance(seq, torch.Tensor):
32 | seq = seq.cpu().numpy()
33 | txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
34 | txt = txt.replace("[SPACE]", " ")
35 | txt = txt.replace("[STOP]", "")
36 | txt = txt.replace("[UNK]", "")
37 | return txt
38 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tortoise/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib import request
3 |
4 | from tqdm import tqdm
5 |
6 | DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models")
7 | MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
8 | MODELS_DIR = "/data/speech_synth/models/"
9 | MODELS = {
10 | "autoregressive.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth",
11 | "classifier.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth",
12 | "clvp2.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth",
13 | "diffusion_decoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth",
14 | "vocoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth",
15 | "rlg_auto.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth",
16 | "rlg_diffuser.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth",
17 | }
18 |
19 |
20 | def download_models(specific_models=None):
21 | """
22 | Call to download all the models that Tortoise uses.
23 | """
24 | os.makedirs(MODELS_DIR, exist_ok=True)
25 | for model_name, url in MODELS.items():
26 | if specific_models is not None and model_name not in specific_models:
27 | continue
28 | model_path = os.path.join(MODELS_DIR, model_name)
29 | if os.path.exists(model_path):
30 | continue
31 | print(f"Downloading {model_name} from {url}...")
32 | with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
33 | request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n))
34 | print("Done.")
35 |
36 |
37 | def get_model_path(model_name, models_dir=MODELS_DIR):
38 | """
39 | Get path to given model, download it if it doesn't exist.
40 | """
41 | if model_name not in MODELS:
42 | raise ValueError(f"Model {model_name} not found in available models.")
43 | model_path = os.path.join(models_dir, model_name)
44 | if not os.path.exists(model_path) and models_dir == MODELS_DIR:
45 | download_models([model_name])
46 | return model_path
47 |
--------------------------------------------------------------------------------
/TTS/tts/layers/vits/discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.modules.conv import Conv1d
4 |
5 | from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator
6 |
7 |
8 | class DiscriminatorS(torch.nn.Module):
9 | """HiFiGAN Scale Discriminator. Channel sizes are different from the original HiFiGAN.
10 |
11 | Args:
12 | use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
13 | """
14 |
15 | def __init__(self, use_spectral_norm=False):
16 | super().__init__()
17 | norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.parametrizations.weight_norm
18 | self.convs = nn.ModuleList(
19 | [
20 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
21 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
22 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
23 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
24 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
25 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
26 | ]
27 | )
28 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
29 |
30 | def forward(self, x):
31 | """
32 | Args:
33 | x (Tensor): input waveform.
34 |
35 | Returns:
36 | Tensor: discriminator scores.
37 | List[Tensor]: list of features from the convolutiona layers.
38 | """
39 | feat = []
40 | for l in self.convs:
41 | x = l(x)
42 | x = torch.nn.functional.leaky_relu(x, 0.1)
43 | feat.append(x)
44 | x = self.conv_post(x)
45 | feat.append(x)
46 | x = torch.flatten(x, 1, -1)
47 | return x, feat
48 |
49 |
50 | class VitsDiscriminator(nn.Module):
51 | """VITS discriminator wrapping one Scale Discriminator and a stack of Period Discriminator.
52 |
53 | ::
54 | waveform -> ScaleDiscriminator() -> scores_sd, feats_sd --> append() -> scores, feats
55 | |--> MultiPeriodDiscriminator() -> scores_mpd, feats_mpd ^
56 |
57 | Args:
58 | use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
59 | """
60 |
61 | def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
62 | super().__init__()
63 | self.nets = nn.ModuleList()
64 | self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
65 | self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])
66 |
67 | def forward(self, x, x_hat=None):
68 | """
69 | Args:
70 | x (Tensor): ground truth waveform.
71 | x_hat (Tensor): predicted waveform.
72 |
73 | Returns:
74 | List[Tensor]: discriminator scores.
75 | List[List[Tensor]]: list of list of features from each layers of each discriminator.
76 | """
77 | x_scores = []
78 | x_hat_scores = [] if x_hat is not None else None
79 | x_feats = []
80 | x_hat_feats = [] if x_hat is not None else None
81 | for net in self.nets:
82 | x_score, x_feat = net(x)
83 | x_scores.append(x_score)
84 | x_feats.append(x_feat)
85 | if x_hat is not None:
86 | x_hat_score, x_hat_feat = net(x_hat)
87 | x_hat_scores.append(x_hat_score)
88 | x_hat_feats.append(x_hat_feat)
89 | return x_scores, x_feats, x_hat_scores, x_hat_feats
90 |
--------------------------------------------------------------------------------
/TTS/tts/models/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 |
3 | from TTS.utils.generic_utils import find_module
4 |
5 |
6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
7 | print(" > Using model: {}".format(config.model))
8 | # fetch the right model implementation.
9 | if "base_model" in config and config["base_model"] is not None:
10 | MyModel = find_module("TTS.tts.models", config.base_model.lower())
11 | else:
12 | MyModel = find_module("TTS.tts.models", config.model.lower())
13 | model = MyModel.init_from_config(config=config, samples=samples)
14 | return model
15 |
--------------------------------------------------------------------------------
/TTS/tts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/data.py:
--------------------------------------------------------------------------------
1 | import bisect
2 |
3 | import numpy as np
4 | import torch
5 |
6 |
7 | def _pad_data(x, length):
8 | _pad = 0
9 | assert x.ndim == 1
10 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
11 |
12 |
13 | def prepare_data(inputs):
14 | max_len = max((len(x) for x in inputs))
15 | return np.stack([_pad_data(x, max_len) for x in inputs])
16 |
17 |
18 | def _pad_tensor(x, length):
19 | _pad = 0.0
20 | assert x.ndim == 2
21 | x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
22 | return x
23 |
24 |
25 | def prepare_tensor(inputs, out_steps):
26 | max_len = max((x.shape[1] for x in inputs))
27 | remainder = max_len % out_steps
28 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
29 | return np.stack([_pad_tensor(x, pad_len) for x in inputs])
30 |
31 |
32 | def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
33 | """Pad stop target array.
34 |
35 | Args:
36 | x (np.ndarray): Stop target array.
37 | length (int): Length after padding.
38 | pad_val (int, optional): Padding value. Defaults to 1.
39 |
40 | Returns:
41 | np.ndarray: Padded stop target array.
42 | """
43 | assert x.ndim == 1
44 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
45 |
46 |
47 | def prepare_stop_target(inputs, out_steps):
48 | """Pad row vectors with 1."""
49 | max_len = max((x.shape[0] for x in inputs))
50 | remainder = max_len % out_steps
51 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
52 | return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
53 |
54 |
55 | def pad_per_step(inputs, pad_len):
56 | return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
57 |
58 |
59 | def get_length_balancer_weights(items: list, num_buckets=10):
60 | # get all durations
61 | audio_lengths = np.array([item["audio_length"] for item in items])
62 | # create the $num_buckets buckets classes based in the dataset max and min length
63 | max_length = int(max(audio_lengths))
64 | min_length = int(min(audio_lengths))
65 | step = int((max_length - min_length) / num_buckets) + 1
66 | buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
67 | # add each sample in their respective length bucket
68 | buckets_names = np.array(
69 | [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
70 | )
71 | # count and compute the weights_bucket for each sample
72 | unique_buckets_names = np.unique(buckets_names).tolist()
73 | bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
74 | bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
75 | weight_bucket = 1.0 / bucket_count
76 | dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
77 | # normalize
78 | dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
79 | return torch.from_numpy(dataset_samples_weight).float()
80 |
--------------------------------------------------------------------------------
/TTS/tts/utils/fairseq.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def rehash_fairseq_vits_checkpoint(checkpoint_file):
5 | chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
6 | new_chk = {}
7 | for k, v in chk.items():
8 | if "enc_p." in k:
9 | new_chk[k.replace("enc_p.", "text_encoder.")] = v
10 | elif "dec." in k:
11 | new_chk[k.replace("dec.", "waveform_decoder.")] = v
12 | elif "enc_q." in k:
13 | new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
14 | elif "flow.flows.2." in k:
15 | new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
16 | elif "flow.flows.4." in k:
17 | new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
18 | elif "flow.flows.6." in k:
19 | new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
20 | elif "dp.flows.0.m" in k:
21 | new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
22 | elif "dp.flows.0.logs" in k:
23 | new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
24 | elif "dp.flows.1" in k:
25 | new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
26 | elif "dp.flows.3" in k:
27 | new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
28 | elif "dp.flows.5" in k:
29 | new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
30 | elif "dp.flows.7" in k:
31 | new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
32 | elif "dp.post_flows.0.m" in k:
33 | new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
34 | elif "dp.post_flows.0.logs" in k:
35 | new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
36 | elif "dp.post_flows.1" in k:
37 | new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
38 | elif "dp.post_flows.3" in k:
39 | new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
40 | elif "dp.post_flows.5" in k:
41 | new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
42 | elif "dp.post_flows.7" in k:
43 | new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
44 | elif "dp." in k:
45 | new_chk[k.replace("dp.", "duration_predictor.")] = v
46 | else:
47 | new_chk[k] = v
48 | return new_chk
49 |
--------------------------------------------------------------------------------
/TTS/tts/utils/measures.py:
--------------------------------------------------------------------------------
1 | def alignment_diagonal_score(alignments, binary=False):
2 | """
3 | Compute how diagonal alignment predictions are. It is useful
4 | to measure the alignment consistency of a model
5 | Args:
6 | alignments (torch.Tensor): batch of alignments.
7 | binary (bool): if True, ignore scores and consider attention
8 | as a binary mask.
9 | Shape:
10 | - alignments : :math:`[B, T_de, T_en]`
11 | """
12 | maxs = alignments.max(dim=1)[0]
13 | if binary:
14 | maxs[maxs > 0] = 1
15 | return maxs.mean(dim=1).mean(dim=0).item()
16 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/monotonic_align/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | cimport cython
4 | cimport numpy as np
5 |
6 | from cython.parallel import prange
7 |
8 |
9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 | cdef int x
13 | cdef int y
14 | cdef float v_prev
15 | cdef float v_cur
16 | cdef float tmp
17 | cdef int index = t_x - 1
18 |
19 | for y in range(t_y):
20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 | if x == y:
22 | v_cur = max_neg_val
23 | else:
24 | v_cur = value[x, y-1]
25 | if x == 0:
26 | if y == 0:
27 | v_prev = 0.
28 | else:
29 | v_prev = max_neg_val
30 | else:
31 | v_prev = value[x-1, y-1]
32 | value[x, y] = max(v_cur, v_prev) + value[x, y]
33 |
34 | for y in range(t_y - 1, -1, -1):
35 | path[index, y] = 1
36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 | index = index - 1
38 |
39 |
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 | cdef int b = values.shape[0]
44 |
45 | cdef int i
46 | for i in prange(b, nogil=True):
47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 |
5 | # setup(name='monotonic_align',
6 | # ext_modules=cythonize("core.pyx"),
7 | # include_dirs=[numpy.get_include()])
8 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
2 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/bangla/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/bangla/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/bangla/phonemizer.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import bangla
4 | from bnnumerizer import numerize
5 | from bnunicodenormalizer import Normalizer
6 |
7 | # initialize
8 | bnorm = Normalizer()
9 |
10 |
11 | attribution_dict = {
12 | "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
13 | "আঃ": "আলাইহিস সালাম",
14 | "রাঃ": "রাদিআল্লাহু আনহু",
15 | "রহঃ": "রহমাতুল্লাহি আলাইহি",
16 | "রহিঃ": "রহিমাহুল্লাহ",
17 | "হাফিঃ": "হাফিযাহুল্লাহ",
18 | "বায়ান": "বাইআন",
19 | "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
20 | # "আয়াত" : "আইআত",#আইআত
21 | # "ওয়া" : "ওআ",
22 | # "ওয়াসাল্লাম" : "ওআসাল্লাম",
23 | # "কেন" : "কেনো",
24 | # "কোন" : "কোনো",
25 | # "বল" : "বলো",
26 | # "চল" : "চলো",
27 | # "কর" : "করো",
28 | # "রাখ" : "রাখো",
29 | "’": "",
30 | "‘": "",
31 | # "য়" : "অ",
32 | # "সম্প্রদায়" : "সম্প্রদাই",
33 | # "রয়েছে" : "রইছে",
34 | # "রয়েছ" : "রইছ",
35 | "/": " বাই ",
36 | }
37 |
38 |
39 | def tag_text(text: str):
40 | # remove multiple spaces
41 | text = re.sub(" +", " ", text)
42 | # create start and end
43 | text = "start" + text + "end"
44 | # tag text
45 | parts = re.split("[\u0600-\u06FF]+", text)
46 | # remove non chars
47 | parts = [p for p in parts if p.strip()]
48 | # unique parts
49 | parts = set(parts)
50 | # tag the text
51 | for m in parts:
52 | if len(m.strip()) > 1:
53 | text = text.replace(m, f"{m}")
54 | # clean-tags
55 | text = text.replace("start", "")
56 | text = text.replace("end", "")
57 | return text
58 |
59 |
60 | def normalize(sen):
61 | global bnorm # pylint: disable=global-statement
62 | _words = [bnorm(word)["normalized"] for word in sen.split()]
63 | return " ".join([word for word in _words if word is not None])
64 |
65 |
66 | def expand_full_attribution(text):
67 | for word, attr in attribution_dict.items():
68 | if word in text:
69 | text = text.replace(word, normalize(attr))
70 | return text
71 |
72 |
73 | def collapse_whitespace(text):
74 | # Regular expression matching whitespace:
75 | _whitespace_re = re.compile(r"\s+")
76 | return re.sub(_whitespace_re, " ", text)
77 |
78 |
79 | def bangla_text_to_phonemes(text: str) -> str:
80 | # english numbers to bangla conversion
81 | res = re.search("[0-9]", text)
82 | if res is not None:
83 | text = bangla.convert_english_digit_to_bangla_digit(text)
84 |
85 | # replace ':' in between two bangla numbers with ' এর '
86 | pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]"
87 | matches = re.findall(pattern, text)
88 | for m in matches:
89 | r = m.replace(":", " এর ")
90 | text = text.replace(m, r)
91 |
92 | # numerize text
93 | text = numerize(text)
94 |
95 | # tag sections
96 | text = tag_text(text)
97 |
98 | # text blocks
99 | # blocks = text.split("")
100 | # blocks = [b for b in blocks if b.strip()]
101 |
102 | # create tuple of (lang,text)
103 | if "" in text:
104 | text = text.replace("", "").replace("", "")
105 | # Split based on sentence ending Characters
106 | bn_text = text.strip()
107 |
108 | sentenceEnders = re.compile("[।!?]")
109 | sentences = sentenceEnders.split(str(bn_text))
110 |
111 | data = ""
112 | for sent in sentences:
113 | res = re.sub("\n", "", sent)
114 | res = normalize(res)
115 | # expand attributes
116 | res = expand_full_attribution(res)
117 |
118 | res = collapse_whitespace(res)
119 | res += "।"
120 | data += res
121 | return data
122 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/belarusian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/belarusian/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/belarusian/phonemizer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | finder = None
4 |
5 |
6 | def init():
7 | try:
8 | import jpype
9 | import jpype.imports
10 | except ModuleNotFoundError:
11 | raise ModuleNotFoundError(
12 | "Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`."
13 | )
14 |
15 | try:
16 | jar_path = os.environ["BEL_FANETYKA_JAR"]
17 | except KeyError:
18 | raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file")
19 |
20 | jpype.startJVM(classpath=[jar_path])
21 |
22 | # import the Java modules
23 | from org.alex73.korpus.base import GrammarDB2, GrammarFinder
24 |
25 | grammar_db = GrammarDB2.initializeFromJar()
26 | global finder
27 | finder = GrammarFinder(grammar_db)
28 |
29 |
30 | def belarusian_text_to_phonemes(text: str) -> str:
31 | # Initialize only on first run
32 | if finder is None:
33 | init()
34 |
35 | from org.alex73.fanetyka.impl import FanetykaText
36 |
37 | return str(FanetykaText(finder, text).ipa)
38 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/chinese_mandarin/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import jieba
4 | import pypinyin
5 |
6 | from .pinyinToPhonemes import PINYIN_DICT
7 |
8 |
9 | def _chinese_character_to_pinyin(text: str) -> List[str]:
10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist]
12 | return pinyins_flat_list
13 |
14 |
15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
16 | segment = pinyin[:-1]
17 | tone = pinyin[-1]
18 | phoneme = PINYIN_DICT.get(segment, [""])[0]
19 | return phoneme + tone
20 |
21 |
22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
23 | tokenized_text = jieba.cut(text, HMM=False)
24 | tokenized_text = " ".join(tokenized_text)
25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
26 |
27 | results: List[str] = []
28 |
29 | for token in pinyined_text:
30 | if token[-1] in "12345": # TODO transform to is_pinyin()
31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
32 |
33 | results += list(pinyin_phonemes)
34 | else: # is ponctuation or other
35 | results += list(token)
36 |
37 | return seperator.join(results)
38 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/cmudict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | VALID_SYMBOLS = [
6 | "AA",
7 | "AA0",
8 | "AA1",
9 | "AA2",
10 | "AE",
11 | "AE0",
12 | "AE1",
13 | "AE2",
14 | "AH",
15 | "AH0",
16 | "AH1",
17 | "AH2",
18 | "AO",
19 | "AO0",
20 | "AO1",
21 | "AO2",
22 | "AW",
23 | "AW0",
24 | "AW1",
25 | "AW2",
26 | "AY",
27 | "AY0",
28 | "AY1",
29 | "AY2",
30 | "B",
31 | "CH",
32 | "D",
33 | "DH",
34 | "EH",
35 | "EH0",
36 | "EH1",
37 | "EH2",
38 | "ER",
39 | "ER0",
40 | "ER1",
41 | "ER2",
42 | "EY",
43 | "EY0",
44 | "EY1",
45 | "EY2",
46 | "F",
47 | "G",
48 | "HH",
49 | "IH",
50 | "IH0",
51 | "IH1",
52 | "IH2",
53 | "IY",
54 | "IY0",
55 | "IY1",
56 | "IY2",
57 | "JH",
58 | "K",
59 | "L",
60 | "M",
61 | "N",
62 | "NG",
63 | "OW",
64 | "OW0",
65 | "OW1",
66 | "OW2",
67 | "OY",
68 | "OY0",
69 | "OY1",
70 | "OY2",
71 | "P",
72 | "R",
73 | "S",
74 | "SH",
75 | "T",
76 | "TH",
77 | "UH",
78 | "UH0",
79 | "UH1",
80 | "UH2",
81 | "UW",
82 | "UW0",
83 | "UW1",
84 | "UW2",
85 | "V",
86 | "W",
87 | "Y",
88 | "Z",
89 | "ZH",
90 | ]
91 |
92 |
93 | class CMUDict:
94 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
95 |
96 | def __init__(self, file_or_path, keep_ambiguous=True):
97 | if isinstance(file_or_path, str):
98 | with open(file_or_path, encoding="latin-1") as f:
99 | entries = _parse_cmudict(f)
100 | else:
101 | entries = _parse_cmudict(file_or_path)
102 | if not keep_ambiguous:
103 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
104 | self._entries = entries
105 |
106 | def __len__(self):
107 | return len(self._entries)
108 |
109 | def lookup(self, word):
110 | """Returns list of ARPAbet pronunciations of the given word."""
111 | return self._entries.get(word.upper())
112 |
113 | @staticmethod
114 | def get_arpabet(word, cmudict, punctuation_symbols):
115 | first_symbol, last_symbol = "", ""
116 | if word and word[0] in punctuation_symbols:
117 | first_symbol = word[0]
118 | word = word[1:]
119 | if word and word[-1] in punctuation_symbols:
120 | last_symbol = word[-1]
121 | word = word[:-1]
122 | arpabet = cmudict.lookup(word)
123 | if arpabet is not None:
124 | return first_symbol + "{%s}" % arpabet[0] + last_symbol
125 | return first_symbol + word + last_symbol
126 |
127 |
128 | _alt_re = re.compile(r"\([0-9]+\)")
129 |
130 |
131 | def _parse_cmudict(file):
132 | cmudict = {}
133 | for line in file:
134 | if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
135 | parts = line.split(" ")
136 | word = re.sub(_alt_re, "", parts[0])
137 | pronunciation = _get_pronunciation(parts[1])
138 | if pronunciation:
139 | if word in cmudict:
140 | cmudict[word].append(pronunciation)
141 | else:
142 | cmudict[word] = [pronunciation]
143 | return cmudict
144 |
145 |
146 | def _get_pronunciation(s):
147 | parts = s.strip().split(" ")
148 | for part in parts:
149 | if part not in VALID_SYMBOLS:
150 | return None
151 | return " ".join(parts)
152 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/english/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in english:
4 | abbreviations_en = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("mrs", "misess"),
8 | ("mr", "mister"),
9 | ("dr", "doctor"),
10 | ("st", "saint"),
11 | ("co", "company"),
12 | ("jr", "junior"),
13 | ("maj", "major"),
14 | ("gen", "general"),
15 | ("drs", "doctors"),
16 | ("rev", "reverend"),
17 | ("lt", "lieutenant"),
18 | ("hon", "honorable"),
19 | ("sgt", "sergeant"),
20 | ("capt", "captain"),
21 | ("esq", "esquire"),
22 | ("ltd", "limited"),
23 | ("col", "colonel"),
24 | ("ft", "fort"),
25 | ]
26 | ]
27 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/number_norm.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import re
4 | from typing import Dict
5 |
6 | import inflect
7 |
8 | _inflect = inflect.engine()
9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"-?[0-9]+")
14 |
15 |
16 | def _remove_commas(m):
17 | return m.group(1).replace(",", "")
18 |
19 |
20 | def _expand_decimal_point(m):
21 | return m.group(1).replace(".", " point ")
22 |
23 |
24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
25 | parts = value.replace(",", "").split(".")
26 | if len(parts) > 2:
27 | return f"{value} {inflection[2]}" # Unexpected format
28 | text = []
29 | integer = int(parts[0]) if parts[0] else 0
30 | if integer > 0:
31 | integer_unit = inflection.get(integer, inflection[2])
32 | text.append(f"{integer} {integer_unit}")
33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
34 | if fraction > 0:
35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02])
36 | text.append(f"{fraction} {fraction_unit}")
37 | if len(text) == 0:
38 | return f"zero {inflection[2]}"
39 | return " ".join(text)
40 |
41 |
42 | def _expand_currency(m: "re.Match") -> str:
43 | currencies = {
44 | "$": {
45 | 0.01: "cent",
46 | 0.02: "cents",
47 | 1: "dollar",
48 | 2: "dollars",
49 | },
50 | "€": {
51 | 0.01: "cent",
52 | 0.02: "cents",
53 | 1: "euro",
54 | 2: "euros",
55 | },
56 | "£": {
57 | 0.01: "penny",
58 | 0.02: "pence",
59 | 1: "pound sterling",
60 | 2: "pounds sterling",
61 | },
62 | "¥": {
63 | # TODO rin
64 | 0.02: "sen",
65 | 2: "yen",
66 | },
67 | }
68 | unit = m.group(1)
69 | currency = currencies[unit]
70 | value = m.group(2)
71 | return __expand_currency(value, currency)
72 |
73 |
74 | def _expand_ordinal(m):
75 | return _inflect.number_to_words(m.group(0))
76 |
77 |
78 | def _expand_number(m):
79 | num = int(m.group(0))
80 | if 1000 < num < 3000:
81 | if num == 2000:
82 | return "two thousand"
83 | if 2000 < num < 2010:
84 | return "two thousand " + _inflect.number_to_words(num % 100)
85 | if num % 100 == 0:
86 | return _inflect.number_to_words(num // 100) + " hundred"
87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
88 | return _inflect.number_to_words(num, andword="")
89 |
90 |
91 | def normalize_numbers(text):
92 | text = re.sub(_comma_number_re, _remove_commas, text)
93 | text = re.sub(_currency_re, _expand_currency, text)
94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
95 | text = re.sub(_ordinal_re, _expand_ordinal, text)
96 | text = re.sub(_number_re, _expand_number, text)
97 | return text
98 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/time_norm.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 |
7 | _time_re = re.compile(
8 | r"""\b
9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
10 | :
11 | ([0-5][0-9]) # minutes
12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 | \b""",
14 | re.IGNORECASE | re.X,
15 | )
16 |
17 |
18 | def _expand_num(n: int) -> str:
19 | return _inflect.number_to_words(n)
20 |
21 |
22 | def _expand_time_english(match: "re.Match") -> str:
23 | hour = int(match.group(1))
24 | past_noon = hour >= 12
25 | time = []
26 | if hour > 12:
27 | hour -= 12
28 | elif hour == 0:
29 | hour = 12
30 | past_noon = True
31 | time.append(_expand_num(hour))
32 |
33 | minute = int(match.group(6))
34 | if minute > 0:
35 | if minute < 10:
36 | time.append("oh")
37 | time.append(_expand_num(minute))
38 | am_pm = match.group(7)
39 | if am_pm is None:
40 | time.append("p m" if past_noon else "a m")
41 | else:
42 | time.extend(list(am_pm.replace(".", "")))
43 | return " ".join(time)
44 |
45 |
46 | def expand_time_english(text: str) -> str:
47 | return re.sub(_time_re, _expand_time_english, text)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/french/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in french:
4 | abbreviations_fr = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("M", "monsieur"),
8 | ("Mlle", "mademoiselle"),
9 | ("Mlles", "mesdemoiselles"),
10 | ("Mme", "Madame"),
11 | ("Mmes", "Mesdames"),
12 | ("N.B", "nota bene"),
13 | ("M", "monsieur"),
14 | ("p.c.q", "parce que"),
15 | ("Pr", "professeur"),
16 | ("qqch", "quelque chose"),
17 | ("rdv", "rendez-vous"),
18 | ("max", "maximum"),
19 | ("min", "minimum"),
20 | ("no", "numéro"),
21 | ("adr", "adresse"),
22 | ("dr", "docteur"),
23 | ("st", "saint"),
24 | ("co", "companie"),
25 | ("jr", "junior"),
26 | ("sgt", "sergent"),
27 | ("capt", "capitain"),
28 | ("col", "colonel"),
29 | ("av", "avenue"),
30 | ("av. J.-C", "avant Jésus-Christ"),
31 | ("apr. J.-C", "après Jésus-Christ"),
32 | ("art", "article"),
33 | ("boul", "boulevard"),
34 | ("c.-à-d", "c’est-à-dire"),
35 | ("etc", "et cetera"),
36 | ("ex", "exemple"),
37 | ("excl", "exclusivement"),
38 | ("boul", "boulevard"),
39 | ]
40 | ] + [
41 | (re.compile("\\b%s" % x[0]), x[1])
42 | for x in [
43 | ("Mlle", "mademoiselle"),
44 | ("Mlles", "mesdemoiselles"),
45 | ("Mme", "Madame"),
46 | ("Mmes", "Mesdames"),
47 | ]
48 | ]
49 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/japanese/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/japanese/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/tts/utils/text/korean/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/ko_dictionary.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Add the word you want to the dictionary.
3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
4 |
5 |
6 | english_dictionary = {
7 | "KOREA": "코리아",
8 | "IDOL": "아이돌",
9 | "IT": "아이티",
10 | "IQ": "아이큐",
11 | "UP": "업",
12 | "DOWN": "다운",
13 | "PC": "피씨",
14 | "CCTV": "씨씨티비",
15 | "SNS": "에스엔에스",
16 | "AI": "에이아이",
17 | "CEO": "씨이오",
18 | "A": "에이",
19 | "B": "비",
20 | "C": "씨",
21 | "D": "디",
22 | "E": "이",
23 | "F": "에프",
24 | "G": "지",
25 | "H": "에이치",
26 | "I": "아이",
27 | "J": "제이",
28 | "K": "케이",
29 | "L": "엘",
30 | "M": "엠",
31 | "N": "엔",
32 | "O": "오",
33 | "P": "피",
34 | "Q": "큐",
35 | "R": "알",
36 | "S": "에스",
37 | "T": "티",
38 | "U": "유",
39 | "V": "브이",
40 | "W": "더블유",
41 | "X": "엑스",
42 | "Y": "와이",
43 | "Z": "제트",
44 | }
45 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/korean.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
3 | import re
4 |
5 | from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
6 |
7 |
8 | def normalize(text):
9 | text = text.strip()
10 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
11 | text = normalize_with_dictionary(text, etc_dictionary)
12 | text = normalize_english(text)
13 | text = text.lower()
14 | return text
15 |
16 |
17 | def normalize_with_dictionary(text, dic):
18 | if any(key in text for key in dic.keys()):
19 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
20 | return pattern.sub(lambda x: dic[x.group()], text)
21 | return text
22 |
23 |
24 | def normalize_english(text):
25 | def fn(m):
26 | word = m.group()
27 | if word in english_dictionary:
28 | return english_dictionary.get(word)
29 | return word
30 |
31 | text = re.sub("([A-Za-z]+)", fn, text)
32 | return text
33 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/phonemizer.py:
--------------------------------------------------------------------------------
1 | from jamo import hangul_to_jamo
2 |
3 | from TTS.tts.utils.text.korean.korean import normalize
4 |
5 | g2p = None
6 |
7 |
8 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
9 | """
10 |
11 | The input and output values look the same, but they are different in Unicode.
12 |
13 | example :
14 |
15 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
16 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
17 |
18 | """
19 | global g2p # pylint: disable=global-statement
20 | if g2p is None:
21 | from g2pkk import G2p
22 |
23 | g2p = G2p()
24 |
25 | if character == "english":
26 | from anyascii import anyascii
27 |
28 | text = normalize(text)
29 | text = g2p(text)
30 | text = anyascii(text)
31 | return text
32 |
33 | text = normalize(text)
34 | text = g2p(text)
35 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
36 | return "".join(text)
37 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer
2 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
3 | from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer
4 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
5 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
6 | from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
7 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
8 |
9 | try:
10 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
11 | except ImportError:
12 | JA_JP_Phonemizer = None
13 | pass
14 |
15 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, KO_KR_Phonemizer, BN_Phonemizer)}
16 |
17 |
18 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
19 | GRUUT_LANGS = list(Gruut.supported_languages())
20 |
21 |
22 | # Dict setting default phonemizers for each language
23 | # Add Gruut languages
24 | _ = [Gruut.name()] * len(GRUUT_LANGS)
25 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
26 |
27 |
28 | # Add ESpeak languages and override any existing ones
29 | _ = [ESpeak.name()] * len(ESPEAK_LANGS)
30 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
31 | DEF_LANG_TO_PHONEMIZER.update(_new_dict)
32 |
33 |
34 | # Force default for some languages
35 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
36 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
37 | DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
38 | DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name()
39 | DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name()
40 |
41 |
42 | # JA phonemizer has deal breaking dependencies like MeCab for some systems.
43 | # So we only have it when we have it.
44 | if JA_JP_Phonemizer is not None:
45 | PHONEMIZERS[JA_JP_Phonemizer.name()] = JA_JP_Phonemizer
46 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
47 |
48 |
49 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
50 | """Initiate a phonemizer by name
51 |
52 | Args:
53 | name (str):
54 | Name of the phonemizer that should match `phonemizer.name()`.
55 |
56 | kwargs (dict):
57 | Extra keyword arguments that should be passed to the phonemizer.
58 | """
59 | if name == "espeak":
60 | return ESpeak(**kwargs)
61 | if name == "gruut":
62 | return Gruut(**kwargs)
63 | if name == "zh_cn_phonemizer":
64 | return ZH_CN_Phonemizer(**kwargs)
65 | if name == "ja_jp_phonemizer":
66 | if JA_JP_Phonemizer is None:
67 | raise ValueError(" ❗ You need to install JA phonemizer dependencies. Try `pip install TTS[ja]`.")
68 | return JA_JP_Phonemizer(**kwargs)
69 | if name == "ko_kr_phonemizer":
70 | return KO_KR_Phonemizer(**kwargs)
71 | if name == "bn_phonemizer":
72 | return BN_Phonemizer(**kwargs)
73 | if name == "be_phonemizer":
74 | return BEL_Phonemizer(**kwargs)
75 | raise ValueError(f"Phonemizer {name} not found")
76 |
77 |
78 | if __name__ == "__main__":
79 | print(DEF_LANG_TO_PHONEMIZER)
80 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class BN_Phonemizer(BasePhonemizer):
10 | """🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Bangla knowledge should check this implementation
24 | """
25 |
26 | language = "bn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "bn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_bn(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
37 | ph = bangla_text_to_phonemes(text)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_bn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"bn": "Bangla"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | if __name__ == "__main__":
55 | txt = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে."
56 | e = BN_Phonemizer()
57 | print(e.supported_languages())
58 | print(e.version())
59 | print(e.language)
60 | print(e.name())
61 | print(e.is_available())
62 | print("`" + e.phonemize(txt) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_BE_PUNCS = ",!." # TODO
7 |
8 |
9 | class BEL_Phonemizer(BasePhonemizer):
10 | """🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 | """
19 |
20 | language = "be"
21 |
22 | def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
23 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
24 |
25 | @staticmethod
26 | def name():
27 | return "be_phonemizer"
28 |
29 | @staticmethod
30 | def phonemize_be(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument
31 | return belarusian_text_to_phonemes(text)
32 |
33 | def _phonemize(self, text, separator):
34 | return self.phonemize_be(text, separator)
35 |
36 | @staticmethod
37 | def supported_languages() -> Dict:
38 | return {"be": "Belarusian"}
39 |
40 | def version(self) -> str:
41 | return "0.0.1"
42 |
43 | def is_available(self) -> bool:
44 | return True
45 |
46 |
47 | if __name__ == "__main__":
48 | txt = "тэст"
49 | e = BEL_Phonemizer()
50 | print(e.supported_languages())
51 | print(e.version())
52 | print(e.language)
53 | print(e.name())
54 | print(e.is_available())
55 | print("`" + e.phonemize(txt) + "`")
56 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 | _TRANS_TABLE = {"、": ","}
9 |
10 |
11 | def trans(text):
12 | for i, j in _TRANS_TABLE.items():
13 | text = text.replace(i, j)
14 | return text
15 |
16 |
17 | class JA_JP_Phonemizer(BasePhonemizer):
18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
19 |
20 | TODO: someone with JA knowledge should check this implementation
21 |
22 | Example:
23 |
24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
25 | >>> phonemizer = JA_JP_Phonemizer()
26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|")
27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
28 |
29 | """
30 |
31 | language = "ja-jp"
32 |
33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
35 |
36 | @staticmethod
37 | def name():
38 | return "ja_jp_phonemizer"
39 |
40 | def _phonemize(self, text: str, separator: str = "|") -> str:
41 | ph = japanese_text_to_phonemes(text)
42 | if separator is not None or separator != "":
43 | return separator.join(ph)
44 | return ph
45 |
46 | def phonemize(self, text: str, separator="|", language=None) -> str:
47 | """Custom phonemize for JP_JA
48 |
49 | Skip pre-post processing steps used by the other phonemizers.
50 | """
51 | return self._phonemize(text, separator)
52 |
53 | @staticmethod
54 | def supported_languages() -> Dict:
55 | return {"ja-jp": "Japanese (Japan)"}
56 |
57 | def version(self) -> str:
58 | return "0.0.1"
59 |
60 | def is_available(self) -> bool:
61 | return True
62 |
63 |
64 | # if __name__ == "__main__":
65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。"
66 | # e = JA_JP_Phonemizer()
67 | # print(e.supported_languages())
68 | # print(e.version())
69 | # print(e.language)
70 | # print(e.name())
71 | # print(e.is_available())
72 | # print("`" + e.phonemize(text) + "`")
73 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class KO_KR_Phonemizer(BasePhonemizer):
10 | """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
11 |
12 | TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
13 |
14 | Example:
15 |
16 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
17 | >>> phonemizer = KO_KR_Phonemizer()
18 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
19 | 'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
20 |
21 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
22 | >>> phonemizer = KO_KR_Phonemizer()
23 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
24 | 'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
25 |
26 | """
27 |
28 | language = "ko-kr"
29 |
30 | def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
31 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
32 |
33 | @staticmethod
34 | def name():
35 | return "ko_kr_phonemizer"
36 |
37 | def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
38 | ph = korean_text_to_phonemes(text, character=character)
39 | if separator is not None or separator != "":
40 | return separator.join(ph)
41 | return ph
42 |
43 | def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
44 | return self._phonemize(text, separator, character)
45 |
46 | @staticmethod
47 | def supported_languages() -> Dict:
48 | return {"ko-kr": "hangeul(korean)"}
49 |
50 | def version(self) -> str:
51 | return "0.0.2"
52 |
53 | def is_available(self) -> bool:
54 | return True
55 |
56 |
57 | if __name__ == "__main__":
58 | texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
59 | e = KO_KR_Phonemizer()
60 | print(e.supported_languages())
61 | print(e.version())
62 | print(e.language)
63 | print(e.name())
64 | print(e.is_available())
65 | print(e.phonemize(texts))
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/multi_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
4 |
5 |
6 | class MultiPhonemizer:
7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
8 |
9 | Args:
10 | custom_lang_to_phonemizer (Dict):
11 | Custom phonemizer mapping if you want to change the defaults. In the format of
12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
13 |
14 | TODO: find a way to pass custom kwargs to the phonemizers
15 | """
16 |
17 | lang_to_phonemizer = {}
18 |
19 | def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value
20 | for k, v in lang_to_phonemizer_name.items():
21 | if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
22 | lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
23 | elif v == "":
24 | raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
25 | self.lang_to_phonemizer_name = lang_to_phonemizer_name
26 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
27 |
28 | @staticmethod
29 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
30 | lang_to_phonemizer = {}
31 | for k, v in lang_to_phonemizer_name.items():
32 | lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
33 | return lang_to_phonemizer
34 |
35 | @staticmethod
36 | def name():
37 | return "multi-phonemizer"
38 |
39 | def phonemize(self, text, separator="|", language=""):
40 | if language == "":
41 | raise ValueError("Language must be set for multi-phonemizer to phonemize.")
42 | return self.lang_to_phonemizer[language].phonemize(text, separator)
43 |
44 | def supported_languages(self) -> List:
45 | return list(self.lang_to_phonemizer.keys())
46 |
47 | def print_logs(self, level: int = 0):
48 | indent = "\t" * level
49 | print(f"{indent}| > phoneme language: {self.supported_languages()}")
50 | print(f"{indent}| > phoneme backend: {self.name()}")
51 |
52 |
53 | # if __name__ == "__main__":
54 | # texts = {
55 | # "tr": "Merhaba, bu Türkçe bit örnek!",
56 | # "en-us": "Hello, this is English example!",
57 | # "de": "Hallo, das ist ein Deutches Beipiel!",
58 | # "zh-cn": "这是中国的例子",
59 | # }
60 | # phonemes = {}
61 | # ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
62 | # for lang, text in texts.items():
63 | # phoneme = ph.phonemize(text, lang)
64 | # phonemes[lang] = phoneme
65 | # print(phonemes)
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class ZH_CN_Phonemizer(BasePhonemizer):
10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Mandarin knowledge should check this implementation
24 | """
25 |
26 | language = "zh-cn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "zh_cn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str:
37 | ph = chinese_text_to_phonemes(text, separator)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_zh_cn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"zh-cn": "Chinese (China)"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | # if __name__ == "__main__":
55 | # text = "这是,样本中文。"
56 | # e = ZH_CN_Phonemizer()
57 | # print(e.supported_languages())
58 | # print(e.version())
59 | # print(e.language)
60 | # print(e.name())
61 | # print(e.is_available())
62 | # print("`" + e.phonemize(text) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/utils/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.utils.audio.processor import AudioProcessor
2 |
--------------------------------------------------------------------------------
/TTS/utils/capacitron_optimizer.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | from trainer.trainer_utils import get_optimizer
4 |
5 |
6 | class CapacitronOptimizer:
7 | """Double optimizer class for the Capacitron model."""
8 |
9 | def __init__(self, config: dict, model_params: Generator) -> None:
10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
11 |
12 | optimizer_names = list(config.optimizer_params.keys())
13 | optimizer_parameters = list(config.optimizer_params.values())
14 |
15 | self.primary_optimizer = get_optimizer(
16 | optimizer_names[0],
17 | optimizer_parameters[0],
18 | config.lr,
19 | parameters=self.primary_params,
20 | )
21 |
22 | self.secondary_optimizer = get_optimizer(
23 | optimizer_names[1],
24 | self.extract_optimizer_parameters(optimizer_parameters[1]),
25 | optimizer_parameters[1]["lr"],
26 | parameters=self.secondary_params,
27 | )
28 |
29 | self.param_groups = self.primary_optimizer.param_groups
30 |
31 | def first_step(self):
32 | self.secondary_optimizer.step()
33 | self.secondary_optimizer.zero_grad()
34 | self.primary_optimizer.zero_grad()
35 |
36 | def step(self):
37 | # Update param groups to display the correct learning rate
38 | self.param_groups = self.primary_optimizer.param_groups
39 | self.primary_optimizer.step()
40 |
41 | def zero_grad(self, set_to_none=False):
42 | self.primary_optimizer.zero_grad(set_to_none)
43 | self.secondary_optimizer.zero_grad(set_to_none)
44 |
45 | def load_state_dict(self, state_dict):
46 | self.primary_optimizer.load_state_dict(state_dict[0])
47 | self.secondary_optimizer.load_state_dict(state_dict[1])
48 |
49 | def state_dict(self):
50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
51 |
52 | @staticmethod
53 | def split_model_parameters(model_params: Generator) -> list:
54 | primary_params = []
55 | secondary_params = []
56 | for name, param in model_params:
57 | if param.requires_grad:
58 | if name == "capacitron_vae_layer.beta":
59 | secondary_params.append(param)
60 | else:
61 | primary_params.append(param)
62 | return [iter(primary_params), iter(secondary_params)]
63 |
64 | @staticmethod
65 | def extract_optimizer_parameters(params: dict) -> dict:
66 | """Extract parameters that are not the learning rate"""
67 | return {k: v for k, v in params.items() if k != "lr"}
68 |
--------------------------------------------------------------------------------
/TTS/utils/distribute.py:
--------------------------------------------------------------------------------
1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
2 | import torch
3 | import torch.distributed as dist
4 |
5 |
6 | def reduce_tensor(tensor, num_gpus):
7 | rt = tensor.clone()
8 | dist.all_reduce(rt, op=dist.reduce_op.SUM)
9 | rt /= num_gpus
10 | return rt
11 |
12 |
13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA."
15 |
16 | # Set cuda device so everything is done on the right GPU.
17 | torch.cuda.set_device(rank % torch.cuda.device_count())
18 |
19 | # Initialize distributed communication
20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name)
21 |
--------------------------------------------------------------------------------
/TTS/utils/io.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle as pickle_tts
3 | from typing import Any, Callable, Dict, Union
4 |
5 | import fsspec
6 | import torch
7 |
8 | from TTS.utils.generic_utils import get_user_data_dir
9 |
10 |
11 | class RenamingUnpickler(pickle_tts.Unpickler):
12 | """Overload default pickler to solve module renaming problem"""
13 |
14 | def find_class(self, module, name):
15 | return super().find_class(module.replace("mozilla_voice_tts", "TTS"), name)
16 |
17 |
18 | class AttrDict(dict):
19 | """A custom dict which converts dict keys
20 | to class attributes"""
21 |
22 | def __init__(self, *args, **kwargs):
23 | super().__init__(*args, **kwargs)
24 | self.__dict__ = self
25 |
26 |
27 | def load_fsspec(
28 | path: str,
29 | map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
30 | cache: bool = True,
31 | **kwargs,
32 | ) -> Any:
33 | """Like torch.load but can load from other locations (e.g. s3:// , gs://).
34 |
35 | Args:
36 | path: Any path or url supported by fsspec.
37 | map_location: torch.device or str.
38 | cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
39 | **kwargs: Keyword arguments forwarded to torch.load.
40 |
41 | Returns:
42 | Object stored in path.
43 | """
44 | is_local = os.path.isdir(path) or os.path.isfile(path)
45 | if cache and not is_local:
46 | with fsspec.open(
47 | f"filecache::{path}",
48 | filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
49 | mode="rb",
50 | ) as f:
51 | return torch.load(f, map_location=map_location, **kwargs)
52 | else:
53 | with fsspec.open(path, "rb") as f:
54 | return torch.load(f, map_location=map_location, **kwargs)
55 |
56 |
57 | def load_checkpoint(
58 | model, checkpoint_path, use_cuda=False, eval=False, cache=False
59 | ): # pylint: disable=redefined-builtin
60 | try:
61 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
62 | except ModuleNotFoundError:
63 | pickle_tts.Unpickler = RenamingUnpickler
64 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
65 | model.load_state_dict(state["model"])
66 | if use_cuda:
67 | model.cuda()
68 | if eval:
69 | model.eval()
70 | return model, state
71 |
--------------------------------------------------------------------------------
/TTS/utils/training.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
6 | r"""Check model gradient against unexpected jumps and failures"""
7 | skip_flag = False
8 | if ignore_stopnet:
9 | if not amp_opt_params:
10 | grad_norm = torch.nn.utils.clip_grad_norm_(
11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip
12 | )
13 | else:
14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
15 | else:
16 | if not amp_opt_params:
17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
18 | else:
19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
20 |
21 | # compatibility with different torch versions
22 | if isinstance(grad_norm, float):
23 | if np.isinf(grad_norm):
24 | print(" | > Gradient is INF !!")
25 | skip_flag = True
26 | else:
27 | if torch.isinf(grad_norm):
28 | print(" | > Gradient is INF !!")
29 | skip_flag = True
30 | return grad_norm, skip_flag
31 |
32 |
33 | def gradual_training_scheduler(global_step, config):
34 | """Setup the gradual training schedule wrt number
35 | of active GPUs"""
36 | num_gpus = torch.cuda.device_count()
37 | if num_gpus == 0:
38 | num_gpus = 1
39 | new_values = None
40 | # we set the scheduling wrt num_gpus
41 | for values in config.gradual_training:
42 | if global_step * num_gpus >= values[0]:
43 | new_values = values
44 | return new_values[1], new_values[2]
45 |
--------------------------------------------------------------------------------
/TTS/utils/vad.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchaudio
3 |
4 |
5 | def read_audio(path):
6 | wav, sr = torchaudio.load(path)
7 |
8 | if wav.size(0) > 1:
9 | wav = wav.mean(dim=0, keepdim=True)
10 |
11 | return wav.squeeze(0), sr
12 |
13 |
14 | def resample_wav(wav, sr, new_sr):
15 | wav = wav.unsqueeze(0)
16 | transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
17 | wav = transform(wav)
18 | return wav.squeeze(0)
19 |
20 |
21 | def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
22 | factor = new_sr / vad_sr
23 | new_timestamps = []
24 | if just_begging_end and timestamps:
25 | # get just the start and end timestamps
26 | new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
27 | new_timestamps.append(new_dict)
28 | else:
29 | for ts in timestamps:
30 | # map to the new SR
31 | new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
32 | new_timestamps.append(new_dict)
33 |
34 | return new_timestamps
35 |
36 |
37 | def get_vad_model_and_utils(use_cuda=False, use_onnx=False):
38 | model, utils = torch.hub.load(
39 | repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True
40 | )
41 | if use_cuda:
42 | model = model.cuda()
43 |
44 | get_speech_timestamps, save_audio, _, _, collect_chunks = utils
45 | return model, get_speech_timestamps, save_audio, collect_chunks
46 |
47 |
48 | def remove_silence(
49 | model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
50 | ):
51 | # get the VAD model and utils functions
52 | model, get_speech_timestamps, _, collect_chunks = model_and_utils
53 |
54 | # read ground truth wav and resample the audio for the VAD
55 | try:
56 | wav, gt_sample_rate = read_audio(audio_path)
57 | except:
58 | print(f"> ❗ Failed to read {audio_path}")
59 | return None, False
60 |
61 | # if needed, resample the audio for the VAD model
62 | if gt_sample_rate != vad_sample_rate:
63 | wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate)
64 | else:
65 | wav_vad = wav
66 |
67 | if use_cuda:
68 | wav_vad = wav_vad.cuda()
69 |
70 | # get speech timestamps from full audio file
71 | speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
72 |
73 | # map the current speech_timestamps to the sample rate of the ground truth audio
74 | new_speech_timestamps = map_timestamps_to_new_sr(
75 | vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
76 | )
77 |
78 | # if have speech timestamps else save the wav
79 | if new_speech_timestamps:
80 | wav = collect_chunks(new_speech_timestamps, wav)
81 | is_speech = True
82 | else:
83 | print(f"> The file {audio_path} probably does not have speech please check it !!")
84 | is_speech = False
85 |
86 | # save
87 | torchaudio.save(out_path, wav[None, :], gt_sample_rate)
88 | return out_path, is_speech
89 |
--------------------------------------------------------------------------------
/TTS/vc/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vc/configs/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/models/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import re
3 | from typing import Dict, List, Union
4 |
5 |
6 | def to_camel(text):
7 | text = text.capitalize()
8 | return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
9 |
10 |
11 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC":
12 | print(" > Using model: {}".format(config.model))
13 | # fetch the right model implementation.
14 | if "model" in config and config["model"].lower() == "freevc":
15 | MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC
16 | model = MyModel.init_from_config(config, samples)
17 | return model
18 |
--------------------------------------------------------------------------------
/TTS/vc/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vc/modules/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vc/modules/freevc/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vc/modules/freevc/speaker_encoder/__init__.py
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/audio.py:
--------------------------------------------------------------------------------
1 | import struct
2 | from pathlib import Path
3 | from typing import Optional, Union
4 |
5 | # import webrtcvad
6 | import librosa
7 | import numpy as np
8 | from scipy.ndimage.morphology import binary_dilation
9 |
10 | from TTS.vc.modules.freevc.speaker_encoder.hparams import *
11 |
12 | int16_max = (2**15) - 1
13 |
14 |
15 | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
16 | """
17 | Applies the preprocessing operations used in training the Speaker Encoder to a waveform
18 | either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
19 |
20 | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
21 | just .wav), either the waveform as a numpy array of floats.
22 | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
23 | preprocessing. After preprocessing, the waveform's sampling rate will match the data
24 | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
25 | this argument will be ignored.
26 | """
27 | # Load the wav from disk if needed
28 | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
29 | wav, source_sr = librosa.load(fpath_or_wav, sr=None)
30 | else:
31 | wav = fpath_or_wav
32 |
33 | # Resample the wav if needed
34 | if source_sr is not None and source_sr != sampling_rate:
35 | wav = librosa.resample(wav, source_sr, sampling_rate)
36 |
37 | # Apply the preprocessing: normalize volume and shorten long silences
38 | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
39 | wav = trim_long_silences(wav)
40 |
41 | return wav
42 |
43 |
44 | def wav_to_mel_spectrogram(wav):
45 | """
46 | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
47 | Note: this not a log-mel spectrogram.
48 | """
49 | frames = librosa.feature.melspectrogram(
50 | y=wav,
51 | sr=sampling_rate,
52 | n_fft=int(sampling_rate * mel_window_length / 1000),
53 | hop_length=int(sampling_rate * mel_window_step / 1000),
54 | n_mels=mel_n_channels,
55 | )
56 | return frames.astype(np.float32).T
57 |
58 |
59 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
60 | if increase_only and decrease_only:
61 | raise ValueError("Both increase only and decrease only are set")
62 | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
63 | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
64 | return wav
65 | return wav * (10 ** (dBFS_change / 20))
66 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/speaker_encoder/hparams.py:
--------------------------------------------------------------------------------
1 | ## Mel-filterbank
2 | mel_window_length = 25 # In milliseconds
3 | mel_window_step = 10 # In milliseconds
4 | mel_n_channels = 40
5 |
6 |
7 | ## Audio
8 | sampling_rate = 16000
9 | # Number of spectrogram frames in a partial utterance
10 | partials_n_frames = 160 # 1600 ms
11 |
12 |
13 | ## Voice Activation Detection
14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
15 | # This sets the granularity of the VAD. Should not need to be changed.
16 | vad_window_length = 30 # In milliseconds
17 | # Number of frames to average together when performing the moving average smoothing.
18 | # The larger this value, the larger the VAD variations must be to not get smoothed out.
19 | vad_moving_average_width = 8
20 | # Maximum number of consecutive silent frames a segment can have.
21 | vad_max_silence_length = 6
22 |
23 |
24 | ## Audio volume normalization
25 | audio_norm_target_dBFS = -30
26 |
27 |
28 | ## Model parameters
29 | model_hidden_size = 256
30 | model_embedding_size = 256
31 | model_num_layers = 3
32 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/wavlm/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import urllib.request
3 |
4 | import torch
5 |
6 | from TTS.utils.generic_utils import get_user_data_dir
7 | from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig
8 |
9 | model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt"
10 |
11 |
12 | def get_wavlm(device="cpu"):
13 | """Download the model and return the model object."""
14 |
15 | output_path = get_user_data_dir("tts")
16 |
17 | output_path = os.path.join(output_path, "wavlm")
18 | if not os.path.exists(output_path):
19 | os.makedirs(output_path)
20 |
21 | output_path = os.path.join(output_path, "WavLM-Large.pt")
22 | if not os.path.exists(output_path):
23 | print(f" > Downloading WavLM model to {output_path} ...")
24 | urllib.request.urlretrieve(model_uri, output_path)
25 |
26 | checkpoint = torch.load(output_path, map_location=torch.device(device))
27 | cfg = WavLMConfig(checkpoint["cfg"])
28 | wavlm = WavLM(cfg).to(device)
29 | wavlm.load_state_dict(checkpoint["model"])
30 | wavlm.eval()
31 | return wavlm
32 |
33 |
34 | if __name__ == "__main__":
35 | wavlm = get_wavlm()
36 |
--------------------------------------------------------------------------------
/TTS/vc/modules/freevc/wavlm/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "./wavlm-large/",
3 | "activation_dropout": 0.0,
4 | "adapter_kernel_size": 3,
5 | "adapter_stride": 2,
6 | "add_adapter": false,
7 | "apply_spec_augment": true,
8 | "architectures": [
9 | "WavLMModel"
10 | ],
11 | "attention_dropout": 0.1,
12 | "bos_token_id": 1,
13 | "classifier_proj_size": 256,
14 | "codevector_dim": 768,
15 | "contrastive_logits_temperature": 0.1,
16 | "conv_bias": false,
17 | "conv_dim": [
18 | 512,
19 | 512,
20 | 512,
21 | 512,
22 | 512,
23 | 512,
24 | 512
25 | ],
26 | "conv_kernel": [
27 | 10,
28 | 3,
29 | 3,
30 | 3,
31 | 3,
32 | 2,
33 | 2
34 | ],
35 | "conv_stride": [
36 | 5,
37 | 2,
38 | 2,
39 | 2,
40 | 2,
41 | 2,
42 | 2
43 | ],
44 | "ctc_loss_reduction": "sum",
45 | "ctc_zero_infinity": false,
46 | "diversity_loss_weight": 0.1,
47 | "do_stable_layer_norm": true,
48 | "eos_token_id": 2,
49 | "feat_extract_activation": "gelu",
50 | "feat_extract_dropout": 0.0,
51 | "feat_extract_norm": "layer",
52 | "feat_proj_dropout": 0.1,
53 | "feat_quantizer_dropout": 0.0,
54 | "final_dropout": 0.0,
55 | "gradient_checkpointing": false,
56 | "hidden_act": "gelu",
57 | "hidden_dropout": 0.1,
58 | "hidden_size": 1024,
59 | "initializer_range": 0.02,
60 | "intermediate_size": 4096,
61 | "layer_norm_eps": 1e-05,
62 | "layerdrop": 0.1,
63 | "mask_channel_length": 10,
64 | "mask_channel_min_space": 1,
65 | "mask_channel_other": 0.0,
66 | "mask_channel_prob": 0.0,
67 | "mask_channel_selection": "static",
68 | "mask_feature_length": 10,
69 | "mask_feature_min_masks": 0,
70 | "mask_feature_prob": 0.0,
71 | "mask_time_length": 10,
72 | "mask_time_min_masks": 2,
73 | "mask_time_min_space": 1,
74 | "mask_time_other": 0.0,
75 | "mask_time_prob": 0.075,
76 | "mask_time_selection": "static",
77 | "max_bucket_distance": 800,
78 | "model_type": "wavlm",
79 | "num_adapter_layers": 3,
80 | "num_attention_heads": 16,
81 | "num_buckets": 320,
82 | "num_codevector_groups": 2,
83 | "num_codevectors_per_group": 320,
84 | "num_conv_pos_embedding_groups": 16,
85 | "num_conv_pos_embeddings": 128,
86 | "num_ctc_classes": 80,
87 | "num_feat_extract_layers": 7,
88 | "num_hidden_layers": 24,
89 | "num_negatives": 100,
90 | "output_hidden_size": 1024,
91 | "pad_token_id": 0,
92 | "proj_codevector_dim": 768,
93 | "replace_prob": 0.5,
94 | "tokenizer_class": "Wav2Vec2CTCTokenizer",
95 | "torch_dtype": "float32",
96 | "transformers_version": "4.15.0.dev0",
97 | "use_weighted_layer_sum": false,
98 | "vocab_size": 32
99 | }
--------------------------------------------------------------------------------
/TTS/vocoder/README.md:
--------------------------------------------------------------------------------
1 | # Mozilla TTS Vocoders (Experimental)
2 |
3 | Here there are vocoder model implementations which can be combined with the other TTS models.
4 |
5 | Currently, following models are implemented:
6 |
7 | - Melgan
8 | - MultiBand-Melgan
9 | - ParallelWaveGAN
10 | - GAN-TTS (Discriminator Only)
11 |
12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
13 |
14 | ## Training a model
15 |
16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
17 |
18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
19 |
20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
21 |
22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
23 |
24 | Example config files can be found under `tts/vocoder/configs/` folder.
25 |
26 | You can continue a previous training run by the following command.
27 |
28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
29 |
30 | You can fine-tune a pre-trained model by the following command.
31 |
32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
33 |
34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
35 |
36 | You can also follow your training runs on Tensorboard as you do with our TTS models.
37 |
38 | ## Acknowledgement
39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
40 |
--------------------------------------------------------------------------------
/TTS/vocoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vocoder/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | configs_dir = os.path.dirname(__file__)
7 | for file in os.listdir(configs_dir):
8 | path = os.path.join(configs_dir, file)
9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | module = importlib.import_module("TTS.vocoder.configs." + config_name)
12 | for attribute_name in dir(module):
13 | attribute = getattr(module, attribute_name)
14 |
15 | if isclass(attribute):
16 | # Add the class to this package's variables
17 | globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from coqpit import Coqpit
4 | from torch.utils.data import Dataset
5 |
6 | from TTS.utils.audio import AudioProcessor
7 | from TTS.vocoder.datasets.gan_dataset import GANDataset
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
11 |
12 |
13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset:
14 | if config.model.lower() in "gan":
15 | dataset = GANDataset(
16 | ap=ap,
17 | items=data_items,
18 | seq_len=config.seq_len,
19 | hop_len=ap.hop_length,
20 | pad_short=config.pad_short,
21 | conv_pad=config.conv_pad,
22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False,
23 | is_training=not is_eval,
24 | return_segments=not is_eval,
25 | use_noise_augment=config.use_noise_augment,
26 | use_cache=config.use_cache,
27 | verbose=verbose,
28 | )
29 | dataset.shuffle_mapping()
30 | elif config.model.lower() == "wavegrad":
31 | dataset = WaveGradDataset(
32 | ap=ap,
33 | items=data_items,
34 | seq_len=config.seq_len,
35 | hop_len=ap.hop_length,
36 | pad_short=config.pad_short,
37 | conv_pad=config.conv_pad,
38 | is_training=not is_eval,
39 | return_segments=True,
40 | use_noise_augment=False,
41 | use_cache=config.use_cache,
42 | verbose=verbose,
43 | )
44 | elif config.model.lower() == "wavernn":
45 | dataset = WaveRNNDataset(
46 | ap=ap,
47 | items=data_items,
48 | seq_len=config.seq_len,
49 | hop_len=ap.hop_length,
50 | pad=config.model_params.pad,
51 | mode=config.model_params.mode,
52 | mulaw=config.model_params.mulaw,
53 | is_training=not is_eval,
54 | verbose=verbose,
55 | )
56 | else:
57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.")
58 | return dataset
59 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/preprocess.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | from coqpit import Coqpit
7 | from tqdm import tqdm
8 |
9 | from TTS.utils.audio import AudioProcessor
10 | from TTS.utils.audio.numpy_transforms import mulaw_encode, quantize
11 |
12 |
13 | def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor):
14 | """Process wav and compute mel and quantized wave signal.
15 | It is mainly used by WaveRNN dataloader.
16 |
17 | Args:
18 | out_path (str): Parent folder path to save the files.
19 | config (Coqpit): Model config.
20 | ap (AudioProcessor): Audio processor.
21 | """
22 | os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
23 | os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
24 | wav_files = find_wav_files(config.data_path)
25 | for path in tqdm(wav_files):
26 | wav_name = Path(path).stem
27 | quant_path = os.path.join(out_path, "quant", wav_name + ".npy")
28 | mel_path = os.path.join(out_path, "mel", wav_name + ".npy")
29 | y = ap.load_wav(path)
30 | mel = ap.melspectrogram(y)
31 | np.save(mel_path, mel)
32 | if isinstance(config.mode, int):
33 | quant = (
34 | mulaw_encode(wav=y, mulaw_qc=config.mode)
35 | if config.model_args.mulaw
36 | else quantize(x=y, quantize_bits=config.mode)
37 | )
38 | np.save(quant_path, quant)
39 |
40 |
41 | def find_wav_files(data_path, file_ext="wav"):
42 | wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True)
43 | return wav_paths
44 |
45 |
46 | def find_feat_files(data_path):
47 | feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True)
48 | return feat_paths
49 |
50 |
51 | def load_wav_data(data_path, eval_split_size, file_ext="wav"):
52 | wav_paths = find_wav_files(data_path, file_ext=file_ext)
53 | assert len(wav_paths) > 0, f" [!] {data_path} is empty."
54 | np.random.seed(0)
55 | np.random.shuffle(wav_paths)
56 | return wav_paths[:eval_split_size], wav_paths[eval_split_size:]
57 |
58 |
59 | def load_wav_feat_data(data_path, feat_path, eval_split_size):
60 | wav_paths = find_wav_files(data_path)
61 | feat_paths = find_feat_files(feat_path)
62 |
63 | wav_paths.sort(key=lambda x: Path(x).stem)
64 | feat_paths.sort(key=lambda x: Path(x).stem)
65 |
66 | assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}"
67 | for wav, feat in zip(wav_paths, feat_paths):
68 | wav_name = Path(wav).stem
69 | feat_name = Path(feat).stem
70 | assert wav_name == feat_name
71 |
72 | items = list(zip(wav_paths, feat_paths))
73 | np.random.seed(0)
74 | np.random.shuffle(items)
75 | return items[:eval_split_size], items[eval_split_size:]
76 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vocoder/layers/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/layers/hifigan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils.parametrize import remove_parametrizations
3 |
4 |
5 | # pylint: disable=dangerous-default-value
6 | class ResStack(nn.Module):
7 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]):
8 | super().__init__()
9 | resstack = []
10 | for dilation in dilations:
11 | resstack += [
12 | nn.LeakyReLU(0.2),
13 | nn.ReflectionPad1d(dilation),
14 | nn.utils.parametrizations.weight_norm(
15 | nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)
16 | ),
17 | nn.LeakyReLU(0.2),
18 | nn.ReflectionPad1d(padding),
19 | nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)),
20 | ]
21 | self.resstack = nn.Sequential(*resstack)
22 |
23 | self.shortcut = nn.utils.parametrizations.weight_norm(nn.Conv1d(channel, channel, kernel_size=1))
24 |
25 | def forward(self, x):
26 | x1 = self.shortcut(x)
27 | x2 = self.resstack(x)
28 | return x1 + x2
29 |
30 | def remove_weight_norm(self):
31 | remove_parametrizations(self.shortcut, "weight")
32 | remove_parametrizations(self.resstack[2], "weight")
33 | remove_parametrizations(self.resstack[5], "weight")
34 | remove_parametrizations(self.resstack[8], "weight")
35 | remove_parametrizations(self.resstack[11], "weight")
36 | remove_parametrizations(self.resstack[14], "weight")
37 | remove_parametrizations(self.resstack[17], "weight")
38 |
39 |
40 | class MRF(nn.Module):
41 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value
42 | super().__init__()
43 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations)
44 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations)
45 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations)
46 |
47 | def forward(self, x):
48 | x1 = self.resblock1(x)
49 | x2 = self.resblock2(x)
50 | x3 = self.resblock3(x)
51 | return x1 + x2 + x3
52 |
53 | def remove_weight_norm(self):
54 | self.resblock1.remove_weight_norm()
55 | self.resblock2.remove_weight_norm()
56 | self.resblock3.remove_weight_norm()
57 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/melgan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils.parametrizations import weight_norm
3 | from torch.nn.utils.parametrize import remove_parametrizations
4 |
5 |
6 | class ResidualStack(nn.Module):
7 | def __init__(self, channels, num_res_blocks, kernel_size):
8 | super().__init__()
9 |
10 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
11 | base_padding = (kernel_size - 1) // 2
12 |
13 | self.blocks = nn.ModuleList()
14 | for idx in range(num_res_blocks):
15 | layer_kernel_size = kernel_size
16 | layer_dilation = layer_kernel_size**idx
17 | layer_padding = base_padding * layer_dilation
18 | self.blocks += [
19 | nn.Sequential(
20 | nn.LeakyReLU(0.2),
21 | nn.ReflectionPad1d(layer_padding),
22 | weight_norm(
23 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True)
24 | ),
25 | nn.LeakyReLU(0.2),
26 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
27 | )
28 | ]
29 |
30 | self.shortcuts = nn.ModuleList(
31 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for _ in range(num_res_blocks)]
32 | )
33 |
34 | def forward(self, x):
35 | for block, shortcut in zip(self.blocks, self.shortcuts):
36 | x = shortcut(x) + block(x)
37 | return x
38 |
39 | def remove_weight_norm(self):
40 | for block, shortcut in zip(self.blocks, self.shortcuts):
41 | remove_parametrizations(block[2], "weight")
42 | remove_parametrizations(block[4], "weight")
43 | remove_parametrizations(shortcut, "weight")
44 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/parallel_wavegan.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 |
5 | class ResidualBlock(torch.nn.Module):
6 | """Residual block module in WaveNet."""
7 |
8 | def __init__(
9 | self,
10 | kernel_size=3,
11 | res_channels=64,
12 | gate_channels=128,
13 | skip_channels=64,
14 | aux_channels=80,
15 | dropout=0.0,
16 | dilation=1,
17 | bias=True,
18 | use_causal_conv=False,
19 | ):
20 | super().__init__()
21 | self.dropout = dropout
22 | # no future time stamps available
23 | if use_causal_conv:
24 | padding = (kernel_size - 1) * dilation
25 | else:
26 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
27 | padding = (kernel_size - 1) // 2 * dilation
28 | self.use_causal_conv = use_causal_conv
29 |
30 | # dilation conv
31 | self.conv = torch.nn.Conv1d(
32 | res_channels, gate_channels, kernel_size, padding=padding, dilation=dilation, bias=bias
33 | )
34 |
35 | # local conditioning
36 | if aux_channels > 0:
37 | self.conv1x1_aux = torch.nn.Conv1d(aux_channels, gate_channels, 1, bias=False)
38 | else:
39 | self.conv1x1_aux = None
40 |
41 | # conv output is split into two groups
42 | gate_out_channels = gate_channels // 2
43 | self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, res_channels, 1, bias=bias)
44 | self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, skip_channels, 1, bias=bias)
45 |
46 | def forward(self, x, c):
47 | """
48 | x: B x D_res x T
49 | c: B x D_aux x T
50 | """
51 | residual = x
52 | x = F.dropout(x, p=self.dropout, training=self.training)
53 | x = self.conv(x)
54 |
55 | # remove future time steps if use_causal_conv conv
56 | x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x
57 |
58 | # split into two part for gated activation
59 | splitdim = 1
60 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
61 |
62 | # local conditioning
63 | if c is not None:
64 | assert self.conv1x1_aux is not None
65 | c = self.conv1x1_aux(c)
66 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
67 | xa, xb = xa + ca, xb + cb
68 |
69 | x = torch.tanh(xa) * torch.sigmoid(xb)
70 |
71 | # for skip connection
72 | s = self.conv1x1_skip(x)
73 |
74 | # for residual connection
75 | x = (self.conv1x1_out(x) + residual) * (0.5**2)
76 |
77 | return x, s
78 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/pqmf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from scipy import signal as sig
5 |
6 |
7 | # adapted from
8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
9 | class PQMF(torch.nn.Module):
10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
11 | super().__init__()
12 |
13 | self.N = N
14 | self.taps = taps
15 | self.cutoff = cutoff
16 | self.beta = beta
17 |
18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta))
19 | H = np.zeros((N, len(QMF)))
20 | G = np.zeros((N, len(QMF)))
21 | for k in range(N):
22 | constant_factor = (
23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2))
24 | ) # TODO: (taps - 1) -> taps
25 | phase = (-1) ** k * np.pi / 4
26 | H[k] = 2 * QMF * np.cos(constant_factor + phase)
27 |
28 | G[k] = 2 * QMF * np.cos(constant_factor - phase)
29 |
30 | H = torch.from_numpy(H[:, None, :]).float()
31 | G = torch.from_numpy(G[None, :, :]).float()
32 |
33 | self.register_buffer("H", H)
34 | self.register_buffer("G", G)
35 |
36 | updown_filter = torch.zeros((N, N, N)).float()
37 | for k in range(N):
38 | updown_filter[k, k, 0] = 1.0
39 | self.register_buffer("updown_filter", updown_filter)
40 | self.N = N
41 |
42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
43 |
44 | def forward(self, x):
45 | return self.analysis(x)
46 |
47 | def analysis(self, x):
48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
49 |
50 | def synthesis(self, x):
51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N)
52 | x = F.conv1d(x, self.G, padding=self.taps // 2)
53 | return x
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/base_vocoder.py:
--------------------------------------------------------------------------------
1 | from coqpit import Coqpit
2 |
3 | from TTS.model import BaseTrainerModel
4 |
5 | # pylint: skip-file
6 |
7 |
8 | class BaseVocoder(BaseTrainerModel):
9 | """Base `vocoder` class. Every new `vocoder` model must inherit this.
10 |
11 | It defines `vocoder` specific functions on top of `Model`.
12 |
13 | Notes on input/output tensor shapes:
14 | Any input or output tensor of the model must be shaped as
15 |
16 | - 3D tensors `batch x time x channels`
17 | - 2D tensors `batch x channels`
18 | - 1D tensors `batch x 1`
19 | """
20 |
21 | MODEL_TYPE = "vocoder"
22 |
23 | def __init__(self, config):
24 | super().__init__()
25 | self._set_model_args(config)
26 |
27 | def _set_model_args(self, config: Coqpit):
28 | """Setup model args based on the config type.
29 |
30 | If the config is for training with a name like "*Config", then the model args are embeded in the
31 | config.model_args
32 |
33 | If the config is for the model with a name like "*Args", then we assign the directly.
34 | """
35 | # don't use isintance not to import recursively
36 | if "Config" in config.__class__.__name__:
37 | if "characters" in config:
38 | _, self.config, num_chars = self.get_characters(config)
39 | self.config.num_chars = num_chars
40 | if hasattr(self.config, "model_args"):
41 | config.model_args.num_chars = num_chars
42 | if "model_args" in config:
43 | self.args = self.config.model_args
44 | # This is for backward compatibility
45 | if "model_params" in config:
46 | self.args = self.config.model_params
47 | else:
48 | self.config = config
49 | if "model_args" in config:
50 | self.args = self.config.model_args
51 | # This is for backward compatibility
52 | if "model_params" in config:
53 | self.args = self.config.model_params
54 | else:
55 | raise ValueError("config must be either a *Config or *Args")
56 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/fullband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.models.melgan_generator import MelganGenerator
4 |
5 |
6 | class FullbandMelganGenerator(MelganGenerator):
7 | def __init__(
8 | self,
9 | in_channels=80,
10 | out_channels=1,
11 | proj_kernel=7,
12 | base_channels=512,
13 | upsample_factors=(2, 8, 2, 2),
14 | res_kernel=3,
15 | num_res_blocks=4,
16 | ):
17 | super().__init__(
18 | in_channels=in_channels,
19 | out_channels=out_channels,
20 | proj_kernel=proj_kernel,
21 | base_channels=base_channels,
22 | upsample_factors=upsample_factors,
23 | res_kernel=res_kernel,
24 | num_res_blocks=num_res_blocks,
25 | )
26 |
27 | @torch.no_grad()
28 | def inference(self, cond_features):
29 | cond_features = cond_features.to(self.layers[1].weight.device)
30 | cond_features = torch.nn.functional.pad(
31 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
32 | )
33 | return self.layers(cond_features)
34 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_discriminator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 | from torch.nn.utils.parametrizations import weight_norm
4 |
5 |
6 | class MelganDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | kernel_sizes=(5, 3),
12 | base_channels=16,
13 | max_channels=1024,
14 | downsample_factors=(4, 4, 4, 4),
15 | groups_denominator=4,
16 | ):
17 | super().__init__()
18 | self.layers = nn.ModuleList()
19 |
20 | layer_kernel_size = np.prod(kernel_sizes)
21 | layer_padding = (layer_kernel_size - 1) // 2
22 |
23 | # initial layer
24 | self.layers += [
25 | nn.Sequential(
26 | nn.ReflectionPad1d(layer_padding),
27 | weight_norm(nn.Conv1d(in_channels, base_channels, layer_kernel_size, stride=1)),
28 | nn.LeakyReLU(0.2, inplace=True),
29 | )
30 | ]
31 |
32 | # downsampling layers
33 | layer_in_channels = base_channels
34 | for downsample_factor in downsample_factors:
35 | layer_out_channels = min(layer_in_channels * downsample_factor, max_channels)
36 | layer_kernel_size = downsample_factor * 10 + 1
37 | layer_padding = (layer_kernel_size - 1) // 2
38 | layer_groups = layer_in_channels // groups_denominator
39 | self.layers += [
40 | nn.Sequential(
41 | weight_norm(
42 | nn.Conv1d(
43 | layer_in_channels,
44 | layer_out_channels,
45 | kernel_size=layer_kernel_size,
46 | stride=downsample_factor,
47 | padding=layer_padding,
48 | groups=layer_groups,
49 | )
50 | ),
51 | nn.LeakyReLU(0.2, inplace=True),
52 | )
53 | ]
54 | layer_in_channels = layer_out_channels
55 |
56 | # last 2 layers
57 | layer_padding1 = (kernel_sizes[0] - 1) // 2
58 | layer_padding2 = (kernel_sizes[1] - 1) // 2
59 | self.layers += [
60 | nn.Sequential(
61 | weight_norm(
62 | nn.Conv1d(
63 | layer_out_channels,
64 | layer_out_channels,
65 | kernel_size=kernel_sizes[0],
66 | stride=1,
67 | padding=layer_padding1,
68 | )
69 | ),
70 | nn.LeakyReLU(0.2, inplace=True),
71 | ),
72 | weight_norm(
73 | nn.Conv1d(
74 | layer_out_channels, out_channels, kernel_size=kernel_sizes[1], stride=1, padding=layer_padding2
75 | )
76 | ),
77 | ]
78 |
79 | def forward(self, x):
80 | feats = []
81 | for layer in self.layers:
82 | x = layer(x)
83 | feats.append(x)
84 | return x, feats
85 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.utils.parametrizations import weight_norm
4 |
5 | from TTS.utils.io import load_fsspec
6 | from TTS.vocoder.layers.melgan import ResidualStack
7 |
8 |
9 | class MelganGenerator(nn.Module):
10 | def __init__(
11 | self,
12 | in_channels=80,
13 | out_channels=1,
14 | proj_kernel=7,
15 | base_channels=512,
16 | upsample_factors=(8, 8, 2, 2),
17 | res_kernel=3,
18 | num_res_blocks=3,
19 | ):
20 | super().__init__()
21 |
22 | # assert model parameters
23 | assert (proj_kernel - 1) % 2 == 0, " [!] proj_kernel should be an odd number."
24 |
25 | # setup additional model parameters
26 | base_padding = (proj_kernel - 1) // 2
27 | act_slope = 0.2
28 | self.inference_padding = 2
29 |
30 | # initial layer
31 | layers = []
32 | layers += [
33 | nn.ReflectionPad1d(base_padding),
34 | weight_norm(nn.Conv1d(in_channels, base_channels, kernel_size=proj_kernel, stride=1, bias=True)),
35 | ]
36 |
37 | # upsampling layers and residual stacks
38 | for idx, upsample_factor in enumerate(upsample_factors):
39 | layer_in_channels = base_channels // (2**idx)
40 | layer_out_channels = base_channels // (2 ** (idx + 1))
41 | layer_filter_size = upsample_factor * 2
42 | layer_stride = upsample_factor
43 | layer_output_padding = upsample_factor % 2
44 | layer_padding = upsample_factor // 2 + layer_output_padding
45 | layers += [
46 | nn.LeakyReLU(act_slope),
47 | weight_norm(
48 | nn.ConvTranspose1d(
49 | layer_in_channels,
50 | layer_out_channels,
51 | layer_filter_size,
52 | stride=layer_stride,
53 | padding=layer_padding,
54 | output_padding=layer_output_padding,
55 | bias=True,
56 | )
57 | ),
58 | ResidualStack(channels=layer_out_channels, num_res_blocks=num_res_blocks, kernel_size=res_kernel),
59 | ]
60 |
61 | layers += [nn.LeakyReLU(act_slope)]
62 |
63 | # final layer
64 | layers += [
65 | nn.ReflectionPad1d(base_padding),
66 | weight_norm(nn.Conv1d(layer_out_channels, out_channels, proj_kernel, stride=1, bias=True)),
67 | nn.Tanh(),
68 | ]
69 | self.layers = nn.Sequential(*layers)
70 |
71 | def forward(self, c):
72 | return self.layers(c)
73 |
74 | def inference(self, c):
75 | c = c.to(self.layers[1].weight.device)
76 | c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
77 | return self.layers(c)
78 |
79 | def remove_weight_norm(self):
80 | for _, layer in enumerate(self.layers):
81 | if len(layer.state_dict()) != 0:
82 | try:
83 | nn.utils.parametrize.remove_parametrizations(layer, "weight")
84 | except ValueError:
85 | layer.remove_weight_norm()
86 |
87 | def load_checkpoint(
88 | self, config, checkpoint_path, eval=False, cache=False
89 | ): # pylint: disable=unused-argument, redefined-builtin
90 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
91 | self.load_state_dict(state["model"])
92 | if eval:
93 | self.eval()
94 | assert not self.training
95 | self.remove_weight_norm()
96 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_multiscale_discriminator.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
4 |
5 |
6 | class MelganMultiscaleDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | num_scales=3,
12 | kernel_sizes=(5, 3),
13 | base_channels=16,
14 | max_channels=1024,
15 | downsample_factors=(4, 4, 4),
16 | pooling_kernel_size=4,
17 | pooling_stride=2,
18 | pooling_padding=2,
19 | groups_denominator=4,
20 | ):
21 | super().__init__()
22 |
23 | self.discriminators = nn.ModuleList(
24 | [
25 | MelganDiscriminator(
26 | in_channels=in_channels,
27 | out_channels=out_channels,
28 | kernel_sizes=kernel_sizes,
29 | base_channels=base_channels,
30 | max_channels=max_channels,
31 | downsample_factors=downsample_factors,
32 | groups_denominator=groups_denominator,
33 | )
34 | for _ in range(num_scales)
35 | ]
36 | )
37 |
38 | self.pooling = nn.AvgPool1d(
39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False
40 | )
41 |
42 | def forward(self, x):
43 | scores = []
44 | feats = []
45 | for disc in self.discriminators:
46 | score, feat = disc(x)
47 | scores.append(score)
48 | feats.append(feat)
49 | x = self.pooling(x)
50 | return scores, feats
51 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/multiband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.layers.pqmf import PQMF
4 | from TTS.vocoder.models.melgan_generator import MelganGenerator
5 |
6 |
7 | class MultibandMelganGenerator(MelganGenerator):
8 | def __init__(
9 | self,
10 | in_channels=80,
11 | out_channels=4,
12 | proj_kernel=7,
13 | base_channels=384,
14 | upsample_factors=(2, 8, 2, 2),
15 | res_kernel=3,
16 | num_res_blocks=3,
17 | ):
18 | super().__init__(
19 | in_channels=in_channels,
20 | out_channels=out_channels,
21 | proj_kernel=proj_kernel,
22 | base_channels=base_channels,
23 | upsample_factors=upsample_factors,
24 | res_kernel=res_kernel,
25 | num_res_blocks=num_res_blocks,
26 | )
27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
28 |
29 | def pqmf_analysis(self, x):
30 | return self.pqmf_layer.analysis(x)
31 |
32 | def pqmf_synthesis(self, x):
33 | return self.pqmf_layer.synthesis(x)
34 |
35 | @torch.no_grad()
36 | def inference(self, cond_features):
37 | cond_features = cond_features.to(self.layers[1].weight.device)
38 | cond_features = torch.nn.functional.pad(
39 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
40 | )
41 | return self.pqmf_synthesis(self.layers(cond_features))
42 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/univnet_discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 | from torch.nn.utils import spectral_norm
5 | from torch.nn.utils.parametrizations import weight_norm
6 |
7 | from TTS.utils.audio.torch_transforms import TorchSTFT
8 | from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
9 |
10 | LRELU_SLOPE = 0.1
11 |
12 |
13 | class SpecDiscriminator(nn.Module):
14 | """docstring for Discriminator."""
15 |
16 | def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False):
17 | super().__init__()
18 | norm_f = weight_norm if use_spectral_norm is False else spectral_norm
19 | self.fft_size = fft_size
20 | self.hop_length = hop_length
21 | self.win_length = win_length
22 | self.stft = TorchSTFT(fft_size, hop_length, win_length)
23 | self.discriminators = nn.ModuleList(
24 | [
25 | norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
26 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
27 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
28 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
29 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
30 | ]
31 | )
32 |
33 | self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
34 |
35 | def forward(self, y):
36 | fmap = []
37 | with torch.no_grad():
38 | y = y.squeeze(1)
39 | y = self.stft(y)
40 | y = y.unsqueeze(1)
41 | for _, d in enumerate(self.discriminators):
42 | y = d(y)
43 | y = F.leaky_relu(y, LRELU_SLOPE)
44 | fmap.append(y)
45 |
46 | y = self.out(y)
47 | fmap.append(y)
48 |
49 | return torch.flatten(y, 1, -1), fmap
50 |
51 |
52 | class MultiResSpecDiscriminator(torch.nn.Module):
53 | def __init__( # pylint: disable=dangerous-default-value
54 | self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window"
55 | ):
56 | super().__init__()
57 | self.discriminators = nn.ModuleList(
58 | [
59 | SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
60 | SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
61 | SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window),
62 | ]
63 | )
64 |
65 | def forward(self, x):
66 | scores = []
67 | feats = []
68 | for d in self.discriminators:
69 | score, feat = d(x)
70 | scores.append(score)
71 | feats.append(feat)
72 |
73 | return scores, feats
74 |
75 |
76 | class UnivnetDiscriminator(nn.Module):
77 | """Univnet discriminator wrapping MPD and MSD."""
78 |
79 | def __init__(self):
80 | super().__init__()
81 | self.mpd = MultiPeriodDiscriminator()
82 | self.msd = MultiResSpecDiscriminator()
83 |
84 | def forward(self, x):
85 | """
86 | Args:
87 | x (Tensor): input waveform.
88 |
89 | Returns:
90 | List[Tensor]: discriminator scores.
91 | List[List[Tensor]]: list of list of features from each layers of each discriminator.
92 | """
93 | scores, feats = self.mpd(x)
94 | scores_, feats_ = self.msd(x)
95 | return scores + scores_, feats + feats_
96 |
--------------------------------------------------------------------------------
/TTS/vocoder/pqmf_output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vocoder/pqmf_output.wav
--------------------------------------------------------------------------------
/TTS/vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/TTS/vocoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/utils/generic_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import numpy as np
4 | import torch
5 | from matplotlib import pyplot as plt
6 |
7 | from TTS.tts.utils.visual import plot_spectrogram
8 | from TTS.utils.audio import AudioProcessor
9 |
10 |
11 | def interpolate_vocoder_input(scale_factor, spec):
12 | """Interpolate spectrogram by the scale factor.
13 | It is mainly used to match the sampling rates of
14 | the tts and vocoder models.
15 |
16 | Args:
17 | scale_factor (float): scale factor to interpolate the spectrogram
18 | spec (np.array): spectrogram to be interpolated
19 |
20 | Returns:
21 | torch.tensor: interpolated spectrogram.
22 | """
23 | print(" > before interpolation :", spec.shape)
24 | spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable
25 | spec = torch.nn.functional.interpolate(
26 | spec, scale_factor=scale_factor, recompute_scale_factor=True, mode="bilinear", align_corners=False
27 | ).squeeze(0)
28 | print(" > after interpolation :", spec.shape)
29 | return spec
30 |
31 |
32 | def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
33 | """Plot the predicted and the real waveform and their spectrograms.
34 |
35 | Args:
36 | y_hat (torch.tensor): Predicted waveform.
37 | y (torch.tensor): Real waveform.
38 | ap (AudioProcessor): Audio processor used to process the waveform.
39 | name_prefix (str, optional): Name prefix used to name the figures. Defaults to None.
40 |
41 | Returns:
42 | Dict: output figures keyed by the name of the figures.
43 | """ """Plot vocoder model results"""
44 | if name_prefix is None:
45 | name_prefix = ""
46 |
47 | # select an instance from batch
48 | y_hat = y_hat[0].squeeze().detach().cpu().numpy()
49 | y = y[0].squeeze().detach().cpu().numpy()
50 |
51 | spec_fake = ap.melspectrogram(y_hat).T
52 | spec_real = ap.melspectrogram(y).T
53 | spec_diff = np.abs(spec_fake - spec_real)
54 |
55 | # plot figure and save it
56 | fig_wave = plt.figure()
57 | plt.subplot(2, 1, 1)
58 | plt.plot(y)
59 | plt.title("groundtruth speech")
60 | plt.subplot(2, 1, 2)
61 | plt.plot(y_hat)
62 | plt.title("generated speech")
63 | plt.tight_layout()
64 | plt.close()
65 |
66 | figures = {
67 | name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake),
68 | name_prefix + "spectrogram/real": plot_spectrogram(spec_real),
69 | name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff),
70 | name_prefix + "speech_comparison": fig_wave,
71 | }
72 | return figures
73 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import tts_v2
3 | import tts_v2_api
4 |
5 | css = """
6 | #warning {background-color: #FFCCCB}
7 | .gradio-container {background-color: black}
8 | .feedback textarea {font-size: 24px !important}
9 | """
10 |
11 | language_list = ['zh-cn', 'en', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'es', 'hu', 'ko', 'ja']
12 |
13 | emotion_list = ["Neutral", "Happy", "Sad", "Angry", "Dull"]
14 |
15 | with gr.Blocks(css=css) as demo:
16 | gr.Markdown(f"### [coqui-ai-webui](https://github.com/douhaohaode/xtts)")
17 | with gr.Tab("文字转语音-本地"): # TTS
18 | with gr.Column():
19 | input_text = gr.Textbox(label="输入文本", lines=4, placeholder="在此输入文字。")
20 | language = gr.Dropdown(language_list, label="语言", value=language_list[0])
21 |
22 | with gr.Column():
23 | with gr.Row():
24 | temperature = gr.Slider(0.0, 1.0, value=0.75, step=0.05,
25 | label="temperature 值越大越有創意 , 犧牲穩定性")
26 | length_penalty = gr.Slider(0, 2.0, value=1.0, step=0.1, label="length_penalty")
27 | repetition_penalty = gr.Slider(1.0, 100.0, value=10.0, step=1, label="repetition_penalty")
28 |
29 | with gr.Row():
30 | top_k = gr.Slider(1.0, 2000.0, value=50.0, step=1.0, label="top_k")
31 | top_p = gr.Slider(0.1, 0.99, value=0.85, step=0.05, label="top_p")
32 | num_gpt_outputs = gr.Slider(1.0, 50.0, value=1.0, step=1.0,
33 | label="num_gpt_outputs 值越大创建伟大事物的概率更高")
34 |
35 | with gr.Row():
36 | gpt_cond_len = gr.Slider(1.0, 600.0, value=30.0, step=1, label="gpt_cond_len")
37 | gpt_cond_chunk_len = gr.Slider(1.0, 600.0, value=4.0, step=1, label="gpt_cond_chunk_len")
38 | max_ref_len = gr.Slider(1.0, 60.0, value=10.0, step=1, label="max_ref_len")
39 |
40 | with gr.Row():
41 | sound_norm_refs = gr.CheckboxGroup(["调节"], label="是否规范调节音频")
42 | gpt_batch_size = gr.Slider(1.0, 10000.0, value=1.0, step=1.0, label="gpt_batch_size")
43 | num_chars = gr.Slider(1.0, 1024.0, value=255.0, step=1, label="num_chars")
44 |
45 | with gr.Row():
46 | audio_filename = gr.Audio(label="Input audio.wav", type='filepath')
47 | output_audio = gr.Audio(label="生成的音频1", type='filepath')
48 |
49 | with gr.Row():
50 | clone_voice_button = gr.Button("创建音频文件")
51 | clone_voice_button.click(tts_v2.generate,
52 | inputs=[audio_filename, input_text, language, temperature, length_penalty,
53 | repetition_penalty, top_k, top_p,
54 | num_gpt_outputs, gpt_cond_len, gpt_cond_chunk_len, max_ref_len,
55 | sound_norm_refs, gpt_batch_size, num_chars],
56 | outputs=output_audio)
57 | with gr.Tab("文字转语音-API"): # TTS
58 | with gr.Column():
59 | input_text_api = gr.Textbox(label="输入文本", lines=4, placeholder="在此输入文字") # Input Text
60 | language_api = gr.Dropdown(language_list, label="语言", value=language_list[0], )
61 | with gr.Row():
62 | emotion = gr.Radio(emotion_list, label="emotion", value=emotion_list[0], info="模型的情感")
63 | speed = gr.Slider(0.0, 2.0, value=0, step=0.1, label="speed", info="速度系数")
64 |
65 | with gr.Row():
66 | audio_filename_api = gr.Audio(label="Input audio.wav", type='filepath')
67 | output_audio_api = gr.Audio(label="生成的音频", type="filepath")
68 | with gr.Row():
69 | clone_voice_button_api = gr.Button("创建音频文件")
70 | clone_voice_button_api.click(tts_v2_api.generate_api_custom,
71 | inputs=[audio_filename_api, input_text_api, language_api, emotion, speed],
72 | outputs=output_audio_api)
73 |
74 | demo.launch()
75 |
--------------------------------------------------------------------------------
/config.txt:
--------------------------------------------------------------------------------
1 | model ( str ) – 型号名称。除非您知道自己在做什么,否则不要改变。
2 | model_args ( XttsArgs ) – 模型架构参数。默认为XttsArgs()。
3 | 音频( XttsAudioConfig ) – 音频处理配置。默认为XttsAudioConfig()。
4 | model_dir ( str ) – 包含所有 XTTS 模型的文件夹的路径。默认为无。
5 | 温度( float ) – 自回归模型推理的温度。较大的值会使预测更具创造性,但会牺牲稳定性。默认为0.2。
6 | length_penalty ( float ) – 对基于梁的生成所使用的长度的指数惩罚。它用作序列长度的指数,而序列长度又用于除以序列的分数。由于分数是序列的对数似然(即负),因此 length_penalty > 0.0 会促进较长的序列,而 length_penalty < 0.0 则会鼓励较短的序列。
7 | repetition_penalty ( float ) – 重复惩罚的参数。1.0 表示没有处罚。默认为2.0。
8 | top_p ( float ) – 如果设置为 float < 1,则仅保留概率总计为 top_p 或更高的最可能标记的最小集合进行生成。默认为0.8。
9 | num_gpt_outputs ( int ) – 从自回归模型中获取的样本数,所有样本均使用 CLVP 进行过滤。由于 XTTS 是一个概率模型,
10 | 更多样本意味着创造“伟大”事物的可能性更高。默认为16。
11 |
12 | gpt_cond_len ( int ) – 用作自回归模型条件的音频秒数。默认为3。
13 | max_ref_len ( int ) – 用于解码器调节的音频最大秒数。默认为10。
14 | sound_norm_refs ( bool ) – 是否标准化调节音频。默认为False。
15 |
16 |
17 |
18 | gpt_batch_size ( int ) – 自回归批次的大小。
19 | enable_redaction ( bool ,可选) – 是否启用密文。默认为 True。
20 | kv_cache ( bool ,可选) – 是否使用 kv_cache。默认为 True。
21 | gpt_checkpoint ( str ,可选) – 自回归模型的检查点。默认为无。
22 | clvp_checkpoint ( str ,可选) – ConditionalLatentVariablePerseq 模型的检查点。默认为无。
23 | Decoder_checkpoint ( str ,可选) – DiffTTS 模型的检查点。默认为无。
24 | num_chars ( int ,可选) – 要生成的最大字符数。默认为 255。
25 | 型号(适用于 GPT)–
26 |
27 | gpt_max_audio_tokens ( int ,可选) – 自回归模型的最大 mel 标记。默认为 604。
28 | gpt_max_text_tokens ( int ,可选) – 自回归模型的最大文本标记。默认为 402。
29 | gpt_max_prompt_tokens ( int ,可选) – 最大提示标记或自回归模型。默认为 70。
30 | gpt_layers ( int ,可选) – 自回归模型的层数。默认为 30。
31 | gpt_n_model_channels ( int ,可选) – 自回归模型的模型维度。默认为 1024。
32 | gpt_n_heads ( int ,可选) – 自回归模型的头数。默认为 16。
33 | gpt_number_text_tokens ( int ,可选) – 自回归模型的文本标记数量。默认为 255。
34 | gpt_start_text_token ( int ,可选) – 自回归模型的起始文本标记。默认为 255。
35 | gpt_checkpointing ( bool ,可选) – 是否对自回归模型使用检查点。默认为 False。
36 | gpt_train_solo_embeddings ( bool ,可选) – 是否训练自回归模型的嵌入。默认为 False。
37 | gpt_code_stride_len ( int ,可选) – dvae 的 hop_size 以及 gpt 输出的 hop_size。默认为 1024。
38 | gpt_use_masking_gt_prompt_approach ( bool ,可选) – 如果为 True,它将使用真实值作为提示,并掩盖损失以避免重复。默认为 True。
39 | gpt_use_perceiver_resampler ( bool ,可选) – 如果为 True,它将使用火烈鸟论文中的感知器重采样器 - https://arxiv.org/abs/2204.14198。默认为 False。
--------------------------------------------------------------------------------
/rename_tool.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 |
5 | def path(path_name, t):
6 | directory, file_name = "", ""
7 | directory = os.path.dirname(os.path.abspath(__file__))
8 | timestamp = int(time.time())
9 | file_path = f"./output/{path_name}/{timestamp}.{t}"
10 | file_name = os.path.join(directory, file_path)
11 | return file_name
12 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # core deps
2 | numpy==1.22.0;python_version<="3.10"
3 | numpy==1.24.3;python_version>"3.10"
4 | cython==0.29.30
5 | scipy>=1.11.2
6 | torch>=2.1
7 | torchaudio
8 | soundfile==0.12.*
9 | librosa==0.10.*
10 | scikit-learn==1.3.0
11 | numba==0.55.1;python_version<"3.9"
12 | numba==0.57.0;python_version>="3.9"
13 | inflect==5.6.*
14 | tqdm==4.64.*
15 | anyascii==0.3.*
16 | pyyaml==6.*
17 | fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
18 | aiohttp==3.8.*
19 | packaging==23.1
20 | # deps for examples
21 | flask==2.*
22 | # deps for inference
23 | pysbd==0.3.4
24 | # deps for notebooks
25 | umap-learn==0.5.*
26 | pandas>=1.4,<2.0
27 | # deps for training
28 | matplotlib==3.7.*
29 | # coqui stack
30 | trainer>=0.0.32
31 | # config management
32 | coqpit>=0.0.16
33 | # chinese g2p deps
34 | jieba
35 | pypinyin
36 | # korean
37 | hangul_romanize
38 | # gruut+supported langs
39 | gruut[de,es,fr]==2.2.3
40 | # deps for korean
41 | jamo
42 | nltk
43 | g2pkk>=0.1.1
44 | # deps for bangla
45 | bangla
46 | bnnumerizer
47 | bnunicodenormalizer
48 | #deps for tortoise
49 | k_diffusion
50 | einops==0.6.*
51 | transformers==4.33.*
52 | #deps for bark
53 | encodec==0.1.*
54 | # deps for XTTS
55 | unidecode==1.3.*
56 | num2words
57 | gradio==4.1.2
58 | ffmpeg
59 | spacy
60 |
--------------------------------------------------------------------------------
/source/asset/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/douhaohaode/xtts_v2/782de23d4ee58d25d1b1208c27336d6f665b6872/source/asset/1.png
--------------------------------------------------------------------------------
/source/model_v2/hash.md5:
--------------------------------------------------------------------------------
1 | 10f92b55c512af7a8d39d650547a15a7
--------------------------------------------------------------------------------
/tts_v2.py:
--------------------------------------------------------------------------------
1 | import rename_tool
2 | import torch
3 | import torchaudio
4 | from TTS.tts.configs.xtts_config import XttsConfig
5 | from TTS.tts.models.xtts import Xtts
6 | import os
7 |
8 | current_dir = os.getcwd()
9 | config_path = os.path.join(current_dir, "source", "model_v2", "config.json")
10 | checkpoint_dir = os.path.join(current_dir, "source", "model_V2")
11 |
12 | config = XttsConfig()
13 | config.load_json(config_path)
14 | model = Xtts.init_from_config(config)
15 | model.load_checkpoint(config, checkpoint_dir=checkpoint_dir, eval=True)
16 | model.cuda()
17 |
18 |
19 | def generate(clone_audio_path, text, language, temperature, length_penalty, repetition_penalty, top_k, top_p, num_gpt_outputs, gpt_cond_len, gpt_cond_chunk_len, max_ref_len, sound_norm_refs, gpt_batch_size, num_chars):
20 |
21 | config.temperature = temperature
22 | config.length_penalty = float(length_penalty)
23 | config.repetition_penalty = float(repetition_penalty)
24 | config.top_k = top_k
25 | config.top_p = top_p
26 | config.num_gpt_outputs = num_gpt_outputs
27 | config.gpt_cond_len = gpt_cond_len
28 | config.gpt_cond_chunk_len = gpt_cond_chunk_len
29 |
30 | config.max_ref_len = max_ref_len
31 | repair = False
32 | if len(sound_norm_refs) > 0:
33 | repair = True
34 | config.sound_norm_refs = repair
35 |
36 | config.model_args.gpt_batch_size = gpt_batch_size
37 | config.model_args.num_chars = num_chars
38 | print(config)
39 |
40 | outputs = model.synthesize(
41 | text,
42 | config,
43 | speaker_wav=clone_audio_path,
44 | language=language,
45 | )
46 |
47 | output_audio = ""
48 | output_audio = rename_tool.path("audio", "wav")
49 | torchaudio.save(output_audio, torch.tensor(outputs["wav"]).unsqueeze(0), 24000)
50 | return output_audio
51 |
--------------------------------------------------------------------------------
/tts_v2_api.py:
--------------------------------------------------------------------------------
1 | from TTS.api import TTS
2 | import torch
3 | import rename_tool
4 |
5 | tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
6 |
7 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
8 | tts = tts.to(device)
9 |
10 | # generate speech by cloning a voice using default settings
11 | # def generate_api(clone_audio_path, text, language):
12 | # output_path = rename_tool.path("audio", "wav")
13 | # tts.tts_to_file(text=text,
14 | # file_path=output_path,
15 | # speaker_wav=clone_audio_path,
16 | # language=language)
17 | # return output_path
18 |
19 |
20 | # generate speech by cloning a voice using custom settings
21 | def generate_api_custom(clone_audio_path, text, language, emotion, speed):
22 | output_path = rename_tool.path("audio", "wav")
23 | tts.tts_to_file(text=text,
24 | file_path=output_path,
25 | speaker_wav=clone_audio_path,
26 | language=language,
27 | emotion=emotion,
28 | speed=speed,)
29 | return output_path
30 |
--------------------------------------------------------------------------------