├── .github
└── FUNDING.yml
├── .gitignore
├── .streamlit
└── config.toml
├── LICENSE
├── README.md
├── TTS
├── .models.json
├── VERSION
├── __init__.py
├── api.py
├── bin
│ ├── __init__.py
│ ├── collect_env_info.py
│ ├── compute_attention_masks.py
│ ├── compute_embeddings.py
│ ├── compute_statistics.py
│ ├── eval_encoder.py
│ ├── extract_tts_spectrograms.py
│ ├── find_unique_chars.py
│ ├── find_unique_phonemes.py
│ ├── remove_silence_using_vad.py
│ ├── resample.py
│ ├── synthesize.py
│ ├── train_encoder.py
│ ├── train_tts.py
│ ├── train_vocoder.py
│ └── tune_wavegrad.py
├── config
│ ├── __init__.py
│ └── shared_configs.py
├── encoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ │ ├── base_encoder_config.py
│ │ ├── emotion_encoder_config.py
│ │ └── speaker_encoder_config.py
│ ├── dataset.py
│ ├── losses.py
│ ├── models
│ │ ├── base_encoder.py
│ │ ├── lstm.py
│ │ └── resnet.py
│ ├── requirements.txt
│ └── utils
│ │ ├── __init__.py
│ │ ├── generic_utils.py
│ │ ├── io.py
│ │ ├── prepare_voxceleb.py
│ │ ├── training.py
│ │ └── visual.py
├── model.py
├── server
│ ├── README.md
│ ├── __init__.py
│ ├── conf.json
│ ├── server.py
│ ├── static
│ │ └── coqui-log-green-TTS.png
│ └── templates
│ │ ├── details.html
│ │ └── index.html
├── tts
│ ├── __init__.py
│ ├── configs
│ │ ├── __init__.py
│ │ ├── align_tts_config.py
│ │ ├── fast_pitch_config.py
│ │ ├── fast_speech_config.py
│ │ ├── fastspeech2_config.py
│ │ ├── glow_tts_config.py
│ │ ├── neuralhmm_tts_config.py
│ │ ├── overflow_config.py
│ │ ├── shared_configs.py
│ │ ├── speedy_speech_config.py
│ │ ├── tacotron2_config.py
│ │ ├── tacotron_config.py
│ │ └── vits_config.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── formatters.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── align_tts
│ │ │ ├── __init__.py
│ │ │ ├── duration_predictor.py
│ │ │ └── mdn.py
│ │ ├── feed_forward
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ └── encoder.py
│ │ ├── generic
│ │ │ ├── __init__.py
│ │ │ ├── aligner.py
│ │ │ ├── gated_conv.py
│ │ │ ├── normalization.py
│ │ │ ├── pos_encoding.py
│ │ │ ├── res_conv_bn.py
│ │ │ ├── time_depth_sep_conv.py
│ │ │ ├── transformer.py
│ │ │ └── wavenet.py
│ │ ├── glow_tts
│ │ │ ├── __init__.py
│ │ │ ├── decoder.py
│ │ │ ├── duration_predictor.py
│ │ │ ├── encoder.py
│ │ │ ├── glow.py
│ │ │ └── transformer.py
│ │ ├── losses.py
│ │ ├── overflow
│ │ │ ├── __init__.py
│ │ │ ├── common_layers.py
│ │ │ ├── decoder.py
│ │ │ ├── neural_hmm.py
│ │ │ └── plotting_utils.py
│ │ ├── tacotron
│ │ │ ├── __init__.py
│ │ │ ├── attentions.py
│ │ │ ├── capacitron_layers.py
│ │ │ ├── common_layers.py
│ │ │ ├── gst_layers.py
│ │ │ ├── tacotron.py
│ │ │ └── tacotron2.py
│ │ └── vits
│ │ │ ├── discriminator.py
│ │ │ ├── networks.py
│ │ │ ├── stochastic_duration_predictor.py
│ │ │ └── transforms.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── align_tts.py
│ │ ├── base_tacotron.py
│ │ ├── base_tts.py
│ │ ├── forward_tts.py
│ │ ├── glow_tts.py
│ │ ├── neuralhmm_tts.py
│ │ ├── overflow.py
│ │ ├── tacotron.py
│ │ ├── tacotron2.py
│ │ └── vits.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── data.py
│ │ ├── helpers.py
│ │ ├── languages.py
│ │ ├── managers.py
│ │ ├── measures.py
│ │ ├── monotonic_align
│ │ ├── __init__.py
│ │ ├── core.pyx
│ │ └── setup.py
│ │ ├── speakers.py
│ │ ├── ssim.py
│ │ ├── synthesis.py
│ │ ├── text
│ │ ├── __init__.py
│ │ ├── characters.py
│ │ ├── chinese_mandarin
│ │ │ ├── __init__.py
│ │ │ ├── numbers.py
│ │ │ ├── phonemizer.py
│ │ │ └── pinyinToPhonemes.py
│ │ ├── cleaners.py
│ │ ├── cmudict.py
│ │ ├── english
│ │ │ ├── __init__.py
│ │ │ ├── abbreviations.py
│ │ │ ├── number_norm.py
│ │ │ └── time_norm.py
│ │ ├── french
│ │ │ ├── __init__.py
│ │ │ └── abbreviations.py
│ │ ├── japanese
│ │ │ ├── __init__.py
│ │ │ └── phonemizer.py
│ │ ├── korean
│ │ │ ├── __init__.py
│ │ │ ├── ko_dictionary.py
│ │ │ ├── korean.py
│ │ │ └── phonemizer.py
│ │ ├── phonemizers
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── espeak_wrapper.py
│ │ │ ├── gruut_wrapper.py
│ │ │ ├── ja_jp_phonemizer.py
│ │ │ ├── ko_kr_phonemizer.py
│ │ │ ├── multi_phonemizer.py
│ │ │ └── zh_cn_phonemizer.py
│ │ ├── punctuation.py
│ │ └── tokenizer.py
│ │ └── visual.py
├── utils
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── numpy_transforms.py
│ │ ├── processor.py
│ │ └── torch_transforms.py
│ ├── callbacks.py
│ ├── capacitron_optimizer.py
│ ├── distribute.py
│ ├── download.py
│ ├── downloaders.py
│ ├── generic_utils.py
│ ├── io.py
│ ├── manage.py
│ ├── radam.py
│ ├── samplers.py
│ ├── synthesizer.py
│ ├── training.py
│ └── vad.py
└── vocoder
│ ├── README.md
│ ├── __init__.py
│ ├── configs
│ ├── __init__.py
│ ├── fullband_melgan_config.py
│ ├── hifigan_config.py
│ ├── melgan_config.py
│ ├── multiband_melgan_config.py
│ ├── parallel_wavegan_config.py
│ ├── shared_configs.py
│ ├── univnet_config.py
│ ├── wavegrad_config.py
│ └── wavernn_config.py
│ ├── datasets
│ ├── __init__.py
│ ├── gan_dataset.py
│ ├── preprocess.py
│ ├── wavegrad_dataset.py
│ └── wavernn_dataset.py
│ ├── layers
│ ├── __init__.py
│ ├── hifigan.py
│ ├── losses.py
│ ├── lvc_block.py
│ ├── melgan.py
│ ├── parallel_wavegan.py
│ ├── pqmf.py
│ ├── qmf.dat
│ ├── upsample.py
│ └── wavegrad.py
│ ├── models
│ ├── __init__.py
│ ├── base_vocoder.py
│ ├── fullband_melgan_generator.py
│ ├── gan.py
│ ├── hifigan_discriminator.py
│ ├── hifigan_generator.py
│ ├── melgan_discriminator.py
│ ├── melgan_generator.py
│ ├── melgan_multiscale_discriminator.py
│ ├── multiband_melgan_generator.py
│ ├── parallel_wavegan_discriminator.py
│ ├── parallel_wavegan_generator.py
│ ├── random_window_discriminator.py
│ ├── univnet_discriminator.py
│ ├── univnet_generator.py
│ ├── wavegrad.py
│ └── wavernn.py
│ ├── pqmf_output.wav
│ └── utils
│ ├── __init__.py
│ ├── distribution.py
│ └── generic_utils.py
├── app.py
├── clonner_output
└── sample.txt
├── helpers.py
├── language_model
└── sample.txt
├── model_list.json
├── output
└── sample.ttx
├── packages.txt
├── requirements.txt
└── setup.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: everydaycodings
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docs/
2 | images/
3 | notebooks/
4 | language_model/*/
5 | output/*.wav
6 | recipes/
7 | tests/
8 | test.ipynb
9 | extras/
10 | .ipynb_checkpoints/
11 | test_sound/
12 | language_model_test/
13 | downloader.ipynb
14 | __pycache__
15 | temp.zip
16 | clonner_output/.wav
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="dark"
3 |
4 | [server]
5 | maxUploadSize = 1028
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Kumar Saksham
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MimicMania
2 |
3 | MiniMania is a web application that provides text-to-speech (TTS) and voice cloning capabilities. The application utilizes deep learning models to generate high-quality speech output in multiple languages, with options to customize the pitch, speed, and volume of the generated audio. MiniMania's TTS engine is built on top of the Tacotron 2 and WaveGlow models, while its voice cloning capabilities are based on the FastSpeech and MelGAN models.
4 |
5 | ## Features
6 |
7 | - Easy-to-use interface: MimicMania has a user-friendly interface that allows users to quickly generate speech in their desired language and voice.
8 | - Multiple languages: MimicMania supports multiple languages, including English, Spanish, French, and more.
9 | - Multiple voices: MimicMania provides a range of voices for each language, giving users a wide variety of options to choose from.
10 | - Customizable parameters: Users can adjust the speed, pitch, and volume of the generated speech to fit their specific needs.
11 | - Voice cloning: MimicMania's voice cloning technology allows users to clone their own voice, making it easier than ever to create personalized voiceovers.
12 |
13 |
14 | ## WebApp Demo
15 |
16 | Click the Mega Link To see the Demo: [Link For The Video](https://mega.nz/file/5ShTiARB#aO4ecf518xnBnj1HKx98y4vw4ozQwSRFgKLifxFJO-E)
17 |
18 |
19 | ## Things to Be Downloaded
20 |
21 | Before you can install and use MimicMania, you will need to download and install the following:
22 |
23 | - **ffmpeg:** MimicMania requires the ffmpeg library for audio encoding and decoding. To install ffmpeg, execute the command `apt-get install ffmpeg` in your terminal.
24 | - **espeak-ng:** MimicMania uses the espeak-ng text-to-speech engine for generating speech in various languages. To install espeak-ng, execute the command `sudo apt-get install espeak-ng` in your terminal.
25 | - **espeak:** MimicMania uses the espeak-ng text-to-speech engine for generating speech in various languages. To install espeak-ng, execute the command `sudo apt-get install espeak` in your terminal.
26 |
27 | Additionally, you will also need to download the required module and Python dependencies as described in the next section.
28 |
29 | ## Installation
30 |
31 | To install and run MimicMania, follow these steps:
32 |
33 | 1. Clone this repository to your local machine using `git clone https://github.com/everydaycodings/MimicMania.git`
34 | 2. Navigate to the project directory using `cd MimicMania`
35 | 3. Run the command `python setup.py` to download the required modules. This process may take some time, as the required module is around 6 GB.
36 | 4. Run the command `pip install -r requirements.txt` to download all the Python dependencies.
37 | 5. Type the command `streamlit run app.py` to start the web application.
38 |
39 |
40 |
41 | ## Contributing
42 |
43 | If you'd like to contribute to MimicMania, please fork the repository and create a pull request. We welcome contributions of all kinds, including bug fixes, new features, and documentation improvements.
44 |
45 | ## Credits
46 |
47 | MimicMania was developed by **Kumar Saksham(everydaycodings)** with the help of various open source resources.
48 |
49 | We would like to extend a special thank you to [coqui-ai/TTS](https://github.com/coqui-ai/TTS) for providing their text-to-speech model as a resource for our project.
50 |
51 | MimicMania is licensed under the [MIT License](https://opensource.org/licenses/MIT).
52 |
53 | ## Support
54 |
55 | If you have any questions or issues with MimicMania, please contact us at [everydaycodings@gmail.com](mailto:everydaycodings@gmail.com) or reach out to us on Twitter [@everydaycodings](https://twitter.com/everydaycodings) or Medium [@everydaycodings](https://medium.com/@everydaycodings).
56 |
57 | We're always happy to help!
58 |
--------------------------------------------------------------------------------
/TTS/VERSION:
--------------------------------------------------------------------------------
1 | 0.11.1
--------------------------------------------------------------------------------
/TTS/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4 | version = f.read().strip()
5 |
6 | __version__ = version
7 |
--------------------------------------------------------------------------------
/TTS/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/bin/__init__.py
--------------------------------------------------------------------------------
/TTS/bin/collect_env_info.py:
--------------------------------------------------------------------------------
1 | """Get detailed info about the working environment."""
2 | import os
3 | import platform
4 | import sys
5 |
6 | import numpy
7 | import torch
8 |
9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10 | import json
11 |
12 | import TTS
13 |
14 |
15 | def system_info():
16 | return {
17 | "OS": platform.system(),
18 | "architecture": platform.architecture(),
19 | "version": platform.version(),
20 | "processor": platform.processor(),
21 | "python": platform.python_version(),
22 | }
23 |
24 |
25 | def cuda_info():
26 | return {
27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28 | "available": torch.cuda.is_available(),
29 | "version": torch.version.cuda,
30 | }
31 |
32 |
33 | def package_info():
34 | return {
35 | "numpy": numpy.__version__,
36 | "PyTorch_version": torch.__version__,
37 | "PyTorch_debug": torch.version.debug,
38 | "TTS": TTS.__version__,
39 | }
40 |
41 |
42 | def main():
43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44 | print(json.dumps(details, indent=4, sort_keys=True))
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/TTS/bin/compute_statistics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import argparse
5 | import glob
6 | import os
7 |
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | # from TTS.utils.io import load_config
12 | from TTS.config import load_config
13 | from TTS.tts.datasets import load_tts_samples
14 | from TTS.utils.audio import AudioProcessor
15 |
16 |
17 | def main():
18 | """Run preprocessing process."""
19 | parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20 | parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21 | parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22 | parser.add_argument(
23 | "--data_path",
24 | type=str,
25 | required=False,
26 | help="folder including the target set of wavs overriding dataset config.",
27 | )
28 | args, overrides = parser.parse_known_args()
29 |
30 | CONFIG = load_config(args.config_path)
31 | CONFIG.parse_known_args(overrides, relaxed_parser=True)
32 |
33 | # load config
34 | CONFIG.audio.signal_norm = False # do not apply earlier normalization
35 | CONFIG.audio.stats_path = None # discard pre-defined stats
36 |
37 | # load audio processor
38 | ap = AudioProcessor(**CONFIG.audio.to_dict())
39 |
40 | # load the meta data of target dataset
41 | if args.data_path:
42 | dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43 | else:
44 | dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45 | print(f" > There are {len(dataset_items)} files.")
46 |
47 | mel_sum = 0
48 | mel_square_sum = 0
49 | linear_sum = 0
50 | linear_square_sum = 0
51 | N = 0
52 | for item in tqdm(dataset_items):
53 | # compute features
54 | wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55 | linear = ap.spectrogram(wav)
56 | mel = ap.melspectrogram(wav)
57 |
58 | # compute stats
59 | N += mel.shape[1]
60 | mel_sum += mel.sum(1)
61 | linear_sum += linear.sum(1)
62 | mel_square_sum += (mel**2).sum(axis=1)
63 | linear_square_sum += (linear**2).sum(axis=1)
64 |
65 | mel_mean = mel_sum / N
66 | mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67 | linear_mean = linear_sum / N
68 | linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69 |
70 | output_file_path = args.out_path
71 | stats = {}
72 | stats["mel_mean"] = mel_mean
73 | stats["mel_std"] = mel_scale
74 | stats["linear_mean"] = linear_mean
75 | stats["linear_std"] = linear_scale
76 |
77 | print(f" > Avg mel spec mean: {mel_mean.mean()}")
78 | print(f" > Avg mel spec scale: {mel_scale.mean()}")
79 | print(f" > Avg linear spec mean: {linear_mean.mean()}")
80 | print(f" > Avg linear spec scale: {linear_scale.mean()}")
81 |
82 | # set default config values for mean-var scaling
83 | CONFIG.audio.stats_path = output_file_path
84 | CONFIG.audio.signal_norm = True
85 | # remove redundant values
86 | del CONFIG.audio.max_norm
87 | del CONFIG.audio.min_level_db
88 | del CONFIG.audio.symmetric_norm
89 | del CONFIG.audio.clip_norm
90 | stats["audio_config"] = CONFIG.audio.to_dict()
91 | np.save(output_file_path, stats, allow_pickle=True)
92 | print(f" > stats saved to {output_file_path}")
93 |
94 |
95 | if __name__ == "__main__":
96 | main()
97 |
--------------------------------------------------------------------------------
/TTS/bin/eval_encoder.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from argparse import RawTextHelpFormatter
3 |
4 | import torch
5 | from tqdm import tqdm
6 |
7 | from TTS.config import load_config
8 | from TTS.tts.datasets import load_tts_samples
9 | from TTS.tts.utils.speakers import SpeakerManager
10 |
11 |
12 | def compute_encoder_accuracy(dataset_items, encoder_manager):
13 | class_name_key = encoder_manager.encoder_config.class_name_key
14 | map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15 |
16 | class_acc_dict = {}
17 |
18 | # compute embeddings for all wav_files
19 | for item in tqdm(dataset_items):
20 | class_name = item[class_name_key]
21 | wav_file = item["audio_file"]
22 |
23 | # extract the embedding
24 | embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25 | if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26 | embedding = torch.FloatTensor(embedd).unsqueeze(0)
27 | if encoder_manager.use_cuda:
28 | embedding = embedding.cuda()
29 |
30 | class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31 | predicted_label = map_classid_to_classname[str(class_id)]
32 | else:
33 | predicted_label = None
34 |
35 | if class_name is not None and predicted_label is not None:
36 | is_equal = int(class_name == predicted_label)
37 | if class_name not in class_acc_dict:
38 | class_acc_dict[class_name] = [is_equal]
39 | else:
40 | class_acc_dict[class_name].append(is_equal)
41 | else:
42 | raise RuntimeError("Error: class_name or/and predicted_label are None")
43 |
44 | acc_avg = 0
45 | for key, values in class_acc_dict.items():
46 | acc = sum(values) / len(values)
47 | print("Class", key, "Accuracy:", acc)
48 | acc_avg += acc
49 |
50 | print("Average Accuracy:", acc_avg / len(class_acc_dict))
51 |
52 |
53 | if __name__ == "__main__":
54 | parser = argparse.ArgumentParser(
55 | description="""Compute the accuracy of the encoder.\n\n"""
56 | """
57 | Example runs:
58 | python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59 | """,
60 | formatter_class=RawTextHelpFormatter,
61 | )
62 | parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63 | parser.add_argument(
64 | "config_path",
65 | type=str,
66 | help="Path to model config file.",
67 | )
68 |
69 | parser.add_argument(
70 | "config_dataset_path",
71 | type=str,
72 | help="Path to dataset config file.",
73 | )
74 | parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75 | parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76 |
77 | args = parser.parse_args()
78 |
79 | c_dataset = load_config(args.config_dataset_path)
80 |
81 | meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82 | items = meta_data_train + meta_data_eval
83 |
84 | enc_manager = SpeakerManager(
85 | encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86 | )
87 |
88 | compute_encoder_accuracy(items, enc_manager)
89 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_chars.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | from argparse import RawTextHelpFormatter
4 |
5 | from TTS.config import load_config
6 | from TTS.tts.datasets import load_tts_samples
7 |
8 |
9 | def main():
10 | # pylint: disable=bad-option-value
11 | parser = argparse.ArgumentParser(
12 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13 | """
14 | Example runs:
15 |
16 | python TTS/bin/find_unique_chars.py --config_path config.json
17 | """,
18 | formatter_class=RawTextHelpFormatter,
19 | )
20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21 | args = parser.parse_args()
22 |
23 | c = load_config(args.config_path)
24 |
25 | # load all datasets
26 | train_items, eval_items = load_tts_samples(
27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28 | )
29 |
30 | items = train_items + eval_items
31 |
32 | texts = "".join(item["text"] for item in items)
33 | chars = set(texts)
34 | lower_chars = filter(lambda c: c.islower(), chars)
35 | chars_force_lower = [c.lower() for c in chars]
36 | chars_force_lower = set(chars_force_lower)
37 |
38 | print(f" > Number of unique characters: {len(chars)}")
39 | print(f" > Unique characters: {''.join(sorted(chars))}")
40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_phonemes.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | import multiprocessing
4 | from argparse import RawTextHelpFormatter
5 |
6 | from tqdm.contrib.concurrent import process_map
7 |
8 | from TTS.config import load_config
9 | from TTS.tts.datasets import load_tts_samples
10 | from TTS.tts.utils.text.phonemizers import Gruut
11 |
12 |
13 | def compute_phonemes(item):
14 | text = item["text"]
15 | ph = phonemizer.phonemize(text).replace("|", "")
16 | return set(list(ph))
17 |
18 |
19 | def main():
20 | # pylint: disable=W0601
21 | global c, phonemizer
22 | # pylint: disable=bad-option-value
23 | parser = argparse.ArgumentParser(
24 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25 | """
26 | Example runs:
27 |
28 | python TTS/bin/find_unique_phonemes.py --config_path config.json
29 | """,
30 | formatter_class=RawTextHelpFormatter,
31 | )
32 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33 | args = parser.parse_args()
34 |
35 | c = load_config(args.config_path)
36 |
37 | # load all datasets
38 | train_items, eval_items = load_tts_samples(
39 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40 | )
41 | items = train_items + eval_items
42 | print("Num items:", len(items))
43 |
44 | language_list = [item["language"] for item in items]
45 | is_lang_def = all(language_list)
46 |
47 | if not c.phoneme_language or not is_lang_def:
48 | raise ValueError("Phoneme language must be defined in config.")
49 |
50 | if not language_list.count(language_list[0]) == len(language_list):
51 | raise ValueError(
52 | "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53 | )
54 |
55 | phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56 |
57 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58 | phones = []
59 | for ph in phonemes:
60 | phones.extend(ph)
61 |
62 | phones = set(phones)
63 | lower_phones = filter(lambda c: c.islower(), phones)
64 | phones_force_lower = [c.lower() for c in phones]
65 | phones_force_lower = set(phones_force_lower)
66 |
67 | print(f" > Number of unique phonemes: {len(phones)}")
68 | print(f" > Unique phonemes: {''.join(sorted(phones))}")
69 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71 |
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/TTS/bin/remove_silence_using_vad.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import os
4 | import pathlib
5 |
6 | from tqdm import tqdm
7 |
8 | from TTS.utils.vad import get_vad_model_and_utils, remove_silence
9 |
10 |
11 | def adjust_path_and_remove_silence(audio_path):
12 | output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
13 | # ignore if the file exists
14 | if os.path.exists(output_path) and not args.force:
15 | return output_path
16 |
17 | # create all directory structure
18 | pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
19 | # remove the silence and save the audio
20 | output_path, is_speech = remove_silence(
21 | model_and_utils,
22 | audio_path,
23 | output_path,
24 | trim_just_beginning_and_end=args.trim_just_beginning_and_end,
25 | use_cuda=args.use_cuda,
26 | )
27 |
28 | return output_path, is_speech
29 |
30 |
31 | def preprocess_audios():
32 | files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
33 | print("> Number of files: ", len(files))
34 | if not args.force:
35 | print("> Ignoring files that already exist in the output idrectory.")
36 |
37 | if args.trim_just_beginning_and_end:
38 | print("> Trimming just the beginning and the end with nonspeech parts.")
39 | else:
40 | print("> Trimming all nonspeech parts.")
41 |
42 | filtered_files = []
43 | if files:
44 | # create threads
45 | # num_threads = multiprocessing.cpu_count()
46 | # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
47 | for f in tqdm(files):
48 | output_path, is_speech = adjust_path_and_remove_silence(f)
49 | if not is_speech:
50 | filtered_files.append(output_path)
51 |
52 | # write files that do not have speech
53 | with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
54 | for file in filtered_files:
55 | f.write(file + "\n")
56 | else:
57 | print("> No files Found !")
58 |
59 |
60 | if __name__ == "__main__":
61 | parser = argparse.ArgumentParser(
62 | description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
63 | )
64 | parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
65 | parser.add_argument(
66 | "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
67 | )
68 | parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
69 | parser.add_argument(
70 | "-g",
71 | "--glob",
72 | type=str,
73 | default="**/*.wav",
74 | help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
75 | )
76 | parser.add_argument(
77 | "-t",
78 | "--trim_just_beginning_and_end",
79 | type=bool,
80 | default=True,
81 | help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
82 | )
83 | parser.add_argument(
84 | "-c",
85 | "--use_cuda",
86 | type=bool,
87 | default=False,
88 | help="If True use cuda",
89 | )
90 | args = parser.parse_args()
91 | # load the model and utils
92 | model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
93 | preprocess_audios()
94 |
--------------------------------------------------------------------------------
/TTS/bin/resample.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import os
4 | from argparse import RawTextHelpFormatter
5 | from multiprocessing import Pool
6 | from shutil import copytree
7 |
8 | import librosa
9 | import soundfile as sf
10 | from tqdm import tqdm
11 |
12 |
13 | def resample_file(func_args):
14 | filename, output_sr = func_args
15 | y, sr = librosa.load(filename, sr=output_sr)
16 | sf.write(filename, y, sr)
17 |
18 |
19 | def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20 | if output_dir:
21 | print("Recursively copying the input folder...")
22 | copytree(input_dir, output_dir)
23 | input_dir = output_dir
24 |
25 | print("Resampling the audio files...")
26 | audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27 | print(f"Found {len(audio_files)} files...")
28 | audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29 | with Pool(processes=n_jobs) as p:
30 | with tqdm(total=len(audio_files)) as pbar:
31 | for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32 | pbar.update()
33 |
34 | print("Done !")
35 |
36 |
37 | if __name__ == "__main__":
38 | parser = argparse.ArgumentParser(
39 | description="""Resample a folder recusively with librosa
40 | Can be used in place or create a copy of the folder as an output.\n\n
41 | Example run:
42 | python TTS/bin/resample.py
43 | --input_dir /root/LJSpeech-1.1/
44 | --output_sr 22050
45 | --output_dir /root/resampled_LJSpeech-1.1/
46 | --file_ext wav
47 | --n_jobs 24
48 | """,
49 | formatter_class=RawTextHelpFormatter,
50 | )
51 |
52 | parser.add_argument(
53 | "--input_dir",
54 | type=str,
55 | default=None,
56 | required=True,
57 | help="Path of the folder containing the audio files to resample",
58 | )
59 |
60 | parser.add_argument(
61 | "--output_sr",
62 | type=int,
63 | default=22050,
64 | required=False,
65 | help="Samlple rate to which the audio files should be resampled",
66 | )
67 |
68 | parser.add_argument(
69 | "--output_dir",
70 | type=str,
71 | default=None,
72 | required=False,
73 | help="Path of the destination folder. If not defined, the operation is done in place",
74 | )
75 |
76 | parser.add_argument(
77 | "--file_ext",
78 | type=str,
79 | default="wav",
80 | required=False,
81 | help="Extension of the audio files to resample",
82 | )
83 |
84 | parser.add_argument(
85 | "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86 | )
87 |
88 | args = parser.parse_args()
89 |
90 | resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
91 |
--------------------------------------------------------------------------------
/TTS/bin/train_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models import setup_model
9 |
10 |
11 | @dataclass
12 | class TrainTTSArgs(TrainerArgs):
13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14 |
15 |
16 | def main():
17 | """Run `tts` model training directly by a `config.json` file."""
18 | # init trainer args
19 | train_args = TrainTTSArgs()
20 | parser = train_args.init_argparse(arg_prefix="")
21 |
22 | # override trainer args from comman-line args
23 | args, config_overrides = parser.parse_known_args()
24 | train_args.parse_args(args)
25 |
26 | # load config.json and register
27 | if args.config_path or args.continue_path:
28 | if args.config_path:
29 | # init from a file
30 | config = load_config(args.config_path)
31 | if len(config_overrides) > 0:
32 | config.parse_known_args(config_overrides, relaxed_parser=True)
33 | elif args.continue_path:
34 | # continue from a prev experiment
35 | config = load_config(os.path.join(args.continue_path, "config.json"))
36 | if len(config_overrides) > 0:
37 | config.parse_known_args(config_overrides, relaxed_parser=True)
38 | else:
39 | # init from console args
40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41 |
42 | config_base = BaseTrainingConfig()
43 | config_base.parse_known_args(config_overrides)
44 | config = register_config(config_base.model)()
45 |
46 | # load training samples
47 | train_samples, eval_samples = load_tts_samples(
48 | config.datasets,
49 | eval_split=True,
50 | eval_split_max_size=config.eval_split_max_size,
51 | eval_split_size=config.eval_split_size,
52 | )
53 |
54 | # init the model from config
55 | model = setup_model(config, train_samples + eval_samples)
56 |
57 | # init the trainer and 🚀
58 | trainer = Trainer(
59 | train_args,
60 | model.config,
61 | config.output_path,
62 | model=model,
63 | train_samples=train_samples,
64 | eval_samples=eval_samples,
65 | parse_command_line_args=False,
66 | )
67 | trainer.fit()
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/TTS/bin/train_vocoder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.utils.audio import AudioProcessor
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.models import setup_model
10 |
11 |
12 | @dataclass
13 | class TrainVocoderArgs(TrainerArgs):
14 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15 |
16 |
17 | def main():
18 | """Run `tts` model training directly by a `config.json` file."""
19 | # init trainer args
20 | train_args = TrainVocoderArgs()
21 | parser = train_args.init_argparse(arg_prefix="")
22 |
23 | # override trainer args from comman-line args
24 | args, config_overrides = parser.parse_known_args()
25 | train_args.parse_args(args)
26 |
27 | # load config.json and register
28 | if args.config_path or args.continue_path:
29 | if args.config_path:
30 | # init from a file
31 | config = load_config(args.config_path)
32 | if len(config_overrides) > 0:
33 | config.parse_known_args(config_overrides, relaxed_parser=True)
34 | elif args.continue_path:
35 | # continue from a prev experiment
36 | config = load_config(os.path.join(args.continue_path, "config.json"))
37 | if len(config_overrides) > 0:
38 | config.parse_known_args(config_overrides, relaxed_parser=True)
39 | else:
40 | # init from console args
41 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42 |
43 | config_base = BaseTrainingConfig()
44 | config_base.parse_known_args(config_overrides)
45 | config = register_config(config_base.model)()
46 |
47 | # load training samples
48 | if "feature_path" in config and config.feature_path:
49 | # load pre-computed features
50 | print(f" > Loading features from: {config.feature_path}")
51 | eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52 | else:
53 | # load data raw wav files
54 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55 |
56 | # setup audio processor
57 | ap = AudioProcessor(**config.audio)
58 |
59 | # init the model from config
60 | model = setup_model(config)
61 |
62 | # init the trainer and 🚀
63 | trainer = Trainer(
64 | train_args,
65 | config,
66 | config.output_path,
67 | model=model,
68 | train_samples=train_samples,
69 | eval_samples=eval_samples,
70 | training_assets={"audio_processor": ap},
71 | parse_command_line_args=False,
72 | )
73 | trainer.fit()
74 |
75 |
76 | if __name__ == "__main__":
77 | main()
78 |
--------------------------------------------------------------------------------
/TTS/bin/tune_wavegrad.py:
--------------------------------------------------------------------------------
1 | """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2 | import argparse
3 | from itertools import product as cartesian_product
4 |
5 | import numpy as np
6 | import torch
7 | from torch.utils.data import DataLoader
8 | from tqdm import tqdm
9 |
10 | from TTS.config import load_config
11 | from TTS.utils.audio import AudioProcessor
12 | from TTS.vocoder.datasets.preprocess import load_wav_data
13 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14 | from TTS.vocoder.models import setup_model
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19 | parser.add_argument("--config_path", type=str, help="Path to model config file.")
20 | parser.add_argument("--data_path", type=str, help="Path to data directory.")
21 | parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22 | parser.add_argument(
23 | "--num_iter",
24 | type=int,
25 | help="Number of model inference iterations that you like to optimize noise schedule for.",
26 | )
27 | parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28 | parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29 | parser.add_argument(
30 | "--search_depth",
31 | type=int,
32 | default=3,
33 | help="Search granularity. Increasing this increases the run-time exponentially.",
34 | )
35 |
36 | # load config
37 | args = parser.parse_args()
38 | config = load_config(args.config_path)
39 |
40 | # setup audio processor
41 | ap = AudioProcessor(**config.audio)
42 |
43 | # load dataset
44 | _, train_data = load_wav_data(args.data_path, 0)
45 | train_data = train_data[: args.num_samples]
46 | dataset = WaveGradDataset(
47 | ap=ap,
48 | items=train_data,
49 | seq_len=-1,
50 | hop_len=ap.hop_length,
51 | pad_short=config.pad_short,
52 | conv_pad=config.conv_pad,
53 | is_training=True,
54 | return_segments=False,
55 | use_noise_augment=False,
56 | use_cache=False,
57 | verbose=True,
58 | )
59 | loader = DataLoader(
60 | dataset,
61 | batch_size=1,
62 | shuffle=False,
63 | collate_fn=dataset.collate_full_clips,
64 | drop_last=False,
65 | num_workers=config.num_loader_workers,
66 | pin_memory=False,
67 | )
68 |
69 | # setup the model
70 | model = setup_model(config)
71 | if args.use_cuda:
72 | model.cuda()
73 |
74 | # setup optimization parameters
75 | base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76 | print(f" > base values: {base_values}")
77 | exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78 | best_error = float("inf")
79 | best_schedule = None # pylint: disable=C0103
80 | total_search_iter = len(base_values) ** args.num_iter
81 | for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82 | beta = exponents * base
83 | model.compute_noise_level(beta)
84 | for data in loader:
85 | mel, audio = data
86 | y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87 |
88 | if args.use_cuda:
89 | y_hat = y_hat.cpu()
90 | y_hat = y_hat.numpy()
91 |
92 | mel_hat = []
93 | for i in range(y_hat.shape[0]):
94 | m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95 | mel_hat.append(torch.from_numpy(m))
96 |
97 | mel_hat = torch.stack(mel_hat)
98 | mse = torch.sum((mel - mel_hat) ** 2).mean()
99 | if mse.item() < best_error:
100 | best_error = mse.item()
101 | best_schedule = {"beta": beta}
102 | print(f" > Found a better schedule. - MSE: {mse.item()}")
103 | np.save(args.output_path, best_schedule)
104 |
--------------------------------------------------------------------------------
/TTS/config/__init__.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import re
4 | from typing import Dict
5 |
6 | import fsspec
7 | import yaml
8 | from coqpit import Coqpit
9 |
10 | from TTS.config.shared_configs import *
11 | from TTS.utils.generic_utils import find_module
12 |
13 |
14 | def read_json_with_comments(json_path):
15 | """for backward compat."""
16 | # fallback to json
17 | with fsspec.open(json_path, "r", encoding="utf-8") as f:
18 | input_str = f.read()
19 | # handle comments
20 | input_str = re.sub(r"\\\n", "", input_str)
21 | input_str = re.sub(r"//.*\n", "\n", input_str)
22 | data = json.loads(input_str)
23 | return data
24 |
25 |
26 | def register_config(model_name: str) -> Coqpit:
27 | """Find the right config for the given model name.
28 |
29 | Args:
30 | model_name (str): Model name.
31 |
32 | Raises:
33 | ModuleNotFoundError: No matching config for the model name.
34 |
35 | Returns:
36 | Coqpit: config class.
37 | """
38 | config_class = None
39 | config_name = model_name + "_config"
40 | paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
41 | for path in paths:
42 | try:
43 | config_class = find_module(path, config_name)
44 | except ModuleNotFoundError:
45 | pass
46 | if config_class is None:
47 | raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
48 | return config_class
49 |
50 |
51 | def _process_model_name(config_dict: Dict) -> str:
52 | """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
53 |
54 | Args:
55 | config_dict (Dict): A dictionary including the config fields.
56 |
57 | Returns:
58 | str: Formatted modelname.
59 | """
60 | model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
61 | model_name = model_name.replace("_generator", "").replace("_discriminator", "")
62 | return model_name
63 |
64 |
65 | def load_config(config_path: str) -> Coqpit:
66 | """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
67 | to find the corresponding Config class. Then initialize the Config.
68 |
69 | Args:
70 | config_path (str): path to the config file.
71 |
72 | Raises:
73 | TypeError: given config file has an unknown type.
74 |
75 | Returns:
76 | Coqpit: TTS config object.
77 | """
78 | config_dict = {}
79 | ext = os.path.splitext(config_path)[1]
80 | if ext in (".yml", ".yaml"):
81 | with fsspec.open(config_path, "r", encoding="utf-8") as f:
82 | data = yaml.safe_load(f)
83 | elif ext == ".json":
84 | try:
85 | with fsspec.open(config_path, "r", encoding="utf-8") as f:
86 | data = json.load(f)
87 | except json.decoder.JSONDecodeError:
88 | # backwards compat.
89 | data = read_json_with_comments(config_path)
90 | else:
91 | raise TypeError(f" [!] Unknown config file type {ext}")
92 | config_dict.update(data)
93 | model_name = _process_model_name(config_dict)
94 | config_class = register_config(model_name.lower())
95 | config = config_class()
96 | config.from_dict(config_dict)
97 | return config
98 |
99 |
100 | def check_config_and_model_args(config, arg_name, value):
101 | """Check the give argument in `config.model_args` if exist or in `config` for
102 | the given value.
103 |
104 | Return False if the argument does not exist in `config.model_args` or `config`.
105 | This is to patch up the compatibility between models with and without `model_args`.
106 |
107 | TODO: Remove this in the future with a unified approach.
108 | """
109 | if hasattr(config, "model_args"):
110 | if arg_name in config.model_args:
111 | return config.model_args[arg_name] == value
112 | if hasattr(config, arg_name):
113 | return config[arg_name] == value
114 | return False
115 |
116 |
117 | def get_from_config_or_model_args(config, arg_name):
118 | """Get the given argument from `config.model_args` if exist or in `config`."""
119 | if hasattr(config, "model_args"):
120 | if arg_name in config.model_args:
121 | return config.model_args[arg_name]
122 | return config[arg_name]
123 |
124 |
125 | def get_from_config_or_model_args_with_default(config, arg_name, def_val):
126 | """Get the given argument from `config.model_args` if exist or in `config`."""
127 | if hasattr(config, "model_args"):
128 | if arg_name in config.model_args:
129 | return config.model_args[arg_name]
130 | if hasattr(config, arg_name):
131 | return config[arg_name]
132 | return def_val
133 |
--------------------------------------------------------------------------------
/TTS/encoder/README.md:
--------------------------------------------------------------------------------
1 | ### Speaker Encoder
2 |
3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4 |
5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6 |
7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8 |
9 | 
10 |
11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12 |
13 | To run the code, you need to follow the same flow as in TTS.
14 |
15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18 | - Watch training on Tensorboard as in TTS
19 |
--------------------------------------------------------------------------------
/TTS/encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/encoder/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/configs/base_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass, field
2 | from typing import Dict, List
3 |
4 | from coqpit import MISSING
5 |
6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7 |
8 |
9 | @dataclass
10 | class BaseEncoderConfig(BaseTrainingConfig):
11 | """Defines parameters for a Generic Encoder model."""
12 |
13 | model: str = None
14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16 | # model params
17 | model_params: Dict = field(
18 | default_factory=lambda: {
19 | "model_name": "lstm",
20 | "input_dim": 80,
21 | "proj_dim": 256,
22 | "lstm_dim": 768,
23 | "num_lstm_layers": 3,
24 | "use_lstm_with_projection": True,
25 | }
26 | )
27 |
28 | audio_augmentation: Dict = field(default_factory=lambda: {})
29 |
30 | # training params
31 | epochs: int = 10000
32 | loss: str = "angleproto"
33 | grad_clip: float = 3.0
34 | lr: float = 0.0001
35 | optimizer: str = "radam"
36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37 | lr_decay: bool = False
38 | warmup_steps: int = 4000
39 |
40 | # logging params
41 | tb_model_param_stats: bool = False
42 | steps_plot_stats: int = 10
43 | save_step: int = 1000
44 | print_step: int = 20
45 | run_eval: bool = False
46 |
47 | # data loader
48 | num_classes_in_batch: int = MISSING
49 | num_utter_per_class: int = MISSING
50 | eval_num_classes_in_batch: int = None
51 | eval_num_utter_per_class: int = None
52 |
53 | num_loader_workers: int = MISSING
54 | voice_len: float = 1.6
55 |
56 | def check_values(self):
57 | super().check_values()
58 | c = asdict(self)
59 | assert (
60 | c["model_params"]["input_dim"] == self.audio.num_mels
61 | ), " [!] model input dimendion must be equal to melspectrogram dimension."
62 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/emotion_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class EmotionEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Emotion Encoder model."""
9 |
10 | model: str = "emotion_encoder"
11 | map_classid_to_classname: dict = None
12 | class_name_key: str = "emotion_name"
13 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/speaker_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class SpeakerEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Speaker Encoder model."""
9 |
10 | model: str = "speaker_encoder"
11 | class_name_key: str = "speaker_name"
12 |
--------------------------------------------------------------------------------
/TTS/encoder/models/lstm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from TTS.encoder.models.base_encoder import BaseEncoder
5 |
6 |
7 | class LSTMWithProjection(nn.Module):
8 | def __init__(self, input_size, hidden_size, proj_size):
9 | super().__init__()
10 | self.input_size = input_size
11 | self.hidden_size = hidden_size
12 | self.proj_size = proj_size
13 | self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14 | self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15 |
16 | def forward(self, x):
17 | self.lstm.flatten_parameters()
18 | o, (_, _) = self.lstm(x)
19 | return self.linear(o)
20 |
21 |
22 | class LSTMWithoutProjection(nn.Module):
23 | def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24 | super().__init__()
25 | self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26 | self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27 | self.relu = nn.ReLU()
28 |
29 | def forward(self, x):
30 | _, (hidden, _) = self.lstm(x)
31 | return self.relu(self.linear(hidden[-1]))
32 |
33 |
34 | class LSTMSpeakerEncoder(BaseEncoder):
35 | def __init__(
36 | self,
37 | input_dim,
38 | proj_dim=256,
39 | lstm_dim=768,
40 | num_lstm_layers=3,
41 | use_lstm_with_projection=True,
42 | use_torch_spec=False,
43 | audio_config=None,
44 | ):
45 | super().__init__()
46 | self.use_lstm_with_projection = use_lstm_with_projection
47 | self.use_torch_spec = use_torch_spec
48 | self.audio_config = audio_config
49 | self.proj_dim = proj_dim
50 |
51 | layers = []
52 | # choise LSTM layer
53 | if use_lstm_with_projection:
54 | layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55 | for _ in range(num_lstm_layers - 1):
56 | layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57 | self.layers = nn.Sequential(*layers)
58 | else:
59 | self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60 |
61 | self.instancenorm = nn.InstanceNorm1d(input_dim)
62 |
63 | if self.use_torch_spec:
64 | self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65 | else:
66 | self.torch_spec = None
67 |
68 | self._init_layers()
69 |
70 | def _init_layers(self):
71 | for name, param in self.layers.named_parameters():
72 | if "bias" in name:
73 | nn.init.constant_(param, 0.0)
74 | elif "weight" in name:
75 | nn.init.xavier_normal_(param)
76 |
77 | def forward(self, x, l2_norm=True):
78 | """Forward pass of the model.
79 |
80 | Args:
81 | x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82 | to compute the spectrogram on-the-fly.
83 | l2_norm (bool): Whether to L2-normalize the outputs.
84 |
85 | Shapes:
86 | - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87 | """
88 | with torch.no_grad():
89 | with torch.cuda.amp.autocast(enabled=False):
90 | if self.use_torch_spec:
91 | x.squeeze_(1)
92 | x = self.torch_spec(x)
93 | x = self.instancenorm(x).transpose(1, 2)
94 | d = self.layers(x)
95 | if self.use_lstm_with_projection:
96 | d = d[:, -1]
97 | if l2_norm:
98 | d = torch.nn.functional.normalize(d, p=2, dim=1)
99 | return d
100 |
--------------------------------------------------------------------------------
/TTS/encoder/requirements.txt:
--------------------------------------------------------------------------------
1 | umap-learn
2 | numpy>=1.17.0
3 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/encoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/encoder/utils/io.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 |
4 | from TTS.utils.io import save_fsspec
5 |
6 |
7 | def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
8 | checkpoint_path = "checkpoint_{}.pth".format(current_step)
9 | checkpoint_path = os.path.join(out_path, checkpoint_path)
10 | print(" | | > Checkpoint saving : {}".format(checkpoint_path))
11 |
12 | new_state_dict = model.state_dict()
13 | state = {
14 | "model": new_state_dict,
15 | "optimizer": optimizer.state_dict() if optimizer is not None else None,
16 | "step": current_step,
17 | "loss": model_loss,
18 | "date": datetime.date.today().strftime("%B %d, %Y"),
19 | }
20 | save_fsspec(state, checkpoint_path)
21 |
22 |
23 | def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
24 | if model_loss < best_loss:
25 | new_state_dict = model.state_dict()
26 | state = {
27 | "model": new_state_dict,
28 | "optimizer": optimizer.state_dict(),
29 | "step": current_step,
30 | "loss": model_loss,
31 | "date": datetime.date.today().strftime("%B %d, %Y"),
32 | }
33 | best_loss = model_loss
34 | bestmodel_path = "best_model.pth"
35 | bestmodel_path = os.path.join(out_path, bestmodel_path)
36 | print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
37 | save_fsspec(state, bestmodel_path)
38 | return best_loss
39 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/training.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from coqpit import Coqpit
5 | from trainer import TrainerArgs, get_last_checkpoint
6 | from trainer.logging import logger_factory
7 | from trainer.logging.console_logger import ConsoleLogger
8 |
9 | from TTS.config import load_config, register_config
10 | from TTS.tts.utils.text.characters import parse_symbols
11 | from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
12 | from TTS.utils.io import copy_model_files
13 |
14 |
15 | @dataclass
16 | class TrainArgs(TrainerArgs):
17 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
18 |
19 |
20 | def getarguments():
21 | train_config = TrainArgs()
22 | parser = train_config.init_argparse(arg_prefix="")
23 | return parser
24 |
25 |
26 | def process_args(args, config=None):
27 | """Process parsed comand line arguments and initialize the config if not provided.
28 | Args:
29 | args (argparse.Namespace or dict like): Parsed input arguments.
30 | config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
31 | Returns:
32 | c (TTS.utils.io.AttrDict): Config paramaters.
33 | out_path (str): Path to save models and logging.
34 | audio_path (str): Path to save generated test audios.
35 | c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
36 | logging to the console.
37 | dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
38 | TODO:
39 | - Interactive config definition.
40 | """
41 | if isinstance(args, tuple):
42 | args, coqpit_overrides = args
43 | if args.continue_path:
44 | # continue a previous training from its output folder
45 | experiment_path = args.continue_path
46 | args.config_path = os.path.join(args.continue_path, "config.json")
47 | args.restore_path, best_model = get_last_checkpoint(args.continue_path)
48 | if not args.best_path:
49 | args.best_path = best_model
50 | # init config if not already defined
51 | if config is None:
52 | if args.config_path:
53 | # init from a file
54 | config = load_config(args.config_path)
55 | else:
56 | # init from console args
57 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
58 |
59 | config_base = BaseTrainingConfig()
60 | config_base.parse_known_args(coqpit_overrides)
61 | config = register_config(config_base.model)()
62 | # override values from command-line args
63 | config.parse_known_args(coqpit_overrides, relaxed_parser=True)
64 | experiment_path = args.continue_path
65 | if not experiment_path:
66 | experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
67 | audio_path = os.path.join(experiment_path, "test_audios")
68 | config.output_log_path = experiment_path
69 | # setup rank 0 process in distributed training
70 | dashboard_logger = None
71 | if args.rank == 0:
72 | new_fields = {}
73 | if args.restore_path:
74 | new_fields["restore_path"] = args.restore_path
75 | new_fields["github_branch"] = get_git_branch()
76 | # if model characters are not set in the config file
77 | # save the default set to the config file for future
78 | # compatibility.
79 | if config.has("characters") and config.characters is None:
80 | used_characters = parse_symbols()
81 | new_fields["characters"] = used_characters
82 | copy_model_files(config, experiment_path, new_fields)
83 | dashboard_logger = logger_factory(config, experiment_path)
84 | c_logger = ConsoleLogger()
85 | return config, experiment_path, audio_path, c_logger, dashboard_logger
86 |
87 |
88 | def init_arguments():
89 | train_config = TrainArgs()
90 | parser = train_config.init_argparse(arg_prefix="")
91 | return parser
92 |
93 |
94 | def init_training(config: Coqpit = None):
95 | """Initialization of a training run."""
96 | parser = init_arguments()
97 | args = parser.parse_known_args()
98 | config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
99 | return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
100 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/visual.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import umap
5 |
6 | matplotlib.use("Agg")
7 |
8 |
9 | colormap = (
10 | np.array(
11 | [
12 | [76, 255, 0],
13 | [0, 127, 70],
14 | [255, 0, 0],
15 | [255, 217, 38],
16 | [0, 135, 255],
17 | [165, 0, 165],
18 | [255, 167, 255],
19 | [0, 255, 255],
20 | [255, 96, 38],
21 | [142, 76, 0],
22 | [33, 0, 127],
23 | [0, 0, 0],
24 | [183, 183, 183],
25 | ],
26 | dtype=np.float,
27 | )
28 | / 255
29 | )
30 |
31 |
32 | def plot_embeddings(embeddings, num_classes_in_batch):
33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
34 |
35 | # if necessary get just the first 10 classes
36 | if num_classes_in_batch > 10:
37 | num_classes_in_batch = 10
38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
39 |
40 | model = umap.UMAP()
41 | projection = model.fit_transform(embeddings)
42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
43 | colors = [colormap[i] for i in ground_truth]
44 | fig, ax = plt.subplots(figsize=(16, 10))
45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
46 | plt.gca().set_aspect("equal", "datalim")
47 | plt.title("UMAP projection")
48 | plt.tight_layout()
49 | plt.savefig("umap")
50 | return fig
51 |
--------------------------------------------------------------------------------
/TTS/model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Dict
3 |
4 | import torch
5 | from coqpit import Coqpit
6 | from trainer import TrainerModel
7 |
8 | # pylint: skip-file
9 |
10 |
11 | class BaseTrainerModel(TrainerModel):
12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13 |
14 | Every new 🐸TTS model must inherit it.
15 | """
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def init_from_config(config: Coqpit):
20 | """Init the model and all its attributes from the given config.
21 |
22 | Override this depending on your model.
23 | """
24 | ...
25 |
26 | @abstractmethod
27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28 | """Forward pass for inference.
29 |
30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31 | is considered to be the main output and you can add any other auxiliary outputs as you want.
32 |
33 | We don't use `*kwargs` since it is problematic with the TorchScript API.
34 |
35 | Args:
36 | input (torch.Tensor): [description]
37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38 |
39 | Returns:
40 | Dict: [description]
41 | """
42 | outputs_dict = {"model_outputs": None}
43 | ...
44 | return outputs_dict
45 |
46 | @abstractmethod
47 | def load_checkpoint(
48 | self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
49 | ) -> None:
50 | """Load a model checkpoint gile and get ready for training or inference.
51 |
52 | Args:
53 | config (Coqpit): Model configuration.
54 | checkpoint_path (str): Path to the model checkpoint file.
55 | eval (bool, optional): If true, init model for inference else for training. Defaults to False.
56 | strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
57 | cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
58 | """
59 | ...
60 |
--------------------------------------------------------------------------------
/TTS/server/README.md:
--------------------------------------------------------------------------------
1 | # :frog: TTS demo server
2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
3 |
4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
5 |
6 | Examples runs:
7 |
8 | List officially released models.
9 | ```python TTS/server/server.py --list_models ```
10 |
11 | Run the server with the official models.
12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
13 |
14 | Run the server with the official models on a GPU.
15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
16 |
17 | Run the server with a custom models.
18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
19 |
--------------------------------------------------------------------------------
/TTS/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/server/__init__.py
--------------------------------------------------------------------------------
/TTS/server/conf.json:
--------------------------------------------------------------------------------
1 | {
2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
3 | "tts_file":"best_model.pth", // tts checkpoint file
4 | "tts_config":"config.json", // tts config.json file
5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
6 | "vocoder_config":null,
7 | "vocoder_file": null,
8 | "is_wavernn_batched":true,
9 | "port": 5002,
10 | "use_cuda": true,
11 | "debug": true
12 | }
13 |
--------------------------------------------------------------------------------
/TTS/server/static/coqui-log-green-TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/server/static/coqui-log-green-TTS.png
--------------------------------------------------------------------------------
/TTS/server/templates/details.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | TTS engine
12 |
13 |
14 |
17 |
18 |
19 |
30 |
31 |
32 |
33 |
35 |
36 | {% if show_details == true %}
37 |
38 |
39 | Model details
40 |
41 |
42 |
43 |
44 | CLI arguments:
45 |
46 |
47 | CLI key |
48 | Value |
49 |
50 |
51 | {% for key, value in args.items() %}
52 |
53 |
54 | {{ key }} |
55 | {{ value }} |
56 |
57 |
58 | {% endfor %}
59 |
60 |
61 |
62 |
63 |
64 |
65 | {% if model_config != None %}
66 |
67 |
68 | Model config:
69 |
70 |
71 |
72 | Key |
73 | Value |
74 |
75 |
76 |
77 | {% for key, value in model_config.items() %}
78 |
79 |
80 | {{ key }} |
81 | {{ value }} |
82 |
83 |
84 | {% endfor %}
85 |
86 |
87 |
88 |
89 | {% endif %}
90 |
91 |
92 |
93 |
94 |
95 |
96 | {% if vocoder_config != None %}
97 |
98 | Vocoder model config:
99 |
100 |
101 |
102 | Key |
103 | Value |
104 |
105 |
106 |
107 | {% for key, value in vocoder_config.items() %}
108 |
109 |
110 | {{ key }} |
111 | {{ value }} |
112 |
113 |
114 | {% endfor %}
115 |
116 |
117 |
118 |
119 | {% endif %}
120 |
121 |
122 | {% else %}
123 |
124 | Please start server with --show_details=true to see details.
125 |
126 |
127 | {% endif %}
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/TTS/tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | # configs_dir = os.path.dirname(__file__)
7 | # for file in os.listdir(configs_dir):
8 | # path = os.path.join(configs_dir, file)
9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | # module = importlib.import_module("TTS.tts.configs." + config_name)
12 | # for attribute_name in dir(module):
13 | # attribute = getattr(module, attribute_name)
14 |
15 | # if isclass(attribute):
16 | # # Add the class to this package's variables
17 | # globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/tts/configs/tacotron2_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from TTS.tts.configs.tacotron_config import TacotronConfig
4 |
5 |
6 | @dataclass
7 | class Tacotron2Config(TacotronConfig):
8 | """Defines parameters for Tacotron2 based models.
9 |
10 | Example:
11 |
12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
13 | >>> config = Tacotron2Config()
14 |
15 | Check `TacotronConfig` for argument descriptions.
16 | """
17 |
18 | model: str = "tacotron2"
19 | out_channels: int = 80
20 | encoder_in_features: int = 512
21 | decoder_in_features: int = 512
22 |
--------------------------------------------------------------------------------
/TTS/tts/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.layers.losses import *
2 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/align_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
9 | super().__init__()
10 | self.embed = nn.Embedding(num_chars, hidden_channels)
11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
14 |
15 | def forward(self, text, text_lengths):
16 | # B, L -> B, L
17 | emb = self.embed(text)
18 | emb = self.pos_enc(emb.transpose(1, 2))
19 | x = self.FFT(emb, text_lengths)
20 | x = self.out_layer(x).squeeze(-1)
21 | return x
22 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/mdn.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class MDNBlock(nn.Module):
5 | """Mixture of Density Network implementation
6 | https://arxiv.org/pdf/2003.01950.pdf
7 | """
8 |
9 | def __init__(self, in_channels, out_channels):
10 | super().__init__()
11 | self.out_channels = out_channels
12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
13 | self.norm = nn.LayerNorm(in_channels)
14 | self.relu = nn.ReLU()
15 | self.dropout = nn.Dropout(0.1)
16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
17 |
18 | def forward(self, x):
19 | o = self.conv1(x)
20 | o = o.transpose(1, 2)
21 | o = self.norm(o)
22 | o = o.transpose(1, 2)
23 | o = self.relu(o)
24 | o = self.dropout(o)
25 | mu_sigma = self.conv2(o)
26 | # TODO: check this sigmoid
27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
28 | mu = mu_sigma[:, : self.out_channels // 2, :]
29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :]
30 | return mu, log_sigma
31 |
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/feed_forward/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
4 |
5 |
6 | class DurationPredictor(nn.Module):
7 | """Speedy Speech duration predictor model.
8 | Predicts phoneme durations from encoder outputs.
9 |
10 | Note:
11 | Outputs interpreted as log(durations)
12 | To get actual durations, do exp transformation
13 |
14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
15 |
16 | Args:
17 | hidden_channels (int): number of channels in the inner layers.
18 | """
19 |
20 | def __init__(self, hidden_channels):
21 | super().__init__()
22 |
23 | self.layers = nn.ModuleList(
24 | [
25 | Conv1dBN(hidden_channels, hidden_channels, 4, 1),
26 | Conv1dBN(hidden_channels, hidden_channels, 3, 1),
27 | Conv1dBN(hidden_channels, hidden_channels, 1, 1),
28 | nn.Conv1d(hidden_channels, 1, 1),
29 | ]
30 | )
31 |
32 | def forward(self, x, x_mask):
33 | """
34 | Shapes:
35 | x: [B, C, T]
36 | x_mask: [B, 1, T]
37 | """
38 | o = x
39 | for layer in self.layers:
40 | o = layer(o) * x_mask
41 | return o
42 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/generic/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/aligner.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import torch
4 | from torch import nn
5 |
6 |
7 | class AlignmentNetwork(torch.nn.Module):
8 | """Aligner Network for learning alignment between the input text and the model output with Gaussian Attention.
9 |
10 | ::
11 |
12 | query -> conv1d -> relu -> conv1d -> relu -> conv1d -> L2_dist -> softmax -> alignment
13 | key -> conv1d -> relu -> conv1d -----------------------^
14 |
15 | Args:
16 | in_query_channels (int): Number of channels in the query network. Defaults to 80.
17 | in_key_channels (int): Number of channels in the key network. Defaults to 512.
18 | attn_channels (int): Number of inner channels in the attention layers. Defaults to 80.
19 | temperature (float): Temperature for the softmax. Defaults to 0.0005.
20 | """
21 |
22 | def __init__(
23 | self,
24 | in_query_channels=80,
25 | in_key_channels=512,
26 | attn_channels=80,
27 | temperature=0.0005,
28 | ):
29 | super().__init__()
30 | self.temperature = temperature
31 | self.softmax = torch.nn.Softmax(dim=3)
32 | self.log_softmax = torch.nn.LogSoftmax(dim=3)
33 |
34 | self.key_layer = nn.Sequential(
35 | nn.Conv1d(
36 | in_key_channels,
37 | in_key_channels * 2,
38 | kernel_size=3,
39 | padding=1,
40 | bias=True,
41 | ),
42 | torch.nn.ReLU(),
43 | nn.Conv1d(in_key_channels * 2, attn_channels, kernel_size=1, padding=0, bias=True),
44 | )
45 |
46 | self.query_layer = nn.Sequential(
47 | nn.Conv1d(
48 | in_query_channels,
49 | in_query_channels * 2,
50 | kernel_size=3,
51 | padding=1,
52 | bias=True,
53 | ),
54 | torch.nn.ReLU(),
55 | nn.Conv1d(in_query_channels * 2, in_query_channels, kernel_size=1, padding=0, bias=True),
56 | torch.nn.ReLU(),
57 | nn.Conv1d(in_query_channels, attn_channels, kernel_size=1, padding=0, bias=True),
58 | )
59 |
60 | def forward(
61 | self, queries: torch.tensor, keys: torch.tensor, mask: torch.tensor = None, attn_prior: torch.tensor = None
62 | ) -> Tuple[torch.tensor, torch.tensor]:
63 | """Forward pass of the aligner encoder.
64 | Shapes:
65 | - queries: :math:`[B, C, T_de]`
66 | - keys: :math:`[B, C_emb, T_en]`
67 | - mask: :math:`[B, T_de]`
68 | Output:
69 | attn (torch.tensor): :math:`[B, 1, T_en, T_de]` soft attention mask.
70 | attn_logp (torch.tensor): :math:`[ßB, 1, T_en , T_de]` log probabilities.
71 | """
72 | key_out = self.key_layer(keys)
73 | query_out = self.query_layer(queries)
74 | attn_factor = (query_out[:, :, :, None] - key_out[:, :, None]) ** 2
75 | attn_logp = -self.temperature * attn_factor.sum(1, keepdim=True)
76 | if attn_prior is not None:
77 | attn_logp = self.log_softmax(attn_logp) + torch.log(attn_prior[:, None] + 1e-8)
78 | if mask is not None:
79 | attn_logp.data.masked_fill_(~mask.bool().unsqueeze(2), -float("inf"))
80 | attn = self.softmax(attn_logp)
81 | return attn, attn_logp
82 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/gated_conv.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from .normalization import LayerNorm
4 |
5 |
6 | class GatedConvBlock(nn.Module):
7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf
8 | Args:
9 | in_out_channels (int): number of input/output channels.
10 | kernel_size (int): convolution kernel size.
11 | dropout_p (float): dropout rate.
12 | """
13 |
14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers):
15 | super().__init__()
16 | # class arguments
17 | self.dropout_p = dropout_p
18 | self.num_layers = num_layers
19 | # define layers
20 | self.conv_layers = nn.ModuleList()
21 | self.norm_layers = nn.ModuleList()
22 | self.layers = nn.ModuleList()
23 | for _ in range(num_layers):
24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)]
25 | self.norm_layers += [LayerNorm(2 * in_out_channels)]
26 |
27 | def forward(self, x, x_mask):
28 | o = x
29 | res = x
30 | for idx in range(self.num_layers):
31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training)
32 | o = self.conv_layers[idx](o * x_mask)
33 | o = self.norm_layers[idx](o)
34 | o = nn.functional.glu(o, dim=1)
35 | o = res + o
36 | res = o
37 | return o
38 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/normalization.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class LayerNorm(nn.Module):
6 | def __init__(self, channels, eps=1e-4):
7 | """Layer norm for the 2nd dimension of the input.
8 | Args:
9 | channels (int): number of channels (2nd dimension) of the input.
10 | eps (float): to prevent 0 division
11 |
12 | Shapes:
13 | - input: (B, C, T)
14 | - output: (B, C, T)
15 | """
16 | super().__init__()
17 | self.channels = channels
18 | self.eps = eps
19 |
20 | self.gamma = nn.Parameter(torch.ones(1, channels, 1) * 0.1)
21 | self.beta = nn.Parameter(torch.zeros(1, channels, 1))
22 |
23 | def forward(self, x):
24 | mean = torch.mean(x, 1, keepdim=True)
25 | variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
26 | x = (x - mean) * torch.rsqrt(variance + self.eps)
27 | x = x * self.gamma + self.beta
28 | return x
29 |
30 |
31 | class LayerNorm2(nn.Module):
32 | """Layer norm for the 2nd dimension of the input using torch primitive.
33 | Args:
34 | channels (int): number of channels (2nd dimension) of the input.
35 | eps (float): to prevent 0 division
36 |
37 | Shapes:
38 | - input: (B, C, T)
39 | - output: (B, C, T)
40 | """
41 |
42 | def __init__(self, channels, eps=1e-5):
43 | super().__init__()
44 | self.channels = channels
45 | self.eps = eps
46 |
47 | self.gamma = nn.Parameter(torch.ones(channels))
48 | self.beta = nn.Parameter(torch.zeros(channels))
49 |
50 | def forward(self, x):
51 | x = x.transpose(1, -1)
52 | x = torch.nn.functional.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
53 | return x.transpose(1, -1)
54 |
55 |
56 | class TemporalBatchNorm1d(nn.BatchNorm1d):
57 | """Normalize each channel separately over time and batch."""
58 |
59 | def __init__(self, channels, affine=True, track_running_stats=True, momentum=0.1):
60 | super().__init__(channels, affine=affine, track_running_stats=track_running_stats, momentum=momentum)
61 |
62 | def forward(self, x):
63 | return super().forward(x.transpose(2, 1)).transpose(2, 1)
64 |
65 |
66 | class ActNorm(nn.Module):
67 | """Activation Normalization bijector as an alternative to Batch Norm. It computes
68 | mean and std from a sample data in advance and it uses these values
69 | for normalization at training.
70 |
71 | Args:
72 | channels (int): input channels.
73 | ddi (False): data depended initialization flag.
74 |
75 | Shapes:
76 | - inputs: (B, C, T)
77 | - outputs: (B, C, T)
78 | """
79 |
80 | def __init__(self, channels, ddi=False, **kwargs): # pylint: disable=unused-argument
81 | super().__init__()
82 | self.channels = channels
83 | self.initialized = not ddi
84 |
85 | self.logs = nn.Parameter(torch.zeros(1, channels, 1))
86 | self.bias = nn.Parameter(torch.zeros(1, channels, 1))
87 |
88 | def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument
89 | if x_mask is None:
90 | x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype)
91 | x_len = torch.sum(x_mask, [1, 2])
92 | if not self.initialized:
93 | self.initialize(x, x_mask)
94 | self.initialized = True
95 |
96 | if reverse:
97 | z = (x - self.bias) * torch.exp(-self.logs) * x_mask
98 | logdet = None
99 | else:
100 | z = (self.bias + torch.exp(self.logs) * x) * x_mask
101 | logdet = torch.sum(self.logs) * x_len # [b]
102 |
103 | return z, logdet
104 |
105 | def store_inverse(self):
106 | pass
107 |
108 | def set_ddi(self, ddi):
109 | self.initialized = not ddi
110 |
111 | def initialize(self, x, x_mask):
112 | with torch.no_grad():
113 | denom = torch.sum(x_mask, [0, 2])
114 | m = torch.sum(x * x_mask, [0, 2]) / denom
115 | m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom
116 | v = m_sq - (m**2)
117 | logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))
118 |
119 | bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
120 | logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype)
121 |
122 | self.bias.data.copy_(bias_init)
123 | self.logs.data.copy_(logs_init)
124 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/pos_encoding.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch import nn
5 |
6 |
7 | class PositionalEncoding(nn.Module):
8 | """Sinusoidal positional encoding for non-recurrent neural networks.
9 | Implementation based on "Attention Is All You Need"
10 |
11 | Args:
12 | channels (int): embedding size
13 | dropout_p (float): dropout rate applied to the output.
14 | max_len (int): maximum sequence length.
15 | use_scale (bool): whether to use a learnable scaling coefficient.
16 | """
17 |
18 | def __init__(self, channels, dropout_p=0.0, max_len=5000, use_scale=False):
19 | super().__init__()
20 | if channels % 2 != 0:
21 | raise ValueError(
22 | "Cannot use sin/cos positional encoding with " "odd channels (got channels={:d})".format(channels)
23 | )
24 | self.use_scale = use_scale
25 | if use_scale:
26 | self.scale = torch.nn.Parameter(torch.ones(1))
27 | pe = torch.zeros(max_len, channels)
28 | position = torch.arange(0, max_len).unsqueeze(1)
29 | div_term = torch.pow(10000, torch.arange(0, channels, 2).float() / channels)
30 | pe[:, 0::2] = torch.sin(position.float() * div_term)
31 | pe[:, 1::2] = torch.cos(position.float() * div_term)
32 | pe = pe.unsqueeze(0).transpose(1, 2)
33 | self.register_buffer("pe", pe)
34 | if dropout_p > 0:
35 | self.dropout = nn.Dropout(p=dropout_p)
36 | self.channels = channels
37 |
38 | def forward(self, x, mask=None, first_idx=None, last_idx=None):
39 | """
40 | Shapes:
41 | x: [B, C, T]
42 | mask: [B, 1, T]
43 | first_idx: int
44 | last_idx: int
45 | """
46 |
47 | x = x * math.sqrt(self.channels)
48 | if first_idx is None:
49 | if self.pe.size(2) < x.size(2):
50 | raise RuntimeError(
51 | f"Sequence is {x.size(2)} but PositionalEncoding is"
52 | f" limited to {self.pe.size(2)}. See max_len argument."
53 | )
54 | if mask is not None:
55 | pos_enc = self.pe[:, :, : x.size(2)] * mask
56 | else:
57 | pos_enc = self.pe[:, :, : x.size(2)]
58 | if self.use_scale:
59 | x = x + self.scale * pos_enc
60 | else:
61 | x = x + pos_enc
62 | else:
63 | if self.use_scale:
64 | x = x + self.scale * self.pe[:, :, first_idx:last_idx]
65 | else:
66 | x = x + self.pe[:, :, first_idx:last_idx]
67 | if hasattr(self, "dropout"):
68 | x = self.dropout(x)
69 | return x
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/time_depth_sep_conv.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class TimeDepthSeparableConv(nn.Module):
6 | """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
7 | It shows competative results with less computation and memory footprint."""
8 |
9 | def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
10 | super().__init__()
11 |
12 | self.in_channels = in_channels
13 | self.out_channels = out_channels
14 | self.hid_channels = hid_channels
15 | self.kernel_size = kernel_size
16 |
17 | self.time_conv = nn.Conv1d(
18 | in_channels,
19 | 2 * hid_channels,
20 | kernel_size=1,
21 | stride=1,
22 | padding=0,
23 | bias=bias,
24 | )
25 | self.norm1 = nn.BatchNorm1d(2 * hid_channels)
26 | self.depth_conv = nn.Conv1d(
27 | hid_channels,
28 | hid_channels,
29 | kernel_size,
30 | stride=1,
31 | padding=(kernel_size - 1) // 2,
32 | groups=hid_channels,
33 | bias=bias,
34 | )
35 | self.norm2 = nn.BatchNorm1d(hid_channels)
36 | self.time_conv2 = nn.Conv1d(
37 | hid_channels,
38 | out_channels,
39 | kernel_size=1,
40 | stride=1,
41 | padding=0,
42 | bias=bias,
43 | )
44 | self.norm3 = nn.BatchNorm1d(out_channels)
45 |
46 | def forward(self, x):
47 | x_res = x
48 | x = self.time_conv(x)
49 | x = self.norm1(x)
50 | x = nn.functional.glu(x, dim=1)
51 | x = self.depth_conv(x)
52 | x = self.norm2(x)
53 | x = x * torch.sigmoid(x)
54 | x = self.time_conv2(x)
55 | x = self.norm3(x)
56 | x = x_res + x
57 | return x
58 |
59 |
60 | class TimeDepthSeparableConvBlock(nn.Module):
61 | def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
62 | super().__init__()
63 | assert (kernel_size - 1) % 2 == 0
64 | assert num_layers > 1
65 |
66 | self.layers = nn.ModuleList()
67 | layer = TimeDepthSeparableConv(
68 | in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
69 | )
70 | self.layers.append(layer)
71 | for idx in range(num_layers - 1):
72 | layer = TimeDepthSeparableConv(
73 | hid_channels,
74 | hid_channels,
75 | out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
76 | kernel_size,
77 | bias,
78 | )
79 | self.layers.append(layer)
80 |
81 | def forward(self, x, mask):
82 | for layer in self.layers:
83 | x = layer(x * mask)
84 | return x
85 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/transformer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 |
5 |
6 | class FFTransformer(nn.Module):
7 | def __init__(self, in_out_channels, num_heads, hidden_channels_ffn=1024, kernel_size_fft=3, dropout_p=0.1):
8 | super().__init__()
9 | self.self_attn = nn.MultiheadAttention(in_out_channels, num_heads, dropout=dropout_p)
10 |
11 | padding = (kernel_size_fft - 1) // 2
12 | self.conv1 = nn.Conv1d(in_out_channels, hidden_channels_ffn, kernel_size=kernel_size_fft, padding=padding)
13 | self.conv2 = nn.Conv1d(hidden_channels_ffn, in_out_channels, kernel_size=kernel_size_fft, padding=padding)
14 |
15 | self.norm1 = nn.LayerNorm(in_out_channels)
16 | self.norm2 = nn.LayerNorm(in_out_channels)
17 |
18 | self.dropout1 = nn.Dropout(dropout_p)
19 | self.dropout2 = nn.Dropout(dropout_p)
20 |
21 | def forward(self, src, src_mask=None, src_key_padding_mask=None):
22 | """😦 ugly looking with all the transposing"""
23 | src = src.permute(2, 0, 1)
24 | src2, enc_align = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
25 | src = src + self.dropout1(src2)
26 | src = self.norm1(src + src2)
27 | # T x B x D -> B x D x T
28 | src = src.permute(1, 2, 0)
29 | src2 = self.conv2(F.relu(self.conv1(src)))
30 | src2 = self.dropout2(src2)
31 | src = src + src2
32 | src = src.transpose(1, 2)
33 | src = self.norm2(src)
34 | src = src.transpose(1, 2)
35 | return src, enc_align
36 |
37 |
38 | class FFTransformerBlock(nn.Module):
39 | def __init__(self, in_out_channels, num_heads, hidden_channels_ffn, num_layers, dropout_p):
40 | super().__init__()
41 | self.fft_layers = nn.ModuleList(
42 | [
43 | FFTransformer(
44 | in_out_channels=in_out_channels,
45 | num_heads=num_heads,
46 | hidden_channels_ffn=hidden_channels_ffn,
47 | dropout_p=dropout_p,
48 | )
49 | for _ in range(num_layers)
50 | ]
51 | )
52 |
53 | def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
54 | """
55 | TODO: handle multi-speaker
56 | Shapes:
57 | - x: :math:`[B, C, T]`
58 | - mask: :math:`[B, 1, T] or [B, T]`
59 | """
60 | if mask is not None and mask.ndim == 3:
61 | mask = mask.squeeze(1)
62 | # mask is negated, torch uses 1s and 0s reversely.
63 | mask = ~mask.bool()
64 | alignments = []
65 | for layer in self.fft_layers:
66 | x, align = layer(x, src_key_padding_mask=mask)
67 | alignments.append(align.unsqueeze(1))
68 | alignments = torch.cat(alignments, 1)
69 | return x
70 |
71 |
72 | class FFTDurationPredictor:
73 | def __init__(
74 | self, in_channels, hidden_channels, num_heads, num_layers, dropout_p=0.1, cond_channels=None
75 | ): # pylint: disable=unused-argument
76 | self.fft = FFTransformerBlock(in_channels, num_heads, hidden_channels, num_layers, dropout_p)
77 | self.proj = nn.Linear(in_channels, 1)
78 |
79 | def forward(self, x, mask=None, g=None): # pylint: disable=unused-argument
80 | """
81 | Shapes:
82 | - x: :math:`[B, C, T]`
83 | - mask: :math:`[B, 1, T]`
84 |
85 | TODO: Handle the cond input
86 | """
87 | x = self.fft(x, mask=mask)
88 | x = self.proj(x)
89 | return x
90 |
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/glow_tts/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from ..generic.normalization import LayerNorm
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | """Glow-TTS duration prediction model.
9 |
10 | ::
11 |
12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
13 |
14 | Args:
15 | in_channels (int): Number of channels of the input tensor.
16 | hidden_channels (int): Number of hidden channels of the network.
17 | kernel_size (int): Kernel size for the conv layers.
18 | dropout_p (float): Dropout rate used after each conv layer.
19 | """
20 |
21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None):
22 | super().__init__()
23 |
24 | # add language embedding dim in the input
25 | if language_emb_dim:
26 | in_channels += language_emb_dim
27 |
28 | # class arguments
29 | self.in_channels = in_channels
30 | self.filter_channels = hidden_channels
31 | self.kernel_size = kernel_size
32 | self.dropout_p = dropout_p
33 | # layers
34 | self.drop = nn.Dropout(dropout_p)
35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
36 | self.norm_1 = LayerNorm(hidden_channels)
37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
38 | self.norm_2 = LayerNorm(hidden_channels)
39 | # output layer
40 | self.proj = nn.Conv1d(hidden_channels, 1, 1)
41 | if cond_channels is not None and cond_channels != 0:
42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1)
43 |
44 | if language_emb_dim != 0 and language_emb_dim is not None:
45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1)
46 |
47 | def forward(self, x, x_mask, g=None, lang_emb=None):
48 | """
49 | Shapes:
50 | - x: :math:`[B, C, T]`
51 | - x_mask: :math:`[B, 1, T]`
52 | - g: :math:`[B, C, 1]`
53 | """
54 | if g is not None:
55 | x = x + self.cond(g)
56 |
57 | if lang_emb is not None:
58 | x = x + self.cond_lang(lang_emb)
59 |
60 | x = self.conv_1(x * x_mask)
61 | x = torch.relu(x)
62 | x = self.norm_1(x)
63 | x = self.drop(x)
64 | x = self.conv_2(x * x_mask)
65 | x = torch.relu(x)
66 | x = self.norm_2(x)
67 | x = self.drop(x)
68 | x = self.proj(x * x_mask)
69 | return x * x_mask
70 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/overflow/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/decoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from TTS.tts.layers.glow_tts.decoder import Decoder as GlowDecoder
5 | from TTS.tts.utils.helpers import sequence_mask
6 |
7 |
8 | class Decoder(nn.Module):
9 | """Uses glow decoder with some modifications.
10 | ::
11 |
12 | Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze
13 |
14 | Args:
15 | in_channels (int): channels of input tensor.
16 | hidden_channels (int): hidden decoder channels.
17 | kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)
18 | dilation_rate (int): rate to increase dilation by each layer in a decoder block.
19 | num_flow_blocks (int): number of decoder blocks.
20 | num_coupling_layers (int): number coupling layers. (number of wavenet layers.)
21 | dropout_p (float): wavenet dropout rate.
22 | sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer.
23 | """
24 |
25 | def __init__(
26 | self,
27 | in_channels,
28 | hidden_channels,
29 | kernel_size,
30 | dilation_rate,
31 | num_flow_blocks,
32 | num_coupling_layers,
33 | dropout_p=0.0,
34 | num_splits=4,
35 | num_squeeze=2,
36 | sigmoid_scale=False,
37 | c_in_channels=0,
38 | ):
39 | super().__init__()
40 |
41 | self.glow_decoder = GlowDecoder(
42 | in_channels,
43 | hidden_channels,
44 | kernel_size,
45 | dilation_rate,
46 | num_flow_blocks,
47 | num_coupling_layers,
48 | dropout_p,
49 | num_splits,
50 | num_squeeze,
51 | sigmoid_scale,
52 | c_in_channels,
53 | )
54 | self.n_sqz = num_squeeze
55 |
56 | def forward(self, x, x_len, g=None, reverse=False):
57 | """
58 | Input shapes:
59 | - x: :math:`[B, C, T]`
60 | - x_len :math:`[B]`
61 | - g: :math:`[B, C]`
62 |
63 | Output shapes:
64 | - x: :math:`[B, C, T]`
65 | - x_len :math:`[B]`
66 | - logget_tot :math:`[B]`
67 | """
68 | x, x_len, x_max_len = self.preprocess(x, x_len, x_len.max())
69 | x_mask = torch.unsqueeze(sequence_mask(x_len, x_max_len), 1).to(x.dtype)
70 | x, logdet_tot = self.glow_decoder(x, x_mask, g, reverse)
71 | return x, x_len, logdet_tot
72 |
73 | def preprocess(self, y, y_lengths, y_max_length):
74 | if y_max_length is not None:
75 | y_max_length = torch.div(y_max_length, self.n_sqz, rounding_mode="floor") * self.n_sqz
76 | y = y[:, :, :y_max_length]
77 | y_lengths = torch.div(y_lengths, self.n_sqz, rounding_mode="floor") * self.n_sqz
78 | return y, y_lengths, y_max_length
79 |
80 | def store_inverse(self):
81 | self.glow_decoder.store_inverse()
82 |
--------------------------------------------------------------------------------
/TTS/tts/layers/overflow/plotting_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import torch
6 |
7 |
8 | def validate_numpy_array(value: Any):
9 | r"""
10 | Validates the input and makes sure it returns a numpy array (i.e on CPU)
11 |
12 | Args:
13 | value (Any): the input value
14 |
15 | Raises:
16 | TypeError: if the value is not a numpy array or torch tensor
17 |
18 | Returns:
19 | np.ndarray: numpy array of the value
20 | """
21 | if isinstance(value, np.ndarray):
22 | pass
23 | elif isinstance(value, list):
24 | value = np.array(value)
25 | elif torch.is_tensor(value):
26 | value = value.cpu().numpy()
27 | else:
28 | raise TypeError("Value must be a numpy array, a torch tensor or a list")
29 |
30 | return value
31 |
32 |
33 | def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
34 | """Get the most probable state means from the log_alpha_scaled.
35 |
36 | Args:
37 | log_alpha_scaled (torch.Tensor): Log alpha scaled values.
38 | - Shape: :math:`(T, N)`
39 | means (torch.Tensor): Means of the states.
40 | - Shape: :math:`(N, T, D_out)`
41 | decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
42 | """
43 | max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
44 | max_len = means.shape[0]
45 | n_mel_channels = means.shape[2]
46 | max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
47 | means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
48 | if decoder is not None:
49 | mel = (
50 | decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
51 | .squeeze(0)
52 | .T
53 | )
54 | else:
55 | mel = means
56 | return mel
57 |
58 |
59 | def plot_transition_probabilities_to_numpy(states, transition_probabilities, output_fig=False):
60 | """Generates trainsition probabilities plot for the states and the probability of transition.
61 |
62 | Args:
63 | states (torch.IntTensor): the states
64 | transition_probabilities (torch.FloatTensor): the transition probabilities
65 | """
66 | states = validate_numpy_array(states)
67 | transition_probabilities = validate_numpy_array(transition_probabilities)
68 |
69 | fig, ax = plt.subplots(figsize=(30, 3))
70 | ax.plot(transition_probabilities, "o")
71 | ax.set_title("Transition probability of state")
72 | ax.set_xlabel("hidden state")
73 | ax.set_ylabel("probability")
74 | ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
75 | ax.set_xticklabels([int(x) for x in states], rotation=90)
76 | plt.tight_layout()
77 | if not output_fig:
78 | plt.close()
79 | return fig
80 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tacotron/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/layers/tacotron/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/layers/vits/discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.modules.conv import Conv1d
4 |
5 | from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator
6 |
7 |
8 | class DiscriminatorS(torch.nn.Module):
9 | """HiFiGAN Scale Discriminator. Channel sizes are different from the original HiFiGAN.
10 |
11 | Args:
12 | use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
13 | """
14 |
15 | def __init__(self, use_spectral_norm=False):
16 | super().__init__()
17 | norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
18 | self.convs = nn.ModuleList(
19 | [
20 | norm_f(Conv1d(1, 16, 15, 1, padding=7)),
21 | norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
22 | norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
23 | norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
24 | norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
25 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
26 | ]
27 | )
28 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
29 |
30 | def forward(self, x):
31 | """
32 | Args:
33 | x (Tensor): input waveform.
34 |
35 | Returns:
36 | Tensor: discriminator scores.
37 | List[Tensor]: list of features from the convolutiona layers.
38 | """
39 | feat = []
40 | for l in self.convs:
41 | x = l(x)
42 | x = torch.nn.functional.leaky_relu(x, 0.1)
43 | feat.append(x)
44 | x = self.conv_post(x)
45 | feat.append(x)
46 | x = torch.flatten(x, 1, -1)
47 | return x, feat
48 |
49 |
50 | class VitsDiscriminator(nn.Module):
51 | """VITS discriminator wrapping one Scale Discriminator and a stack of Period Discriminator.
52 |
53 | ::
54 | waveform -> ScaleDiscriminator() -> scores_sd, feats_sd --> append() -> scores, feats
55 | |--> MultiPeriodDiscriminator() -> scores_mpd, feats_mpd ^
56 |
57 | Args:
58 | use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
59 | """
60 |
61 | def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
62 | super().__init__()
63 | self.nets = nn.ModuleList()
64 | self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
65 | self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])
66 |
67 | def forward(self, x, x_hat=None):
68 | """
69 | Args:
70 | x (Tensor): ground truth waveform.
71 | x_hat (Tensor): predicted waveform.
72 |
73 | Returns:
74 | List[Tensor]: discriminator scores.
75 | List[List[Tensor]]: list of list of features from each layers of each discriminator.
76 | """
77 | x_scores = []
78 | x_hat_scores = [] if x_hat is not None else None
79 | x_feats = []
80 | x_hat_feats = [] if x_hat is not None else None
81 | for net in self.nets:
82 | x_score, x_feat = net(x)
83 | x_scores.append(x_score)
84 | x_feats.append(x_feat)
85 | if x_hat is not None:
86 | x_hat_score, x_hat_feat = net(x_hat)
87 | x_hat_scores.append(x_hat_score)
88 | x_hat_feats.append(x_hat_feat)
89 | return x_scores, x_feats, x_hat_scores, x_hat_feats
90 |
--------------------------------------------------------------------------------
/TTS/tts/models/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 |
3 | from TTS.utils.generic_utils import find_module
4 |
5 |
6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
7 | print(" > Using model: {}".format(config.model))
8 | # fetch the right model implementation.
9 | if "base_model" in config and config["base_model"] is not None:
10 | MyModel = find_module("TTS.tts.models", config.base_model.lower())
11 | else:
12 | MyModel = find_module("TTS.tts.models", config.model.lower())
13 | model = MyModel.init_from_config(config, samples)
14 | return model
15 |
--------------------------------------------------------------------------------
/TTS/tts/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/data.py:
--------------------------------------------------------------------------------
1 | import bisect
2 |
3 | import numpy as np
4 | import torch
5 |
6 |
7 | def _pad_data(x, length):
8 | _pad = 0
9 | assert x.ndim == 1
10 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
11 |
12 |
13 | def prepare_data(inputs):
14 | max_len = max((len(x) for x in inputs))
15 | return np.stack([_pad_data(x, max_len) for x in inputs])
16 |
17 |
18 | def _pad_tensor(x, length):
19 | _pad = 0.0
20 | assert x.ndim == 2
21 | x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode="constant", constant_values=_pad)
22 | return x
23 |
24 |
25 | def prepare_tensor(inputs, out_steps):
26 | max_len = max((x.shape[1] for x in inputs))
27 | remainder = max_len % out_steps
28 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
29 | return np.stack([_pad_tensor(x, pad_len) for x in inputs])
30 |
31 |
32 | def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
33 | """Pad stop target array.
34 |
35 | Args:
36 | x (np.ndarray): Stop target array.
37 | length (int): Length after padding.
38 | pad_val (int, optional): Padding value. Defaults to 1.
39 |
40 | Returns:
41 | np.ndarray: Padded stop target array.
42 | """
43 | assert x.ndim == 1
44 | return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
45 |
46 |
47 | def prepare_stop_target(inputs, out_steps):
48 | """Pad row vectors with 1."""
49 | max_len = max((x.shape[0] for x in inputs))
50 | remainder = max_len % out_steps
51 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
52 | return np.stack([_pad_stop_target(x, pad_len) for x in inputs])
53 |
54 |
55 | def pad_per_step(inputs, pad_len):
56 | return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
57 |
58 |
59 | def get_length_balancer_weights(items: list, num_buckets=10):
60 | # get all durations
61 | audio_lengths = np.array([item["audio_length"] for item in items])
62 | # create the $num_buckets buckets classes based in the dataset max and min length
63 | max_length = int(max(audio_lengths))
64 | min_length = int(min(audio_lengths))
65 | step = int((max_length - min_length) / num_buckets) + 1
66 | buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
67 | # add each sample in their respective length bucket
68 | buckets_names = np.array(
69 | [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
70 | )
71 | # count and compute the weights_bucket for each sample
72 | unique_buckets_names = np.unique(buckets_names).tolist()
73 | bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
74 | bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
75 | weight_bucket = 1.0 / bucket_count
76 | dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
77 | # normalize
78 | dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
79 | return torch.from_numpy(dataset_samples_weight).float()
80 |
--------------------------------------------------------------------------------
/TTS/tts/utils/languages.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any, Dict, List
3 |
4 | import fsspec
5 | import numpy as np
6 | import torch
7 | from coqpit import Coqpit
8 |
9 | from TTS.config import check_config_and_model_args
10 | from TTS.tts.utils.managers import BaseIDManager
11 |
12 |
13 | class LanguageManager(BaseIDManager):
14 | """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
15 | in a way that can be queried by language.
16 |
17 | Args:
18 | language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by
19 | TTS models. Defaults to "".
20 | config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed.
21 | Defaults to None.
22 |
23 | Examples:
24 | >>> manager = LanguageManager(language_ids_file_path=language_ids_file_path)
25 | >>> language_id_mapper = manager.language_ids
26 | """
27 |
28 | def __init__(
29 | self,
30 | language_ids_file_path: str = "",
31 | config: Coqpit = None,
32 | ):
33 | super().__init__(id_file_path=language_ids_file_path)
34 |
35 | if config:
36 | self.set_language_ids_from_config(config)
37 |
38 | @property
39 | def num_languages(self) -> int:
40 | return len(list(self.name_to_id.keys()))
41 |
42 | @property
43 | def language_names(self) -> List:
44 | return list(self.name_to_id.keys())
45 |
46 | @staticmethod
47 | def parse_language_ids_from_config(c: Coqpit) -> Dict:
48 | """Set language id from config.
49 |
50 | Args:
51 | c (Coqpit): Config
52 |
53 | Returns:
54 | Tuple[Dict, int]: Language ID mapping and the number of languages.
55 | """
56 | languages = set({})
57 | for dataset in c.datasets:
58 | if "language" in dataset:
59 | languages.add(dataset["language"])
60 | else:
61 | raise ValueError(f"Dataset {dataset['name']} has no language specified.")
62 | return {name: i for i, name in enumerate(sorted(list(languages)))}
63 |
64 | def set_language_ids_from_config(self, c: Coqpit) -> None:
65 | """Set language IDs from config samples.
66 |
67 | Args:
68 | c (Coqpit): Config.
69 | """
70 | self.name_to_id = self.parse_language_ids_from_config(c)
71 |
72 | @staticmethod
73 | def parse_ids_from_data(items: List, parse_key: str) -> Any:
74 | raise NotImplementedError
75 |
76 | def set_ids_from_data(self, items: List, parse_key: str) -> Any:
77 | raise NotImplementedError
78 |
79 | def save_ids_to_file(self, file_path: str) -> None:
80 | """Save language IDs to a json file.
81 |
82 | Args:
83 | file_path (str): Path to the output file.
84 | """
85 | self._save_json(file_path, self.name_to_id)
86 |
87 | @staticmethod
88 | def init_from_config(config: Coqpit) -> "LanguageManager":
89 | """Initialize the language manager from a Coqpit config.
90 |
91 | Args:
92 | config (Coqpit): Coqpit config.
93 | """
94 | language_manager = None
95 | if check_config_and_model_args(config, "use_language_embedding", True):
96 | if config.get("language_ids_file", None):
97 | language_manager = LanguageManager(language_ids_file_path=config.language_ids_file)
98 | language_manager = LanguageManager(config=config)
99 | return language_manager
100 |
101 |
102 | def _set_file_path(path):
103 | """Find the language_ids.json under the given path or the above it.
104 | Intended to band aid the different paths returned in restored and continued training."""
105 | path_restore = os.path.join(os.path.dirname(path), "language_ids.json")
106 | path_continue = os.path.join(path, "language_ids.json")
107 | fs = fsspec.get_mapper(path).fs
108 | if fs.exists(path_restore):
109 | return path_restore
110 | if fs.exists(path_continue):
111 | return path_continue
112 | return None
113 |
114 |
115 | def get_language_balancer_weights(items: list):
116 | language_names = np.array([item["language"] for item in items])
117 | unique_language_names = np.unique(language_names).tolist()
118 | language_ids = [unique_language_names.index(l) for l in language_names]
119 | language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
120 | weight_language = 1.0 / language_count
121 | # get weight for each sample
122 | dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
123 | # normalize
124 | dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
125 | return torch.from_numpy(dataset_samples_weight).float()
126 |
--------------------------------------------------------------------------------
/TTS/tts/utils/measures.py:
--------------------------------------------------------------------------------
1 | def alignment_diagonal_score(alignments, binary=False):
2 | """
3 | Compute how diagonal alignment predictions are. It is useful
4 | to measure the alignment consistency of a model
5 | Args:
6 | alignments (torch.Tensor): batch of alignments.
7 | binary (bool): if True, ignore scores and consider attention
8 | as a binary mask.
9 | Shape:
10 | - alignments : :math:`[B, T_de, T_en]`
11 | """
12 | maxs = alignments.max(dim=1)[0]
13 | if binary:
14 | maxs[maxs > 0] = 1
15 | return maxs.mean(dim=1).mean(dim=0).item()
16 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/monotonic_align/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | cimport cython
4 | cimport numpy as np
5 |
6 | from cython.parallel import prange
7 |
8 |
9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 | cdef int x
13 | cdef int y
14 | cdef float v_prev
15 | cdef float v_cur
16 | cdef float tmp
17 | cdef int index = t_x - 1
18 |
19 | for y in range(t_y):
20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 | if x == y:
22 | v_cur = max_neg_val
23 | else:
24 | v_cur = value[x, y-1]
25 | if x == 0:
26 | if y == 0:
27 | v_prev = 0.
28 | else:
29 | v_prev = max_neg_val
30 | else:
31 | v_prev = value[x-1, y-1]
32 | value[x, y] = max(v_cur, v_prev) + value[x, y]
33 |
34 | for y in range(t_y - 1, -1, -1):
35 | path[index, y] = 1
36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 | index = index - 1
38 |
39 |
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 | cdef int b = values.shape[0]
44 |
45 | cdef int i
46 | for i in prange(b, nogil=True):
47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 |
5 | # setup(name='monotonic_align',
6 | # ext_modules=cythonize("core.pyx"),
7 | # include_dirs=[numpy.get_include()])
8 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
2 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/text/chinese_mandarin/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/numbers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | # Licensed under WTFPL or the Unlicense or CC0.
5 | # This uses Python 3, but it's easy to port to Python 2 by changing
6 | # strings to u'xx'.
7 |
8 | import itertools
9 | import re
10 |
11 |
12 | def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
13 | """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
14 |
15 | Args:
16 | num (str): arabic number to convert
17 | big (bool, optional): use financial characters. Defaults to False.
18 | simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
19 | o (bool, optional): use 〇 for 'zero'. Defaults to False.
20 | twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
21 |
22 | Raises:
23 | ValueError: if number is more than 1e48
24 | ValueError: if 'e' exposent in number
25 |
26 | Returns:
27 | str: converted number as hanzi characters
28 | """
29 |
30 | # check num first
31 | nd = str(num)
32 | if abs(float(nd)) >= 1e48:
33 | raise ValueError("number out of range")
34 | if "e" in nd:
35 | raise ValueError("scientific notation is not supported")
36 | c_symbol = "正负点" if simp else "正負點"
37 | if o: # formal
38 | twoalt = False
39 | if big:
40 | c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
41 | c_unit1 = "拾佰仟"
42 | c_twoalt = "贰" if simp else "貳"
43 | else:
44 | c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
45 | c_unit1 = "十百千"
46 | if twoalt:
47 | c_twoalt = "两" if simp else "兩"
48 | else:
49 | c_twoalt = "二"
50 | c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
51 | revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
52 | nd = str(num)
53 | result = []
54 | if nd[0] == "+":
55 | result.append(c_symbol[0])
56 | elif nd[0] == "-":
57 | result.append(c_symbol[1])
58 | if "." in nd:
59 | integer, remainder = nd.lstrip("+-").split(".")
60 | else:
61 | integer, remainder = nd.lstrip("+-"), None
62 | if int(integer):
63 | splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
64 | intresult = []
65 | for nu, unit in enumerate(splitted):
66 | # special cases
67 | if int(unit) == 0: # 0000
68 | intresult.append(c_basic[0])
69 | continue
70 | if nu > 0 and int(unit) == 2: # 0002
71 | intresult.append(c_twoalt + c_unit2[nu - 1])
72 | continue
73 | ulist = []
74 | unit = unit.zfill(4)
75 | for nc, ch in enumerate(reversed(unit)):
76 | if ch == "0":
77 | if ulist: # ???0
78 | ulist.append(c_basic[0])
79 | elif nc == 0:
80 | ulist.append(c_basic[int(ch)])
81 | elif nc == 1 and ch == "1" and unit[1] == "0":
82 | # special case for tens
83 | # edit the 'elif' if you don't like
84 | # 十四, 三千零十四, 三千三百一十四
85 | ulist.append(c_unit1[0])
86 | elif nc > 1 and ch == "2":
87 | ulist.append(c_twoalt + c_unit1[nc - 1])
88 | else:
89 | ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
90 | ustr = revuniq(ulist)
91 | if nu == 0:
92 | intresult.append(ustr)
93 | else:
94 | intresult.append(ustr + c_unit2[nu - 1])
95 | result.append(revuniq(intresult).strip(c_basic[0]))
96 | else:
97 | result.append(c_basic[0])
98 | if remainder:
99 | result.append(c_symbol[2])
100 | result.append("".join(c_basic[int(ch)] for ch in remainder))
101 | return "".join(result)
102 |
103 |
104 | def _number_replace(match) -> str:
105 | """function to apply in a match, transform all numbers in a match by chinese characters
106 |
107 | Args:
108 | match (re.Match): numbers regex matches
109 |
110 | Returns:
111 | str: replaced characters for the numbers
112 | """
113 | match_str: str = match.group()
114 | return _num2chinese(match_str)
115 |
116 |
117 | def replace_numbers_to_characters_in_text(text: str) -> str:
118 | """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
119 |
120 | Args:
121 | text (str): input text to transform
122 |
123 | Returns:
124 | str: output text
125 | """
126 | text = re.sub(r"[0-9]+", _number_replace, text)
127 | return text
128 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import jieba
4 | import pypinyin
5 |
6 | from .pinyinToPhonemes import PINYIN_DICT
7 |
8 |
9 | def _chinese_character_to_pinyin(text: str) -> List[str]:
10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist]
12 | return pinyins_flat_list
13 |
14 |
15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
16 | segment = pinyin[:-1]
17 | tone = pinyin[-1]
18 | phoneme = PINYIN_DICT.get(segment, [""])[0]
19 | return phoneme + tone
20 |
21 |
22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
23 | tokenized_text = jieba.cut(text, HMM=False)
24 | tokenized_text = " ".join(tokenized_text)
25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
26 |
27 | results: List[str] = []
28 |
29 | for token in pinyined_text:
30 | if token[-1] in "12345": # TODO transform to is_pinyin()
31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
32 |
33 | results += list(pinyin_phonemes)
34 | else: # is ponctuation or other
35 | results += list(token)
36 |
37 | return seperator.join(results)
38 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/cmudict.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | VALID_SYMBOLS = [
6 | "AA",
7 | "AA0",
8 | "AA1",
9 | "AA2",
10 | "AE",
11 | "AE0",
12 | "AE1",
13 | "AE2",
14 | "AH",
15 | "AH0",
16 | "AH1",
17 | "AH2",
18 | "AO",
19 | "AO0",
20 | "AO1",
21 | "AO2",
22 | "AW",
23 | "AW0",
24 | "AW1",
25 | "AW2",
26 | "AY",
27 | "AY0",
28 | "AY1",
29 | "AY2",
30 | "B",
31 | "CH",
32 | "D",
33 | "DH",
34 | "EH",
35 | "EH0",
36 | "EH1",
37 | "EH2",
38 | "ER",
39 | "ER0",
40 | "ER1",
41 | "ER2",
42 | "EY",
43 | "EY0",
44 | "EY1",
45 | "EY2",
46 | "F",
47 | "G",
48 | "HH",
49 | "IH",
50 | "IH0",
51 | "IH1",
52 | "IH2",
53 | "IY",
54 | "IY0",
55 | "IY1",
56 | "IY2",
57 | "JH",
58 | "K",
59 | "L",
60 | "M",
61 | "N",
62 | "NG",
63 | "OW",
64 | "OW0",
65 | "OW1",
66 | "OW2",
67 | "OY",
68 | "OY0",
69 | "OY1",
70 | "OY2",
71 | "P",
72 | "R",
73 | "S",
74 | "SH",
75 | "T",
76 | "TH",
77 | "UH",
78 | "UH0",
79 | "UH1",
80 | "UH2",
81 | "UW",
82 | "UW0",
83 | "UW1",
84 | "UW2",
85 | "V",
86 | "W",
87 | "Y",
88 | "Z",
89 | "ZH",
90 | ]
91 |
92 |
93 | class CMUDict:
94 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
95 |
96 | def __init__(self, file_or_path, keep_ambiguous=True):
97 | if isinstance(file_or_path, str):
98 | with open(file_or_path, encoding="latin-1") as f:
99 | entries = _parse_cmudict(f)
100 | else:
101 | entries = _parse_cmudict(file_or_path)
102 | if not keep_ambiguous:
103 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
104 | self._entries = entries
105 |
106 | def __len__(self):
107 | return len(self._entries)
108 |
109 | def lookup(self, word):
110 | """Returns list of ARPAbet pronunciations of the given word."""
111 | return self._entries.get(word.upper())
112 |
113 | @staticmethod
114 | def get_arpabet(word, cmudict, punctuation_symbols):
115 | first_symbol, last_symbol = "", ""
116 | if word and word[0] in punctuation_symbols:
117 | first_symbol = word[0]
118 | word = word[1:]
119 | if word and word[-1] in punctuation_symbols:
120 | last_symbol = word[-1]
121 | word = word[:-1]
122 | arpabet = cmudict.lookup(word)
123 | if arpabet is not None:
124 | return first_symbol + "{%s}" % arpabet[0] + last_symbol
125 | return first_symbol + word + last_symbol
126 |
127 |
128 | _alt_re = re.compile(r"\([0-9]+\)")
129 |
130 |
131 | def _parse_cmudict(file):
132 | cmudict = {}
133 | for line in file:
134 | if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
135 | parts = line.split(" ")
136 | word = re.sub(_alt_re, "", parts[0])
137 | pronunciation = _get_pronunciation(parts[1])
138 | if pronunciation:
139 | if word in cmudict:
140 | cmudict[word].append(pronunciation)
141 | else:
142 | cmudict[word] = [pronunciation]
143 | return cmudict
144 |
145 |
146 | def _get_pronunciation(s):
147 | parts = s.strip().split(" ")
148 | for part in parts:
149 | if part not in VALID_SYMBOLS:
150 | return None
151 | return " ".join(parts)
152 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/text/english/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in english:
4 | abbreviations_en = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("mrs", "misess"),
8 | ("mr", "mister"),
9 | ("dr", "doctor"),
10 | ("st", "saint"),
11 | ("co", "company"),
12 | ("jr", "junior"),
13 | ("maj", "major"),
14 | ("gen", "general"),
15 | ("drs", "doctors"),
16 | ("rev", "reverend"),
17 | ("lt", "lieutenant"),
18 | ("hon", "honorable"),
19 | ("sgt", "sergeant"),
20 | ("capt", "captain"),
21 | ("esq", "esquire"),
22 | ("ltd", "limited"),
23 | ("col", "colonel"),
24 | ("ft", "fort"),
25 | ]
26 | ]
27 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/number_norm.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | import re
4 | from typing import Dict
5 |
6 | import inflect
7 |
8 | _inflect = inflect.engine()
9 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
10 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
11 | _currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"-?[0-9]+")
14 |
15 |
16 | def _remove_commas(m):
17 | return m.group(1).replace(",", "")
18 |
19 |
20 | def _expand_decimal_point(m):
21 | return m.group(1).replace(".", " point ")
22 |
23 |
24 | def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
25 | parts = value.replace(",", "").split(".")
26 | if len(parts) > 2:
27 | return f"{value} {inflection[2]}" # Unexpected format
28 | text = []
29 | integer = int(parts[0]) if parts[0] else 0
30 | if integer > 0:
31 | integer_unit = inflection.get(integer, inflection[2])
32 | text.append(f"{integer} {integer_unit}")
33 | fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
34 | if fraction > 0:
35 | fraction_unit = inflection.get(fraction / 100, inflection[0.02])
36 | text.append(f"{fraction} {fraction_unit}")
37 | if len(text) == 0:
38 | return f"zero {inflection[2]}"
39 | return " ".join(text)
40 |
41 |
42 | def _expand_currency(m: "re.Match") -> str:
43 | currencies = {
44 | "$": {
45 | 0.01: "cent",
46 | 0.02: "cents",
47 | 1: "dollar",
48 | 2: "dollars",
49 | },
50 | "€": {
51 | 0.01: "cent",
52 | 0.02: "cents",
53 | 1: "euro",
54 | 2: "euros",
55 | },
56 | "£": {
57 | 0.01: "penny",
58 | 0.02: "pence",
59 | 1: "pound sterling",
60 | 2: "pounds sterling",
61 | },
62 | "¥": {
63 | # TODO rin
64 | 0.02: "sen",
65 | 2: "yen",
66 | },
67 | }
68 | unit = m.group(1)
69 | currency = currencies[unit]
70 | value = m.group(2)
71 | return __expand_currency(value, currency)
72 |
73 |
74 | def _expand_ordinal(m):
75 | return _inflect.number_to_words(m.group(0))
76 |
77 |
78 | def _expand_number(m):
79 | num = int(m.group(0))
80 | if 1000 < num < 3000:
81 | if num == 2000:
82 | return "two thousand"
83 | if 2000 < num < 2010:
84 | return "two thousand " + _inflect.number_to_words(num % 100)
85 | if num % 100 == 0:
86 | return _inflect.number_to_words(num // 100) + " hundred"
87 | return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
88 | return _inflect.number_to_words(num, andword="")
89 |
90 |
91 | def normalize_numbers(text):
92 | text = re.sub(_comma_number_re, _remove_commas, text)
93 | text = re.sub(_currency_re, _expand_currency, text)
94 | text = re.sub(_decimal_number_re, _expand_decimal_point, text)
95 | text = re.sub(_ordinal_re, _expand_ordinal, text)
96 | text = re.sub(_number_re, _expand_number, text)
97 | return text
98 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/time_norm.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 |
7 | _time_re = re.compile(
8 | r"""\b
9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
10 | :
11 | ([0-5][0-9]) # minutes
12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 | \b""",
14 | re.IGNORECASE | re.X,
15 | )
16 |
17 |
18 | def _expand_num(n: int) -> str:
19 | return _inflect.number_to_words(n)
20 |
21 |
22 | def _expand_time_english(match: "re.Match") -> str:
23 | hour = int(match.group(1))
24 | past_noon = hour >= 12
25 | time = []
26 | if hour > 12:
27 | hour -= 12
28 | elif hour == 0:
29 | hour = 12
30 | past_noon = True
31 | time.append(_expand_num(hour))
32 |
33 | minute = int(match.group(6))
34 | if minute > 0:
35 | if minute < 10:
36 | time.append("oh")
37 | time.append(_expand_num(minute))
38 | am_pm = match.group(7)
39 | if am_pm is None:
40 | time.append("p m" if past_noon else "a m")
41 | else:
42 | time.extend(list(am_pm.replace(".", "")))
43 | return " ".join(time)
44 |
45 |
46 | def expand_time_english(text: str) -> str:
47 | return re.sub(_time_re, _expand_time_english, text)
48 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/text/french/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in french:
4 | abbreviations_fr = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("M", "monsieur"),
8 | ("Mlle", "mademoiselle"),
9 | ("Mlles", "mesdemoiselles"),
10 | ("Mme", "Madame"),
11 | ("Mmes", "Mesdames"),
12 | ("N.B", "nota bene"),
13 | ("M", "monsieur"),
14 | ("p.c.q", "parce que"),
15 | ("Pr", "professeur"),
16 | ("qqch", "quelque chose"),
17 | ("rdv", "rendez-vous"),
18 | ("max", "maximum"),
19 | ("min", "minimum"),
20 | ("no", "numéro"),
21 | ("adr", "adresse"),
22 | ("dr", "docteur"),
23 | ("st", "saint"),
24 | ("co", "companie"),
25 | ("jr", "junior"),
26 | ("sgt", "sergent"),
27 | ("capt", "capitain"),
28 | ("col", "colonel"),
29 | ("av", "avenue"),
30 | ("av. J.-C", "avant Jésus-Christ"),
31 | ("apr. J.-C", "après Jésus-Christ"),
32 | ("art", "article"),
33 | ("boul", "boulevard"),
34 | ("c.-à-d", "c’est-à-dire"),
35 | ("etc", "et cetera"),
36 | ("ex", "exemple"),
37 | ("excl", "exclusivement"),
38 | ("boul", "boulevard"),
39 | ]
40 | ] + [
41 | (re.compile("\\b%s" % x[0]), x[1])
42 | for x in [
43 | ("Mlle", "mademoiselle"),
44 | ("Mlles", "mesdemoiselles"),
45 | ("Mme", "Madame"),
46 | ("Mmes", "Mesdames"),
47 | ]
48 | ]
49 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/japanese/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/text/japanese/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/tts/utils/text/korean/__init__.py
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/ko_dictionary.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Add the word you want to the dictionary.
3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
4 |
5 |
6 | english_dictionary = {
7 | "KOREA": "코리아",
8 | "IDOL": "아이돌",
9 | "IT": "아이티",
10 | "IQ": "아이큐",
11 | "UP": "업",
12 | "DOWN": "다운",
13 | "PC": "피씨",
14 | "CCTV": "씨씨티비",
15 | "SNS": "에스엔에스",
16 | "AI": "에이아이",
17 | "CEO": "씨이오",
18 | "A": "에이",
19 | "B": "비",
20 | "C": "씨",
21 | "D": "디",
22 | "E": "이",
23 | "F": "에프",
24 | "G": "지",
25 | "H": "에이치",
26 | "I": "아이",
27 | "J": "제이",
28 | "K": "케이",
29 | "L": "엘",
30 | "M": "엠",
31 | "N": "엔",
32 | "O": "오",
33 | "P": "피",
34 | "Q": "큐",
35 | "R": "알",
36 | "S": "에스",
37 | "T": "티",
38 | "U": "유",
39 | "V": "브이",
40 | "W": "더블유",
41 | "X": "엑스",
42 | "Y": "와이",
43 | "Z": "제트",
44 | }
45 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/korean.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
3 | import re
4 |
5 | from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
6 |
7 |
8 | def normalize(text):
9 | text = text.strip()
10 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
11 | text = normalize_with_dictionary(text, etc_dictionary)
12 | text = normalize_english(text)
13 | text = text.lower()
14 | return text
15 |
16 |
17 | def normalize_with_dictionary(text, dic):
18 | if any(key in text for key in dic.keys()):
19 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
20 | return pattern.sub(lambda x: dic[x.group()], text)
21 | return text
22 |
23 |
24 | def normalize_english(text):
25 | def fn(m):
26 | word = m.group()
27 | if word in english_dictionary:
28 | return english_dictionary.get(word)
29 | return word
30 |
31 | text = re.sub("([A-Za-z]+)", fn, text)
32 | return text
33 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/korean/phonemizer.py:
--------------------------------------------------------------------------------
1 | from jamo import hangul_to_jamo
2 |
3 | from TTS.tts.utils.text.korean.korean import normalize
4 |
5 | g2p = None
6 |
7 |
8 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
9 | """
10 |
11 | The input and output values look the same, but they are different in Unicode.
12 |
13 | example :
14 |
15 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
16 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
17 |
18 | """
19 | global g2p # pylint: disable=global-statement
20 | if g2p is None:
21 | from g2pkk import G2p
22 |
23 | g2p = G2p()
24 |
25 | if character == "english":
26 | from anyascii import anyascii
27 |
28 | text = normalize(text)
29 | text = g2p(text)
30 | text = anyascii(text)
31 | return text
32 |
33 | text = normalize(text)
34 | text = g2p(text)
35 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
36 | return "".join(text)
37 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
2 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
3 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
4 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
5 | from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
6 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
7 |
8 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
9 |
10 |
11 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
12 | GRUUT_LANGS = list(Gruut.supported_languages())
13 |
14 |
15 | # Dict setting default phonemizers for each language
16 | # Add Gruut languages
17 | _ = [Gruut.name()] * len(GRUUT_LANGS)
18 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
19 |
20 |
21 | # Add ESpeak languages and override any existing ones
22 | _ = [ESpeak.name()] * len(ESPEAK_LANGS)
23 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
24 | DEF_LANG_TO_PHONEMIZER.update(_new_dict)
25 |
26 | # Force default for some languages
27 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
28 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
29 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
30 | DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
31 |
32 |
33 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
34 | """Initiate a phonemizer by name
35 |
36 | Args:
37 | name (str):
38 | Name of the phonemizer that should match `phonemizer.name()`.
39 |
40 | kwargs (dict):
41 | Extra keyword arguments that should be passed to the phonemizer.
42 | """
43 | if name == "espeak":
44 | return ESpeak(**kwargs)
45 | if name == "gruut":
46 | return Gruut(**kwargs)
47 | if name == "zh_cn_phonemizer":
48 | return ZH_CN_Phonemizer(**kwargs)
49 | if name == "ja_jp_phonemizer":
50 | return JA_JP_Phonemizer(**kwargs)
51 | if name == "ko_kr_phonemizer":
52 | return KO_KR_Phonemizer(**kwargs)
53 | raise ValueError(f"Phonemizer {name} not found")
54 |
55 |
56 | if __name__ == "__main__":
57 | print(DEF_LANG_TO_PHONEMIZER)
58 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 | _TRANS_TABLE = {"、": ","}
9 |
10 |
11 | def trans(text):
12 | for i, j in _TRANS_TABLE.items():
13 | text = text.replace(i, j)
14 | return text
15 |
16 |
17 | class JA_JP_Phonemizer(BasePhonemizer):
18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
19 |
20 | TODO: someone with JA knowledge should check this implementation
21 |
22 | Example:
23 |
24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
25 | >>> phonemizer = JA_JP_Phonemizer()
26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|")
27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
28 |
29 | """
30 |
31 | language = "ja-jp"
32 |
33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
35 |
36 | @staticmethod
37 | def name():
38 | return "ja_jp_phonemizer"
39 |
40 | def _phonemize(self, text: str, separator: str = "|") -> str:
41 | ph = japanese_text_to_phonemes(text)
42 | if separator is not None or separator != "":
43 | return separator.join(ph)
44 | return ph
45 |
46 | def phonemize(self, text: str, separator="|", language=None) -> str:
47 | """Custom phonemize for JP_JA
48 |
49 | Skip pre-post processing steps used by the other phonemizers.
50 | """
51 | return self._phonemize(text, separator)
52 |
53 | @staticmethod
54 | def supported_languages() -> Dict:
55 | return {"ja-jp": "Japanese (Japan)"}
56 |
57 | def version(self) -> str:
58 | return "0.0.1"
59 |
60 | def is_available(self) -> bool:
61 | return True
62 |
63 |
64 | # if __name__ == "__main__":
65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。"
66 | # e = JA_JP_Phonemizer()
67 | # print(e.supported_languages())
68 | # print(e.version())
69 | # print(e.language)
70 | # print(e.name())
71 | # print(e.is_available())
72 | # print("`" + e.phonemize(text) + "`")
73 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class KO_KR_Phonemizer(BasePhonemizer):
10 | """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
11 |
12 | TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
13 |
14 | Example:
15 |
16 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
17 | >>> phonemizer = KO_KR_Phonemizer()
18 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
19 | 'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
20 |
21 | >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
22 | >>> phonemizer = KO_KR_Phonemizer()
23 | >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
24 | 'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
25 |
26 | """
27 |
28 | language = "ko-kr"
29 |
30 | def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
31 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
32 |
33 | @staticmethod
34 | def name():
35 | return "ko_kr_phonemizer"
36 |
37 | def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
38 | ph = korean_text_to_phonemes(text, character=character)
39 | if separator is not None or separator != "":
40 | return separator.join(ph)
41 | return ph
42 |
43 | def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
44 | return self._phonemize(text, separator, character)
45 |
46 | @staticmethod
47 | def supported_languages() -> Dict:
48 | return {"ko-kr": "hangeul(korean)"}
49 |
50 | def version(self) -> str:
51 | return "0.0.2"
52 |
53 | def is_available(self) -> bool:
54 | return True
55 |
56 |
57 | if __name__ == "__main__":
58 | texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
59 | e = KO_KR_Phonemizer()
60 | print(e.supported_languages())
61 | print(e.version())
62 | print(e.language)
63 | print(e.name())
64 | print(e.is_available())
65 | print(e.phonemize(texts))
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/multi_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
4 |
5 |
6 | class MultiPhonemizer:
7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
8 |
9 | Args:
10 | custom_lang_to_phonemizer (Dict):
11 | Custom phonemizer mapping if you want to change the defaults. In the format of
12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
13 |
14 | TODO: find a way to pass custom kwargs to the phonemizers
15 | """
16 |
17 | lang_to_phonemizer = {}
18 |
19 | def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None: # pylint: disable=dangerous-default-value
20 | for k, v in lang_to_phonemizer_name.items():
21 | if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
22 | lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
23 | elif v == "":
24 | raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
25 | self.lang_to_phonemizer_name = lang_to_phonemizer_name
26 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
27 |
28 | @staticmethod
29 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
30 | lang_to_phonemizer = {}
31 | for k, v in lang_to_phonemizer_name.items():
32 | lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
33 | return lang_to_phonemizer
34 |
35 | @staticmethod
36 | def name():
37 | return "multi-phonemizer"
38 |
39 | def phonemize(self, text, separator="|", language=""):
40 | if language == "":
41 | raise ValueError("Language must be set for multi-phonemizer to phonemize.")
42 | return self.lang_to_phonemizer[language].phonemize(text, separator)
43 |
44 | def supported_languages(self) -> List:
45 | return list(self.lang_to_phonemizer.keys())
46 |
47 | def print_logs(self, level: int = 0):
48 | indent = "\t" * level
49 | print(f"{indent}| > phoneme language: {self.supported_languages()}")
50 | print(f"{indent}| > phoneme backend: {self.name()}")
51 |
52 |
53 | # if __name__ == "__main__":
54 | # texts = {
55 | # "tr": "Merhaba, bu Türkçe bit örnek!",
56 | # "en-us": "Hello, this is English example!",
57 | # "de": "Hallo, das ist ein Deutches Beipiel!",
58 | # "zh-cn": "这是中国的例子",
59 | # }
60 | # phonemes = {}
61 | # ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
62 | # for lang, text in texts.items():
63 | # phoneme = ph.phonemize(text, lang)
64 | # phonemes[lang] = phoneme
65 | # print(phonemes)
66 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class ZH_CN_Phonemizer(BasePhonemizer):
10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Mandarin knowledge should check this implementation
24 | """
25 |
26 | language = "zh-cn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "zh_cn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str:
37 | ph = chinese_text_to_phonemes(text, separator)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_zh_cn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"zh-cn": "Chinese (China)"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | # if __name__ == "__main__":
55 | # text = "这是,样本中文。"
56 | # e = ZH_CN_Phonemizer()
57 | # print(e.supported_languages())
58 | # print(e.version())
59 | # print(e.language)
60 | # print(e.name())
61 | # print(e.is_available())
62 | # print("`" + e.phonemize(text) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/utils/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.utils.audio.processor import AudioProcessor
2 |
--------------------------------------------------------------------------------
/TTS/utils/callbacks.py:
--------------------------------------------------------------------------------
1 | class TrainerCallback:
2 | @staticmethod
3 | def on_init_start(trainer) -> None:
4 | if hasattr(trainer.model, "module"):
5 | if hasattr(trainer.model.module, "on_init_start"):
6 | trainer.model.module.on_init_start(trainer)
7 | else:
8 | if hasattr(trainer.model, "on_init_start"):
9 | trainer.model.on_init_start(trainer)
10 |
11 | if hasattr(trainer.criterion, "on_init_start"):
12 | trainer.criterion.on_init_start(trainer)
13 |
14 | if hasattr(trainer.optimizer, "on_init_start"):
15 | trainer.optimizer.on_init_start(trainer)
16 |
17 | @staticmethod
18 | def on_init_end(trainer) -> None:
19 | if hasattr(trainer.model, "module"):
20 | if hasattr(trainer.model.module, "on_init_end"):
21 | trainer.model.module.on_init_end(trainer)
22 | else:
23 | if hasattr(trainer.model, "on_init_end"):
24 | trainer.model.on_init_end(trainer)
25 |
26 | if hasattr(trainer.criterion, "on_init_end"):
27 | trainer.criterion.on_init_end(trainer)
28 |
29 | if hasattr(trainer.optimizer, "on_init_end"):
30 | trainer.optimizer.on_init_end(trainer)
31 |
32 | @staticmethod
33 | def on_epoch_start(trainer) -> None:
34 | if hasattr(trainer.model, "module"):
35 | if hasattr(trainer.model.module, "on_epoch_start"):
36 | trainer.model.module.on_epoch_start(trainer)
37 | else:
38 | if hasattr(trainer.model, "on_epoch_start"):
39 | trainer.model.on_epoch_start(trainer)
40 |
41 | if hasattr(trainer.criterion, "on_epoch_start"):
42 | trainer.criterion.on_epoch_start(trainer)
43 |
44 | if hasattr(trainer.optimizer, "on_epoch_start"):
45 | trainer.optimizer.on_epoch_start(trainer)
46 |
47 | @staticmethod
48 | def on_epoch_end(trainer) -> None:
49 | if hasattr(trainer.model, "module"):
50 | if hasattr(trainer.model.module, "on_epoch_end"):
51 | trainer.model.module.on_epoch_end(trainer)
52 | else:
53 | if hasattr(trainer.model, "on_epoch_end"):
54 | trainer.model.on_epoch_end(trainer)
55 |
56 | if hasattr(trainer.criterion, "on_epoch_end"):
57 | trainer.criterion.on_epoch_end(trainer)
58 |
59 | if hasattr(trainer.optimizer, "on_epoch_end"):
60 | trainer.optimizer.on_epoch_end(trainer)
61 |
62 | @staticmethod
63 | def on_train_step_start(trainer) -> None:
64 | if hasattr(trainer.model, "module"):
65 | if hasattr(trainer.model.module, "on_train_step_start"):
66 | trainer.model.module.on_train_step_start(trainer)
67 | else:
68 | if hasattr(trainer.model, "on_train_step_start"):
69 | trainer.model.on_train_step_start(trainer)
70 |
71 | if hasattr(trainer.criterion, "on_train_step_start"):
72 | trainer.criterion.on_train_step_start(trainer)
73 |
74 | if hasattr(trainer.optimizer, "on_train_step_start"):
75 | trainer.optimizer.on_train_step_start(trainer)
76 |
77 | @staticmethod
78 | def on_train_step_end(trainer) -> None:
79 | if hasattr(trainer.model, "module"):
80 | if hasattr(trainer.model.module, "on_train_step_end"):
81 | trainer.model.module.on_train_step_end(trainer)
82 | else:
83 | if hasattr(trainer.model, "on_train_step_end"):
84 | trainer.model.on_train_step_end(trainer)
85 |
86 | if hasattr(trainer.criterion, "on_train_step_end"):
87 | trainer.criterion.on_train_step_end(trainer)
88 |
89 | if hasattr(trainer.optimizer, "on_train_step_end"):
90 | trainer.optimizer.on_train_step_end(trainer)
91 |
92 | @staticmethod
93 | def on_keyboard_interrupt(trainer) -> None:
94 | if hasattr(trainer.model, "module"):
95 | if hasattr(trainer.model.module, "on_keyboard_interrupt"):
96 | trainer.model.module.on_keyboard_interrupt(trainer)
97 | else:
98 | if hasattr(trainer.model, "on_keyboard_interrupt"):
99 | trainer.model.on_keyboard_interrupt(trainer)
100 |
101 | if hasattr(trainer.criterion, "on_keyboard_interrupt"):
102 | trainer.criterion.on_keyboard_interrupt(trainer)
103 |
104 | if hasattr(trainer.optimizer, "on_keyboard_interrupt"):
105 | trainer.optimizer.on_keyboard_interrupt(trainer)
106 |
--------------------------------------------------------------------------------
/TTS/utils/capacitron_optimizer.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | from trainer.trainer_utils import get_optimizer
4 |
5 |
6 | class CapacitronOptimizer:
7 | """Double optimizer class for the Capacitron model."""
8 |
9 | def __init__(self, config: dict, model_params: Generator) -> None:
10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
11 |
12 | optimizer_names = list(config.optimizer_params.keys())
13 | optimizer_parameters = list(config.optimizer_params.values())
14 |
15 | self.primary_optimizer = get_optimizer(
16 | optimizer_names[0],
17 | optimizer_parameters[0],
18 | config.lr,
19 | parameters=self.primary_params,
20 | )
21 |
22 | self.secondary_optimizer = get_optimizer(
23 | optimizer_names[1],
24 | self.extract_optimizer_parameters(optimizer_parameters[1]),
25 | optimizer_parameters[1]["lr"],
26 | parameters=self.secondary_params,
27 | )
28 |
29 | self.param_groups = self.primary_optimizer.param_groups
30 |
31 | def first_step(self):
32 | self.secondary_optimizer.step()
33 | self.secondary_optimizer.zero_grad()
34 | self.primary_optimizer.zero_grad()
35 |
36 | def step(self):
37 | # Update param groups to display the correct learning rate
38 | self.param_groups = self.primary_optimizer.param_groups
39 | self.primary_optimizer.step()
40 |
41 | def zero_grad(self, set_to_none=False):
42 | self.primary_optimizer.zero_grad(set_to_none)
43 | self.secondary_optimizer.zero_grad(set_to_none)
44 |
45 | def load_state_dict(self, state_dict):
46 | self.primary_optimizer.load_state_dict(state_dict[0])
47 | self.secondary_optimizer.load_state_dict(state_dict[1])
48 |
49 | def state_dict(self):
50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
51 |
52 | @staticmethod
53 | def split_model_parameters(model_params: Generator) -> list:
54 | primary_params = []
55 | secondary_params = []
56 | for name, param in model_params:
57 | if param.requires_grad:
58 | if name == "capacitron_vae_layer.beta":
59 | secondary_params.append(param)
60 | else:
61 | primary_params.append(param)
62 | return [iter(primary_params), iter(secondary_params)]
63 |
64 | @staticmethod
65 | def extract_optimizer_parameters(params: dict) -> dict:
66 | """Extract parameters that are not the learning rate"""
67 | return {k: v for k, v in params.items() if k != "lr"}
68 |
--------------------------------------------------------------------------------
/TTS/utils/distribute.py:
--------------------------------------------------------------------------------
1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
2 | import torch
3 | import torch.distributed as dist
4 |
5 |
6 | def reduce_tensor(tensor, num_gpus):
7 | rt = tensor.clone()
8 | dist.all_reduce(rt, op=dist.reduce_op.SUM)
9 | rt /= num_gpus
10 | return rt
11 |
12 |
13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA."
15 |
16 | # Set cuda device so everything is done on the right GPU.
17 | torch.cuda.set_device(rank % torch.cuda.device_count())
18 |
19 | # Initialize distributed communication
20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name)
21 |
--------------------------------------------------------------------------------
/TTS/utils/training.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
6 | r"""Check model gradient against unexpected jumps and failures"""
7 | skip_flag = False
8 | if ignore_stopnet:
9 | if not amp_opt_params:
10 | grad_norm = torch.nn.utils.clip_grad_norm_(
11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip
12 | )
13 | else:
14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
15 | else:
16 | if not amp_opt_params:
17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
18 | else:
19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
20 |
21 | # compatibility with different torch versions
22 | if isinstance(grad_norm, float):
23 | if np.isinf(grad_norm):
24 | print(" | > Gradient is INF !!")
25 | skip_flag = True
26 | else:
27 | if torch.isinf(grad_norm):
28 | print(" | > Gradient is INF !!")
29 | skip_flag = True
30 | return grad_norm, skip_flag
31 |
32 |
33 | def gradual_training_scheduler(global_step, config):
34 | """Setup the gradual training schedule wrt number
35 | of active GPUs"""
36 | num_gpus = torch.cuda.device_count()
37 | if num_gpus == 0:
38 | num_gpus = 1
39 | new_values = None
40 | # we set the scheduling wrt num_gpus
41 | for values in config.gradual_training:
42 | if global_step * num_gpus >= values[0]:
43 | new_values = values
44 | return new_values[1], new_values[2]
45 |
--------------------------------------------------------------------------------
/TTS/utils/vad.py:
--------------------------------------------------------------------------------
1 | import soundfile as sf
2 | import torch
3 | import torchaudio
4 |
5 |
6 | def read_audio(path):
7 | wav, sr = torchaudio.load(path)
8 |
9 | if wav.size(0) > 1:
10 | wav = wav.mean(dim=0, keepdim=True)
11 |
12 | return wav.squeeze(0), sr
13 |
14 |
15 | def resample_wav(wav, sr, new_sr):
16 | wav = wav.unsqueeze(0)
17 | transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
18 | wav = transform(wav)
19 | return wav.squeeze(0)
20 |
21 |
22 | def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
23 | factor = new_sr / vad_sr
24 | new_timestamps = []
25 | if just_begging_end and timestamps:
26 | # get just the start and end timestamps
27 | new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
28 | new_timestamps.append(new_dict)
29 | else:
30 | for ts in timestamps:
31 | # map to the new SR
32 | new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
33 | new_timestamps.append(new_dict)
34 |
35 | return new_timestamps
36 |
37 |
38 | def get_vad_model_and_utils(use_cuda=False):
39 | model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
40 | if use_cuda:
41 | model = model.cuda()
42 |
43 | get_speech_timestamps, save_audio, _, _, collect_chunks = utils
44 | return model, get_speech_timestamps, save_audio, collect_chunks
45 |
46 |
47 | def remove_silence(
48 | model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
49 | ):
50 | # get the VAD model and utils functions
51 | model, get_speech_timestamps, _, collect_chunks = model_and_utils
52 |
53 | # read ground truth wav and resample the audio for the VAD
54 | wav, gt_sample_rate = read_audio(audio_path)
55 |
56 | # if needed, resample the audio for the VAD model
57 | if gt_sample_rate != vad_sample_rate:
58 | wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate)
59 | else:
60 | wav_vad = wav
61 |
62 | if use_cuda:
63 | wav_vad = wav_vad.cuda()
64 |
65 | # get speech timestamps from full audio file
66 | speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
67 |
68 | # map the current speech_timestamps to the sample rate of the ground truth audio
69 | new_speech_timestamps = map_timestamps_to_new_sr(
70 | vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
71 | )
72 |
73 | # if have speech timestamps else save the wav
74 | if new_speech_timestamps:
75 | wav = collect_chunks(new_speech_timestamps, wav)
76 | is_speech = True
77 | else:
78 | print(f"> The file {audio_path} probably does not have speech please check it !!")
79 | is_speech = False
80 |
81 | # save audio
82 | sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
83 | return out_path, is_speech
84 |
--------------------------------------------------------------------------------
/TTS/vocoder/README.md:
--------------------------------------------------------------------------------
1 | # Mozilla TTS Vocoders (Experimental)
2 |
3 | Here there are vocoder model implementations which can be combined with the other TTS models.
4 |
5 | Currently, following models are implemented:
6 |
7 | - Melgan
8 | - MultiBand-Melgan
9 | - ParallelWaveGAN
10 | - GAN-TTS (Discriminator Only)
11 |
12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
13 |
14 | ## Training a model
15 |
16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
17 |
18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
19 |
20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
21 |
22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
23 |
24 | Example config files can be found under `tts/vocoder/configs/` folder.
25 |
26 | You can continue a previous training run by the following command.
27 |
28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
29 |
30 | You can fine-tune a pre-trained model by the following command.
31 |
32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
33 |
34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
35 |
36 | You can also follow your training runs on Tensorboard as you do with our TTS models.
37 |
38 | ## Acknowledgement
39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
40 |
--------------------------------------------------------------------------------
/TTS/vocoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/vocoder/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | configs_dir = os.path.dirname(__file__)
7 | for file in os.listdir(configs_dir):
8 | path = os.path.join(configs_dir, file)
9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | module = importlib.import_module("TTS.vocoder.configs." + config_name)
12 | for attribute_name in dir(module):
13 | attribute = getattr(module, attribute_name)
14 |
15 | if isclass(attribute):
16 | # Add the class to this package's variables
17 | globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/vocoder/configs/wavegrad_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 |
3 | from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
4 | from TTS.vocoder.models.wavegrad import WavegradArgs
5 |
6 |
7 | @dataclass
8 | class WavegradConfig(BaseVocoderConfig):
9 | """Defines parameters for WaveGrad vocoder.
10 | Example:
11 |
12 | >>> from TTS.vocoder.configs import WavegradConfig
13 | >>> config = WavegradConfig()
14 |
15 | Args:
16 | model (str):
17 | Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
18 | generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
19 | considered as a generator too. Defaults to `wavegrad`.
20 | model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values.
21 | target_loss (str):
22 | Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
23 | epochs (int):
24 | Number of epochs to traing the model. Defaults to 10000.
25 | batch_size (int):
26 | Batch size used at training. Larger values use more memory. Defaults to 96.
27 | seq_len (int):
28 | Audio segment length used at training. Larger values use more memory. Defaults to 6144.
29 | use_cache (bool):
30 | enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
31 | not large enough. Defaults to True.
32 | mixed_precision (bool):
33 | enable / disable mixed precision training. Default is True.
34 | eval_split_size (int):
35 | Number of samples used for evalutaion. Defaults to 50.
36 | train_noise_schedule (dict):
37 | Training noise schedule. Defaults to
38 | `{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}`
39 | test_noise_schedule (dict):
40 | Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a
41 | better schedule. Defaults to
42 | `
43 | {
44 | "min_val": 1e-6,
45 | "max_val": 1e-2,
46 | "num_steps": 50,
47 | }
48 | `
49 | grad_clip (float):
50 | Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0
51 | lr (float):
52 | Initila leraning rate. Defaults to 1e-4.
53 | lr_scheduler (str):
54 | One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
55 | lr_scheduler_params (dict):
56 | kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`
57 | """
58 |
59 | model: str = "wavegrad"
60 | # Model specific params
61 | generator_model: str = "wavegrad"
62 | model_params: WavegradArgs = field(default_factory=WavegradArgs)
63 | target_loss: str = "loss" # loss value to pick the best model to save after each epoch
64 |
65 | # Training - overrides
66 | epochs: int = 10000
67 | batch_size: int = 96
68 | seq_len: int = 6144
69 | use_cache: bool = True
70 | mixed_precision: bool = True
71 | eval_split_size: int = 50
72 |
73 | # NOISE SCHEDULE PARAMS
74 | train_noise_schedule: dict = field(default_factory=lambda: {"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000})
75 |
76 | test_noise_schedule: dict = field(
77 | default_factory=lambda: { # inference noise schedule. Try TTS/bin/tune_wavegrad.py to find the optimal values.
78 | "min_val": 1e-6,
79 | "max_val": 1e-2,
80 | "num_steps": 50,
81 | }
82 | )
83 |
84 | # optimizer overrides
85 | grad_clip: float = 1.0
86 | lr: float = 1e-4 # Initial learning rate.
87 | lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
88 | lr_scheduler_params: dict = field(
89 | default_factory=lambda: {"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}
90 | )
91 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from coqpit import Coqpit
4 | from torch.utils.data import Dataset
5 |
6 | from TTS.utils.audio import AudioProcessor
7 | from TTS.vocoder.datasets.gan_dataset import GANDataset
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
11 |
12 |
13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset:
14 | if config.model.lower() in "gan":
15 | dataset = GANDataset(
16 | ap=ap,
17 | items=data_items,
18 | seq_len=config.seq_len,
19 | hop_len=ap.hop_length,
20 | pad_short=config.pad_short,
21 | conv_pad=config.conv_pad,
22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False,
23 | is_training=not is_eval,
24 | return_segments=not is_eval,
25 | use_noise_augment=config.use_noise_augment,
26 | use_cache=config.use_cache,
27 | verbose=verbose,
28 | )
29 | dataset.shuffle_mapping()
30 | elif config.model.lower() == "wavegrad":
31 | dataset = WaveGradDataset(
32 | ap=ap,
33 | items=data_items,
34 | seq_len=config.seq_len,
35 | hop_len=ap.hop_length,
36 | pad_short=config.pad_short,
37 | conv_pad=config.conv_pad,
38 | is_training=not is_eval,
39 | return_segments=True,
40 | use_noise_augment=False,
41 | use_cache=config.use_cache,
42 | verbose=verbose,
43 | )
44 | elif config.model.lower() == "wavernn":
45 | dataset = WaveRNNDataset(
46 | ap=ap,
47 | items=data_items,
48 | seq_len=config.seq_len,
49 | hop_len=ap.hop_length,
50 | pad=config.model_params.pad,
51 | mode=config.model_params.mode,
52 | mulaw=config.model_params.mulaw,
53 | is_training=not is_eval,
54 | verbose=verbose,
55 | )
56 | else:
57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.")
58 | return dataset
59 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/preprocess.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | from coqpit import Coqpit
7 | from tqdm import tqdm
8 |
9 | from TTS.utils.audio import AudioProcessor
10 |
11 |
12 | def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor):
13 | """Process wav and compute mel and quantized wave signal.
14 | It is mainly used by WaveRNN dataloader.
15 |
16 | Args:
17 | out_path (str): Parent folder path to save the files.
18 | config (Coqpit): Model config.
19 | ap (AudioProcessor): Audio processor.
20 | """
21 | os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
22 | os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
23 | wav_files = find_wav_files(config.data_path)
24 | for path in tqdm(wav_files):
25 | wav_name = Path(path).stem
26 | quant_path = os.path.join(out_path, "quant", wav_name + ".npy")
27 | mel_path = os.path.join(out_path, "mel", wav_name + ".npy")
28 | y = ap.load_wav(path)
29 | mel = ap.melspectrogram(y)
30 | np.save(mel_path, mel)
31 | if isinstance(config.mode, int):
32 | quant = ap.mulaw_encode(y, qc=config.mode) if config.model_args.mulaw else ap.quantize(y, bits=config.mode)
33 | np.save(quant_path, quant)
34 |
35 |
36 | def find_wav_files(data_path, file_ext="wav"):
37 | wav_paths = glob.glob(os.path.join(data_path, "**", f"*.{file_ext}"), recursive=True)
38 | return wav_paths
39 |
40 |
41 | def find_feat_files(data_path):
42 | feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True)
43 | return feat_paths
44 |
45 |
46 | def load_wav_data(data_path, eval_split_size, file_ext="wav"):
47 | wav_paths = find_wav_files(data_path, file_ext=file_ext)
48 | assert len(wav_paths) > 0, f" [!] {data_path} is empty."
49 | np.random.seed(0)
50 | np.random.shuffle(wav_paths)
51 | return wav_paths[:eval_split_size], wav_paths[eval_split_size:]
52 |
53 |
54 | def load_wav_feat_data(data_path, feat_path, eval_split_size):
55 | wav_paths = find_wav_files(data_path)
56 | feat_paths = find_feat_files(feat_path)
57 |
58 | wav_paths.sort(key=lambda x: Path(x).stem)
59 | feat_paths.sort(key=lambda x: Path(x).stem)
60 |
61 | assert len(wav_paths) == len(feat_paths), f" [!] {len(wav_paths)} vs {feat_paths}"
62 | for wav, feat in zip(wav_paths, feat_paths):
63 | wav_name = Path(wav).stem
64 | feat_name = Path(feat).stem
65 | assert wav_name == feat_name
66 |
67 | items = list(zip(wav_paths, feat_paths))
68 | np.random.seed(0)
69 | np.random.shuffle(items)
70 | return items[:eval_split_size], items[eval_split_size:]
71 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/wavernn_dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.utils.data import Dataset
4 |
5 |
6 | class WaveRNNDataset(Dataset):
7 | """
8 | WaveRNN Dataset searchs for all the wav files under root path
9 | and converts them to acoustic features on the fly.
10 | """
11 |
12 | def __init__(
13 | self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True
14 | ):
15 | super().__init__()
16 | self.ap = ap
17 | self.compute_feat = not isinstance(items[0], (tuple, list))
18 | self.item_list = items
19 | self.seq_len = seq_len
20 | self.hop_len = hop_len
21 | self.mel_len = seq_len // hop_len
22 | self.pad = pad
23 | self.mode = mode
24 | self.mulaw = mulaw
25 | self.is_training = is_training
26 | self.verbose = verbose
27 | self.return_segments = return_segments
28 |
29 | assert self.seq_len % self.hop_len == 0
30 |
31 | def __len__(self):
32 | return len(self.item_list)
33 |
34 | def __getitem__(self, index):
35 | item = self.load_item(index)
36 | return item
37 |
38 | def load_test_samples(self, num_samples):
39 | samples = []
40 | return_segments = self.return_segments
41 | self.return_segments = False
42 | for idx in range(num_samples):
43 | mel, audio, _ = self.load_item(idx)
44 | samples.append([mel, audio])
45 | self.return_segments = return_segments
46 | return samples
47 |
48 | def load_item(self, index):
49 | """
50 | load (audio, feat) couple if feature_path is set
51 | else compute it on the fly
52 | """
53 | if self.compute_feat:
54 | wavpath = self.item_list[index]
55 | audio = self.ap.load_wav(wavpath)
56 | if self.return_segments:
57 | min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len)
58 | else:
59 | min_audio_len = audio.shape[0] + (2 * self.pad * self.hop_len)
60 | if audio.shape[0] < min_audio_len:
61 | print(" [!] Instance is too short! : {}".format(wavpath))
62 | audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len])
63 | mel = self.ap.melspectrogram(audio)
64 |
65 | if self.mode in ["gauss", "mold"]:
66 | x_input = audio
67 | elif isinstance(self.mode, int):
68 | x_input = (
69 | self.ap.mulaw_encode(audio, qc=self.mode) if self.mulaw else self.ap.quantize(audio, bits=self.mode)
70 | )
71 | else:
72 | raise RuntimeError("Unknown dataset mode - ", self.mode)
73 |
74 | else:
75 | wavpath, feat_path = self.item_list[index]
76 | mel = np.load(feat_path.replace("/quant/", "/mel/"))
77 |
78 | if mel.shape[-1] < self.mel_len + 2 * self.pad:
79 | print(" [!] Instance is too short! : {}".format(wavpath))
80 | self.item_list[index] = self.item_list[index + 1]
81 | feat_path = self.item_list[index]
82 | mel = np.load(feat_path.replace("/quant/", "/mel/"))
83 | if self.mode in ["gauss", "mold"]:
84 | x_input = self.ap.load_wav(wavpath)
85 | elif isinstance(self.mode, int):
86 | x_input = np.load(feat_path.replace("/mel/", "/quant/"))
87 | else:
88 | raise RuntimeError("Unknown dataset mode - ", self.mode)
89 |
90 | return mel, x_input, wavpath
91 |
92 | def collate(self, batch):
93 | mel_win = self.seq_len // self.hop_len + 2 * self.pad
94 | max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch]
95 |
96 | mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
97 | sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets]
98 |
99 | mels = [x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
100 |
101 | coarse = [x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] for i, x in enumerate(batch)]
102 |
103 | mels = np.stack(mels).astype(np.float32)
104 | if self.mode in ["gauss", "mold"]:
105 | coarse = np.stack(coarse).astype(np.float32)
106 | coarse = torch.FloatTensor(coarse)
107 | x_input = coarse[:, : self.seq_len]
108 | elif isinstance(self.mode, int):
109 | coarse = np.stack(coarse).astype(np.int64)
110 | coarse = torch.LongTensor(coarse)
111 | x_input = 2 * coarse[:, : self.seq_len].float() / (2**self.mode - 1.0) - 1.0
112 | y_coarse = coarse[:, 1:]
113 | mels = torch.FloatTensor(mels)
114 | return x_input, mels, y_coarse
115 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/vocoder/layers/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/layers/hifigan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | # pylint: disable=dangerous-default-value
5 | class ResStack(nn.Module):
6 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]):
7 | super().__init__()
8 | resstack = []
9 | for dilation in dilations:
10 | resstack += [
11 | nn.LeakyReLU(0.2),
12 | nn.ReflectionPad1d(dilation),
13 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)),
14 | nn.LeakyReLU(0.2),
15 | nn.ReflectionPad1d(padding),
16 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)),
17 | ]
18 | self.resstack = nn.Sequential(*resstack)
19 |
20 | self.shortcut = nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1))
21 |
22 | def forward(self, x):
23 | x1 = self.shortcut(x)
24 | x2 = self.resstack(x)
25 | return x1 + x2
26 |
27 | def remove_weight_norm(self):
28 | nn.utils.remove_weight_norm(self.shortcut)
29 | nn.utils.remove_weight_norm(self.resstack[2])
30 | nn.utils.remove_weight_norm(self.resstack[5])
31 | nn.utils.remove_weight_norm(self.resstack[8])
32 | nn.utils.remove_weight_norm(self.resstack[11])
33 | nn.utils.remove_weight_norm(self.resstack[14])
34 | nn.utils.remove_weight_norm(self.resstack[17])
35 |
36 |
37 | class MRF(nn.Module):
38 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value
39 | super().__init__()
40 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations)
41 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations)
42 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations)
43 |
44 | def forward(self, x):
45 | x1 = self.resblock1(x)
46 | x2 = self.resblock2(x)
47 | x3 = self.resblock3(x)
48 | return x1 + x2 + x3
49 |
50 | def remove_weight_norm(self):
51 | self.resblock1.remove_weight_norm()
52 | self.resblock2.remove_weight_norm()
53 | self.resblock3.remove_weight_norm()
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/melgan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils import weight_norm
3 |
4 |
5 | class ResidualStack(nn.Module):
6 | def __init__(self, channels, num_res_blocks, kernel_size):
7 | super().__init__()
8 |
9 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
10 | base_padding = (kernel_size - 1) // 2
11 |
12 | self.blocks = nn.ModuleList()
13 | for idx in range(num_res_blocks):
14 | layer_kernel_size = kernel_size
15 | layer_dilation = layer_kernel_size**idx
16 | layer_padding = base_padding * layer_dilation
17 | self.blocks += [
18 | nn.Sequential(
19 | nn.LeakyReLU(0.2),
20 | nn.ReflectionPad1d(layer_padding),
21 | weight_norm(
22 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True)
23 | ),
24 | nn.LeakyReLU(0.2),
25 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
26 | )
27 | ]
28 |
29 | self.shortcuts = nn.ModuleList(
30 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for i in range(num_res_blocks)]
31 | )
32 |
33 | def forward(self, x):
34 | for block, shortcut in zip(self.blocks, self.shortcuts):
35 | x = shortcut(x) + block(x)
36 | return x
37 |
38 | def remove_weight_norm(self):
39 | for block, shortcut in zip(self.blocks, self.shortcuts):
40 | nn.utils.remove_weight_norm(block[2])
41 | nn.utils.remove_weight_norm(block[4])
42 | nn.utils.remove_weight_norm(shortcut)
43 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/parallel_wavegan.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 |
5 | class ResidualBlock(torch.nn.Module):
6 | """Residual block module in WaveNet."""
7 |
8 | def __init__(
9 | self,
10 | kernel_size=3,
11 | res_channels=64,
12 | gate_channels=128,
13 | skip_channels=64,
14 | aux_channels=80,
15 | dropout=0.0,
16 | dilation=1,
17 | bias=True,
18 | use_causal_conv=False,
19 | ):
20 | super().__init__()
21 | self.dropout = dropout
22 | # no future time stamps available
23 | if use_causal_conv:
24 | padding = (kernel_size - 1) * dilation
25 | else:
26 | assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
27 | padding = (kernel_size - 1) // 2 * dilation
28 | self.use_causal_conv = use_causal_conv
29 |
30 | # dilation conv
31 | self.conv = torch.nn.Conv1d(
32 | res_channels, gate_channels, kernel_size, padding=padding, dilation=dilation, bias=bias
33 | )
34 |
35 | # local conditioning
36 | if aux_channels > 0:
37 | self.conv1x1_aux = torch.nn.Conv1d(aux_channels, gate_channels, 1, bias=False)
38 | else:
39 | self.conv1x1_aux = None
40 |
41 | # conv output is split into two groups
42 | gate_out_channels = gate_channels // 2
43 | self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, res_channels, 1, bias=bias)
44 | self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, skip_channels, 1, bias=bias)
45 |
46 | def forward(self, x, c):
47 | """
48 | x: B x D_res x T
49 | c: B x D_aux x T
50 | """
51 | residual = x
52 | x = F.dropout(x, p=self.dropout, training=self.training)
53 | x = self.conv(x)
54 |
55 | # remove future time steps if use_causal_conv conv
56 | x = x[:, :, : residual.size(-1)] if self.use_causal_conv else x
57 |
58 | # split into two part for gated activation
59 | splitdim = 1
60 | xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
61 |
62 | # local conditioning
63 | if c is not None:
64 | assert self.conv1x1_aux is not None
65 | c = self.conv1x1_aux(c)
66 | ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
67 | xa, xb = xa + ca, xb + cb
68 |
69 | x = torch.tanh(xa) * torch.sigmoid(xb)
70 |
71 | # for skip connection
72 | s = self.conv1x1_skip(x)
73 |
74 | # for residual connection
75 | x = (self.conv1x1_out(x) + residual) * (0.5**2)
76 |
77 | return x, s
78 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/pqmf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from scipy import signal as sig
5 |
6 |
7 | # adapted from
8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
9 | class PQMF(torch.nn.Module):
10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
11 | super().__init__()
12 |
13 | self.N = N
14 | self.taps = taps
15 | self.cutoff = cutoff
16 | self.beta = beta
17 |
18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta))
19 | H = np.zeros((N, len(QMF)))
20 | G = np.zeros((N, len(QMF)))
21 | for k in range(N):
22 | constant_factor = (
23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2))
24 | ) # TODO: (taps - 1) -> taps
25 | phase = (-1) ** k * np.pi / 4
26 | H[k] = 2 * QMF * np.cos(constant_factor + phase)
27 |
28 | G[k] = 2 * QMF * np.cos(constant_factor - phase)
29 |
30 | H = torch.from_numpy(H[:, None, :]).float()
31 | G = torch.from_numpy(G[None, :, :]).float()
32 |
33 | self.register_buffer("H", H)
34 | self.register_buffer("G", G)
35 |
36 | updown_filter = torch.zeros((N, N, N)).float()
37 | for k in range(N):
38 | updown_filter[k, k, 0] = 1.0
39 | self.register_buffer("updown_filter", updown_filter)
40 | self.N = N
41 |
42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
43 |
44 | def forward(self, x):
45 | return self.analysis(x)
46 |
47 | def analysis(self, x):
48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
49 |
50 | def synthesis(self, x):
51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N)
52 | x = F.conv1d(x, self.G, padding=self.taps // 2)
53 | return x
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/upsample.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 |
5 | class Stretch2d(torch.nn.Module):
6 | def __init__(self, x_scale, y_scale, mode="nearest"):
7 | super().__init__()
8 | self.x_scale = x_scale
9 | self.y_scale = y_scale
10 | self.mode = mode
11 |
12 | def forward(self, x):
13 | """
14 | x (Tensor): Input tensor (B, C, F, T).
15 | Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
16 | """
17 | return F.interpolate(x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
18 |
19 |
20 | class UpsampleNetwork(torch.nn.Module):
21 | # pylint: disable=dangerous-default-value
22 | def __init__(
23 | self,
24 | upsample_factors,
25 | nonlinear_activation=None,
26 | nonlinear_activation_params={},
27 | interpolate_mode="nearest",
28 | freq_axis_kernel_size=1,
29 | use_causal_conv=False,
30 | ):
31 | super().__init__()
32 | self.use_causal_conv = use_causal_conv
33 | self.up_layers = torch.nn.ModuleList()
34 | for scale in upsample_factors:
35 | # interpolation layer
36 | stretch = Stretch2d(scale, 1, interpolate_mode)
37 | self.up_layers += [stretch]
38 |
39 | # conv layer
40 | assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
41 | freq_axis_padding = (freq_axis_kernel_size - 1) // 2
42 | kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
43 | if use_causal_conv:
44 | padding = (freq_axis_padding, scale * 2)
45 | else:
46 | padding = (freq_axis_padding, scale)
47 | conv = torch.nn.Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
48 | self.up_layers += [conv]
49 |
50 | # nonlinear
51 | if nonlinear_activation is not None:
52 | nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
53 | self.up_layers += [nonlinear]
54 |
55 | def forward(self, c):
56 | """
57 | c : (B, C, T_in).
58 | Tensor: (B, C, T_upsample)
59 | """
60 | c = c.unsqueeze(1) # (B, 1, C, T)
61 | for f in self.up_layers:
62 | c = f(c)
63 | return c.squeeze(1) # (B, C, T')
64 |
65 |
66 | class ConvUpsample(torch.nn.Module):
67 | # pylint: disable=dangerous-default-value
68 | def __init__(
69 | self,
70 | upsample_factors,
71 | nonlinear_activation=None,
72 | nonlinear_activation_params={},
73 | interpolate_mode="nearest",
74 | freq_axis_kernel_size=1,
75 | aux_channels=80,
76 | aux_context_window=0,
77 | use_causal_conv=False,
78 | ):
79 | super().__init__()
80 | self.aux_context_window = aux_context_window
81 | self.use_causal_conv = use_causal_conv and aux_context_window > 0
82 | # To capture wide-context information in conditional features
83 | kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
84 | # NOTE(kan-bayashi): Here do not use padding because the input is already padded
85 | self.conv_in = torch.nn.Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
86 | self.upsample = UpsampleNetwork(
87 | upsample_factors=upsample_factors,
88 | nonlinear_activation=nonlinear_activation,
89 | nonlinear_activation_params=nonlinear_activation_params,
90 | interpolate_mode=interpolate_mode,
91 | freq_axis_kernel_size=freq_axis_kernel_size,
92 | use_causal_conv=use_causal_conv,
93 | )
94 |
95 | def forward(self, c):
96 | """
97 | c : (B, C, T_in).
98 | Tensor: (B, C, T_upsampled),
99 | """
100 | c_ = self.conv_in(c)
101 | c = c_[:, :, : -self.aux_context_window] if self.use_causal_conv else c_
102 | return self.upsample(c)
103 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/base_vocoder.py:
--------------------------------------------------------------------------------
1 | from coqpit import Coqpit
2 |
3 | from TTS.model import BaseTrainerModel
4 |
5 | # pylint: skip-file
6 |
7 |
8 | class BaseVocoder(BaseTrainerModel):
9 | """Base `vocoder` class. Every new `vocoder` model must inherit this.
10 |
11 | It defines `vocoder` specific functions on top of `Model`.
12 |
13 | Notes on input/output tensor shapes:
14 | Any input or output tensor of the model must be shaped as
15 |
16 | - 3D tensors `batch x time x channels`
17 | - 2D tensors `batch x channels`
18 | - 1D tensors `batch x 1`
19 | """
20 |
21 | def __init__(self, config):
22 | super().__init__()
23 | self._set_model_args(config)
24 |
25 | def _set_model_args(self, config: Coqpit):
26 | """Setup model args based on the config type.
27 |
28 | If the config is for training with a name like "*Config", then the model args are embeded in the
29 | config.model_args
30 |
31 | If the config is for the model with a name like "*Args", then we assign the directly.
32 | """
33 | # don't use isintance not to import recursively
34 | if "Config" in config.__class__.__name__:
35 | if "characters" in config:
36 | _, self.config, num_chars = self.get_characters(config)
37 | self.config.num_chars = num_chars
38 | if hasattr(self.config, "model_args"):
39 | config.model_args.num_chars = num_chars
40 | if "model_args" in config:
41 | self.args = self.config.model_args
42 | # This is for backward compatibility
43 | if "model_params" in config:
44 | self.args = self.config.model_params
45 | else:
46 | self.config = config
47 | if "model_args" in config:
48 | self.args = self.config.model_args
49 | # This is for backward compatibility
50 | if "model_params" in config:
51 | self.args = self.config.model_params
52 | else:
53 | raise ValueError("config must be either a *Config or *Args")
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/fullband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.models.melgan_generator import MelganGenerator
4 |
5 |
6 | class FullbandMelganGenerator(MelganGenerator):
7 | def __init__(
8 | self,
9 | in_channels=80,
10 | out_channels=1,
11 | proj_kernel=7,
12 | base_channels=512,
13 | upsample_factors=(2, 8, 2, 2),
14 | res_kernel=3,
15 | num_res_blocks=4,
16 | ):
17 | super().__init__(
18 | in_channels=in_channels,
19 | out_channels=out_channels,
20 | proj_kernel=proj_kernel,
21 | base_channels=base_channels,
22 | upsample_factors=upsample_factors,
23 | res_kernel=res_kernel,
24 | num_res_blocks=num_res_blocks,
25 | )
26 |
27 | @torch.no_grad()
28 | def inference(self, cond_features):
29 | cond_features = cond_features.to(self.layers[1].weight.device)
30 | cond_features = torch.nn.functional.pad(
31 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
32 | )
33 | return self.layers(cond_features)
34 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_discriminator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch import nn
3 | from torch.nn.utils import weight_norm
4 |
5 |
6 | class MelganDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | kernel_sizes=(5, 3),
12 | base_channels=16,
13 | max_channels=1024,
14 | downsample_factors=(4, 4, 4, 4),
15 | groups_denominator=4,
16 | ):
17 | super().__init__()
18 | self.layers = nn.ModuleList()
19 |
20 | layer_kernel_size = np.prod(kernel_sizes)
21 | layer_padding = (layer_kernel_size - 1) // 2
22 |
23 | # initial layer
24 | self.layers += [
25 | nn.Sequential(
26 | nn.ReflectionPad1d(layer_padding),
27 | weight_norm(nn.Conv1d(in_channels, base_channels, layer_kernel_size, stride=1)),
28 | nn.LeakyReLU(0.2, inplace=True),
29 | )
30 | ]
31 |
32 | # downsampling layers
33 | layer_in_channels = base_channels
34 | for downsample_factor in downsample_factors:
35 | layer_out_channels = min(layer_in_channels * downsample_factor, max_channels)
36 | layer_kernel_size = downsample_factor * 10 + 1
37 | layer_padding = (layer_kernel_size - 1) // 2
38 | layer_groups = layer_in_channels // groups_denominator
39 | self.layers += [
40 | nn.Sequential(
41 | weight_norm(
42 | nn.Conv1d(
43 | layer_in_channels,
44 | layer_out_channels,
45 | kernel_size=layer_kernel_size,
46 | stride=downsample_factor,
47 | padding=layer_padding,
48 | groups=layer_groups,
49 | )
50 | ),
51 | nn.LeakyReLU(0.2, inplace=True),
52 | )
53 | ]
54 | layer_in_channels = layer_out_channels
55 |
56 | # last 2 layers
57 | layer_padding1 = (kernel_sizes[0] - 1) // 2
58 | layer_padding2 = (kernel_sizes[1] - 1) // 2
59 | self.layers += [
60 | nn.Sequential(
61 | weight_norm(
62 | nn.Conv1d(
63 | layer_out_channels,
64 | layer_out_channels,
65 | kernel_size=kernel_sizes[0],
66 | stride=1,
67 | padding=layer_padding1,
68 | )
69 | ),
70 | nn.LeakyReLU(0.2, inplace=True),
71 | ),
72 | weight_norm(
73 | nn.Conv1d(
74 | layer_out_channels, out_channels, kernel_size=kernel_sizes[1], stride=1, padding=layer_padding2
75 | )
76 | ),
77 | ]
78 |
79 | def forward(self, x):
80 | feats = []
81 | for layer in self.layers:
82 | x = layer(x)
83 | feats.append(x)
84 | return x, feats
85 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn.utils import weight_norm
4 |
5 | from TTS.utils.io import load_fsspec
6 | from TTS.vocoder.layers.melgan import ResidualStack
7 |
8 |
9 | class MelganGenerator(nn.Module):
10 | def __init__(
11 | self,
12 | in_channels=80,
13 | out_channels=1,
14 | proj_kernel=7,
15 | base_channels=512,
16 | upsample_factors=(8, 8, 2, 2),
17 | res_kernel=3,
18 | num_res_blocks=3,
19 | ):
20 | super().__init__()
21 |
22 | # assert model parameters
23 | assert (proj_kernel - 1) % 2 == 0, " [!] proj_kernel should be an odd number."
24 |
25 | # setup additional model parameters
26 | base_padding = (proj_kernel - 1) // 2
27 | act_slope = 0.2
28 | self.inference_padding = 2
29 |
30 | # initial layer
31 | layers = []
32 | layers += [
33 | nn.ReflectionPad1d(base_padding),
34 | weight_norm(nn.Conv1d(in_channels, base_channels, kernel_size=proj_kernel, stride=1, bias=True)),
35 | ]
36 |
37 | # upsampling layers and residual stacks
38 | for idx, upsample_factor in enumerate(upsample_factors):
39 | layer_in_channels = base_channels // (2**idx)
40 | layer_out_channels = base_channels // (2 ** (idx + 1))
41 | layer_filter_size = upsample_factor * 2
42 | layer_stride = upsample_factor
43 | layer_output_padding = upsample_factor % 2
44 | layer_padding = upsample_factor // 2 + layer_output_padding
45 | layers += [
46 | nn.LeakyReLU(act_slope),
47 | weight_norm(
48 | nn.ConvTranspose1d(
49 | layer_in_channels,
50 | layer_out_channels,
51 | layer_filter_size,
52 | stride=layer_stride,
53 | padding=layer_padding,
54 | output_padding=layer_output_padding,
55 | bias=True,
56 | )
57 | ),
58 | ResidualStack(channels=layer_out_channels, num_res_blocks=num_res_blocks, kernel_size=res_kernel),
59 | ]
60 |
61 | layers += [nn.LeakyReLU(act_slope)]
62 |
63 | # final layer
64 | layers += [
65 | nn.ReflectionPad1d(base_padding),
66 | weight_norm(nn.Conv1d(layer_out_channels, out_channels, proj_kernel, stride=1, bias=True)),
67 | nn.Tanh(),
68 | ]
69 | self.layers = nn.Sequential(*layers)
70 |
71 | def forward(self, c):
72 | return self.layers(c)
73 |
74 | def inference(self, c):
75 | c = c.to(self.layers[1].weight.device)
76 | c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate")
77 | return self.layers(c)
78 |
79 | def remove_weight_norm(self):
80 | for _, layer in enumerate(self.layers):
81 | if len(layer.state_dict()) != 0:
82 | try:
83 | nn.utils.remove_weight_norm(layer)
84 | except ValueError:
85 | layer.remove_weight_norm()
86 |
87 | def load_checkpoint(
88 | self, config, checkpoint_path, eval=False, cache=False
89 | ): # pylint: disable=unused-argument, redefined-builtin
90 | state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
91 | self.load_state_dict(state["model"])
92 | if eval:
93 | self.eval()
94 | assert not self.training
95 | self.remove_weight_norm()
96 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_multiscale_discriminator.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
4 |
5 |
6 | class MelganMultiscaleDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | num_scales=3,
12 | kernel_sizes=(5, 3),
13 | base_channels=16,
14 | max_channels=1024,
15 | downsample_factors=(4, 4, 4),
16 | pooling_kernel_size=4,
17 | pooling_stride=2,
18 | pooling_padding=2,
19 | groups_denominator=4,
20 | ):
21 | super().__init__()
22 |
23 | self.discriminators = nn.ModuleList(
24 | [
25 | MelganDiscriminator(
26 | in_channels=in_channels,
27 | out_channels=out_channels,
28 | kernel_sizes=kernel_sizes,
29 | base_channels=base_channels,
30 | max_channels=max_channels,
31 | downsample_factors=downsample_factors,
32 | groups_denominator=groups_denominator,
33 | )
34 | for _ in range(num_scales)
35 | ]
36 | )
37 |
38 | self.pooling = nn.AvgPool1d(
39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False
40 | )
41 |
42 | def forward(self, x):
43 | scores = []
44 | feats = []
45 | for disc in self.discriminators:
46 | score, feat = disc(x)
47 | scores.append(score)
48 | feats.append(feat)
49 | x = self.pooling(x)
50 | return scores, feats
51 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/multiband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.layers.pqmf import PQMF
4 | from TTS.vocoder.models.melgan_generator import MelganGenerator
5 |
6 |
7 | class MultibandMelganGenerator(MelganGenerator):
8 | def __init__(
9 | self,
10 | in_channels=80,
11 | out_channels=4,
12 | proj_kernel=7,
13 | base_channels=384,
14 | upsample_factors=(2, 8, 2, 2),
15 | res_kernel=3,
16 | num_res_blocks=3,
17 | ):
18 | super().__init__(
19 | in_channels=in_channels,
20 | out_channels=out_channels,
21 | proj_kernel=proj_kernel,
22 | base_channels=base_channels,
23 | upsample_factors=upsample_factors,
24 | res_kernel=res_kernel,
25 | num_res_blocks=num_res_blocks,
26 | )
27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
28 |
29 | def pqmf_analysis(self, x):
30 | return self.pqmf_layer.analysis(x)
31 |
32 | def pqmf_synthesis(self, x):
33 | return self.pqmf_layer.synthesis(x)
34 |
35 | @torch.no_grad()
36 | def inference(self, cond_features):
37 | cond_features = cond_features.to(self.layers[1].weight.device)
38 | cond_features = torch.nn.functional.pad(
39 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
40 | )
41 | return self.pqmf_synthesis(self.layers(cond_features))
42 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/univnet_discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import nn
4 | from torch.nn.utils import spectral_norm, weight_norm
5 |
6 | from TTS.utils.audio.torch_transforms import TorchSTFT
7 | from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
8 |
9 | LRELU_SLOPE = 0.1
10 |
11 |
12 | class SpecDiscriminator(nn.Module):
13 | """docstring for Discriminator."""
14 |
15 | def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False):
16 | super().__init__()
17 | norm_f = weight_norm if use_spectral_norm is False else spectral_norm
18 | self.fft_size = fft_size
19 | self.hop_length = hop_length
20 | self.win_length = win_length
21 | self.stft = TorchSTFT(fft_size, hop_length, win_length)
22 | self.discriminators = nn.ModuleList(
23 | [
24 | norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
25 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
26 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
27 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
28 | norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
29 | ]
30 | )
31 |
32 | self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
33 |
34 | def forward(self, y):
35 | fmap = []
36 | with torch.no_grad():
37 | y = y.squeeze(1)
38 | y = self.stft(y)
39 | y = y.unsqueeze(1)
40 | for _, d in enumerate(self.discriminators):
41 | y = d(y)
42 | y = F.leaky_relu(y, LRELU_SLOPE)
43 | fmap.append(y)
44 |
45 | y = self.out(y)
46 | fmap.append(y)
47 |
48 | return torch.flatten(y, 1, -1), fmap
49 |
50 |
51 | class MultiResSpecDiscriminator(torch.nn.Module):
52 | def __init__( # pylint: disable=dangerous-default-value
53 | self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window"
54 | ):
55 | super().__init__()
56 | self.discriminators = nn.ModuleList(
57 | [
58 | SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
59 | SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
60 | SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window),
61 | ]
62 | )
63 |
64 | def forward(self, x):
65 | scores = []
66 | feats = []
67 | for d in self.discriminators:
68 | score, feat = d(x)
69 | scores.append(score)
70 | feats.append(feat)
71 |
72 | return scores, feats
73 |
74 |
75 | class UnivnetDiscriminator(nn.Module):
76 | """Univnet discriminator wrapping MPD and MSD."""
77 |
78 | def __init__(self):
79 | super().__init__()
80 | self.mpd = MultiPeriodDiscriminator()
81 | self.msd = MultiResSpecDiscriminator()
82 |
83 | def forward(self, x):
84 | """
85 | Args:
86 | x (Tensor): input waveform.
87 |
88 | Returns:
89 | List[Tensor]: discriminator scores.
90 | List[List[Tensor]]: list of list of features from each layers of each discriminator.
91 | """
92 | scores, feats = self.mpd(x)
93 | scores_, feats_ = self.msd(x)
94 | return scores + scores_, feats + feats_
95 |
--------------------------------------------------------------------------------
/TTS/vocoder/pqmf_output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/vocoder/pqmf_output.wav
--------------------------------------------------------------------------------
/TTS/vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/TTS/vocoder/utils/__init__.py
--------------------------------------------------------------------------------
/TTS/vocoder/utils/generic_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import numpy as np
4 | import torch
5 | from matplotlib import pyplot as plt
6 |
7 | from TTS.tts.utils.visual import plot_spectrogram
8 | from TTS.utils.audio import AudioProcessor
9 |
10 |
11 | def interpolate_vocoder_input(scale_factor, spec):
12 | """Interpolate spectrogram by the scale factor.
13 | It is mainly used to match the sampling rates of
14 | the tts and vocoder models.
15 |
16 | Args:
17 | scale_factor (float): scale factor to interpolate the spectrogram
18 | spec (np.array): spectrogram to be interpolated
19 |
20 | Returns:
21 | torch.tensor: interpolated spectrogram.
22 | """
23 | print(" > before interpolation :", spec.shape)
24 | spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) # pylint: disable=not-callable
25 | spec = torch.nn.functional.interpolate(
26 | spec, scale_factor=scale_factor, recompute_scale_factor=True, mode="bilinear", align_corners=False
27 | ).squeeze(0)
28 | print(" > after interpolation :", spec.shape)
29 | return spec
30 |
31 |
32 | def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
33 | """Plot the predicted and the real waveform and their spectrograms.
34 |
35 | Args:
36 | y_hat (torch.tensor): Predicted waveform.
37 | y (torch.tensor): Real waveform.
38 | ap (AudioProcessor): Audio processor used to process the waveform.
39 | name_prefix (str, optional): Name prefix used to name the figures. Defaults to None.
40 |
41 | Returns:
42 | Dict: output figures keyed by the name of the figures.
43 | """ """Plot vocoder model results"""
44 | if name_prefix is None:
45 | name_prefix = ""
46 |
47 | # select an instance from batch
48 | y_hat = y_hat[0].squeeze().detach().cpu().numpy()
49 | y = y[0].squeeze().detach().cpu().numpy()
50 |
51 | spec_fake = ap.melspectrogram(y_hat).T
52 | spec_real = ap.melspectrogram(y).T
53 | spec_diff = np.abs(spec_fake - spec_real)
54 |
55 | # plot figure and save it
56 | fig_wave = plt.figure()
57 | plt.subplot(2, 1, 1)
58 | plt.plot(y)
59 | plt.title("groundtruth speech")
60 | plt.subplot(2, 1, 2)
61 | plt.plot(y_hat)
62 | plt.title("generated speech")
63 | plt.tight_layout()
64 | plt.close()
65 |
66 | figures = {
67 | name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake),
68 | name_prefix + "spectrogram/real": plot_spectrogram(spec_real),
69 | name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff),
70 | name_prefix + "speech_comparison": fig_wave,
71 | }
72 | return figures
73 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from helpers import ProcessModelList, ConvertTextToSpeech, AudioClonning, download_audio_file
3 | import io
4 |
5 |
6 | st.sidebar.title("Welcome to :blue[MimicMania]")
7 | sidebar_options = ["Text To Speech", "Voice Clonning"]
8 | choice = st.sidebar.selectbox(label="Select Your Usecase: ", options=sidebar_options)
9 |
10 |
11 | if choice == sidebar_options[0]:
12 | st.title("Convert your Text to Speech")
13 | model_list = ProcessModelList()
14 |
15 | selected_language = st.selectbox("Select The Language: ", options=model_list.get_langauge_labels())
16 | selected_model = st.selectbox("Select The Model: ", options=model_list.get_model_name(selected_language=selected_language))
17 | selected_model_path = model_list.get_model_path(selected_language=selected_language, selected_model=selected_model)
18 | multi_speaker, multi_speaker_list = model_list.get_multi_speaker_model(model_path=selected_model_path)
19 |
20 | if selected_language == "Multi Language":
21 | speakers, languages = model_list.multi_language_selected(model_path=selected_model_path)
22 | selected_speaker = st.selectbox("Select the voice: ", options=speakers)
23 | selected_speaker_language = st.selectbox("Select the Speaker: ", options=languages)
24 |
25 |
26 | elif multi_speaker == True:
27 | selected_speaker = st.selectbox("Select the voice: ", options=multi_speaker_list)
28 |
29 | else:
30 | selected_speaker = None
31 | selected_speaker_language = None
32 |
33 | text = st.text_area("Enter Your Text which you want to convert to audio.")
34 |
35 | if st.button("Convert"):
36 |
37 | text_to_speech = ConvertTextToSpeech(model_name=selected_model, model_path=selected_model_path, text=text)
38 |
39 | if selected_language == "Multi Language" or selected_language == "Popular Person":
40 | text_to_speech.convert_text_to_speech_multi_langauge(speaker=selected_speaker, language=selected_speaker_language, model_name=selected_model, selected_langauge=selected_language)
41 | else:
42 | text_to_speech.convert_text_to_speech(speaker_id=selected_speaker)
43 |
44 | read_audio = text_to_speech.read_audio_file()
45 | st.audio(read_audio, format='audio/wav')
46 | download_audio_file(audio=read_audio, file_name="TTS")
47 |
48 |
49 |
50 | elif choice == sidebar_options[1]:
51 | st.title("Clone Anyone's Voice")
52 | st.subheader("The Better the quality and duration of the data the more realistic the sound will be.")
53 | uploaded_music = st.file_uploader(label="Upload Your Audio File: ", type=["mp3", "wav"])
54 | text = st.text_area(label="Enter The text you want to convert: ")
55 | emotion = "Neutral"#st.selectbox(label="Select What will the voice emotion: ", options=["Neutral", "Happy", "Sad", "Angry", "Surprise", "Dull"])
56 |
57 | if st.button("Start Clonning"):
58 | if uploaded_music is not None:
59 |
60 | audio_filename = uploaded_music.name
61 | audio = io.BytesIO(uploaded_music.read())
62 |
63 | audio_clonning = AudioClonning(audio=audio, audio_filename=audio_filename, text=text, emotion=emotion)
64 |
65 | cloned_voice = audio_clonning.convert_text_to_speech()
66 | #cloned_voice = audio_clonning.emotion_modification()
67 |
68 | st.audio(cloned_voice, format="audio/wav")
69 | download_audio_file(audio=cloned_voice, file_name="Voice-Cloned")
70 |
--------------------------------------------------------------------------------
/clonner_output/sample.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/clonner_output/sample.txt
--------------------------------------------------------------------------------
/language_model/sample.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/language_model/sample.txt
--------------------------------------------------------------------------------
/model_list.json:
--------------------------------------------------------------------------------
1 | {
2 | "labels": ["Multi Language", "American English", "United Kingdom", "Japanese", "Italian"],
3 | "custom_voice": ["Chales Hoskinson"],
4 | "data": {
5 | "Multi Language": [
6 | {
7 | "name": "Multi Language",
8 | "model_path": "tts_models--multilingual--multi-dataset--your_tts"
9 | }
10 | ],
11 |
12 | "American English": [
13 | {
14 | "name": "c50",
15 | "model_path": "tts_models--en--blizzard2013--capacitron-t2-c50"
16 | },
17 | {
18 | "name": "c150",
19 | "model_path": "tts_models--en--blizzard2013--capacitron-t2-c150_v2"
20 | },
21 | {
22 | "name": "tacotron2",
23 | "model_path": "tts_models--en--ek1--tacotron2"
24 | },
25 | {
26 | "name": "fast_pitch",
27 | "model_path": "tts_models--en--ljspeech--fast_pitch"
28 | },
29 | {
30 | "name": "glow_tts",
31 | "model_path": "tts_models--en--ljspeech--glow-tts"
32 | },
33 | {
34 | "name": "neutral-hmm",
35 | "model_path": "tts_models--en--ljspeech--neural_hmm"
36 | },
37 | {
38 | "name": "overflow",
39 | "model_path": "tts_models--en--ljspeech--overflow"
40 | },
41 | {
42 | "name": "speedy-speech",
43 | "model_path": "tts_models--en--ljspeech--speedy-speech"
44 | },
45 | {
46 | "name": "DCA",
47 | "model_path": "tts_models--en--ljspeech--tacotron2-DCA"
48 | },
49 | {
50 | "name": "DDC",
51 | "model_path": "tts_models--en--ljspeech--tacotron2-DDC"
52 | },
53 | {
54 | "name": "DDS-PH",
55 | "model_path": "tts_models--en--ljspeech--tacotron2-DDC_ph"
56 | },
57 | {
58 | "name": "Vits",
59 | "model_path": "tts_models--en--ljspeech--vits"
60 | },
61 | {
62 | "name": "DDCv2",
63 | "model_path": "tts_models--en--sam--tacotron-DDC"
64 | },
65 | {
66 | "name": "fast_pitchv2",
67 | "model_path": "tts_models--en--vctk--fast_pitch"
68 | },
69 | {
70 | "name": "Vitsv2",
71 | "model_path": "tts_models--en--vctk--vits"
72 | }
73 | ],
74 |
75 | "United Kingdom": [
76 | {
77 | "name": "Mai-Glow",
78 | "model_path": "tts_models--uk--mai--glow-tts"
79 | }
80 | ],
81 | "Japanese": [
82 | {
83 | "name": "DDC",
84 | "model_path": "tts_models--ja--kokoro--tacotron2-DDC"
85 | }
86 | ],
87 | "Italian": [
88 | {
89 | "name": "Male TTS",
90 | "model_path": "tts_models--it--mai_male--glow-tts"
91 | },
92 | {
93 | "name": "Female TTS",
94 | "model_path": "tts_models--it--mai_female--glow-tts"
95 | }
96 | ],
97 | "Popular Person": [
98 | {
99 | "name": "Chales Hoskinson",
100 | "model_path": "popular_person/popular_person_model",
101 | "model_voice": "popular_person/popular_person_voice/chales_hoskinson.wav"
102 | }
103 | ]
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/output/sample.ttx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/everydaycodings/MimicMania/eecbaae2c430785145afe48bc92a7634ab9fe660/output/sample.ttx
--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | espeak-ng
2 | espeak
3 | ffmpeg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # core deps
2 | numpy==1.21.6;python_version<"3.10"
3 | numpy;python_version=="3.10"
4 | cython==0.29.28
5 | scipy>=1.4.0
6 | torch>=1.7
7 | torchaudio
8 | soundfile
9 | librosa==0.8.0
10 | numba==0.55.1;python_version<"3.10"
11 | numba==0.55.2;python_version=="3.10"
12 | inflect==5.6.0
13 | tqdm
14 | anyascii
15 | pyyaml
16 | fsspec>=2021.04.0
17 | packaging
18 | streamlit
19 | # deps for examples
20 | flask
21 | # deps for inference
22 | pysbd
23 | # deps for notebooks
24 | umap-learn==0.5.1
25 | pandas
26 | # deps for training
27 | matplotlib
28 | # coqui stack
29 | trainer==0.0.20
30 | # config management
31 | coqpit>=0.0.16
32 | # chinese g2p deps
33 | jieba
34 | pypinyin
35 | # japanese g2p deps
36 | mecab-python3==1.0.5
37 | unidic-lite==1.0.8
38 | # gruut+supported langs
39 | gruut[de]==2.2.3
40 | # deps for korean
41 | jamo
42 | nltk
43 | g2pkk>=0.1.1
44 | streamlit
45 | pydub
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from helpers import DownloadModels
2 |
3 | DownloadModels().download_models()
--------------------------------------------------------------------------------