├── TTS ├── VERSION ├── bin │ ├── __init__.py │ ├── collect_env_info.py │ ├── find_unique_chars.py │ ├── find_unique_phonemes.py │ └── train_tts.py ├── encoder │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── io.py │ │ └── visual.py │ ├── requirements.txt │ ├── configs │ │ ├── speaker_encoder_config.py │ │ ├── emotion_encoder_config.py │ │ └── base_encoder_config.py │ └── README.md ├── server │ ├── __init__.py │ ├── static │ │ └── coqui-log-green-TTS.png │ ├── conf.json │ └── README.md ├── tts │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── text │ │ │ ├── english │ │ │ │ ├── __init__.py │ │ │ │ ├── abbreviations.py │ │ │ │ └── time_norm.py │ │ │ ├── french │ │ │ │ ├── __init__.py │ │ │ │ └── abbreviations.py │ │ │ ├── nepali │ │ │ │ └── __init__.py │ │ │ ├── japanese │ │ │ │ └── __init__.py │ │ │ ├── chinese_mandarin │ │ │ │ ├── __init__.py │ │ │ │ └── phonemizer.py │ │ │ ├── __init__.py │ │ │ └── phonemizers │ │ │ │ ├── __init__.py │ │ │ │ ├── zh_cn_phonemizer.py │ │ │ │ ├── multi_phonemizer.py │ │ │ │ └── ja_jp_phonemizer.py │ │ ├── monotonic_align │ │ │ ├── __init__.py │ │ │ ├── setup.py │ │ │ └── core.pyx │ │ └── measures.py │ ├── layers │ │ ├── align_tts │ │ │ ├── __init__.py │ │ │ ├── duration_predictor.py │ │ │ └── mdn.py │ │ ├── generic │ │ │ ├── __init__.py │ │ │ └── gated_conv.py │ │ ├── glow_tts │ │ │ ├── __init__.py │ │ │ └── duration_predictor.py │ │ ├── tacotron │ │ │ └── __init__.py │ │ ├── feed_forward │ │ │ ├── __init__.py │ │ │ └── duration_predictor.py │ │ └── __init__.py │ ├── configs │ │ ├── tacotron2_config.py │ │ └── __init__.py │ └── models │ │ └── __init__.py ├── utils │ ├── __init__.py │ ├── distribute.py │ ├── training.py │ └── capacitron_optimizer.py ├── vocoder │ ├── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── melgan.py │ │ ├── pqmf.py │ │ └── hifigan.py │ ├── utils │ │ └── __init__.py │ ├── pqmf_output.wav │ ├── configs │ │ └── __init__.py │ ├── models │ │ ├── fullband_melgan_generator.py │ │ ├── multiband_melgan_generator.py │ │ ├── melgan_multiscale_discriminator.py │ │ └── base_vocoder.py │ ├── README.md │ └── datasets │ │ └── __init__.py ├── __init__.py └── model.py ├── docs ├── README.md ├── source │ ├── contributing.md │ ├── _static │ │ └── logo.png │ ├── main_classes │ │ ├── trainer_api.md │ │ ├── speaker_manager.md │ │ ├── gan.md │ │ ├── dataset.md │ │ ├── model_api.md │ │ └── audio_processor.md │ ├── _templates │ │ └── page.html │ ├── installation.md │ ├── models │ │ ├── glow_tts.md │ │ ├── vits.md │ │ └── forward_tts.md │ ├── make.bat │ ├── tts_datasets.md │ └── index.md ├── requirements.txt └── Makefile ├── tests ├── aux_tests │ ├── __init__.py │ └── test_stft_torch.py ├── data_tests │ ├── __init__.py │ └── test_dataset_formatters.py ├── text_tests │ ├── __init__.py │ ├── test_text_cleaners.py │ ├── test_japanese_phonemizer.py │ └── test_punctuation.py ├── tts_tests │ ├── __init__.py │ ├── test_vits_d-vectors_train.py │ └── test_tacotron_train.py ├── zoo_tests │ └── __init__.py ├── inference_tests │ ├── __init__.py │ └── test_synthesize.py ├── vocoder_tests │ ├── __init__.py │ ├── test_vocoder_melgan_generator.py │ ├── test_vocoder_rwd.py │ ├── test_vocoder_pqmf.py │ ├── test_vocoder_parallel_wavegan_generator.py │ ├── test_vocoder_melgan_discriminator.py │ ├── test_hifigan_train.py │ ├── test_parallel_wavegan_train.py │ ├── test_vocoder_parallel_wavegan_discriminator.py │ ├── test_wavegrad_train.py │ ├── test_melgan_train.py │ ├── test_wavernn_train.py │ ├── test_fullband_melgan_train.py │ ├── test_multiband_melgan_train.py │ ├── test_vocoder_wavernn.py │ └── test_wavegrad.py ├── inputs │ ├── language_ids.json │ ├── example_1.wav │ ├── scale_stats.npy │ ├── server_config.json │ ├── common_voice.tsv │ ├── test_vocoder_audio_config.json │ └── test_config.json ├── data │ ├── dummy_speakers.pth │ └── ljspeech │ │ ├── wavs │ │ ├── LJ001-0001.npy │ │ ├── LJ001-0001.wav │ │ ├── LJ001-0002.npy │ │ ├── LJ001-0002.wav │ │ ├── LJ001-0003.npy │ │ ├── LJ001-0003.wav │ │ ├── LJ001-0004.npy │ │ ├── LJ001-0004.wav │ │ ├── LJ001-0005.npy │ │ ├── LJ001-0005.wav │ │ ├── LJ001-0006.npy │ │ ├── LJ001-0006.wav │ │ ├── LJ001-0007.npy │ │ ├── LJ001-0007.wav │ │ ├── LJ001-0008.npy │ │ ├── LJ001-0008.wav │ │ ├── LJ001-0009.npy │ │ ├── LJ001-0009.wav │ │ ├── LJ001-0010.npy │ │ ├── LJ001-0010.wav │ │ ├── LJ001-0011.npy │ │ ├── LJ001-0011.wav │ │ ├── LJ001-0012.npy │ │ ├── LJ001-0012.wav │ │ ├── LJ001-0013.npy │ │ ├── LJ001-0013.wav │ │ ├── LJ001-0014.npy │ │ ├── LJ001-0014.wav │ │ ├── LJ001-0015.npy │ │ ├── LJ001-0015.wav │ │ ├── LJ001-0016.npy │ │ ├── LJ001-0016.wav │ │ ├── LJ001-0017.npy │ │ ├── LJ001-0017.wav │ │ ├── LJ001-0018.npy │ │ ├── LJ001-0018.wav │ │ ├── LJ001-0019.npy │ │ ├── LJ001-0019.wav │ │ ├── LJ001-0020.npy │ │ ├── LJ001-0020.wav │ │ ├── LJ001-0021.npy │ │ ├── LJ001-0021.wav │ │ ├── LJ001-0022.npy │ │ ├── LJ001-0022.wav │ │ ├── LJ001-0023.npy │ │ ├── LJ001-0023.wav │ │ ├── LJ001-0024.npy │ │ ├── LJ001-0024.wav │ │ ├── LJ001-0025.npy │ │ ├── LJ001-0025.wav │ │ ├── LJ001-0026.npy │ │ ├── LJ001-0026.wav │ │ ├── LJ001-0027.npy │ │ ├── LJ001-0027.wav │ │ ├── LJ001-0028.npy │ │ ├── LJ001-0028.wav │ │ ├── LJ001-0029.npy │ │ ├── LJ001-0029.wav │ │ ├── LJ001-0030.npy │ │ ├── LJ001-0030.wav │ │ ├── LJ001-0031.npy │ │ ├── LJ001-0031.wav │ │ ├── LJ001-0032.npy │ │ └── LJ001-0032.wav │ │ ├── f0_cache │ │ └── pitch_stats.npy │ │ ├── phoneme_cache │ │ ├── LJ001-0001_phoneme.npy │ │ ├── LJ001-0002_phoneme.npy │ │ ├── LJ001-0003_phoneme.npy │ │ ├── LJ001-0004_phoneme.npy │ │ ├── LJ001-0005_phoneme.npy │ │ ├── LJ001-0006_phoneme.npy │ │ ├── LJ001-0007_phoneme.npy │ │ ├── LJ001-0008_phoneme.npy │ │ ├── LJ001-0009_phoneme.npy │ │ ├── LJ001-0010_phoneme.npy │ │ ├── LJ001-0011_phoneme.npy │ │ ├── LJ001-0012_phoneme.npy │ │ ├── LJ001-0013_phoneme.npy │ │ ├── LJ001-0014_phoneme.npy │ │ ├── LJ001-0015_phoneme.npy │ │ ├── LJ001-0016_phoneme.npy │ │ ├── LJ001-0017_phoneme.npy │ │ ├── LJ001-0018_phoneme.npy │ │ ├── LJ001-0019_phoneme.npy │ │ ├── LJ001-0020_phoneme.npy │ │ ├── LJ001-0021_phoneme.npy │ │ ├── LJ001-0022_phoneme.npy │ │ ├── LJ001-0023_phoneme.npy │ │ ├── LJ001-0024_phoneme.npy │ │ ├── LJ001-0025_phoneme.npy │ │ ├── LJ001-0026_phoneme.npy │ │ ├── LJ001-0027_phoneme.npy │ │ ├── LJ001-0028_phoneme.npy │ │ ├── LJ001-0029_phoneme.npy │ │ ├── LJ001-0030_phoneme.npy │ │ ├── LJ001-0031_phoneme.npy │ │ └── LJ001-0032_phoneme.npy │ │ └── metadata.csv ├── bash_tests │ ├── test_compute_statistics.sh │ └── test_demo_server.sh └── __init__.py ├── .dockerignore ├── requirements.notebooks.txt ├── requirements.dev.txt ├── images ├── model.png ├── tts_cli.gif ├── demo_server.gif ├── TTS-performance.png ├── tts_performance.png ├── coqui-log-green-TTS.png └── example_model_output.png ├── .cardboardlint.yml ├── setup.cfg ├── run_bash_tests.sh ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── feature_request.md ├── stale.yml ├── PR_TEMPLATE.md └── workflows │ ├── style_check.yml │ ├── vocoder_tests.yml │ ├── text_tests.yml │ ├── aux_tests.yml │ ├── data_tests.yml │ ├── inference_tests.yml │ ├── zoo_tests.yml │ ├── tts_tests.yml │ └── docker.yaml ├── notebooks └── dataset_analysis │ └── README.md ├── MANIFEST.in ├── recipes ├── vctk │ └── download_vctk.sh ├── ljspeech │ ├── download_ljspeech.sh │ ├── README.md │ ├── univnet │ │ └── train.py │ ├── hifigan │ │ └── train_hifigan.py │ ├── multiband_melgan │ │ └── train_multiband_melgan.py │ ├── wavegrad │ │ └── train_wavegrad.py │ ├── wavernn │ │ └── train_wavernn.py │ └── align_tts │ │ └── train_aligntts.py ├── thorsten_DE │ ├── download_thorsten_DE.sh │ ├── README.md │ ├── univnet │ │ └── train_univnet.py │ ├── hifigan │ │ └── train_hifigan.py │ ├── multiband_melgan │ │ └── train_multiband_melgan.py │ ├── wavegrad │ │ └── train_wavegrad.py │ └── wavernn │ │ └── train_wavernn.py ├── README.md ├── blizzard2013 │ └── README.md └── kokoro │ └── tacotron2-DDC │ └── run.sh ├── .readthedocs.yml ├── CITATION.cff ├── pyproject.toml ├── .pre-commit-config.yaml ├── Dockerfile ├── requirements.txt ├── hubconf.py └── Makefile /TTS/VERSION: -------------------------------------------------------------------------------- 1 | 0.7.1 -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/aux_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/text_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tts_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/zoo_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/encoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/vocoder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/inference_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/vocoder_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | Dockerfile 3 | -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/layers/generic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/layers/glow_tts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/layers/tacotron/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/french/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/nepali/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.notebooks.txt: -------------------------------------------------------------------------------- 1 | bokeh==1.4.0 -------------------------------------------------------------------------------- /tests/aux_tests/test_stft_torch.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/layers/feed_forward/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/japanese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/chinese_mandarin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TTS/encoder/requirements.txt: -------------------------------------------------------------------------------- 1 | umap-learn 2 | numpy>=1.17.0 3 | -------------------------------------------------------------------------------- /TTS/tts/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.layers.losses import * 2 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | coverage 3 | isort 4 | nose2 5 | pylint==2.10.2 6 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 2 | -------------------------------------------------------------------------------- /images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/model.png -------------------------------------------------------------------------------- /tests/inputs/language_ids.json: -------------------------------------------------------------------------------- 1 | { 2 | "en": 0, 3 | "fr-fr": 1, 4 | "pt-br": 2 5 | } -------------------------------------------------------------------------------- /docs/source/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../../CONTRIBUTING.md 2 | :relative-images: 3 | ``` 4 | -------------------------------------------------------------------------------- /images/tts_cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/tts_cli.gif -------------------------------------------------------------------------------- /images/demo_server.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/demo_server.gif -------------------------------------------------------------------------------- /images/TTS-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/TTS-performance.png -------------------------------------------------------------------------------- /images/tts_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/tts_performance.png -------------------------------------------------------------------------------- /tests/inputs/example_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/inputs/example_1.wav -------------------------------------------------------------------------------- /TTS/vocoder/pqmf_output.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/TTS/vocoder/pqmf_output.wav -------------------------------------------------------------------------------- /docs/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/docs/source/_static/logo.png -------------------------------------------------------------------------------- /tests/data/dummy_speakers.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/dummy_speakers.pth -------------------------------------------------------------------------------- /tests/inputs/scale_stats.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/inputs/scale_stats.npy -------------------------------------------------------------------------------- /images/coqui-log-green-TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/coqui-log-green-TTS.png -------------------------------------------------------------------------------- /images/example_model_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/example_model_output.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo 2 | myst-parser == 0.15.1 3 | sphinx == 4.0.2 4 | sphinx_inline_tabs 5 | sphinx_copybutton 6 | linkify-it-py -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0001.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0001.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0002.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0002.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0003.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0003.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0004.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0004.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0004.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0005.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0005.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0005.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0006.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0006.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0006.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0007.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0007.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0007.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0008.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0008.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0008.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0009.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0009.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0009.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0010.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0010.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0010.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0011.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0011.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0011.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0012.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0012.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0012.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0013.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0013.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0013.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0013.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0014.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0014.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0014.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0014.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0015.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0015.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0015.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0015.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0016.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0016.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0016.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0016.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0017.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0017.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0017.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0017.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0018.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0018.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0018.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0018.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0019.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0019.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0019.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0019.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0020.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0020.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0020.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0020.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0021.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0021.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0021.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0021.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0022.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0022.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0022.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0022.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0023.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0023.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0023.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0023.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0024.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0024.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0024.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0024.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0025.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0025.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0025.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0025.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0026.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0026.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0026.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0026.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0027.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0027.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0027.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0027.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0028.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0028.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0028.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0028.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0029.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0029.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0029.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0029.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0030.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0030.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0030.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0030.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0031.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0031.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0031.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0031.wav -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0032.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0032.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/wavs/LJ001-0032.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0032.wav -------------------------------------------------------------------------------- /TTS/server/static/coqui-log-green-TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/TTS/server/static/coqui-log-green-TTS.png -------------------------------------------------------------------------------- /docs/source/main_classes/trainer_api.md: -------------------------------------------------------------------------------- 1 | # Trainer API 2 | 3 | We made the trainer a seprate project on https://github.com/coqui-ai/Trainer 4 | -------------------------------------------------------------------------------- /.cardboardlint.yml: -------------------------------------------------------------------------------- 1 | linters: 2 | - pylint: 3 | # pylintrc: pylintrc 4 | filefilter: ['- test_*.py', '+ *.py', '- *.npy'] 5 | # exclude: -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_py] 2 | build-lib=temp_build 3 | 4 | [bdist_wheel] 5 | bdist-dir=temp_build 6 | 7 | [install_lib] 8 | build-dir=temp_build 9 | -------------------------------------------------------------------------------- /tests/data/ljspeech/f0_cache/pitch_stats.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/f0_cache/pitch_stats.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy -------------------------------------------------------------------------------- /tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy -------------------------------------------------------------------------------- /TTS/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f: 4 | version = f.read().strip() 5 | 6 | __version__ = version 7 | 8 | -------------------------------------------------------------------------------- /run_bash_tests.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | TF_CPP_MIN_LOG_LEVEL=3 3 | 4 | # runtime bash based tests 5 | # TODO: move these to python 6 | ./tests/bash_tests/test_demo_server.sh && \ 7 | ./tests/bash_tests/test_compute_statistics.sh 8 | -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /tests/bash_tests/test_compute_statistics.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xe 3 | BASEDIR=$(dirname "$0") 4 | echo "$BASEDIR" 5 | # run training 6 | CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy 7 | 8 | -------------------------------------------------------------------------------- /docs/source/main_classes/speaker_manager.md: -------------------------------------------------------------------------------- 1 | # Speaker Manager API 2 | 3 | The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is 4 | especially useful for multi-speaker models. 5 | 6 | 7 | ## Speaker Manager 8 | ```{eval-rst} 9 | .. automodule:: TTS.tts.utils.speakers 10 | :members: 11 | ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: CoquiTTS GitHub Discussions 4 | url: https://github.com/coqui-ai/TTS/discussions 5 | about: Please ask and answer questions here. 6 | - name: Coqui Security issue disclosure 7 | url: mailto:info@coqui.ai 8 | about: Please report security vulnerabilities here. 9 | -------------------------------------------------------------------------------- /TTS/encoder/configs/speaker_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class SpeakerEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Speaker Encoder model.""" 9 | 10 | model: str = "speaker_encoder" 11 | class_name_key: str = "speaker_name" 12 | -------------------------------------------------------------------------------- /docs/source/main_classes/gan.md: -------------------------------------------------------------------------------- 1 | # GAN API 2 | 3 | The {class}`TTS.vocoder.models.gan.GAN` provides an easy way to implementing new GAN based models. You just need 4 | to define the model architectures for the generator and the discriminator networks and give them to the `GAN` class 5 | to do its ✨️. 6 | 7 | 8 | ## GAN 9 | ```{eval-rst} 10 | .. autoclass:: TTS.vocoder.models.gan.GAN 11 | :members: 12 | ``` -------------------------------------------------------------------------------- /notebooks/dataset_analysis/README.md: -------------------------------------------------------------------------------- 1 | ## Simple Notebook to Analyze a Dataset 2 | 3 | By the use of this notebook, you can easily analyze a brand new dataset, find exceptional cases and define your training set. 4 | 5 | What we are looking in here is reasonable distribution of instances in terms of sequence-length, audio-length and word-coverage. 6 | 7 | This notebook is inspired from https://github.com/MycroftAI/mimic2 8 | -------------------------------------------------------------------------------- /tests/bash_tests/test_demo_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | python -m TTS.server.server & 5 | SERVER_PID=$! 6 | 7 | echo 'Waiting for server...' 8 | sleep 30 9 | 10 | curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis" 11 | python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav 12 | 13 | kill $SERVER_PID 14 | 15 | rm /tmp/audio.wav 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include TTS/VERSION 7 | recursive-include TTS *.json 8 | recursive-include TTS *.html 9 | recursive-include TTS *.png 10 | recursive-include TTS *.md 11 | recursive-include TTS *.py 12 | recursive-include TTS *.pyx 13 | recursive-include images *.png 14 | recursive-exclude tests * 15 | prune tests* 16 | -------------------------------------------------------------------------------- /TTS/encoder/configs/emotion_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class EmotionEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Emotion Encoder model.""" 9 | 10 | model: str = "emotion_encoder" 11 | map_classid_to_classname: dict = None 12 | class_name_key: str = "emotion_name" 13 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.melgan_generator import MelganGenerator 5 | 6 | 7 | def test_melgan_generator(): 8 | model = MelganGenerator() 9 | print(model) 10 | dummy_input = torch.rand((4, 80, 64)) 11 | output = model(dummy_input) 12 | assert np.all(output.shape == (4, 1, 64 * 256)) 13 | output = model.inference(dummy_input) 14 | assert np.all(output.shape == (4, 1, (64 + 4) * 256)) 15 | -------------------------------------------------------------------------------- /recipes/vctk/download_vctk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download VCTK dataset 6 | wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip 7 | # extract 8 | mkdir VCTK 9 | unzip VCTK-Corpus-0.92 -d VCTK 10 | # create train-val splits 11 | mv VCTK $RUN_DIR/recipes/vctk/ 12 | rm VCTK-Corpus-0.92.zip 13 | -------------------------------------------------------------------------------- /TTS/server/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder 3 | "tts_file":"best_model.pth", // tts checkpoint file 4 | "tts_config":"config.json", // tts config.json file 5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. 6 | "vocoder_config":null, 7 | "vocoder_file": null, 8 | "is_wavernn_batched":true, 9 | "port": 5002, 10 | "use_cuda": true, 11 | "debug": true 12 | } 13 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | builder: html 11 | configuration: docs/source/conf.py 12 | 13 | # Optionally set the version of Python and requirements required to build your docs 14 | python: 15 | version: 3.7 16 | install: 17 | - requirements: docs/requirements.txt 18 | - requirements: requirements.txt -------------------------------------------------------------------------------- /docs/source/main_classes/dataset.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | ## TTS Dataset 4 | 5 | ```{eval-rst} 6 | .. autoclass:: TTS.tts.datasets.TTSDataset 7 | :members: 8 | ``` 9 | 10 | ## Vocoder Dataset 11 | 12 | ```{eval-rst} 13 | .. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset 14 | :members: 15 | ``` 16 | 17 | ```{eval-rst} 18 | .. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset 19 | :members: 20 | ``` 21 | 22 | ```{eval-rst} 23 | .. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset 24 | :members: 25 | ``` -------------------------------------------------------------------------------- /docs/source/main_classes/model_api.md: -------------------------------------------------------------------------------- 1 | # Model API 2 | Model API provides you a set of functions that easily make your model compatible with the `Trainer`, 3 | `Synthesizer` and `ModelZoo`. 4 | 5 | ## Base TTS Model 6 | 7 | ```{eval-rst} 8 | .. autoclass:: TTS.model.BaseModel 9 | :members: 10 | ``` 11 | 12 | ## Base `tts` Model 13 | 14 | ```{eval-rst} 15 | .. autoclass:: TTS.tts.models.base_tts.BaseTTS 16 | :members: 17 | ``` 18 | 19 | ## Base `vocoder` Model 20 | 21 | ```{eval-rst} 22 | .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder 23 | :members: 24 | ``` -------------------------------------------------------------------------------- /TTS/tts/utils/measures.py: -------------------------------------------------------------------------------- 1 | def alignment_diagonal_score(alignments, binary=False): 2 | """ 3 | Compute how diagonal alignment predictions are. It is useful 4 | to measure the alignment consistency of a model 5 | Args: 6 | alignments (torch.Tensor): batch of alignments. 7 | binary (bool): if True, ignore scores and consider attention 8 | as a binary mask. 9 | Shape: 10 | - alignments : :math:`[B, T_de, T_en]` 11 | """ 12 | maxs = alignments.max(dim=1)[0] 13 | if binary: 14 | maxs[maxs > 0] = 1 15 | return maxs.mean(dim=1).mean(dim=0).item() 16 | -------------------------------------------------------------------------------- /TTS/tts/configs/tacotron2_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from TTS.tts.configs.tacotron_config import TacotronConfig 4 | 5 | 6 | @dataclass 7 | class Tacotron2Config(TacotronConfig): 8 | """Defines parameters for Tacotron2 based models. 9 | 10 | Example: 11 | 12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config 13 | >>> config = Tacotron2Config() 14 | 15 | Check `TacotronConfig` for argument descriptions. 16 | """ 17 | 18 | model: str = "tacotron2" 19 | out_channels: int = 80 20 | encoder_in_features: int = 512 21 | decoder_in_features: int = 512 22 | -------------------------------------------------------------------------------- /TTS/tts/models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | from TTS.utils.generic_utils import find_module 4 | 5 | 6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": 7 | print(" > Using model: {}".format(config.model)) 8 | # fetch the right model implementation. 9 | if "base_model" in config and config["base_model"] is not None: 10 | MyModel = find_module("TTS.tts.models", config.base_model.lower()) 11 | else: 12 | MyModel = find_module("TTS.tts.models", config.model.lower()) 13 | model = MyModel.init_from_config(config, samples) 14 | return model 15 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)" 3 | title: "Coqui TTS" 4 | abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production" 5 | date-released: 2021-01-01 6 | authors: 7 | - family-names: "Eren" 8 | given-names: "Gölge" 9 | - name: "The Coqui TTS Team" 10 | version: 1.4 11 | doi: 10.5281/zenodo.6334862 12 | license: "MPL-2.0" 13 | url: "https://www.coqui.ai" 14 | repository-code: "https://github.com/coqui-ai/TTS" 15 | keywords: 16 | - machine learning 17 | - deep learning 18 | - artificial intelligence 19 | - text to speech 20 | - TTS -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_rwd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscriminator 5 | 6 | 7 | def test_rwd(): 8 | layer = RandomWindowDiscriminator( 9 | cond_channels=80, 10 | window_sizes=(512, 1024, 2048, 4096, 8192), 11 | cond_disc_downsample_factors=[(8, 4, 2, 2, 2), (8, 4, 2, 2), (8, 4, 2), (8, 4), (4, 2, 2)], 12 | hop_length=256, 13 | ) 14 | x = torch.rand([4, 1, 22050]) 15 | c = torch.rand([4, 80, 22050 // 256]) 16 | 17 | scores, _ = layer(x, c) 18 | assert len(scores) == 10 19 | assert np.all(scores[0].shape == (4, 1, 1)) 20 | -------------------------------------------------------------------------------- /recipes/ljspeech/download_ljspeech.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download LJSpeech dataset 6 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 7 | # extract 8 | tar -xjf LJSpeech-1.1.tar.bz2 9 | # create train-val splits 10 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 11 | head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 12 | tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 13 | mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/ 14 | rm LJSpeech-1.1.tar.bz2 -------------------------------------------------------------------------------- /recipes/thorsten_DE/download_thorsten_DE.sh: -------------------------------------------------------------------------------- 1 | # create venv 2 | python3 -m venv env 3 | source .env/bin/activate 4 | pip install pip --upgrade 5 | 6 | # download Thorsten_DE dataset 7 | pip install gdown 8 | gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz 9 | tar -xzf dataset.tgz 10 | 11 | # create train-val splits 12 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 13 | head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 14 | tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 15 | 16 | # rename dataset and remove archive 17 | mv LJSpeech-1.1 thorsten-de 18 | rm dataset.tgz 19 | 20 | # destry venv 21 | rm -rf env 22 | -------------------------------------------------------------------------------- /tests/inputs/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file 3 | "tts_config":"dummy_model_config.json", // tts config.json file 4 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. 5 | "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. 6 | "wavernn_file": null, // wavernn checkpoint file name 7 | "wavernn_config": null, // wavernn config file 8 | "vocoder_config":null, 9 | "vocoder_checkpoint": null, 10 | "is_wavernn_batched":true, 11 | "port": 5002, 12 | "use_cuda": false, 13 | "debug": true 14 | } 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -j auto -WT --keep-going 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"] 3 | 4 | [flake8] 5 | max-line-length=120 6 | 7 | [tool.black] 8 | line-length = 120 9 | target-version = ['py39'] 10 | exclude = ''' 11 | 12 | ( 13 | /( 14 | \.eggs # exclude a few common directories in the 15 | | \.git # root of the project 16 | | \.hg 17 | | \.mypy_cache 18 | | \.tox 19 | | \.venv 20 | | _build 21 | | buck-out 22 | | build 23 | | dist 24 | )/ 25 | | foo.py # also separately exclude a file named foo.py in 26 | # the root of the project 27 | ) 28 | ''' 29 | 30 | [tool.isort] 31 | line_length = 120 32 | profile = "black" 33 | multi_line_output = 3 -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: 'https://github.com/pre-commit/pre-commit-hooks' 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: 'https://github.com/psf/black' 9 | rev: 20.8b1 10 | hooks: 11 | - id: black 12 | language_version: python3 13 | - repo: https://github.com/pycqa/isort 14 | rev: 5.8.0 15 | hooks: 16 | - id: isort 17 | name: isort (python) 18 | - id: isort 19 | name: isort (cython) 20 | types: [cython] 21 | - id: isort 22 | name: isort (pyi) 23 | types: [pyi] 24 | - repo: https://github.com/pycqa/pylint 25 | rev: v2.8.2 26 | hooks: 27 | - id: pylint 28 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 30 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. You might also look our discussion channels. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | 19 | -------------------------------------------------------------------------------- /TTS/utils/distribute.py: -------------------------------------------------------------------------------- 1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py 2 | import torch 3 | import torch.distributed as dist 4 | 5 | 6 | def reduce_tensor(tensor, num_gpus): 7 | rt = tensor.clone() 8 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 9 | rt /= num_gpus 10 | return rt 11 | 12 | 13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): 14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA." 15 | 16 | # Set cuda device so everything is done on the right GPU. 17 | torch.cuda.set_device(rank % torch.cuda.device_count()) 18 | 19 | # Initialize distributed communication 20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name) 21 | -------------------------------------------------------------------------------- /docs/source/main_classes/audio_processor.md: -------------------------------------------------------------------------------- 1 | # AudioProcessor API 2 | 3 | `TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for 4 | 5 | - Feature extraction. 6 | - Sound normalization. 7 | - Reading and writing audio files. 8 | - Sampling audio signals. 9 | - Normalizing and denormalizing audio signals. 10 | - Griffin-Lim vocoder. 11 | 12 | The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config 13 | also must inherit or initiate `BaseAudioConfig`. 14 | 15 | ## AudioProcessor 16 | ```{eval-rst} 17 | .. autoclass:: TTS.utils.audio.AudioProcessor 18 | :members: 19 | ``` 20 | 21 | ## BaseAudioConfig 22 | ```{eval-rst} 23 | .. autoclass:: TTS.config.shared_configs.BaseAudioConfig 24 | :members: 25 | ``` -------------------------------------------------------------------------------- /TTS/vocoder/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | configs_dir = os.path.dirname(__file__) 7 | for file in os.listdir(configs_dir): 8 | path = os.path.join(configs_dir, file) 9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | module = importlib.import_module("TTS.vocoder.configs." + config_name) 12 | for attribute_name in dir(module): 13 | attribute = getattr(module, attribute_name) 14 | 15 | if isclass(attribute): 16 | # Add the class to this package's variables 17 | globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_pqmf.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import soundfile as sf 4 | import torch 5 | from librosa.core import load 6 | 7 | from tests import get_tests_input_path, get_tests_output_path, get_tests_path 8 | from TTS.vocoder.layers.pqmf import PQMF 9 | 10 | TESTS_PATH = get_tests_path() 11 | WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") 12 | 13 | 14 | def test_pqmf(): 15 | w, sr = load(WAV_FILE) 16 | 17 | layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) 18 | w, sr = load(WAV_FILE) 19 | w2 = torch.from_numpy(w[None, None, :]) 20 | b2 = layer.analysis(w2) 21 | w2_ = layer.synthesis(b2) 22 | 23 | print(w2_.max()) 24 | print(w2_.min()) 25 | print(w2_.mean()) 26 | sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) 27 | -------------------------------------------------------------------------------- /TTS/tts/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | # configs_dir = os.path.dirname(__file__) 7 | # for file in os.listdir(configs_dir): 8 | # path = os.path.join(configs_dir, file) 9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | # module = importlib.import_module("TTS.tts.configs." + config_name) 12 | # for attribute_name in dir(module): 13 | # attribute = getattr(module, attribute_name) 14 | 15 | # if isclass(attribute): 16 | # # Add the class to this package's variables 17 | # globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in english: 4 | abbreviations_en = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("mrs", "misess"), 8 | ("mr", "mister"), 9 | ("dr", "doctor"), 10 | ("st", "saint"), 11 | ("co", "company"), 12 | ("jr", "junior"), 13 | ("maj", "major"), 14 | ("gen", "general"), 15 | ("drs", "doctors"), 16 | ("rev", "reverend"), 17 | ("lt", "lieutenant"), 18 | ("hon", "honorable"), 19 | ("sgt", "sergeant"), 20 | ("capt", "captain"), 21 | ("esq", "esquire"), 22 | ("ltd", "limited"), 23 | ("col", "colonel"), 24 | ("ft", "fort"), 25 | ] 26 | ] 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3 2 | FROM ${BASE} 3 | RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* 4 | RUN pip install llvmlite --ignore-installed 5 | 6 | # Create and activate virtual env 7 | ENV VIRTUAL_ENV=/venv 8 | RUN python3 -m venv $VIRTUAL_ENV 9 | ENV PATH="$VIRTUAL_ENV/bin:$PATH" 10 | RUN pip install -U pip setuptools wheel 11 | 12 | WORKDIR /root 13 | COPY requirements.txt /root 14 | COPY requirements.dev.txt /root 15 | COPY requirements.notebooks.txt /root 16 | RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"] 17 | COPY . /root 18 | RUN make install 19 | ENTRYPOINT ["tts"] 20 | CMD ["--help"] 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🚀 Feature request 3 | about: Suggest a feature or an idea for this project 4 | title: '[Feature request] ' 5 | labels: feature request 6 | assignees: '' 7 | 8 | --- 9 | 11 | **🚀 Feature Description** 12 | 13 | 14 | 15 | **Solution** 16 | 17 | 18 | 19 | **Alternative Solutions** 20 | 21 | 22 | 23 | **Additional context** 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/source/_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | {% block scripts %} 3 | {{ super() }} 4 | 5 | 6 | 7 | 13 | 21 | 22 | 23 | {% endblock %} 24 | -------------------------------------------------------------------------------- /recipes/ljspeech/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS LJspeech Recipes 2 | 3 | For running the recipes 4 | 5 | 1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. 6 | 2. Go to your desired model folder and run the training. 7 | 8 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 9 | ```terminal 10 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 11 | ``` 12 | 13 | Running bash scripts. 14 | ```terminal 15 | bash run.sh 16 | ``` 17 | 18 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 19 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 20 | -------------------------------------------------------------------------------- /tests/text_tests/test_text_cleaners.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners 4 | 5 | 6 | def test_time() -> None: 7 | assert english_cleaners("It's 11:00") == "it's eleven a m" 8 | assert english_cleaners("It's 9:01") == "it's nine oh one a m" 9 | assert english_cleaners("It's 16:00") == "it's four p m" 10 | assert english_cleaners("It's 00:00 am") == "it's twelve a m" 11 | 12 | 13 | def test_currency() -> None: 14 | assert phoneme_cleaners("It's $10.50") == "It's ten dollars fifty cents" 15 | assert phoneme_cleaners("£1.1") == "one pound sterling one penny" 16 | assert phoneme_cleaners("¥1") == "one yen" 17 | 18 | 19 | def test_expand_numbers() -> None: 20 | assert phoneme_cleaners("-1") == "minus one" 21 | assert phoneme_cleaners("1") == "one" 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # core deps 2 | numpy==1.21.6;python_version<"3.10" 3 | numpy==1.22.4;python_version=="3.10" 4 | cython==0.29.28 5 | scipy>=1.4.0 6 | torch>=1.7 7 | torchaudio 8 | soundfile 9 | librosa==0.8.0 10 | numba==0.55.1;python_version<"3.10" 11 | numba==0.55.2;python_version=="3.10" 12 | inflect 13 | tqdm 14 | anyascii 15 | pyyaml 16 | fsspec>=2021.04.0 17 | # deps for examples 18 | flask 19 | # deps for inference 20 | pysbd 21 | # deps for notebooks 22 | umap-learn==0.5.1 23 | pandas 24 | # deps for training 25 | matplotlib 26 | pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible 27 | # coqui stack 28 | trainer 29 | # config management 30 | coqpit>=0.0.16 31 | # chinese g2p deps 32 | jieba 33 | pypinyin 34 | # japanese g2p deps 35 | mecab-python3==1.0.5 36 | unidic-lite==1.0.8 37 | # gruut+supported langs 38 | gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 39 | -------------------------------------------------------------------------------- /tests/text_tests/test_japanese_phonemizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes 4 | 5 | _TEST_CASES = """ 6 | どちらに行きますか?/dochiraniikimasuka? 7 | 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. 8 | 「A」から「Z」までです。/e:karazeqtomadedesu. 9 | そうですね!/so:desune! 10 | クジラは哺乳類です。/kujirawahonyu:ruidesu. 11 | ヴィディオを見ます。/bidioomimasu. 12 | 今日は8月22日です/kyo:wahachigatsuniju:ninichidesu 13 | xyzとαβγ/eqkusuwaizeqtotoarufabe:tagaNma 14 | 値段は$12.34です/nedaNwaju:niteNsaNyoNdorudesu 15 | """ 16 | 17 | 18 | class TestText(unittest.TestCase): 19 | def test_japanese_text_to_phonemes(self): 20 | for line in _TEST_CASES.strip().split("\n"): 21 | text, phone = line.split("/") 22 | self.assertEqual(japanese_text_to_phonemes(text), phone) 23 | 24 | 25 | if __name__ == "__main__": 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | 🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10. 4 | 5 | ## Using `pip` 6 | 7 | `pip` is recommended if you want to use 🐸TTS only for inference. 8 | 9 | You can install from PyPI as follows: 10 | 11 | ```bash 12 | pip install TTS # from PyPI 13 | ``` 14 | 15 | Or install from Github: 16 | 17 | ```bash 18 | pip install git+https://github.com/coqui-ai/TTS # from Github 19 | ``` 20 | 21 | ## Installing From Source 22 | 23 | This is recommended for development and more control over 🐸TTS. 24 | 25 | ```bash 26 | git clone https://github.com/coqui-ai/TTS/ 27 | cd TTS 28 | make system-deps # only on Linux systems. 29 | make install 30 | ``` 31 | 32 | ## On Windows 33 | If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ -------------------------------------------------------------------------------- /docs/source/models/glow_tts.md: -------------------------------------------------------------------------------- 1 | # Glow TTS 2 | 3 | Glow TTS is a normalizing flow model for text-to-speech. It is built on the generic Glow model that is previously 4 | used in computer vision and vocoder models. It uses "monotonic alignment search" (MAS) to fine the text-to-speech alignment 5 | and uses the output to train a separate duration predictor network for faster inference run-time. 6 | 7 | ## Important resources & papers 8 | - GlowTTS: https://arxiv.org/abs/2005.11129 9 | - Glow (Generative Flow with invertible 1x1 Convolutions): https://arxiv.org/abs/1807.03039 10 | - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html 11 | 12 | ## GlowTTS Config 13 | ```{eval-rst} 14 | .. autoclass:: TTS.tts.configs.glow_tts_config.GlowTTSConfig 15 | :members: 16 | ``` 17 | 18 | ## GlowTTS Model 19 | ```{eval-rst} 20 | .. autoclass:: TTS.tts.models.glow_tts.GlowTTS 21 | :members: 22 | ``` 23 | -------------------------------------------------------------------------------- /.github/PR_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Pull request guidelines 2 | 3 | Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support! 4 | 5 | This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file. 6 | 7 | In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file. 8 | 9 | Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS). 10 | 11 | This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS): 12 | 13 | - Protects you, Coqui, and the users of the code. 14 | - Does not change your rights to use your contributions for any purpose. 15 | - Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute. 16 | -------------------------------------------------------------------------------- /docs/source/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /tests/data_tests/test_dataset_formatters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from tests import get_tests_input_path 5 | from TTS.tts.datasets.formatters import common_voice 6 | 7 | 8 | class TestTTSFormatters(unittest.TestCase): 9 | def test_common_voice_preprocessor(self): # pylint: disable=no-self-use 10 | root_path = get_tests_input_path() 11 | meta_file = "common_voice.tsv" 12 | items = common_voice(root_path, meta_file) 13 | assert items[0]["text"] == "The applicants are invited for coffee and visa is given immediately." 14 | assert items[0]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") 15 | 16 | assert items[-1]["text"] == "Competition for limited resources has also resulted in some local conflicts." 17 | assert items[-1]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav") 18 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Thorsten Recipes 2 | 3 | For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset. 4 | 5 | You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present. 6 | 7 | Then, go to your desired model folder and run the training. 8 | 9 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 10 | ```terminal 11 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 12 | ``` 13 | 14 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 15 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 16 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_parallel_wavegan_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.parallel_wavegan_generator import ParallelWaveganGenerator 5 | 6 | 7 | def test_pwgan_generator(): 8 | model = ParallelWaveganGenerator( 9 | in_channels=1, 10 | out_channels=1, 11 | kernel_size=3, 12 | num_res_blocks=30, 13 | stacks=3, 14 | res_channels=64, 15 | gate_channels=128, 16 | skip_channels=64, 17 | aux_channels=80, 18 | dropout=0.0, 19 | bias=True, 20 | use_weight_norm=True, 21 | upsample_factors=[4, 4, 4, 4], 22 | ) 23 | dummy_c = torch.rand((2, 80, 5)) 24 | output = model(dummy_c) 25 | assert np.all(output.shape == (2, 1, 5 * 256)), output.shape 26 | model.remove_weight_norm() 27 | output = model.inference(dummy_c) 28 | assert np.all(output.shape == (2, 1, (5 + 4) * 256)) 29 | -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding 4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock 5 | 6 | 7 | class DurationPredictor(nn.Module): 8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads): 9 | super().__init__() 10 | self.embed = nn.Embedding(num_chars, hidden_channels) 11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1) 12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1) 13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1) 14 | 15 | def forward(self, text, text_lengths): 16 | # B, L -> B, L 17 | emb = self.embed(text) 18 | emb = self.pos_enc(emb.transpose(1, 2)) 19 | x = self.FFT(emb, text_lengths) 20 | x = self.out_layer(x).squeeze(-1) 21 | return x 22 | -------------------------------------------------------------------------------- /docs/source/tts_datasets.md: -------------------------------------------------------------------------------- 1 | # TTS Datasets 2 | 3 | Some of the known public datasets that we successfully applied 🐸TTS: 4 | 5 | - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/) 6 | - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) 7 | - [English - TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) 8 | - [English - LibriTTS](https://openslr.org/60/) 9 | - [English - VCTK](https://datashare.ed.ac.uk/handle/10283/2950) 10 | - [Multilingual - M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) 11 | - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 12 | - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) 13 | - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) 14 | - [Chinese](https://www.data-baker.com/data/index/source/) 15 | 16 | Let us know if you use 🐸TTS on a different dataset. -------------------------------------------------------------------------------- /recipes/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Training Recipes 2 | 3 | TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset. 4 | 5 | For each dataset, you need to download the dataset once. Then you run the training for the model you want. 6 | 7 | Run each script from the root TTS folder as follows. 8 | 9 | ```console 10 | $ sh ./recipes//download_.sh 11 | $ python recipes///train.py 12 | ``` 13 | 14 | For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows. 15 | 16 | ```console 17 | python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac 18 | ``` 19 | 20 | If you train a new model using TTS, feel free to share your training to expand the list of recipes. 21 | 22 | You can also open a new discussion and share your progress with the 🐸 community. -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_melgan_discriminator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator 5 | from TTS.vocoder.models.melgan_multiscale_discriminator import MelganMultiscaleDiscriminator 6 | 7 | 8 | def test_melgan_discriminator(): 9 | model = MelganDiscriminator() 10 | print(model) 11 | dummy_input = torch.rand((4, 1, 256 * 10)) 12 | output, _ = model(dummy_input) 13 | assert np.all(output.shape == (4, 1, 10)) 14 | 15 | 16 | def test_melgan_multi_scale_discriminator(): 17 | model = MelganMultiscaleDiscriminator() 18 | print(model) 19 | dummy_input = torch.rand((4, 1, 256 * 16)) 20 | scores, feats = model(dummy_input) 21 | assert len(scores) == 3 22 | assert len(scores) == len(feats) 23 | assert np.all(scores[0].shape == (4, 1, 64)) 24 | assert np.all(feats[0][0].shape == (4, 16, 4096)) 25 | assert np.all(feats[0][1].shape == (4, 64, 1024)) 26 | assert np.all(feats[0][2].shape == (4, 256, 256)) 27 | -------------------------------------------------------------------------------- /TTS/tts/layers/align_tts/mdn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class MDNBlock(nn.Module): 5 | """Mixture of Density Network implementation 6 | https://arxiv.org/pdf/2003.01950.pdf 7 | """ 8 | 9 | def __init__(self, in_channels, out_channels): 10 | super().__init__() 11 | self.out_channels = out_channels 12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1) 13 | self.norm = nn.LayerNorm(in_channels) 14 | self.relu = nn.ReLU() 15 | self.dropout = nn.Dropout(0.1) 16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1) 17 | 18 | def forward(self, x): 19 | o = self.conv1(x) 20 | o = o.transpose(1, 2) 21 | o = self.norm(o) 22 | o = o.transpose(1, 2) 23 | o = self.relu(o) 24 | o = self.dropout(o) 25 | mu_sigma = self.conv2(o) 26 | # TODO: check this sigmoid 27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :]) 28 | mu = mu_sigma[:, : self.out_channels // 2, :] 29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :] 30 | return mu, log_sigma 31 | -------------------------------------------------------------------------------- /tests/inference_tests/test_synthesize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tests import get_tests_output_path, run_cli 4 | 5 | 6 | def test_synthesize(): 7 | """Test synthesize.py with diffent arguments.""" 8 | output_path = os.path.join(get_tests_output_path(), "output.wav") 9 | run_cli("tts --list_models") 10 | 11 | # single speaker model 12 | run_cli(f'tts --text "This is an example." --out_path "{output_path}"') 13 | run_cli( 14 | "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"' 15 | ) 16 | run_cli( 17 | "tts --model_name tts_models/en/ljspeech/glow-tts " 18 | "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " 19 | f'--text "This is an example." --out_path "{output_path}"' 20 | ) 21 | 22 | # multi-speaker SC-Glow model 23 | # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") 24 | # run_cli( 25 | # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' 26 | # f'--text "This is an example." --out_path "{output_path}"' 27 | # ) 28 | -------------------------------------------------------------------------------- /TTS/server/README.md: -------------------------------------------------------------------------------- 1 | # :frog: TTS demo server 2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. 3 | 4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. 5 | 6 | Examples runs: 7 | 8 | List officially released models. 9 | ```python TTS/server/server.py --list_models ``` 10 | 11 | Run the server with the official models. 12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` 13 | 14 | Run the server with the official models on a GPU. 15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` 16 | 17 | Run the server with a custom models. 18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` 19 | -------------------------------------------------------------------------------- /TTS/vocoder/models/fullband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.models.melgan_generator import MelganGenerator 4 | 5 | 6 | class FullbandMelganGenerator(MelganGenerator): 7 | def __init__( 8 | self, 9 | in_channels=80, 10 | out_channels=1, 11 | proj_kernel=7, 12 | base_channels=512, 13 | upsample_factors=(2, 8, 2, 2), 14 | res_kernel=3, 15 | num_res_blocks=4, 16 | ): 17 | super().__init__( 18 | in_channels=in_channels, 19 | out_channels=out_channels, 20 | proj_kernel=proj_kernel, 21 | base_channels=base_channels, 22 | upsample_factors=upsample_factors, 23 | res_kernel=res_kernel, 24 | num_res_blocks=num_res_blocks, 25 | ) 26 | 27 | @torch.no_grad() 28 | def inference(self, cond_features): 29 | cond_features = cond_features.to(self.layers[1].weight.device) 30 | cond_features = torch.nn.functional.pad( 31 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 32 | ) 33 | return self.layers(cond_features) 34 | -------------------------------------------------------------------------------- /recipes/blizzard2013/README.md: -------------------------------------------------------------------------------- 1 | # How to get the Blizzard 2013 Dataset 2 | 3 | The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody. 4 | 5 | To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings. 6 | 7 | To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh. 8 | 9 | You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset. 10 | 11 | 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments). 12 | 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation). -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | 2 | ```{include} ../../README.md 3 | :relative-images: 4 | ``` 5 | ---- 6 | 7 | # Documentation Content 8 | ```{eval-rst} 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Get started 12 | 13 | tutorial_for_nervous_beginners 14 | installation 15 | faq 16 | contributing 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | :caption: Using 🐸TTS 21 | 22 | inference 23 | implementing_a_new_model 24 | training_a_model 25 | finetuning 26 | configuration 27 | formatting_your_dataset 28 | what_makes_a_good_dataset 29 | tts_datasets 30 | 31 | .. toctree:: 32 | :maxdepth: 2 33 | :caption: Main Classes 34 | 35 | main_classes/trainer_api 36 | main_classes/audio_processor 37 | main_classes/model_api 38 | main_classes/dataset 39 | main_classes/gan 40 | main_classes/speaker_manager 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :caption: `tts` Models 45 | 46 | models/glow_tts.md 47 | models/vits.md 48 | models/forward_tts.md 49 | models/tacotron1-2.md 50 | 51 | .. toctree:: 52 | :maxdepth: 2 53 | :caption: `vocoder` Models 54 | 55 | ``` 56 | 57 | -------------------------------------------------------------------------------- /TTS/bin/collect_env_info.py: -------------------------------------------------------------------------------- 1 | """Get detailed info about the working environment.""" 2 | import os 3 | import platform 4 | import sys 5 | 6 | import numpy 7 | import torch 8 | 9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")] 10 | import json 11 | 12 | import TTS 13 | 14 | 15 | def system_info(): 16 | return { 17 | "OS": platform.system(), 18 | "architecture": platform.architecture(), 19 | "version": platform.version(), 20 | "processor": platform.processor(), 21 | "python": platform.python_version(), 22 | } 23 | 24 | 25 | def cuda_info(): 26 | return { 27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], 28 | "available": torch.cuda.is_available(), 29 | "version": torch.version.cuda, 30 | } 31 | 32 | 33 | def package_info(): 34 | return { 35 | "numpy": numpy.__version__, 36 | "PyTorch_version": torch.__version__, 37 | "PyTorch_debug": torch.version.debug, 38 | "TTS": TTS.__version__, 39 | } 40 | 41 | 42 | def main(): 43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()} 44 | print(json.dumps(details, indent=4, sort_keys=True)) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /tests/inputs/common_voice.tsv: -------------------------------------------------------------------------------- 1 | client_id path sentence up_votes down_votes age gender accent locale segment 2 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en 3 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en 4 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en 5 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en 6 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en 7 | -------------------------------------------------------------------------------- /TTS/tts/layers/feed_forward/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN 4 | 5 | 6 | class DurationPredictor(nn.Module): 7 | """Speedy Speech duration predictor model. 8 | Predicts phoneme durations from encoder outputs. 9 | 10 | Note: 11 | Outputs interpreted as log(durations) 12 | To get actual durations, do exp transformation 13 | 14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1 15 | 16 | Args: 17 | hidden_channels (int): number of channels in the inner layers. 18 | """ 19 | 20 | def __init__(self, hidden_channels): 21 | 22 | super().__init__() 23 | 24 | self.layers = nn.ModuleList( 25 | [ 26 | Conv1dBN(hidden_channels, hidden_channels, 4, 1), 27 | Conv1dBN(hidden_channels, hidden_channels, 3, 1), 28 | Conv1dBN(hidden_channels, hidden_channels, 1, 1), 29 | nn.Conv1d(hidden_channels, 1, 1), 30 | ] 31 | ) 32 | 33 | def forward(self, x, x_mask): 34 | """ 35 | Shapes: 36 | x: [B, C, T] 37 | x_mask: [B, 1, T] 38 | """ 39 | o = x 40 | for layer in self.layers: 41 | o = layer(o) * x_mask 42 | return o 43 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/chinese_mandarin/phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import jieba 4 | import pypinyin 5 | 6 | from .pinyinToPhonemes import PINYIN_DICT 7 | 8 | 9 | def _chinese_character_to_pinyin(text: str) -> List[str]: 10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) 11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist] 12 | return pinyins_flat_list 13 | 14 | 15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str: 16 | segment = pinyin[:-1] 17 | tone = pinyin[-1] 18 | phoneme = PINYIN_DICT.get(segment, [""])[0] 19 | return phoneme + tone 20 | 21 | 22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: 23 | tokenized_text = jieba.cut(text, HMM=False) 24 | tokenized_text = " ".join(tokenized_text) 25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) 26 | 27 | results: List[str] = [] 28 | 29 | for token in pinyined_text: 30 | if token[-1] in "12345": # TODO transform to is_pinyin() 31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token) 32 | 33 | results += list(pinyin_phonemes) 34 | else: # is ponctuation or other 35 | results += list(token) 36 | 37 | return seperator.join(results) 38 | -------------------------------------------------------------------------------- /recipes/ljspeech/univnet/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import UnivnetConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = UnivnetConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=8192, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=False, 25 | mixed_precision=False, 26 | lr_gen=1e-4, 27 | lr_disc=1e-4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = GAN(config, ap) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 44 | ) 45 | trainer.fit() 46 | -------------------------------------------------------------------------------- /recipes/ljspeech/hifigan/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import HifiganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = HifiganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /TTS/encoder/README.md: -------------------------------------------------------------------------------- 1 | ### Speaker Encoder 2 | 3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. 4 | 5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. 6 | 7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). 8 | 9 | ![](umap.png) 10 | 11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. 12 | 13 | To run the code, you need to follow the same flow as in TTS. 14 | 15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. 16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` 17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. 18 | - Watch training on Tensorboard as in TTS 19 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/english/time_norm.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | 7 | _time_re = re.compile( 8 | r"""\b 9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours 10 | : 11 | ([0-5][0-9]) # minutes 12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm 13 | \b""", 14 | re.IGNORECASE | re.X, 15 | ) 16 | 17 | 18 | def _expand_num(n: int) -> str: 19 | return _inflect.number_to_words(n) 20 | 21 | 22 | def _expand_time_english(match: "re.Match") -> str: 23 | hour = int(match.group(1)) 24 | past_noon = hour >= 12 25 | time = [] 26 | if hour > 12: 27 | hour -= 12 28 | elif hour == 0: 29 | hour = 12 30 | past_noon = True 31 | time.append(_expand_num(hour)) 32 | 33 | minute = int(match.group(6)) 34 | if minute > 0: 35 | if minute < 10: 36 | time.append("oh") 37 | time.append(_expand_num(minute)) 38 | am_pm = match.group(7) 39 | if am_pm is None: 40 | time.append("p m" if past_noon else "a m") 41 | else: 42 | time.extend(list(am_pm.replace(".", ""))) 43 | return " ".join(time) 44 | 45 | 46 | def expand_time_english(text: str) -> str: 47 | return re.sub(_time_re, _expand_time_english, text) 48 | -------------------------------------------------------------------------------- /recipes/ljspeech/multiband_melgan/train_multiband_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import MultibandMelganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = MultibandMelganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /recipes/ljspeech/wavegrad/train_wavegrad.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavegradConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavegrad import Wavegrad 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavegradConfig( 12 | batch_size=32, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=6144, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=50, 23 | print_step=50, 24 | print_eval=True, 25 | mixed_precision=False, 26 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 27 | output_path=output_path, 28 | ) 29 | 30 | # init audio processor 31 | ap = AudioProcessor(**config.audio.to_dict()) 32 | 33 | # load training samples 34 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 35 | 36 | # init model 37 | model = Wavegrad(config) 38 | 39 | # init the trainer and 🚀 40 | trainer = Trainer( 41 | TrainerArgs(), 42 | config, 43 | output_path, 44 | model=model, 45 | train_samples=train_samples, 46 | eval_samples=eval_samples, 47 | training_assets={"audio_processor": ap}, 48 | ) 49 | trainer.fit() 50 | -------------------------------------------------------------------------------- /TTS/tts/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_hifigan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import HifiganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | 12 | config = HifiganConfig( 13 | batch_size=8, 14 | eval_batch_size=8, 15 | num_loader_workers=0, 16 | num_eval_loader_workers=0, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1, 20 | seq_len=1024, 21 | eval_split_size=1, 22 | print_step=1, 23 | print_eval=True, 24 | data_path="tests/data/ljspeech", 25 | output_path=output_path, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /TTS/tts/layers/generic/gated_conv.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .normalization import LayerNorm 4 | 5 | 6 | class GatedConvBlock(nn.Module): 7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf 8 | Args: 9 | in_out_channels (int): number of input/output channels. 10 | kernel_size (int): convolution kernel size. 11 | dropout_p (float): dropout rate. 12 | """ 13 | 14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers): 15 | super().__init__() 16 | # class arguments 17 | self.dropout_p = dropout_p 18 | self.num_layers = num_layers 19 | # define layers 20 | self.conv_layers = nn.ModuleList() 21 | self.norm_layers = nn.ModuleList() 22 | self.layers = nn.ModuleList() 23 | for _ in range(num_layers): 24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)] 25 | self.norm_layers += [LayerNorm(2 * in_out_channels)] 26 | 27 | def forward(self, x, x_mask): 28 | o = x 29 | res = x 30 | for idx in range(self.num_layers): 31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training) 32 | o = self.conv_layers[idx](o * x_mask) 33 | o = self.norm_layers[idx](o) 34 | o = nn.functional.glu(o, dim=1) 35 | o = res + o 36 | res = o 37 | return o 38 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_parallel_wavegan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import ParallelWaveganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = ParallelWaveganConfig( 12 | batch_size=4, 13 | eval_batch_size=4, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=2048, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | data_path="tests/data/ljspeech", 24 | output_path=output_path, 25 | ) 26 | config.audio.do_trim_silence = True 27 | config.audio.trim_db = 60 28 | config.save_json(config_path) 29 | 30 | # train the model for one epoch 31 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 32 | run_cli(command_train) 33 | 34 | # Find latest folder 35 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 36 | 37 | # restore the model and continue training for one more epoch 38 | command_train = ( 39 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 40 | ) 41 | run_cli(command_train) 42 | shutil.rmtree(continue_path) 43 | -------------------------------------------------------------------------------- /TTS/vocoder/models/multiband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.layers.pqmf import PQMF 4 | from TTS.vocoder.models.melgan_generator import MelganGenerator 5 | 6 | 7 | class MultibandMelganGenerator(MelganGenerator): 8 | def __init__( 9 | self, 10 | in_channels=80, 11 | out_channels=4, 12 | proj_kernel=7, 13 | base_channels=384, 14 | upsample_factors=(2, 8, 2, 2), 15 | res_kernel=3, 16 | num_res_blocks=3, 17 | ): 18 | super().__init__( 19 | in_channels=in_channels, 20 | out_channels=out_channels, 21 | proj_kernel=proj_kernel, 22 | base_channels=base_channels, 23 | upsample_factors=upsample_factors, 24 | res_kernel=res_kernel, 25 | num_res_blocks=num_res_blocks, 26 | ) 27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) 28 | 29 | def pqmf_analysis(self, x): 30 | return self.pqmf_layer.analysis(x) 31 | 32 | def pqmf_synthesis(self, x): 33 | return self.pqmf_layer.synthesis(x) 34 | 35 | @torch.no_grad() 36 | def inference(self, cond_features): 37 | cond_features = cond_features.to(self.layers[1].weight.device) 38 | cond_features = torch.nn.functional.pad( 39 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 40 | ) 41 | return self.pqmf_synthesis(self.layers(cond_features)) 42 | -------------------------------------------------------------------------------- /recipes/kokoro/tacotron2-DDC/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | CORPUS=kokoro-speech-v1_1-small 5 | echo $RUN_DIR 6 | if [ \! -d $RUN_DIR/$CORPUS ] ; then 7 | echo "$RUN_DIR/$CORPUS doesn't exist." 8 | echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." 9 | exit 1 10 | fi 11 | # create train-val splits 12 | shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv 13 | head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv 14 | tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv 15 | # compute dataset mean and variance for normalization 16 | python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ 17 | # training .... 18 | # change the GPU id if needed 19 | CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ 20 | --coqpit.output_path $RUN_DIR \ 21 | --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ 22 | --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ 23 | --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ -------------------------------------------------------------------------------- /recipes/ljspeech/wavernn/train_wavernn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavernn import Wavernn 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavernnConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=10000, 19 | seq_len=1280, 20 | pad_short=2000, 21 | use_noise_augment=False, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=True, 25 | mixed_precision=False, 26 | lr=1e-4, 27 | grad_clip=4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = Wavernn(config) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), 44 | config, 45 | output_path, 46 | model=model, 47 | train_samples=train_samples, 48 | eval_samples=eval_samples, 49 | training_assets={"audio_processor": ap}, 50 | ) 51 | trainer.fit() 52 | -------------------------------------------------------------------------------- /.github/workflows/style_check.yml: -------------------------------------------------------------------------------- 1 | name: style-check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.9] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y git make gcc 38 | make system-deps 39 | - name: Install/upgrade Python setup deps 40 | run: python3 -m pip install --upgrade pip setuptools wheel 41 | - name: Install TTS 42 | run: | 43 | python3 -m pip install .[all] 44 | python3 setup.py egg_info 45 | - name: Lint check 46 | run: | 47 | make lint -------------------------------------------------------------------------------- /.github/workflows/vocoder_tests.yml: -------------------------------------------------------------------------------- 1 | name: vocoder-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y git make gcc 38 | make system-deps 39 | - name: Install/upgrade Python setup deps 40 | run: python3 -m pip install --upgrade pip setuptools wheel 41 | - name: Install TTS 42 | run: | 43 | python3 -m pip install .[all] 44 | python3 setup.py egg_info 45 | - name: Unit tests 46 | run: make test_vocoder 47 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_parallel_wavegan_discriminator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.parallel_wavegan_discriminator import ( 5 | ParallelWaveganDiscriminator, 6 | ResidualParallelWaveganDiscriminator, 7 | ) 8 | 9 | 10 | def test_pwgan_disciminator(): 11 | model = ParallelWaveganDiscriminator( 12 | in_channels=1, 13 | out_channels=1, 14 | kernel_size=3, 15 | num_layers=10, 16 | conv_channels=64, 17 | dilation_factor=1, 18 | nonlinear_activation="LeakyReLU", 19 | nonlinear_activation_params={"negative_slope": 0.2}, 20 | bias=True, 21 | ) 22 | dummy_x = torch.rand((4, 1, 64 * 256)) 23 | output = model(dummy_x) 24 | assert np.all(output.shape == (4, 1, 64 * 256)) 25 | model.remove_weight_norm() 26 | 27 | 28 | def test_redisual_pwgan_disciminator(): 29 | model = ResidualParallelWaveganDiscriminator( 30 | in_channels=1, 31 | out_channels=1, 32 | kernel_size=3, 33 | num_layers=30, 34 | stacks=3, 35 | res_channels=64, 36 | gate_channels=128, 37 | skip_channels=64, 38 | dropout=0.0, 39 | bias=True, 40 | nonlinear_activation="LeakyReLU", 41 | nonlinear_activation_params={"negative_slope": 0.2}, 42 | ) 43 | dummy_x = torch.rand((4, 1, 64 * 256)) 44 | output = model(dummy_x) 45 | assert np.all(output.shape == (4, 1, 64 * 256)) 46 | model.remove_weight_norm() 47 | -------------------------------------------------------------------------------- /TTS/encoder/utils/io.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | from TTS.utils.io import save_fsspec 5 | 6 | 7 | def save_checkpoint(model, optimizer, model_loss, out_path, current_step): 8 | checkpoint_path = "checkpoint_{}.pth".format(current_step) 9 | checkpoint_path = os.path.join(out_path, checkpoint_path) 10 | print(" | | > Checkpoint saving : {}".format(checkpoint_path)) 11 | 12 | new_state_dict = model.state_dict() 13 | state = { 14 | "model": new_state_dict, 15 | "optimizer": optimizer.state_dict() if optimizer is not None else None, 16 | "step": current_step, 17 | "loss": model_loss, 18 | "date": datetime.date.today().strftime("%B %d, %Y"), 19 | } 20 | save_fsspec(state, checkpoint_path) 21 | 22 | 23 | def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step): 24 | if model_loss < best_loss: 25 | new_state_dict = model.state_dict() 26 | state = { 27 | "model": new_state_dict, 28 | "optimizer": optimizer.state_dict(), 29 | "step": current_step, 30 | "loss": model_loss, 31 | "date": datetime.date.today().strftime("%B %d, %Y"), 32 | } 33 | best_loss = model_loss 34 | bestmodel_path = "best_model.pth" 35 | bestmodel_path = os.path.join(out_path, bestmodel_path) 36 | print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) 37 | save_fsspec(state, bestmodel_path) 38 | return best_loss 39 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_wavegrad_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import WavegradConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = WavegradConfig( 12 | batch_size=8, 13 | eval_batch_size=8, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=8192, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | data_path="tests/data/ljspeech", 24 | output_path=output_path, 25 | test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /TTS/encoder/utils/visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import umap 5 | 6 | matplotlib.use("Agg") 7 | 8 | 9 | colormap = ( 10 | np.array( 11 | [ 12 | [76, 255, 0], 13 | [0, 127, 70], 14 | [255, 0, 0], 15 | [255, 217, 38], 16 | [0, 135, 255], 17 | [165, 0, 165], 18 | [255, 167, 255], 19 | [0, 255, 255], 20 | [255, 96, 38], 21 | [142, 76, 0], 22 | [33, 0, 127], 23 | [0, 0, 0], 24 | [183, 183, 183], 25 | ], 26 | dtype=np.float, 27 | ) 28 | / 255 29 | ) 30 | 31 | 32 | def plot_embeddings(embeddings, num_classes_in_batch): 33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch 34 | 35 | # if necessary get just the first 10 classes 36 | if num_classes_in_batch > 10: 37 | num_classes_in_batch = 10 38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] 39 | 40 | model = umap.UMAP() 41 | projection = model.fit_transform(embeddings) 42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) 43 | colors = [colormap[i] for i in ground_truth] 44 | fig, ax = plt.subplots(figsize=(16, 10)) 45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) 46 | plt.gca().set_aspect("equal", "datalim") 47 | plt.title("UMAP projection") 48 | plt.tight_layout() 49 | plt.savefig("umap") 50 | return fig 51 | -------------------------------------------------------------------------------- /tests/text_tests/test_punctuation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from TTS.tts.utils.text.punctuation import _DEF_PUNCS, Punctuation 4 | 5 | 6 | class PunctuationTest(unittest.TestCase): 7 | def setUp(self): 8 | self.punctuation = Punctuation() 9 | self.test_texts = [ 10 | ("This, is my text ... to be striped !! from text?", "This is my text to be striped from text"), 11 | ("This, is my text ... to be striped !! from text", "This is my text to be striped from text"), 12 | ("This, is my text ... to be striped from text?", "This is my text to be striped from text"), 13 | ("This, is my text to be striped from text", "This is my text to be striped from text"), 14 | ] 15 | 16 | def test_get_set_puncs(self): 17 | self.punctuation.puncs = "-=" 18 | self.assertEqual(self.punctuation.puncs, "-=") 19 | 20 | self.punctuation.puncs = _DEF_PUNCS 21 | self.assertEqual(self.punctuation.puncs, _DEF_PUNCS) 22 | 23 | def test_strip_punc(self): 24 | for text, gt in self.test_texts: 25 | text_striped = self.punctuation.strip(text) 26 | self.assertEqual(text_striped, gt) 27 | 28 | def test_strip_restore(self): 29 | for text, gt in self.test_texts: 30 | text_striped, puncs_map = self.punctuation.strip_to_restore(text) 31 | text_restored = self.punctuation.restore(text_striped, puncs_map) 32 | self.assertEqual(" ".join(text_striped), gt) 33 | self.assertEqual(text_restored[0], text) 34 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_melgan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import MelganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = MelganConfig( 12 | batch_size=4, 13 | eval_batch_size=4, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=2048, 20 | eval_split_size=1, 21 | print_step=1, 22 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, 23 | print_eval=True, 24 | data_path="tests/data/ljspeech", 25 | output_path=output_path, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_wavernn_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.models.wavernn import WavernnArgs 8 | 9 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 10 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 11 | 12 | 13 | config = WavernnConfig( 14 | model_args=WavernnArgs(), 15 | batch_size=8, 16 | eval_batch_size=8, 17 | num_loader_workers=0, 18 | num_eval_loader_workers=0, 19 | run_eval=True, 20 | test_delay_epochs=-1, 21 | epochs=1, 22 | seq_len=256, # for shorter test time 23 | eval_split_size=1, 24 | print_step=1, 25 | print_eval=True, 26 | data_path="tests/data/ljspeech", 27 | output_path=output_path, 28 | ) 29 | config.audio.do_trim_silence = True 30 | config.audio.trim_db = 60 31 | config.save_json(config_path) 32 | 33 | # train the model for one epoch 34 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 35 | run_cli(command_train) 36 | 37 | # Find latest folder 38 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 39 | 40 | # restore the model and continue training for one more epoch 41 | command_train = ( 42 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 43 | ) 44 | run_cli(command_train) 45 | shutil.rmtree(continue_path) 46 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_fullband_melgan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import FullbandMelganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = FullbandMelganConfig( 12 | batch_size=8, 13 | eval_batch_size=8, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=8192, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | data_path="tests/data/ljspeech", 24 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, 25 | output_path=output_path, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/french/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in french: 4 | abbreviations_fr = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("M", "monsieur"), 8 | ("Mlle", "mademoiselle"), 9 | ("Mlles", "mesdemoiselles"), 10 | ("Mme", "Madame"), 11 | ("Mmes", "Mesdames"), 12 | ("N.B", "nota bene"), 13 | ("M", "monsieur"), 14 | ("p.c.q", "parce que"), 15 | ("Pr", "professeur"), 16 | ("qqch", "quelque chose"), 17 | ("rdv", "rendez-vous"), 18 | ("max", "maximum"), 19 | ("min", "minimum"), 20 | ("no", "numéro"), 21 | ("adr", "adresse"), 22 | ("dr", "docteur"), 23 | ("st", "saint"), 24 | ("co", "companie"), 25 | ("jr", "junior"), 26 | ("sgt", "sergent"), 27 | ("capt", "capitain"), 28 | ("col", "colonel"), 29 | ("av", "avenue"), 30 | ("av. J.-C", "avant Jésus-Christ"), 31 | ("apr. J.-C", "après Jésus-Christ"), 32 | ("art", "article"), 33 | ("boul", "boulevard"), 34 | ("c.-à-d", "c’est-à-dire"), 35 | ("etc", "et cetera"), 36 | ("ex", "exemple"), 37 | ("excl", "exclusivement"), 38 | ("boul", "boulevard"), 39 | ] 40 | ] + [ 41 | (re.compile("\\b%s" % x[0]), x[1]) 42 | for x in [ 43 | ("Mlle", "mademoiselle"), 44 | ("Mlles", "mesdemoiselles"), 45 | ("Mme", "Madame"), 46 | ("Mmes", "Mesdames"), 47 | ] 48 | ] 49 | -------------------------------------------------------------------------------- /.github/workflows/text_tests.yml: -------------------------------------------------------------------------------- 1 | name: text-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y --no-install-recommends git make gcc 38 | sudo apt-get install espeak 39 | sudo apt-get install espeak-ng 40 | make system-deps 41 | - name: Install/upgrade Python setup deps 42 | run: python3 -m pip install --upgrade pip setuptools wheel 43 | - name: Install TTS 44 | run: | 45 | python3 -m pip install .[all] 46 | python3 setup.py egg_info 47 | - name: Unit tests 48 | run: make test_text 49 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_multiband_melgan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import MultibandMelganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = MultibandMelganConfig( 12 | batch_size=8, 13 | eval_batch_size=8, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=8192, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | steps_to_start_discriminator=1, 24 | data_path="tests/data/ljspeech", 25 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, 26 | output_path=output_path, 27 | ) 28 | config.audio.do_trim_silence = True 29 | config.audio.trim_db = 60 30 | config.save_json(config_path) 31 | 32 | # train the model for one epoch 33 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 34 | run_cli(command_train) 35 | 36 | # Find latest folder 37 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 38 | 39 | # restore the model and continue training for one more epoch 40 | command_train = ( 41 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 42 | ) 43 | run_cli(command_train) 44 | shutil.rmtree(continue_path) 45 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_vocoder_wavernn.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs 8 | 9 | 10 | def test_wavernn(): 11 | config = WavernnConfig() 12 | config.model_args = WavernnArgs( 13 | rnn_dims=512, 14 | fc_dims=512, 15 | mode="mold", 16 | mulaw=False, 17 | pad=2, 18 | use_aux_net=True, 19 | use_upsample_net=True, 20 | upsample_factors=[4, 8, 8], 21 | feat_dims=80, 22 | compute_dims=128, 23 | res_out_dims=128, 24 | num_res_blocks=10, 25 | ) 26 | config.audio.hop_length = 256 27 | config.audio.sample_rate = 2048 28 | 29 | dummy_x = torch.rand((2, 1280)) 30 | dummy_m = torch.rand((2, 80, 9)) 31 | y_size = random.randrange(20, 60) 32 | dummy_y = torch.rand((80, y_size)) 33 | 34 | # mode: mold 35 | model = Wavernn(config) 36 | output = model(dummy_x, dummy_m) 37 | assert np.all(output.shape == (2, 1280, 30)), output.shape 38 | 39 | # mode: gauss 40 | config.model_args.mode = "gauss" 41 | model = Wavernn(config) 42 | output = model(dummy_x, dummy_m) 43 | assert np.all(output.shape == (2, 1280, 2)), output.shape 44 | 45 | # mode: quantized 46 | config.model_args.mode = 4 47 | model = Wavernn(config) 48 | output = model(dummy_x, dummy_m) 49 | assert np.all(output.shape == (2, 1280, 2**4)), output.shape 50 | output = model.inference(dummy_y, True, 5500, 550) 51 | assert np.all(output.shape == (256 * (y_size - 1),)) 52 | -------------------------------------------------------------------------------- /tests/inputs/test_vocoder_audio_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "audio":{ 3 | "num_mels": 80, // size of the mel spec frame. 4 | "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. 5 | "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. 6 | "frame_length_ms": null, // stft window length in ms. 7 | "frame_shift_ms": null, // stft window hop-lengh in ms. 8 | "hop_length": 256, 9 | "win_length": 1024, 10 | "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. 11 | "min_level_db": -100, // normalization range 12 | "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. 13 | "power": 1.5, // value to sharpen wav signals after GL algorithm. 14 | "griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. 15 | "signal_norm": true, // normalize the spec values in range [0, 1] 16 | "symmetric_norm": true, // move normalization to range [-1, 1] 17 | "clip_norm": true, // clip normalized values into the range. 18 | "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] 19 | "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! 20 | "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! 21 | "do_trim_silence": false 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /tests/data/ljspeech/metadata.csv: -------------------------------------------------------------------------------- 1 | LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition 2 | LJ001-0002|in being comparatively modern.|in being comparatively modern. 3 | LJ001-0003|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process 4 | LJ001-0004|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book, 5 | LJ001-0005|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing. 6 | LJ001-0006|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography, 7 | LJ001-0007|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five, 8 | LJ001-0008|has never been surpassed.|has never been surpassed. 9 | -------------------------------------------------------------------------------- /TTS/bin/find_unique_chars.py: -------------------------------------------------------------------------------- 1 | """Find all the unique characters in a dataset""" 2 | import argparse 3 | from argparse import RawTextHelpFormatter 4 | 5 | from TTS.config import load_config 6 | from TTS.tts.datasets import load_tts_samples 7 | 8 | 9 | def main(): 10 | # pylint: disable=bad-option-value 11 | parser = argparse.ArgumentParser( 12 | description="""Find all the unique characters or phonemes in a dataset.\n\n""" 13 | """ 14 | Example runs: 15 | 16 | python TTS/bin/find_unique_chars.py --config_path config.json 17 | """, 18 | formatter_class=RawTextHelpFormatter, 19 | ) 20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) 21 | args = parser.parse_args() 22 | 23 | c = load_config(args.config_path) 24 | 25 | # load all datasets 26 | train_items, eval_items = load_tts_samples( 27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size 28 | ) 29 | 30 | items = train_items + eval_items 31 | 32 | texts = "".join(item["text"] for item in items) 33 | chars = set(texts) 34 | lower_chars = filter(lambda c: c.islower(), chars) 35 | chars_force_lower = [c.lower() for c in chars] 36 | chars_force_lower = set(chars_force_lower) 37 | 38 | print(f" > Number of unique characters: {len(chars)}") 39 | print(f" > Unique characters: {''.join(sorted(chars))}") 40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") 41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /.github/workflows/aux_tests.yml: -------------------------------------------------------------------------------- 1 | name: aux-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y git make gcc 38 | make system-deps 39 | - name: Install/upgrade Python setup deps 40 | run: python3 -m pip install --upgrade pip setuptools wheel 41 | - name: Replace scarf urls 42 | run: | 43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json 44 | - name: Install TTS 45 | run: | 46 | python3 -m pip install .[all] 47 | python3 setup.py egg_info 48 | - name: Unit tests 49 | run: make test_aux 50 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/univnet/train_univnet.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import UnivnetConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = UnivnetConfig( 13 | batch_size=64, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # download dataset if not already present 34 | if not os.path.exists(config.data_path): 35 | print("Downloading dataset") 36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 37 | download_thorsten_de(download_path) 38 | 39 | # init audio processor 40 | ap = AudioProcessor(**config.audio.to_dict()) 41 | 42 | # load training samples 43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 44 | 45 | # init model 46 | model = GAN(config, ap) 47 | 48 | # init the trainer and 🚀 49 | trainer = Trainer( 50 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 51 | ) 52 | trainer.fit() 53 | -------------------------------------------------------------------------------- /.github/workflows/data_tests.yml: -------------------------------------------------------------------------------- 1 | name: data-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y --no-install-recommends git make gcc 38 | make system-deps 39 | - name: Install/upgrade Python setup deps 40 | run: python3 -m pip install --upgrade pip setuptools wheel 41 | - name: Replace scarf urls 42 | run: | 43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json 44 | - name: Install TTS 45 | run: | 46 | python3 -m pip install .[all] 47 | python3 setup.py egg_info 48 | - name: Unit tests 49 | run: make data_tests 50 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/hifigan/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import HifiganConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | config = HifiganConfig( 14 | batch_size=32, 15 | eval_batch_size=16, 16 | num_loader_workers=4, 17 | num_eval_loader_workers=4, 18 | run_eval=True, 19 | test_delay_epochs=5, 20 | epochs=1000, 21 | seq_len=8192, 22 | pad_short=2000, 23 | use_noise_augment=True, 24 | eval_split_size=10, 25 | print_step=25, 26 | print_eval=False, 27 | mixed_precision=False, 28 | lr_gen=1e-4, 29 | lr_disc=1e-4, 30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 31 | output_path=output_path, 32 | ) 33 | 34 | # download dataset if not already present 35 | if not os.path.exists(config.data_path): 36 | print("Downloading dataset") 37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 38 | download_thorsten_de(download_path) 39 | 40 | # init audio processor 41 | ap = AudioProcessor(**config.audio.to_dict()) 42 | 43 | # load training samples 44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 45 | 46 | # init model 47 | model = GAN(config, ap) 48 | 49 | # init the trainer and 🚀 50 | trainer = Trainer( 51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 52 | ) 53 | trainer.fit() 54 | -------------------------------------------------------------------------------- /TTS/utils/training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): 6 | r"""Check model gradient against unexpected jumps and failures""" 7 | skip_flag = False 8 | if ignore_stopnet: 9 | if not amp_opt_params: 10 | grad_norm = torch.nn.utils.clip_grad_norm_( 11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip 12 | ) 13 | else: 14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) 15 | else: 16 | if not amp_opt_params: 17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) 18 | else: 19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip) 20 | 21 | # compatibility with different torch versions 22 | if isinstance(grad_norm, float): 23 | if np.isinf(grad_norm): 24 | print(" | > Gradient is INF !!") 25 | skip_flag = True 26 | else: 27 | if torch.isinf(grad_norm): 28 | print(" | > Gradient is INF !!") 29 | skip_flag = True 30 | return grad_norm, skip_flag 31 | 32 | 33 | def gradual_training_scheduler(global_step, config): 34 | """Setup the gradual training schedule wrt number 35 | of active GPUs""" 36 | num_gpus = torch.cuda.device_count() 37 | if num_gpus == 0: 38 | num_gpus = 1 39 | new_values = None 40 | # we set the scheduling wrt num_gpus 41 | for values in config.gradual_training: 42 | if global_step * num_gpus >= values[0]: 43 | new_values = values 44 | return new_values[1], new_values[2] 45 | -------------------------------------------------------------------------------- /TTS/vocoder/models/melgan_multiscale_discriminator.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator 4 | 5 | 6 | class MelganMultiscaleDiscriminator(nn.Module): 7 | def __init__( 8 | self, 9 | in_channels=1, 10 | out_channels=1, 11 | num_scales=3, 12 | kernel_sizes=(5, 3), 13 | base_channels=16, 14 | max_channels=1024, 15 | downsample_factors=(4, 4, 4), 16 | pooling_kernel_size=4, 17 | pooling_stride=2, 18 | pooling_padding=2, 19 | groups_denominator=4, 20 | ): 21 | super().__init__() 22 | 23 | self.discriminators = nn.ModuleList( 24 | [ 25 | MelganDiscriminator( 26 | in_channels=in_channels, 27 | out_channels=out_channels, 28 | kernel_sizes=kernel_sizes, 29 | base_channels=base_channels, 30 | max_channels=max_channels, 31 | downsample_factors=downsample_factors, 32 | groups_denominator=groups_denominator, 33 | ) 34 | for _ in range(num_scales) 35 | ] 36 | ) 37 | 38 | self.pooling = nn.AvgPool1d( 39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False 40 | ) 41 | 42 | def forward(self, x): 43 | scores = [] 44 | feats = [] 45 | for disc in self.discriminators: 46 | score, feat = disc(x) 47 | scores.append(score) 48 | feats.append(feat) 49 | x = self.pooling(x) 50 | return scores, feats 51 | -------------------------------------------------------------------------------- /.github/workflows/inference_tests.yml: -------------------------------------------------------------------------------- 1 | name: inference_tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y --no-install-recommends git make gcc 38 | make system-deps 39 | - name: Install/upgrade Python setup deps 40 | run: python3 -m pip install --upgrade pip setuptools wheel 41 | - name: Replace scarf urls 42 | run: | 43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json 44 | - name: Install TTS 45 | run: | 46 | python3 -m pip install .[all] 47 | python3 setup.py egg_info 48 | - name: Unit tests 49 | run: make inference_tests 50 | -------------------------------------------------------------------------------- /docs/source/models/vits.md: -------------------------------------------------------------------------------- 1 | # VITS 2 | 3 | VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech 4 | ) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE, 5 | Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment 6 | using MAS, as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. 7 | It is a feed-forward model with x67.12 real-time factor on a GPU. 8 | 9 | 🐸 YourTTS is a multi-speaker and multi-lingual TTS model that can perform voice conversion and zero-shot speaker adaptation. 10 | It can also learn a new language or voice with a ~ 1 minute long audio clip. This is a big open gate for training 11 | TTS models in low-resources languages. 🐸 YourTTS uses VITS as the backbone architecture coupled with a speaker encoder model. 12 | 13 | ## Important resources & papers 14 | - 🐸 YourTTS: https://arxiv.org/abs/2112.02418 15 | - VITS: https://arxiv.org/pdf/2106.06103.pdf 16 | - Neural Spline Flows: https://arxiv.org/abs/1906.04032 17 | - Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf 18 | - Generative Adversarial Networks: https://arxiv.org/abs/1406.2661 19 | - HiFiGAN: https://arxiv.org/abs/2010.05646 20 | - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html 21 | 22 | ## VitsConfig 23 | ```{eval-rst} 24 | .. autoclass:: TTS.tts.configs.vits_config.VitsConfig 25 | :members: 26 | ``` 27 | 28 | ## VitsArgs 29 | ```{eval-rst} 30 | .. autoclass:: TTS.tts.models.vits.VitsArgs 31 | :members: 32 | ``` 33 | 34 | ## Vits Model 35 | ```{eval-rst} 36 | .. autoclass:: TTS.tts.models.vits.Vits 37 | :members: 38 | ``` 39 | -------------------------------------------------------------------------------- /.github/workflows/zoo_tests.yml: -------------------------------------------------------------------------------- 1 | name: zoo-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y git make gcc 38 | sudo apt-get install espeak espeak-ng 39 | make system-deps 40 | - name: Install/upgrade Python setup deps 41 | run: python3 -m pip install --upgrade pip setuptools wheel 42 | - name: Replace scarf urls 43 | run: | 44 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json 45 | - name: Install TTS 46 | run: | 47 | python3 -m pip install .[all] 48 | python3 setup.py egg_info 49 | - name: Unit tests 50 | run: make test_zoo 51 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import MultibandMelganConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.gan import GAN 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | 13 | config = MultibandMelganConfig( 14 | batch_size=32, 15 | eval_batch_size=16, 16 | num_loader_workers=4, 17 | num_eval_loader_workers=4, 18 | run_eval=True, 19 | test_delay_epochs=5, 20 | epochs=1000, 21 | seq_len=8192, 22 | pad_short=2000, 23 | use_noise_augment=True, 24 | eval_split_size=10, 25 | print_step=25, 26 | print_eval=False, 27 | mixed_precision=False, 28 | lr_gen=1e-4, 29 | lr_disc=1e-4, 30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 31 | output_path=output_path, 32 | ) 33 | 34 | # download dataset if not already present 35 | if not os.path.exists(config.data_path): 36 | print("Downloading dataset") 37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 38 | download_thorsten_de(download_path) 39 | 40 | # init audio processor 41 | ap = AudioProcessor(**config.audio.to_dict()) 42 | 43 | # load training samples 44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 45 | 46 | # init model 47 | model = GAN(config, ap) 48 | 49 | # init the trainer and 🚀 50 | trainer = Trainer( 51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 52 | ) 53 | trainer.fit() 54 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/melgan.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.nn.utils import weight_norm 3 | 4 | 5 | class ResidualStack(nn.Module): 6 | def __init__(self, channels, num_res_blocks, kernel_size): 7 | super().__init__() 8 | 9 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd." 10 | base_padding = (kernel_size - 1) // 2 11 | 12 | self.blocks = nn.ModuleList() 13 | for idx in range(num_res_blocks): 14 | layer_kernel_size = kernel_size 15 | layer_dilation = layer_kernel_size**idx 16 | layer_padding = base_padding * layer_dilation 17 | self.blocks += [ 18 | nn.Sequential( 19 | nn.LeakyReLU(0.2), 20 | nn.ReflectionPad1d(layer_padding), 21 | weight_norm( 22 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True) 23 | ), 24 | nn.LeakyReLU(0.2), 25 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)), 26 | ) 27 | ] 28 | 29 | self.shortcuts = nn.ModuleList( 30 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for i in range(num_res_blocks)] 31 | ) 32 | 33 | def forward(self, x): 34 | for block, shortcut in zip(self.blocks, self.shortcuts): 35 | x = shortcut(x) + block(x) 36 | return x 37 | 38 | def remove_weight_norm(self): 39 | for block, shortcut in zip(self.blocks, self.shortcuts): 40 | nn.utils.remove_weight_norm(block[2]) 41 | nn.utils.remove_weight_norm(block[4]) 42 | nn.utils.remove_weight_norm(shortcut) 43 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/wavegrad/train_wavegrad.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import WavegradConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.wavegrad import Wavegrad 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = WavegradConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1000, 20 | seq_len=6144, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=50, 24 | print_step=50, 25 | print_eval=True, 26 | mixed_precision=False, 27 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 28 | output_path=output_path, 29 | ) 30 | 31 | # download dataset if not already present 32 | if not os.path.exists(config.data_path): 33 | print("Downloading dataset") 34 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 35 | download_thorsten_de(download_path) 36 | 37 | # init audio processor 38 | ap = AudioProcessor(**config.audio.to_dict()) 39 | 40 | # load training samples 41 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 42 | 43 | # init model 44 | model = Wavegrad(config) 45 | 46 | # init the trainer and 🚀 47 | trainer = Trainer( 48 | TrainerArgs(), 49 | config, 50 | output_path, 51 | model=model, 52 | train_samples=train_samples, 53 | eval_samples=eval_samples, 54 | training_assets={"audio_processor": ap}, 55 | ) 56 | trainer.fit() 57 | -------------------------------------------------------------------------------- /.github/workflows/tts_tests.yml: -------------------------------------------------------------------------------- 1 | name: tts-tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | types: [opened, synchronize, reopened] 9 | jobs: 10 | check_skip: 11 | runs-on: ubuntu-latest 12 | if: "! contains(github.event.head_commit.message, '[ci skip]')" 13 | steps: 14 | - run: echo "${{ github.event.head_commit.message }}" 15 | 16 | test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.7, 3.8, 3.9, "3.10"] 22 | experimental: [false] 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | architecture: x64 30 | cache: 'pip' 31 | cache-dependency-path: 'requirements*' 32 | - name: check OS 33 | run: cat /etc/os-release 34 | - name: Install dependencies 35 | run: | 36 | sudo apt-get update 37 | sudo apt-get install -y --no-install-recommends git make gcc 38 | sudo apt-get install espeak 39 | sudo apt-get install espeak-ng 40 | make system-deps 41 | - name: Install/upgrade Python setup deps 42 | run: python3 -m pip install --upgrade pip setuptools wheel 43 | - name: Replace scarf urls 44 | run: | 45 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json 46 | - name: Install TTS 47 | run: | 48 | python3 -m pip install .[all] 49 | python3 setup.py egg_info 50 | - name: Unit tests 51 | run: make test_tts 52 | -------------------------------------------------------------------------------- /recipes/thorsten_DE/wavernn/train_wavernn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.utils.downloaders import download_thorsten_de 7 | from TTS.vocoder.configs import WavernnConfig 8 | from TTS.vocoder.datasets.preprocess import load_wav_data 9 | from TTS.vocoder.models.wavernn import Wavernn 10 | 11 | output_path = os.path.dirname(os.path.abspath(__file__)) 12 | config = WavernnConfig( 13 | batch_size=64, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=10000, 20 | seq_len=1280, 21 | pad_short=2000, 22 | use_noise_augment=False, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=True, 26 | mixed_precision=False, 27 | lr=1e-4, 28 | grad_clip=4, 29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # download dataset if not already present 34 | if not os.path.exists(config.data_path): 35 | print("Downloading dataset") 36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../")) 37 | download_thorsten_de(download_path) 38 | 39 | # init audio processor 40 | ap = AudioProcessor(**config.audio.to_dict()) 41 | 42 | # load training samples 43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 44 | 45 | # init model 46 | model = Wavernn(config) 47 | 48 | # init the trainer and 🚀 49 | trainer = Trainer( 50 | TrainerArgs(), 51 | config, 52 | output_path, 53 | model=model, 54 | train_samples=train_samples, 55 | eval_samples=eval_samples, 56 | training_assets={"audio_processor": ap}, 57 | ) 58 | trainer.fit() 59 | -------------------------------------------------------------------------------- /TTS/vocoder/README.md: -------------------------------------------------------------------------------- 1 | # Mozilla TTS Vocoders (Experimental) 2 | 3 | Here there are vocoder model implementations which can be combined with the other TTS models. 4 | 5 | Currently, following models are implemented: 6 | 7 | - Melgan 8 | - MultiBand-Melgan 9 | - ParallelWaveGAN 10 | - GAN-TTS (Discriminator Only) 11 | 12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework. 13 | 14 | ## Training a model 15 | 16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset. 17 | 18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json''' 19 | 20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command. 21 | 22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json``` 23 | 24 | Example config files can be found under `tts/vocoder/configs/` folder. 25 | 26 | You can continue a previous training run by the following command. 27 | 28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder``` 29 | 30 | You can fine-tune a pre-trained model by the following command. 31 | 32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth``` 33 | 34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off. 35 | 36 | You can also follow your training runs on Tensorboard as you do with our TTS models. 37 | 38 | ## Acknowledgement 39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work. 40 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/pqmf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from scipy import signal as sig 5 | 6 | 7 | # adapted from 8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan 9 | class PQMF(torch.nn.Module): 10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0): 11 | super().__init__() 12 | 13 | self.N = N 14 | self.taps = taps 15 | self.cutoff = cutoff 16 | self.beta = beta 17 | 18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta)) 19 | H = np.zeros((N, len(QMF))) 20 | G = np.zeros((N, len(QMF))) 21 | for k in range(N): 22 | constant_factor = ( 23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2)) 24 | ) # TODO: (taps - 1) -> taps 25 | phase = (-1) ** k * np.pi / 4 26 | H[k] = 2 * QMF * np.cos(constant_factor + phase) 27 | 28 | G[k] = 2 * QMF * np.cos(constant_factor - phase) 29 | 30 | H = torch.from_numpy(H[:, None, :]).float() 31 | G = torch.from_numpy(G[None, :, :]).float() 32 | 33 | self.register_buffer("H", H) 34 | self.register_buffer("G", G) 35 | 36 | updown_filter = torch.zeros((N, N, N)).float() 37 | for k in range(N): 38 | updown_filter[k, k, 0] = 1.0 39 | self.register_buffer("updown_filter", updown_filter) 40 | self.N = N 41 | 42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 43 | 44 | def forward(self, x): 45 | return self.analysis(x) 46 | 47 | def analysis(self, x): 48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N) 49 | 50 | def synthesis(self, x): 51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N) 52 | x = F.conv1d(x, self.G, padding=self.taps // 2) 53 | return x 54 | -------------------------------------------------------------------------------- /tests/inputs/test_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "audio":{ 3 | "audio_processor": "audio", 4 | "num_mels": 80, 5 | "fft_size": 1024, 6 | "sample_rate": 22050, 7 | "frame_length_ms": null, 8 | "frame_shift_ms": null, 9 | "hop_length": 256, 10 | "win_length": 1024, 11 | "preemphasis": 0.97, 12 | "min_level_db": -100, 13 | "ref_level_db": 20, 14 | "power": 1.5, 15 | "griffin_lim_iters": 30, 16 | "signal_norm": true, 17 | "symmetric_norm": true, 18 | "clip_norm": true, 19 | "max_norm": 4, 20 | "mel_fmin": 0, 21 | "mel_fmax": 8000, 22 | "do_trim_silence": false, 23 | "spec_gain": 20 24 | }, 25 | 26 | "characters":{ 27 | "pad": "_", 28 | "eos": "~", 29 | "bos": "^", 30 | "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", 31 | "punctuations":"!'(),-.:;? ", 32 | "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫʲ" 33 | }, 34 | 35 | "hidden_size": 128, 36 | "embedding_size": 256, 37 | "text_cleaner": "english_cleaners", 38 | 39 | "epochs": 2000, 40 | "lr": 0.003, 41 | "lr_patience": 5, 42 | "lr_decay": 0.5, 43 | "batch_size": 2, 44 | "r": 5, 45 | "mk": 1.0, 46 | "num_loader_workers": 0, 47 | "memory_size": 5, 48 | 49 | "save_step": 200, 50 | "data_path": "tests/data/ljspeech/", 51 | "output_path": "result", 52 | "min_seq_len": 0, 53 | "max_seq_len": 300, 54 | "log_dir": "tests/outputs/", 55 | 56 | 57 | "use_speaker_embedding": false, 58 | "use_gst": true, 59 | "gst": { 60 | "gst_style_input": null, 61 | 62 | 63 | 64 | "gst_use_speaker_embedding": true, 65 | "gst_embedding_dim": 512, 66 | "gst_num_heads": 4, 67 | "gst_num_style_tokens": 10 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from TTS.config import BaseDatasetConfig 4 | from TTS.utils.generic_utils import get_cuda 5 | 6 | 7 | def get_device_id(): 8 | use_cuda, _ = get_cuda() 9 | if use_cuda: 10 | if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] != "": 11 | GPU_ID = os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0] 12 | else: 13 | GPU_ID = "0" 14 | else: 15 | GPU_ID = "" 16 | return GPU_ID 17 | 18 | 19 | def get_tests_path(): 20 | """Returns the path to the test directory.""" 21 | return os.path.dirname(os.path.realpath(__file__)) 22 | 23 | 24 | def get_tests_input_path(): 25 | """Returns the path to the test data directory.""" 26 | return os.path.join(get_tests_path(), "inputs") 27 | 28 | 29 | def get_tests_data_path(): 30 | """Returns the path to the test data directory.""" 31 | return os.path.join(get_tests_path(), "data") 32 | 33 | 34 | def get_tests_output_path(): 35 | """Returns the path to the directory for test outputs.""" 36 | return os.path.join(get_tests_path(), "outputs") 37 | 38 | 39 | def run_cli(command): 40 | exit_status = os.system(command) 41 | assert exit_status == 0, f" [!] command `{command}` failed." 42 | 43 | 44 | def get_test_data_config(): 45 | return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv") 46 | 47 | 48 | def assertHasAttr(test_obj, obj, intendedAttr): 49 | # from https://stackoverflow.com/questions/48078636/pythons-unittest-lacks-an-asserthasattr-method-what-should-i-use-instead 50 | testBool = hasattr(obj, intendedAttr) 51 | test_obj.assertTrue(testBool, msg=f"obj lacking an attribute. obj: {obj}, intendedAttr: {intendedAttr}") 52 | 53 | 54 | def assertHasNotAttr(test_obj, obj, intendedAttr): 55 | testBool = hasattr(obj, intendedAttr) 56 | test_obj.assertFalse(testBool, msg=f"obj should not have an attribute. obj: {obj}, intendedAttr: {intendedAttr}") 57 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 2 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak 3 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut 4 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer 5 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer 6 | 7 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)} 8 | 9 | 10 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys()) 11 | GRUUT_LANGS = list(Gruut.supported_languages()) 12 | 13 | 14 | # Dict setting default phonemizers for each language 15 | # Add Gruut languages 16 | _ = [Gruut.name()] * len(GRUUT_LANGS) 17 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _))) 18 | 19 | 20 | # Add ESpeak languages and override any existing ones 21 | _ = [ESpeak.name()] * len(ESPEAK_LANGS) 22 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _))) 23 | DEF_LANG_TO_PHONEMIZER.update(_new_dict) 24 | 25 | # Force default for some languages 26 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] 27 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() 28 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() 29 | 30 | 31 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: 32 | """Initiate a phonemizer by name 33 | 34 | Args: 35 | name (str): 36 | Name of the phonemizer that should match `phonemizer.name()`. 37 | 38 | kwargs (dict): 39 | Extra keyword arguments that should be passed to the phonemizer. 40 | """ 41 | if name == "espeak": 42 | return ESpeak(**kwargs) 43 | if name == "gruut": 44 | return Gruut(**kwargs) 45 | if name == "zh_cn_phonemizer": 46 | return ZH_CN_Phonemizer(**kwargs) 47 | if name == "ja_jp_phonemizer": 48 | return JA_JP_Phonemizer(**kwargs) 49 | raise ValueError(f"Phonemizer {name} not found") 50 | 51 | 52 | if __name__ == "__main__": 53 | print(DEF_LANG_TO_PHONEMIZER) 54 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | dependencies = [ 2 | 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite' 3 | ] 4 | import torch 5 | 6 | from TTS.utils.manage import ModelManager 7 | from TTS.utils.synthesizer import Synthesizer 8 | 9 | 10 | def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA', 11 | vocoder_name=None, 12 | use_cuda=False): 13 | """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text. 14 | 15 | Example: 16 | >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github') 17 | >>> wavs = synthesizer.tts("This is a test! This is also a test!!") 18 | wavs - is a list of values of the synthesized speech. 19 | 20 | Args: 21 | model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'. 22 | vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'. 23 | pretrained (bool, optional): [description]. Defaults to True. 24 | 25 | Returns: 26 | TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models. 27 | """ 28 | manager = ModelManager() 29 | 30 | model_path, config_path, model_item = manager.download_model(model_name) 31 | vocoder_name = model_item[ 32 | 'default_vocoder'] if vocoder_name is None else vocoder_name 33 | vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name) 34 | 35 | # create synthesizer 36 | synt = Synthesizer(tts_checkpoint=model_path, 37 | tts_config_path=config_path, 38 | vocoder_checkpoint=vocoder_path, 39 | vocoder_config=vocoder_config_path, 40 | use_cuda=use_cuda) 41 | return synt 42 | 43 | 44 | if __name__ == '__main__': 45 | synthesizer = torch.hub.load('coqui-ai/TTS:dev', 'tts', source='github') 46 | synthesizer.tts("This is a test!") 47 | -------------------------------------------------------------------------------- /TTS/model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Dict 3 | 4 | import torch 5 | from coqpit import Coqpit 6 | from trainer import TrainerModel 7 | 8 | # pylint: skip-file 9 | 10 | 11 | class BaseTrainerModel(TrainerModel): 12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. 13 | 14 | Every new 🐸TTS model must inherit it. 15 | """ 16 | 17 | @staticmethod 18 | @abstractmethod 19 | def init_from_config(config: Coqpit): 20 | """Init the model and all its attributes from the given config. 21 | 22 | Override this depending on your model. 23 | """ 24 | ... 25 | 26 | @abstractmethod 27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict: 28 | """Forward pass for inference. 29 | 30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` 31 | is considered to be the main output and you can add any other auxiliary outputs as you want. 32 | 33 | We don't use `*kwargs` since it is problematic with the TorchScript API. 34 | 35 | Args: 36 | input (torch.Tensor): [description] 37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. 38 | 39 | Returns: 40 | Dict: [description] 41 | """ 42 | outputs_dict = {"model_outputs": None} 43 | ... 44 | return outputs_dict 45 | 46 | @abstractmethod 47 | def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None: 48 | """Load a model checkpoint gile and get ready for training or inference. 49 | 50 | Args: 51 | config (Coqpit): Model configuration. 52 | checkpoint_path (str): Path to the model checkpoint file. 53 | eval (bool, optional): If true, init model for inference else for training. Defaults to False. 54 | strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. 55 | """ 56 | ... 57 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | 9 | class ZH_CN_Phonemizer(BasePhonemizer): 10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer` 11 | 12 | Args: 13 | punctuations (str): 14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`. 15 | 16 | keep_puncs (bool): 17 | If True, keep the punctuations after phonemization. Defaults to False. 18 | 19 | Example :: 20 | 21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。` 22 | 23 | TODO: someone with Mandarin knowledge should check this implementation 24 | """ 25 | 26 | language = "zh-cn" 27 | 28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument 29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 30 | 31 | @staticmethod 32 | def name(): 33 | return "zh_cn_phonemizer" 34 | 35 | @staticmethod 36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str: 37 | ph = chinese_text_to_phonemes(text, separator) 38 | return ph 39 | 40 | def _phonemize(self, text, separator): 41 | return self.phonemize_zh_cn(text, separator) 42 | 43 | @staticmethod 44 | def supported_languages() -> Dict: 45 | return {"zh-cn": "Japanese (Japan)"} 46 | 47 | def version(self) -> str: 48 | return "0.0.1" 49 | 50 | def is_available(self) -> bool: 51 | return True 52 | 53 | 54 | # if __name__ == "__main__": 55 | # text = "这是,样本中文。" 56 | # e = ZH_CN_Phonemizer() 57 | # print(e.supported_languages()) 58 | # print(e.version()) 59 | # print(e.language) 60 | # print(e.name()) 61 | # print(e.is_available()) 62 | # print("`" + e.phonemize(text) + "`") 63 | -------------------------------------------------------------------------------- /TTS/encoder/configs/base_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass, field 2 | from typing import Dict, List 3 | 4 | from coqpit import MISSING 5 | 6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig 7 | 8 | 9 | @dataclass 10 | class BaseEncoderConfig(BaseTrainingConfig): 11 | """Defines parameters for a Generic Encoder model.""" 12 | 13 | model: str = None 14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) 15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) 16 | # model params 17 | model_params: Dict = field( 18 | default_factory=lambda: { 19 | "model_name": "lstm", 20 | "input_dim": 80, 21 | "proj_dim": 256, 22 | "lstm_dim": 768, 23 | "num_lstm_layers": 3, 24 | "use_lstm_with_projection": True, 25 | } 26 | ) 27 | 28 | audio_augmentation: Dict = field(default_factory=lambda: {}) 29 | 30 | # training params 31 | epochs: int = 10000 32 | loss: str = "angleproto" 33 | grad_clip: float = 3.0 34 | lr: float = 0.0001 35 | optimizer: str = "radam" 36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) 37 | lr_decay: bool = False 38 | warmup_steps: int = 4000 39 | 40 | # logging params 41 | tb_model_param_stats: bool = False 42 | steps_plot_stats: int = 10 43 | save_step: int = 1000 44 | print_step: int = 20 45 | run_eval: bool = False 46 | 47 | # data loader 48 | num_classes_in_batch: int = MISSING 49 | num_utter_per_class: int = MISSING 50 | eval_num_classes_in_batch: int = None 51 | eval_num_utter_per_class: int = None 52 | 53 | num_loader_workers: int = MISSING 54 | voice_len: float = 1.6 55 | 56 | def check_values(self): 57 | super().check_values() 58 | c = asdict(self) 59 | assert ( 60 | c["model_params"]["input_dim"] == self.audio.num_mels 61 | ), " [!] model input dimendion must be equal to melspectrogram dimension." 62 | -------------------------------------------------------------------------------- /docs/source/models/forward_tts.md: -------------------------------------------------------------------------------- 1 | # Forward TTS model(s) 2 | 3 | A general feed-forward TTS model implementation that can be configured to different architectures by setting different 4 | encoder and decoder networks. It can be trained with either pre-computed durations (from pre-trained Tacotron) or 5 | an alignment network that learns the text to audio alignment from the input data. 6 | 7 | Currently we provide the following pre-configured architectures: 8 | 9 | - **FastSpeech:** 10 | 11 | It's a feed-forward model TTS model that uses Feed Forward Transformer (FFT) modules as the encoder and decoder. 12 | 13 | - **FastPitch:** 14 | 15 | It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the 16 | promise of more expressive speech. 17 | 18 | - **SpeedySpeech:** 19 | 20 | It uses Residual Convolution layers instead of Transformers that leads to a more compute friendly model. 21 | 22 | - **FastSpeech2 (TODO):** 23 | 24 | Similar to FastPitch but it also uses a spectral energy values as an addition. 25 | 26 | ## Important resources & papers 27 | - FastPitch: https://arxiv.org/abs/2006.06873 28 | - SpeedySpeech: https://arxiv.org/abs/2008.03802 29 | - FastSpeech: https://arxiv.org/pdf/1905.09263 30 | - FastSpeech2: https://arxiv.org/abs/2006.04558 31 | - Aligner Network: https://arxiv.org/abs/2108.10447 32 | - What is Pitch: https://www.britannica.com/topic/pitch-speech 33 | 34 | 35 | ## ForwardTTSArgs 36 | ```{eval-rst} 37 | .. autoclass:: TTS.tts.models.forward_tts.ForwardTTSArgs 38 | :members: 39 | ``` 40 | 41 | ## ForwardTTS Model 42 | ```{eval-rst} 43 | .. autoclass:: TTS.tts.models.forward_tts.ForwardTTS 44 | :members: 45 | ``` 46 | 47 | ## FastPitchConfig 48 | ```{eval-rst} 49 | .. autoclass:: TTS.tts.configs.fast_pitch_config.FastPitchConfig 50 | :members: 51 | ``` 52 | 53 | ## SpeedySpeechConfig 54 | ```{eval-rst} 55 | .. autoclass:: TTS.tts.configs.speedy_speech_config.SpeedySpeechConfig 56 | :members: 57 | ``` 58 | 59 | ## FastSpeechConfig 60 | ```{eval-rst} 61 | .. autoclass:: TTS.tts.configs.fast_speech_config.FastSpeechConfig 62 | :members: 63 | ``` 64 | 65 | 66 | -------------------------------------------------------------------------------- /TTS/vocoder/layers/hifigan.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | # pylint: disable=dangerous-default-value 5 | class ResStack(nn.Module): 6 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]): 7 | super().__init__() 8 | resstack = [] 9 | for dilation in dilations: 10 | resstack += [ 11 | nn.LeakyReLU(0.2), 12 | nn.ReflectionPad1d(dilation), 13 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)), 14 | nn.LeakyReLU(0.2), 15 | nn.ReflectionPad1d(padding), 16 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)), 17 | ] 18 | self.resstack = nn.Sequential(*resstack) 19 | 20 | self.shortcut = nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)) 21 | 22 | def forward(self, x): 23 | x1 = self.shortcut(x) 24 | x2 = self.resstack(x) 25 | return x1 + x2 26 | 27 | def remove_weight_norm(self): 28 | nn.utils.remove_weight_norm(self.shortcut) 29 | nn.utils.remove_weight_norm(self.resstack[2]) 30 | nn.utils.remove_weight_norm(self.resstack[5]) 31 | nn.utils.remove_weight_norm(self.resstack[8]) 32 | nn.utils.remove_weight_norm(self.resstack[11]) 33 | nn.utils.remove_weight_norm(self.resstack[14]) 34 | nn.utils.remove_weight_norm(self.resstack[17]) 35 | 36 | 37 | class MRF(nn.Module): 38 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value 39 | super().__init__() 40 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations) 41 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations) 42 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations) 43 | 44 | def forward(self, x): 45 | x1 = self.resblock1(x) 46 | x2 = self.resblock2(x) 47 | x3 = self.resblock3(x) 48 | return x1 + x2 + x3 49 | 50 | def remove_weight_norm(self): 51 | self.resblock1.remove_weight_norm() 52 | self.resblock2.remove_weight_norm() 53 | self.resblock3.remove_weight_norm() 54 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/multi_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name 4 | 5 | 6 | class MultiPhonemizer: 7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages 8 | 9 | Args: 10 | custom_lang_to_phonemizer (Dict): 11 | Custom phonemizer mapping if you want to change the defaults. In the format of 12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`. 13 | 14 | TODO: find a way to pass custom kwargs to the phonemizers 15 | """ 16 | 17 | lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER 18 | language = "multi-lingual" 19 | 20 | def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value 21 | self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer) 22 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name) 23 | 24 | @staticmethod 25 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict: 26 | lang_to_phonemizer = {} 27 | for k, v in lang_to_phonemizer_name.items(): 28 | phonemizer = get_phonemizer_by_name(v, language=k) 29 | lang_to_phonemizer[k] = phonemizer 30 | return lang_to_phonemizer 31 | 32 | @staticmethod 33 | def name(): 34 | return "multi-phonemizer" 35 | 36 | def phonemize(self, text, language, separator="|"): 37 | return self.lang_to_phonemizer[language].phonemize(text, separator) 38 | 39 | def supported_languages(self) -> List: 40 | return list(self.lang_to_phonemizer_name.keys()) 41 | 42 | 43 | # if __name__ == "__main__": 44 | # texts = { 45 | # "tr": "Merhaba, bu Türkçe bit örnek!", 46 | # "en-us": "Hello, this is English example!", 47 | # "de": "Hallo, das ist ein Deutches Beipiel!", 48 | # "zh-cn": "这是中国的例子", 49 | # } 50 | # phonemes = {} 51 | # ph = MultiPhonemizer() 52 | # for lang, text in texts.items(): 53 | # phoneme = ph.phonemize(text, lang) 54 | # phonemes[lang] = phoneme 55 | # print(phonemes) 56 | -------------------------------------------------------------------------------- /TTS/vocoder/models/base_vocoder.py: -------------------------------------------------------------------------------- 1 | from coqpit import Coqpit 2 | 3 | from TTS.model import BaseTrainerModel 4 | 5 | # pylint: skip-file 6 | 7 | 8 | class BaseVocoder(BaseTrainerModel): 9 | """Base `vocoder` class. Every new `vocoder` model must inherit this. 10 | 11 | It defines `vocoder` specific functions on top of `Model`. 12 | 13 | Notes on input/output tensor shapes: 14 | Any input or output tensor of the model must be shaped as 15 | 16 | - 3D tensors `batch x time x channels` 17 | - 2D tensors `batch x channels` 18 | - 1D tensors `batch x 1` 19 | """ 20 | 21 | def __init__(self, config): 22 | super().__init__() 23 | self._set_model_args(config) 24 | 25 | def _set_model_args(self, config: Coqpit): 26 | """Setup model args based on the config type. 27 | 28 | If the config is for training with a name like "*Config", then the model args are embeded in the 29 | config.model_args 30 | 31 | If the config is for the model with a name like "*Args", then we assign the directly. 32 | """ 33 | # don't use isintance not to import recursively 34 | if "Config" in config.__class__.__name__: 35 | if "characters" in config: 36 | _, self.config, num_chars = self.get_characters(config) 37 | self.config.num_chars = num_chars 38 | if hasattr(self.config, "model_args"): 39 | config.model_args.num_chars = num_chars 40 | if "model_args" in config: 41 | self.args = self.config.model_args 42 | # This is for backward compatibility 43 | if "model_params" in config: 44 | self.args = self.config.model_params 45 | else: 46 | self.config = config 47 | if "model_args" in config: 48 | self.args = self.config.model_args 49 | # This is for backward compatibility 50 | if "model_params" in config: 51 | self.args = self.config.model_params 52 | else: 53 | raise ValueError("config must be either a *Config or *Args") 54 | -------------------------------------------------------------------------------- /tests/tts_tests/test_vits_d-vectors_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.tts.configs.vits_config import VitsConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_model_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | 12 | config = VitsConfig( 13 | batch_size=2, 14 | eval_batch_size=2, 15 | num_loader_workers=0, 16 | num_eval_loader_workers=0, 17 | text_cleaner="english_cleaners", 18 | use_phonemes=True, 19 | phoneme_language="en-us", 20 | phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", 21 | run_eval=True, 22 | test_delay_epochs=-1, 23 | epochs=1, 24 | print_step=1, 25 | print_eval=True, 26 | test_sentences=[ 27 | ["Be a voice, not an echo.", "ljspeech-0"], 28 | ], 29 | ) 30 | # set audio config 31 | config.audio.do_trim_silence = True 32 | config.audio.trim_db = 60 33 | 34 | # active multispeaker d-vec mode 35 | config.model_args.use_d_vector_file = True 36 | config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" 37 | config.model_args.d_vector_dim = 256 38 | 39 | 40 | config.save_json(config_path) 41 | 42 | # train the model for one epoch 43 | command_train = ( 44 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " 45 | f"--coqpit.output_path {output_path} " 46 | "--coqpit.datasets.0.name ljspeech " 47 | "--coqpit.datasets.0.meta_file_train metadata.csv " 48 | "--coqpit.datasets.0.meta_file_val metadata.csv " 49 | "--coqpit.datasets.0.path tests/data/ljspeech " 50 | "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " 51 | "--coqpit.test_delay_epochs 0" 52 | ) 53 | run_cli(command_train) 54 | 55 | # Find latest folder 56 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 57 | 58 | # restore the model and continue training for one more epoch 59 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " 60 | run_cli(command_train) 61 | shutil.rmtree(continue_path) 62 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | .PHONY: test system-deps dev-deps deps style lint install help docs 3 | 4 | help: 5 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 6 | 7 | target_dirs := tests TTS notebooks recipes 8 | 9 | test_all: ## run tests and don't stop on an error. 10 | nose2 --with-coverage --coverage TTS tests 11 | ./run_bash_tests.sh 12 | 13 | test: ## run tests. 14 | nose2 -F -v -B --with-coverage --coverage TTS tests 15 | 16 | test_vocoder: ## run vocoder tests. 17 | nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests 18 | 19 | test_tts: ## run tts tests. 20 | nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests 21 | 22 | test_aux: ## run aux tests. 23 | nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests 24 | ./run_bash_tests.sh 25 | 26 | test_zoo: ## run zoo tests. 27 | nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests 28 | 29 | inference_tests: ## run inference tests. 30 | nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests 31 | 32 | data_tests: ## run data tests. 33 | nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests 34 | 35 | test_text: ## run text tests. 36 | nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests 37 | 38 | test_failed: ## only run tests failed the last time. 39 | nose2 -F -v -B --with-coverage --coverage TTS tests 40 | 41 | style: ## update code style. 42 | black ${target_dirs} 43 | isort ${target_dirs} 44 | 45 | lint: ## run pylint linter. 46 | pylint ${target_dirs} 47 | black ${target_dirs} --check 48 | isort ${target_dirs} --check-only 49 | 50 | system-deps: ## install linux system deps 51 | sudo apt-get install -y libsndfile1-dev 52 | 53 | dev-deps: ## install development deps 54 | pip install -r requirements.dev.txt 55 | 56 | doc-deps: ## install docs dependencies 57 | pip install -r docs/requirements.txt 58 | 59 | build-docs: ## build the docs 60 | cd docs && make clean && make build 61 | 62 | hub-deps: ## install deps for torch hub use 63 | pip install -r requirements.hub.txt 64 | 65 | deps: ## install 🐸 requirements. 66 | pip install -r requirements.txt 67 | 68 | install: ## install 🐸 TTS for development. 69 | pip install -e .[all] 70 | 71 | docs: ## build the docs 72 | $(MAKE) -C docs clean && $(MAKE) -C docs html 73 | -------------------------------------------------------------------------------- /TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes 4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer 5 | 6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】" 7 | 8 | _TRANS_TABLE = {"、": ","} 9 | 10 | 11 | def trans(text): 12 | for i, j in _TRANS_TABLE.items(): 13 | text = text.replace(i, j) 14 | return text 15 | 16 | 17 | class JA_JP_Phonemizer(BasePhonemizer): 18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer` 19 | 20 | TODO: someone with JA knowledge should check this implementation 21 | 22 | Example: 23 | 24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer 25 | >>> phonemizer = JA_JP_Phonemizer() 26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|") 27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?' 28 | 29 | """ 30 | 31 | language = "ja-jp" 32 | 33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument 34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) 35 | 36 | @staticmethod 37 | def name(): 38 | return "ja_jp_phonemizer" 39 | 40 | def _phonemize(self, text: str, separator: str = "|") -> str: 41 | ph = japanese_text_to_phonemes(text) 42 | if separator is not None or separator != "": 43 | return separator.join(ph) 44 | return ph 45 | 46 | def phonemize(self, text: str, separator="|") -> str: 47 | """Custom phonemize for JP_JA 48 | 49 | Skip pre-post processing steps used by the other phonemizers. 50 | """ 51 | return self._phonemize(text, separator) 52 | 53 | @staticmethod 54 | def supported_languages() -> Dict: 55 | return {"ja-jp": "Japanese (Japan)"} 56 | 57 | def version(self) -> str: 58 | return "0.0.1" 59 | 60 | def is_available(self) -> bool: 61 | return True 62 | 63 | 64 | # if __name__ == "__main__": 65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。" 66 | # e = JA_JP_Phonemizer() 67 | # print(e.supported_languages()) 68 | # print(e.version()) 69 | # print(e.language) 70 | # print(e.name()) 71 | # print(e.is_available()) 72 | # print("`" + e.phonemize(text) + "`") 73 | -------------------------------------------------------------------------------- /TTS/vocoder/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from coqpit import Coqpit 4 | from torch.utils.data import Dataset 5 | 6 | from TTS.utils.audio import AudioProcessor 7 | from TTS.vocoder.datasets.gan_dataset import GANDataset 8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data 9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset 10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset 11 | 12 | 13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: 14 | if config.model.lower() in "gan": 15 | dataset = GANDataset( 16 | ap=ap, 17 | items=data_items, 18 | seq_len=config.seq_len, 19 | hop_len=ap.hop_length, 20 | pad_short=config.pad_short, 21 | conv_pad=config.conv_pad, 22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, 23 | is_training=not is_eval, 24 | return_segments=not is_eval, 25 | use_noise_augment=config.use_noise_augment, 26 | use_cache=config.use_cache, 27 | verbose=verbose, 28 | ) 29 | dataset.shuffle_mapping() 30 | elif config.model.lower() == "wavegrad": 31 | dataset = WaveGradDataset( 32 | ap=ap, 33 | items=data_items, 34 | seq_len=config.seq_len, 35 | hop_len=ap.hop_length, 36 | pad_short=config.pad_short, 37 | conv_pad=config.conv_pad, 38 | is_training=not is_eval, 39 | return_segments=True, 40 | use_noise_augment=False, 41 | use_cache=config.use_cache, 42 | verbose=verbose, 43 | ) 44 | elif config.model.lower() == "wavernn": 45 | dataset = WaveRNNDataset( 46 | ap=ap, 47 | items=data_items, 48 | seq_len=config.seq_len, 49 | hop_len=ap.hop_length, 50 | pad=config.model_params.pad, 51 | mode=config.model_params.mode, 52 | mulaw=config.model_params.mulaw, 53 | is_training=not is_eval, 54 | verbose=verbose, 55 | ) 56 | else: 57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") 58 | return dataset 59 | -------------------------------------------------------------------------------- /.github/workflows/docker.yaml: -------------------------------------------------------------------------------- 1 | name: "Docker build and push" 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - main 7 | - dev 8 | tags: 9 | - v* 10 | jobs: 11 | docker-build: 12 | name: "Build and push Docker image" 13 | runs-on: ubuntu-20.04 14 | strategy: 15 | matrix: 16 | arch: ["amd64"] 17 | base: 18 | - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled 19 | - "ubuntu:20.04" # CPU only 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Log in to the Container registry 23 | uses: docker/login-action@v1 24 | with: 25 | registry: ghcr.io 26 | username: ${{ github.actor }} 27 | password: ${{ secrets.GITHUB_TOKEN }} 28 | - name: Compute Docker tags, check VERSION file matches tag 29 | id: compute-tag 30 | run: | 31 | set -ex 32 | base="ghcr.io/coqui-ai/tts" 33 | tags="" # PR build 34 | 35 | if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then 36 | base="ghcr.io/coqui-ai/tts-cpu" 37 | fi 38 | 39 | if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then 40 | # Push to branch 41 | github_ref="${{ github.ref }}" 42 | branch=${github_ref#*refs/heads/} # strip prefix to get branch name 43 | tags="${base}:${branch},${base}:${{ github.sha }}," 44 | elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then 45 | VERSION="v$(cat TTS/VERSION)" 46 | if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then 47 | echo "Pushed tag does not match VERSION file. Aborting push." 48 | exit 1 49 | fi 50 | tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}" 51 | fi 52 | echo "::set-output name=tags::${tags}" 53 | - name: Set up QEMU 54 | uses: docker/setup-qemu-action@v1 55 | - name: Set up Docker Buildx 56 | id: buildx 57 | uses: docker/setup-buildx-action@v1 58 | - name: Build and push 59 | uses: docker/build-push-action@v2 60 | with: 61 | context: . 62 | platforms: linux/${{ matrix.arch }} 63 | push: ${{ github.event_name == 'push' }} 64 | build-args: "BASE=${{ matrix.base }}" 65 | tags: ${{ steps.compute-tag.outputs.tags }} 66 | -------------------------------------------------------------------------------- /tests/vocoder_tests/test_wavegrad.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import torch 5 | from torch import optim 6 | 7 | from TTS.vocoder.configs import WavegradConfig 8 | from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs 9 | 10 | # pylint: disable=unused-variable 11 | 12 | torch.manual_seed(1) 13 | use_cuda = torch.cuda.is_available() 14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 15 | 16 | 17 | class WavegradTrainTest(unittest.TestCase): 18 | def test_train_step(self): # pylint: disable=no-self-use 19 | """Test if all layers are updated in a basic training cycle""" 20 | input_dummy = torch.rand(8, 1, 20 * 300).to(device) 21 | mel_spec = torch.rand(8, 80, 20).to(device) 22 | 23 | criterion = torch.nn.L1Loss().to(device) 24 | args = WavegradArgs( 25 | in_channels=80, 26 | out_channels=1, 27 | upsample_factors=[5, 5, 3, 2, 2], 28 | upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], 29 | ) 30 | config = WavegradConfig(model_params=args) 31 | model = Wavegrad(config) 32 | 33 | model_ref = Wavegrad(config) 34 | model.train() 35 | model.to(device) 36 | betas = np.linspace(1e-6, 1e-2, 1000) 37 | model.compute_noise_level(betas) 38 | model_ref.load_state_dict(model.state_dict()) 39 | model_ref.to(device) 40 | count = 0 41 | for param, param_ref in zip(model.parameters(), model_ref.parameters()): 42 | assert (param - param_ref).sum() == 0, param 43 | count += 1 44 | optimizer = optim.Adam(model.parameters(), lr=0.001) 45 | for i in range(5): 46 | y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) 47 | optimizer.zero_grad() 48 | loss = criterion(y_hat, input_dummy) 49 | loss.backward() 50 | optimizer.step() 51 | # check parameter changes 52 | count = 0 53 | for param, param_ref in zip(model.parameters(), model_ref.parameters()): 54 | # ignore pre-higway layer since it works conditional 55 | # if count not in [145, 59]: 56 | assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( 57 | count, param.shape, param, param_ref 58 | ) 59 | count += 1 60 | -------------------------------------------------------------------------------- /tests/tts_tests/test_tacotron_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from trainer import get_last_checkpoint 6 | 7 | from tests import get_device_id, get_tests_output_path, run_cli 8 | from TTS.tts.configs.tacotron_config import TacotronConfig 9 | 10 | config_path = os.path.join(get_tests_output_path(), "test_model_config.json") 11 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 12 | 13 | 14 | config = TacotronConfig( 15 | batch_size=8, 16 | eval_batch_size=8, 17 | num_loader_workers=0, 18 | num_eval_loader_workers=0, 19 | text_cleaner="english_cleaners", 20 | use_phonemes=False, 21 | phoneme_language="en-us", 22 | phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), 23 | run_eval=True, 24 | test_delay_epochs=-1, 25 | epochs=1, 26 | print_step=1, 27 | test_sentences=[ 28 | "Be a voice, not an echo.", 29 | ], 30 | print_eval=True, 31 | r=5, 32 | max_decoder_steps=50, 33 | ) 34 | config.audio.do_trim_silence = True 35 | config.audio.trim_db = 60 36 | config.save_json(config_path) 37 | 38 | # train the model for one epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " 41 | f"--coqpit.output_path {output_path} " 42 | "--coqpit.datasets.0.name ljspeech " 43 | "--coqpit.datasets.0.meta_file_train metadata.csv " 44 | "--coqpit.datasets.0.meta_file_val metadata.csv " 45 | "--coqpit.datasets.0.path tests/data/ljspeech " 46 | "--coqpit.test_delay_epochs 0" 47 | ) 48 | run_cli(command_train) 49 | 50 | # Find latest folder 51 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 52 | 53 | # Inference using TTS API 54 | continue_config_path = os.path.join(continue_path, "config.json") 55 | continue_restore_path, _ = get_last_checkpoint(continue_path) 56 | out_wav_path = os.path.join(get_tests_output_path(), "output.wav") 57 | 58 | inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" 59 | run_cli(inference_command) 60 | 61 | # restore the model and continue training for one more epoch 62 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " 63 | run_cli(command_train) 64 | shutil.rmtree(continue_path) 65 | -------------------------------------------------------------------------------- /TTS/bin/find_unique_phonemes.py: -------------------------------------------------------------------------------- 1 | """Find all the unique characters in a dataset""" 2 | import argparse 3 | import multiprocessing 4 | from argparse import RawTextHelpFormatter 5 | 6 | from tqdm.contrib.concurrent import process_map 7 | 8 | from TTS.config import load_config 9 | from TTS.tts.datasets import load_tts_samples 10 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut 11 | 12 | phonemizer = Gruut(language="en-us") 13 | 14 | 15 | def compute_phonemes(item): 16 | try: 17 | text = item[0] 18 | ph = phonemizer.phonemize(text).split("|") 19 | except: 20 | return [] 21 | return list(set(ph)) 22 | 23 | 24 | def main(): 25 | # pylint: disable=W0601 26 | global c 27 | # pylint: disable=bad-option-value 28 | parser = argparse.ArgumentParser( 29 | description="""Find all the unique characters or phonemes in a dataset.\n\n""" 30 | """ 31 | Example runs: 32 | 33 | python TTS/bin/find_unique_chars.py --config_path config.json 34 | """, 35 | formatter_class=RawTextHelpFormatter, 36 | ) 37 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) 38 | args = parser.parse_args() 39 | 40 | c = load_config(args.config_path) 41 | 42 | # load all datasets 43 | train_items, eval_items = load_tts_samples( 44 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size 45 | ) 46 | items = train_items + eval_items 47 | print("Num items:", len(items)) 48 | 49 | is_lang_def = all(item["language"] for item in items) 50 | 51 | if not c.phoneme_language or not is_lang_def: 52 | raise ValueError("Phoneme language must be defined in config.") 53 | 54 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) 55 | phones = [] 56 | for ph in phonemes: 57 | phones.extend(ph) 58 | phones = set(phones) 59 | lower_phones = filter(lambda c: c.islower(), phones) 60 | phones_force_lower = [c.lower() for c in phones] 61 | phones_force_lower = set(phones_force_lower) 62 | 63 | print(f" > Number of unique phonemes: {len(phones)}") 64 | print(f" > Unique phonemes: {''.join(sorted(phones))}") 65 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") 66 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /TTS/bin/train_tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | 4 | from trainer import Trainer, TrainerArgs 5 | 6 | from TTS.config import load_config, register_config 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models import setup_model 9 | 10 | 11 | @dataclass 12 | class TrainTTSArgs(TrainerArgs): 13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."}) 14 | 15 | 16 | def main(): 17 | """Run `tts` model training directly by a `config.json` file.""" 18 | # init trainer args 19 | train_args = TrainTTSArgs() 20 | parser = train_args.init_argparse(arg_prefix="") 21 | 22 | # override trainer args from comman-line args 23 | args, config_overrides = parser.parse_known_args() 24 | train_args.parse_args(args) 25 | 26 | # load config.json and register 27 | if args.config_path or args.continue_path: 28 | if args.config_path: 29 | # init from a file 30 | config = load_config(args.config_path) 31 | if len(config_overrides) > 0: 32 | config.parse_known_args(config_overrides, relaxed_parser=True) 33 | elif args.continue_path: 34 | # continue from a prev experiment 35 | config = load_config(os.path.join(args.continue_path, "config.json")) 36 | if len(config_overrides) > 0: 37 | config.parse_known_args(config_overrides, relaxed_parser=True) 38 | else: 39 | # init from console args 40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel 41 | 42 | config_base = BaseTrainingConfig() 43 | config_base.parse_known_args(config_overrides) 44 | config = register_config(config_base.model)() 45 | 46 | # load training samples 47 | train_samples, eval_samples = load_tts_samples( 48 | config.datasets, 49 | eval_split=True, 50 | eval_split_max_size=config.eval_split_max_size, 51 | eval_split_size=config.eval_split_size, 52 | ) 53 | 54 | # init the model from config 55 | model = setup_model(config, train_samples + eval_samples) 56 | 57 | # init the trainer and 🚀 58 | trainer = Trainer( 59 | train_args, 60 | model.config, 61 | config.output_path, 62 | model=model, 63 | train_samples=train_samples, 64 | eval_samples=eval_samples, 65 | parse_command_line_args=False, 66 | ) 67 | trainer.fit() 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /TTS/tts/layers/glow_tts/duration_predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ..generic.normalization import LayerNorm 5 | 6 | 7 | class DurationPredictor(nn.Module): 8 | """Glow-TTS duration prediction model. 9 | 10 | :: 11 | 12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs 13 | 14 | Args: 15 | in_channels (int): Number of channels of the input tensor. 16 | hidden_channels (int): Number of hidden channels of the network. 17 | kernel_size (int): Kernel size for the conv layers. 18 | dropout_p (float): Dropout rate used after each conv layer. 19 | """ 20 | 21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): 22 | super().__init__() 23 | 24 | # add language embedding dim in the input 25 | if language_emb_dim: 26 | in_channels += language_emb_dim 27 | 28 | # class arguments 29 | self.in_channels = in_channels 30 | self.filter_channels = hidden_channels 31 | self.kernel_size = kernel_size 32 | self.dropout_p = dropout_p 33 | # layers 34 | self.drop = nn.Dropout(dropout_p) 35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2) 36 | self.norm_1 = LayerNorm(hidden_channels) 37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2) 38 | self.norm_2 = LayerNorm(hidden_channels) 39 | # output layer 40 | self.proj = nn.Conv1d(hidden_channels, 1, 1) 41 | if cond_channels is not None and cond_channels != 0: 42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1) 43 | 44 | if language_emb_dim != 0 and language_emb_dim is not None: 45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1) 46 | 47 | def forward(self, x, x_mask, g=None, lang_emb=None): 48 | """ 49 | Shapes: 50 | - x: :math:`[B, C, T]` 51 | - x_mask: :math:`[B, 1, T]` 52 | - g: :math:`[B, C, 1]` 53 | """ 54 | if g is not None: 55 | x = x + self.cond(g) 56 | 57 | if lang_emb is not None: 58 | x = x + self.cond_lang(lang_emb) 59 | 60 | x = self.conv_1(x * x_mask) 61 | x = torch.relu(x) 62 | x = self.norm_1(x) 63 | x = self.drop(x) 64 | x = self.conv_2(x * x_mask) 65 | x = torch.relu(x) 66 | x = self.norm_2(x) 67 | x = self.drop(x) 68 | x = self.proj(x * x_mask) 69 | return x * x_mask 70 | -------------------------------------------------------------------------------- /TTS/utils/capacitron_optimizer.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | from trainer.trainer_utils import get_optimizer 4 | 5 | 6 | class CapacitronOptimizer: 7 | """Double optimizer class for the Capacitron model.""" 8 | 9 | def __init__(self, config: dict, model_params: Generator) -> None: 10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params) 11 | 12 | optimizer_names = list(config.optimizer_params.keys()) 13 | optimizer_parameters = list(config.optimizer_params.values()) 14 | 15 | self.primary_optimizer = get_optimizer( 16 | optimizer_names[0], 17 | optimizer_parameters[0], 18 | config.lr, 19 | parameters=self.primary_params, 20 | ) 21 | 22 | self.secondary_optimizer = get_optimizer( 23 | optimizer_names[1], 24 | self.extract_optimizer_parameters(optimizer_parameters[1]), 25 | optimizer_parameters[1]["lr"], 26 | parameters=self.secondary_params, 27 | ) 28 | 29 | self.param_groups = self.primary_optimizer.param_groups 30 | 31 | def first_step(self): 32 | self.secondary_optimizer.step() 33 | self.secondary_optimizer.zero_grad() 34 | self.primary_optimizer.zero_grad() 35 | 36 | def step(self): 37 | # Update param groups to display the correct learning rate 38 | self.param_groups = self.primary_optimizer.param_groups 39 | self.primary_optimizer.step() 40 | 41 | def zero_grad(self): 42 | self.primary_optimizer.zero_grad() 43 | self.secondary_optimizer.zero_grad() 44 | 45 | def load_state_dict(self, state_dict): 46 | self.primary_optimizer.load_state_dict(state_dict[0]) 47 | self.secondary_optimizer.load_state_dict(state_dict[1]) 48 | 49 | def state_dict(self): 50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()] 51 | 52 | @staticmethod 53 | def split_model_parameters(model_params: Generator) -> list: 54 | primary_params = [] 55 | secondary_params = [] 56 | for name, param in model_params: 57 | if param.requires_grad: 58 | if name == "capacitron_vae_layer.beta": 59 | secondary_params.append(param) 60 | else: 61 | primary_params.append(param) 62 | return [iter(primary_params), iter(secondary_params)] 63 | 64 | @staticmethod 65 | def extract_optimizer_parameters(params: dict) -> dict: 66 | """Extract parameters that are not the learning rate""" 67 | return {k: v for k, v in params.items() if k != "lr"} 68 | -------------------------------------------------------------------------------- /recipes/ljspeech/align_tts/train_aligntts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig 6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig 7 | from TTS.tts.datasets import load_tts_samples 8 | from TTS.tts.models.align_tts import AlignTTS 9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 10 | from TTS.utils.audio import AudioProcessor 11 | 12 | output_path = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | # init configs 15 | dataset_config = BaseDatasetConfig( 16 | name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") 17 | ) 18 | config = AlignTTSConfig( 19 | batch_size=32, 20 | eval_batch_size=16, 21 | num_loader_workers=4, 22 | num_eval_loader_workers=4, 23 | run_eval=True, 24 | test_delay_epochs=-1, 25 | epochs=1000, 26 | text_cleaner="english_cleaners", 27 | use_phonemes=False, 28 | phoneme_language="en-us", 29 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), 30 | print_step=25, 31 | print_eval=True, 32 | mixed_precision=False, 33 | output_path=output_path, 34 | datasets=[dataset_config], 35 | ) 36 | 37 | # INITIALIZE THE AUDIO PROCESSOR 38 | # Audio processor is used for feature extraction and audio I/O. 39 | # It mainly serves to the dataloader and the training loggers. 40 | ap = AudioProcessor.init_from_config(config) 41 | 42 | # INITIALIZE THE TOKENIZER 43 | # Tokenizer is used to convert text to sequences of token IDs. 44 | # If characters are not defined in the config, default characters are passed to the config 45 | tokenizer, config = TTSTokenizer.init_from_config(config) 46 | 47 | # LOAD DATA SAMPLES 48 | # Each sample is a list of ```[text, audio_file_path, speaker_name]``` 49 | # You can define your custom sample loader returning the list of samples. 50 | # Or define your custom formatter and pass it to the `load_tts_samples`. 51 | # Check `TTS.tts.datasets.load_tts_samples` for more details. 52 | train_samples, eval_samples = load_tts_samples( 53 | dataset_config, 54 | eval_split=True, 55 | eval_split_max_size=config.eval_split_max_size, 56 | eval_split_size=config.eval_split_size, 57 | ) 58 | 59 | # init model 60 | model = AlignTTS(config, ap, tokenizer) 61 | 62 | # INITIALIZE THE TRAINER 63 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, 64 | # distributed training, etc. 65 | trainer = Trainer( 66 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 67 | ) 68 | 69 | # AND... 3,2,1... 🚀 70 | trainer.fit() 71 | --------------------------------------------------------------------------------