├── TTS
├── VERSION
├── bin
│ ├── __init__.py
│ ├── collect_env_info.py
│ ├── find_unique_chars.py
│ ├── find_unique_phonemes.py
│ └── train_tts.py
├── encoder
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── io.py
│ │ └── visual.py
│ ├── requirements.txt
│ ├── configs
│ │ ├── speaker_encoder_config.py
│ │ ├── emotion_encoder_config.py
│ │ └── base_encoder_config.py
│ └── README.md
├── server
│ ├── __init__.py
│ ├── static
│ │ └── coqui-log-green-TTS.png
│ ├── conf.json
│ └── README.md
├── tts
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── text
│ │ │ ├── english
│ │ │ │ ├── __init__.py
│ │ │ │ ├── abbreviations.py
│ │ │ │ └── time_norm.py
│ │ │ ├── french
│ │ │ │ ├── __init__.py
│ │ │ │ └── abbreviations.py
│ │ │ ├── nepali
│ │ │ │ └── __init__.py
│ │ │ ├── japanese
│ │ │ │ └── __init__.py
│ │ │ ├── chinese_mandarin
│ │ │ │ ├── __init__.py
│ │ │ │ └── phonemizer.py
│ │ │ ├── __init__.py
│ │ │ └── phonemizers
│ │ │ │ ├── __init__.py
│ │ │ │ ├── zh_cn_phonemizer.py
│ │ │ │ ├── multi_phonemizer.py
│ │ │ │ └── ja_jp_phonemizer.py
│ │ ├── monotonic_align
│ │ │ ├── __init__.py
│ │ │ ├── setup.py
│ │ │ └── core.pyx
│ │ └── measures.py
│ ├── layers
│ │ ├── align_tts
│ │ │ ├── __init__.py
│ │ │ ├── duration_predictor.py
│ │ │ └── mdn.py
│ │ ├── generic
│ │ │ ├── __init__.py
│ │ │ └── gated_conv.py
│ │ ├── glow_tts
│ │ │ ├── __init__.py
│ │ │ └── duration_predictor.py
│ │ ├── tacotron
│ │ │ └── __init__.py
│ │ ├── feed_forward
│ │ │ ├── __init__.py
│ │ │ └── duration_predictor.py
│ │ └── __init__.py
│ ├── configs
│ │ ├── tacotron2_config.py
│ │ └── __init__.py
│ └── models
│ │ └── __init__.py
├── utils
│ ├── __init__.py
│ ├── distribute.py
│ ├── training.py
│ └── capacitron_optimizer.py
├── vocoder
│ ├── __init__.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── melgan.py
│ │ ├── pqmf.py
│ │ └── hifigan.py
│ ├── utils
│ │ └── __init__.py
│ ├── pqmf_output.wav
│ ├── configs
│ │ └── __init__.py
│ ├── models
│ │ ├── fullband_melgan_generator.py
│ │ ├── multiband_melgan_generator.py
│ │ ├── melgan_multiscale_discriminator.py
│ │ └── base_vocoder.py
│ ├── README.md
│ └── datasets
│ │ └── __init__.py
├── __init__.py
└── model.py
├── docs
├── README.md
├── source
│ ├── contributing.md
│ ├── _static
│ │ └── logo.png
│ ├── main_classes
│ │ ├── trainer_api.md
│ │ ├── speaker_manager.md
│ │ ├── gan.md
│ │ ├── dataset.md
│ │ ├── model_api.md
│ │ └── audio_processor.md
│ ├── _templates
│ │ └── page.html
│ ├── installation.md
│ ├── models
│ │ ├── glow_tts.md
│ │ ├── vits.md
│ │ └── forward_tts.md
│ ├── make.bat
│ ├── tts_datasets.md
│ └── index.md
├── requirements.txt
└── Makefile
├── tests
├── aux_tests
│ ├── __init__.py
│ └── test_stft_torch.py
├── data_tests
│ ├── __init__.py
│ └── test_dataset_formatters.py
├── text_tests
│ ├── __init__.py
│ ├── test_text_cleaners.py
│ ├── test_japanese_phonemizer.py
│ └── test_punctuation.py
├── tts_tests
│ ├── __init__.py
│ ├── test_vits_d-vectors_train.py
│ └── test_tacotron_train.py
├── zoo_tests
│ └── __init__.py
├── inference_tests
│ ├── __init__.py
│ └── test_synthesize.py
├── vocoder_tests
│ ├── __init__.py
│ ├── test_vocoder_melgan_generator.py
│ ├── test_vocoder_rwd.py
│ ├── test_vocoder_pqmf.py
│ ├── test_vocoder_parallel_wavegan_generator.py
│ ├── test_vocoder_melgan_discriminator.py
│ ├── test_hifigan_train.py
│ ├── test_parallel_wavegan_train.py
│ ├── test_vocoder_parallel_wavegan_discriminator.py
│ ├── test_wavegrad_train.py
│ ├── test_melgan_train.py
│ ├── test_wavernn_train.py
│ ├── test_fullband_melgan_train.py
│ ├── test_multiband_melgan_train.py
│ ├── test_vocoder_wavernn.py
│ └── test_wavegrad.py
├── inputs
│ ├── language_ids.json
│ ├── example_1.wav
│ ├── scale_stats.npy
│ ├── server_config.json
│ ├── common_voice.tsv
│ ├── test_vocoder_audio_config.json
│ └── test_config.json
├── data
│ ├── dummy_speakers.pth
│ └── ljspeech
│ │ ├── wavs
│ │ ├── LJ001-0001.npy
│ │ ├── LJ001-0001.wav
│ │ ├── LJ001-0002.npy
│ │ ├── LJ001-0002.wav
│ │ ├── LJ001-0003.npy
│ │ ├── LJ001-0003.wav
│ │ ├── LJ001-0004.npy
│ │ ├── LJ001-0004.wav
│ │ ├── LJ001-0005.npy
│ │ ├── LJ001-0005.wav
│ │ ├── LJ001-0006.npy
│ │ ├── LJ001-0006.wav
│ │ ├── LJ001-0007.npy
│ │ ├── LJ001-0007.wav
│ │ ├── LJ001-0008.npy
│ │ ├── LJ001-0008.wav
│ │ ├── LJ001-0009.npy
│ │ ├── LJ001-0009.wav
│ │ ├── LJ001-0010.npy
│ │ ├── LJ001-0010.wav
│ │ ├── LJ001-0011.npy
│ │ ├── LJ001-0011.wav
│ │ ├── LJ001-0012.npy
│ │ ├── LJ001-0012.wav
│ │ ├── LJ001-0013.npy
│ │ ├── LJ001-0013.wav
│ │ ├── LJ001-0014.npy
│ │ ├── LJ001-0014.wav
│ │ ├── LJ001-0015.npy
│ │ ├── LJ001-0015.wav
│ │ ├── LJ001-0016.npy
│ │ ├── LJ001-0016.wav
│ │ ├── LJ001-0017.npy
│ │ ├── LJ001-0017.wav
│ │ ├── LJ001-0018.npy
│ │ ├── LJ001-0018.wav
│ │ ├── LJ001-0019.npy
│ │ ├── LJ001-0019.wav
│ │ ├── LJ001-0020.npy
│ │ ├── LJ001-0020.wav
│ │ ├── LJ001-0021.npy
│ │ ├── LJ001-0021.wav
│ │ ├── LJ001-0022.npy
│ │ ├── LJ001-0022.wav
│ │ ├── LJ001-0023.npy
│ │ ├── LJ001-0023.wav
│ │ ├── LJ001-0024.npy
│ │ ├── LJ001-0024.wav
│ │ ├── LJ001-0025.npy
│ │ ├── LJ001-0025.wav
│ │ ├── LJ001-0026.npy
│ │ ├── LJ001-0026.wav
│ │ ├── LJ001-0027.npy
│ │ ├── LJ001-0027.wav
│ │ ├── LJ001-0028.npy
│ │ ├── LJ001-0028.wav
│ │ ├── LJ001-0029.npy
│ │ ├── LJ001-0029.wav
│ │ ├── LJ001-0030.npy
│ │ ├── LJ001-0030.wav
│ │ ├── LJ001-0031.npy
│ │ ├── LJ001-0031.wav
│ │ ├── LJ001-0032.npy
│ │ └── LJ001-0032.wav
│ │ ├── f0_cache
│ │ └── pitch_stats.npy
│ │ ├── phoneme_cache
│ │ ├── LJ001-0001_phoneme.npy
│ │ ├── LJ001-0002_phoneme.npy
│ │ ├── LJ001-0003_phoneme.npy
│ │ ├── LJ001-0004_phoneme.npy
│ │ ├── LJ001-0005_phoneme.npy
│ │ ├── LJ001-0006_phoneme.npy
│ │ ├── LJ001-0007_phoneme.npy
│ │ ├── LJ001-0008_phoneme.npy
│ │ ├── LJ001-0009_phoneme.npy
│ │ ├── LJ001-0010_phoneme.npy
│ │ ├── LJ001-0011_phoneme.npy
│ │ ├── LJ001-0012_phoneme.npy
│ │ ├── LJ001-0013_phoneme.npy
│ │ ├── LJ001-0014_phoneme.npy
│ │ ├── LJ001-0015_phoneme.npy
│ │ ├── LJ001-0016_phoneme.npy
│ │ ├── LJ001-0017_phoneme.npy
│ │ ├── LJ001-0018_phoneme.npy
│ │ ├── LJ001-0019_phoneme.npy
│ │ ├── LJ001-0020_phoneme.npy
│ │ ├── LJ001-0021_phoneme.npy
│ │ ├── LJ001-0022_phoneme.npy
│ │ ├── LJ001-0023_phoneme.npy
│ │ ├── LJ001-0024_phoneme.npy
│ │ ├── LJ001-0025_phoneme.npy
│ │ ├── LJ001-0026_phoneme.npy
│ │ ├── LJ001-0027_phoneme.npy
│ │ ├── LJ001-0028_phoneme.npy
│ │ ├── LJ001-0029_phoneme.npy
│ │ ├── LJ001-0030_phoneme.npy
│ │ ├── LJ001-0031_phoneme.npy
│ │ └── LJ001-0032_phoneme.npy
│ │ └── metadata.csv
├── bash_tests
│ ├── test_compute_statistics.sh
│ └── test_demo_server.sh
└── __init__.py
├── .dockerignore
├── requirements.notebooks.txt
├── requirements.dev.txt
├── images
├── model.png
├── tts_cli.gif
├── demo_server.gif
├── TTS-performance.png
├── tts_performance.png
├── coqui-log-green-TTS.png
└── example_model_output.png
├── .cardboardlint.yml
├── setup.cfg
├── run_bash_tests.sh
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ └── feature_request.md
├── stale.yml
├── PR_TEMPLATE.md
└── workflows
│ ├── style_check.yml
│ ├── vocoder_tests.yml
│ ├── text_tests.yml
│ ├── aux_tests.yml
│ ├── data_tests.yml
│ ├── inference_tests.yml
│ ├── zoo_tests.yml
│ ├── tts_tests.yml
│ └── docker.yaml
├── notebooks
└── dataset_analysis
│ └── README.md
├── MANIFEST.in
├── recipes
├── vctk
│ └── download_vctk.sh
├── ljspeech
│ ├── download_ljspeech.sh
│ ├── README.md
│ ├── univnet
│ │ └── train.py
│ ├── hifigan
│ │ └── train_hifigan.py
│ ├── multiband_melgan
│ │ └── train_multiband_melgan.py
│ ├── wavegrad
│ │ └── train_wavegrad.py
│ ├── wavernn
│ │ └── train_wavernn.py
│ └── align_tts
│ │ └── train_aligntts.py
├── thorsten_DE
│ ├── download_thorsten_DE.sh
│ ├── README.md
│ ├── univnet
│ │ └── train_univnet.py
│ ├── hifigan
│ │ └── train_hifigan.py
│ ├── multiband_melgan
│ │ └── train_multiband_melgan.py
│ ├── wavegrad
│ │ └── train_wavegrad.py
│ └── wavernn
│ │ └── train_wavernn.py
├── README.md
├── blizzard2013
│ └── README.md
└── kokoro
│ └── tacotron2-DDC
│ └── run.sh
├── .readthedocs.yml
├── CITATION.cff
├── pyproject.toml
├── .pre-commit-config.yaml
├── Dockerfile
├── requirements.txt
├── hubconf.py
└── Makefile
/TTS/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.1
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/bin/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/encoder/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/server/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/vocoder/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/aux_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/data_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/text_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/tts_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/zoo_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/vocoder/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/inference_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | Dockerfile
3 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/layers/tacotron/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/nepali/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.notebooks.txt:
--------------------------------------------------------------------------------
1 | bokeh==1.4.0
--------------------------------------------------------------------------------
/tests/aux_tests/test_stft_torch.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/japanese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/TTS/encoder/requirements.txt:
--------------------------------------------------------------------------------
1 | umap-learn
2 | numpy>=1.17.0
3 |
--------------------------------------------------------------------------------
/TTS/tts/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.layers.losses import *
2 |
--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | coverage
3 | isort
4 | nose2
5 | pylint==2.10.2
6 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
2 |
--------------------------------------------------------------------------------
/images/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/model.png
--------------------------------------------------------------------------------
/tests/inputs/language_ids.json:
--------------------------------------------------------------------------------
1 | {
2 | "en": 0,
3 | "fr-fr": 1,
4 | "pt-br": 2
5 | }
--------------------------------------------------------------------------------
/docs/source/contributing.md:
--------------------------------------------------------------------------------
1 | ```{include} ../../CONTRIBUTING.md
2 | :relative-images:
3 | ```
4 |
--------------------------------------------------------------------------------
/images/tts_cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/tts_cli.gif
--------------------------------------------------------------------------------
/images/demo_server.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/demo_server.gif
--------------------------------------------------------------------------------
/images/TTS-performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/TTS-performance.png
--------------------------------------------------------------------------------
/images/tts_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/tts_performance.png
--------------------------------------------------------------------------------
/tests/inputs/example_1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/inputs/example_1.wav
--------------------------------------------------------------------------------
/TTS/vocoder/pqmf_output.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/TTS/vocoder/pqmf_output.wav
--------------------------------------------------------------------------------
/docs/source/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/docs/source/_static/logo.png
--------------------------------------------------------------------------------
/tests/data/dummy_speakers.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/dummy_speakers.pth
--------------------------------------------------------------------------------
/tests/inputs/scale_stats.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/inputs/scale_stats.npy
--------------------------------------------------------------------------------
/images/coqui-log-green-TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/coqui-log-green-TTS.png
--------------------------------------------------------------------------------
/images/example_model_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/images/example_model_output.png
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | furo
2 | myst-parser == 0.15.1
3 | sphinx == 4.0.2
4 | sphinx_inline_tabs
5 | sphinx_copybutton
6 | linkify-it-py
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0001.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0001.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0002.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0002.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0002.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0003.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0003.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0003.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0003.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0004.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0004.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0004.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0004.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0005.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0005.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0005.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0005.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0006.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0006.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0006.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0006.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0007.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0007.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0007.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0007.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0008.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0008.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0008.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0008.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0009.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0009.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0009.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0009.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0010.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0010.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0010.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0010.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0011.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0011.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0011.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0011.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0012.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0012.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0012.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0012.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0013.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0013.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0013.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0013.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0014.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0014.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0014.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0014.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0015.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0015.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0015.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0015.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0016.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0016.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0016.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0016.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0017.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0017.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0017.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0017.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0018.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0018.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0018.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0018.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0019.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0019.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0019.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0019.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0020.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0020.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0020.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0020.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0021.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0021.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0021.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0021.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0022.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0022.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0022.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0022.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0023.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0023.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0023.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0023.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0024.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0024.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0024.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0024.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0025.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0025.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0025.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0025.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0026.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0026.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0026.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0026.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0027.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0027.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0027.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0027.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0028.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0028.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0028.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0028.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0029.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0029.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0029.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0029.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0030.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0030.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0030.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0030.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0031.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0031.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0031.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0031.wav
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0032.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0032.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/wavs/LJ001-0032.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/wavs/LJ001-0032.wav
--------------------------------------------------------------------------------
/TTS/server/static/coqui-log-green-TTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/TTS/server/static/coqui-log-green-TTS.png
--------------------------------------------------------------------------------
/docs/source/main_classes/trainer_api.md:
--------------------------------------------------------------------------------
1 | # Trainer API
2 |
3 | We made the trainer a seprate project on https://github.com/coqui-ai/Trainer
4 |
--------------------------------------------------------------------------------
/.cardboardlint.yml:
--------------------------------------------------------------------------------
1 | linters:
2 | - pylint:
3 | # pylintrc: pylintrc
4 | filefilter: ['- test_*.py', '+ *.py', '- *.npy']
5 | # exclude:
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [build_py]
2 | build-lib=temp_build
3 |
4 | [bdist_wheel]
5 | bdist-dir=temp_build
6 |
7 | [install_lib]
8 | build-dir=temp_build
9 |
--------------------------------------------------------------------------------
/tests/data/ljspeech/f0_cache/pitch_stats.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/f0_cache/pitch_stats.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0001_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0002_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0003_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0004_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0005_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0006_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0007_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0008_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0009_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0010_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0011_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0012_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0013_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0014_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0015_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0016_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0017_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0018_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0019_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0020_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0021_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0022_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0023_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0024_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0025_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0026_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0027_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0028_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0029_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0030_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0031_phoneme.npy
--------------------------------------------------------------------------------
/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samiptimalsena/TTS/Nepali-TTS/tests/data/ljspeech/phoneme_cache/LJ001-0032_phoneme.npy
--------------------------------------------------------------------------------
/TTS/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4 | version = f.read().strip()
5 |
6 | __version__ = version
7 |
8 |
--------------------------------------------------------------------------------
/run_bash_tests.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | TF_CPP_MIN_LOG_LEVEL=3
3 |
4 | # runtime bash based tests
5 | # TODO: move these to python
6 | ./tests/bash_tests/test_demo_server.sh && \
7 | ./tests/bash_tests/test_compute_statistics.sh
8 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 |
5 | # setup(name='monotonic_align',
6 | # ext_modules=cythonize("core.pyx"),
7 | # include_dirs=[numpy.get_include()])
8 |
--------------------------------------------------------------------------------
/tests/bash_tests/test_compute_statistics.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -xe
3 | BASEDIR=$(dirname "$0")
4 | echo "$BASEDIR"
5 | # run training
6 | CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy
7 |
8 |
--------------------------------------------------------------------------------
/docs/source/main_classes/speaker_manager.md:
--------------------------------------------------------------------------------
1 | # Speaker Manager API
2 |
3 | The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is
4 | especially useful for multi-speaker models.
5 |
6 |
7 | ## Speaker Manager
8 | ```{eval-rst}
9 | .. automodule:: TTS.tts.utils.speakers
10 | :members:
11 | ```
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: CoquiTTS GitHub Discussions
4 | url: https://github.com/coqui-ai/TTS/discussions
5 | about: Please ask and answer questions here.
6 | - name: Coqui Security issue disclosure
7 | url: mailto:info@coqui.ai
8 | about: Please report security vulnerabilities here.
9 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/speaker_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class SpeakerEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Speaker Encoder model."""
9 |
10 | model: str = "speaker_encoder"
11 | class_name_key: str = "speaker_name"
12 |
--------------------------------------------------------------------------------
/docs/source/main_classes/gan.md:
--------------------------------------------------------------------------------
1 | # GAN API
2 |
3 | The {class}`TTS.vocoder.models.gan.GAN` provides an easy way to implementing new GAN based models. You just need
4 | to define the model architectures for the generator and the discriminator networks and give them to the `GAN` class
5 | to do its ✨️.
6 |
7 |
8 | ## GAN
9 | ```{eval-rst}
10 | .. autoclass:: TTS.vocoder.models.gan.GAN
11 | :members:
12 | ```
--------------------------------------------------------------------------------
/notebooks/dataset_analysis/README.md:
--------------------------------------------------------------------------------
1 | ## Simple Notebook to Analyze a Dataset
2 |
3 | By the use of this notebook, you can easily analyze a brand new dataset, find exceptional cases and define your training set.
4 |
5 | What we are looking in here is reasonable distribution of instances in terms of sequence-length, audio-length and word-coverage.
6 |
7 | This notebook is inspired from https://github.com/MycroftAI/mimic2
8 |
--------------------------------------------------------------------------------
/tests/bash_tests/test_demo_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -xe
3 |
4 | python -m TTS.server.server &
5 | SERVER_PID=$!
6 |
7 | echo 'Waiting for server...'
8 | sleep 30
9 |
10 | curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis"
11 | python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav
12 |
13 | kill $SERVER_PID
14 |
15 | rm /tmp/audio.wav
16 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.txt
3 | include requirements.*.txt
4 | include *.cff
5 | include requirements.txt
6 | include TTS/VERSION
7 | recursive-include TTS *.json
8 | recursive-include TTS *.html
9 | recursive-include TTS *.png
10 | recursive-include TTS *.md
11 | recursive-include TTS *.py
12 | recursive-include TTS *.pyx
13 | recursive-include images *.png
14 | recursive-exclude tests *
15 | prune tests*
16 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/emotion_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass
2 |
3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4 |
5 |
6 | @dataclass
7 | class EmotionEncoderConfig(BaseEncoderConfig):
8 | """Defines parameters for Emotion Encoder model."""
9 |
10 | model: str = "emotion_encoder"
11 | map_classid_to_classname: dict = None
12 | class_name_key: str = "emotion_name"
13 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from TTS.vocoder.models.melgan_generator import MelganGenerator
5 |
6 |
7 | def test_melgan_generator():
8 | model = MelganGenerator()
9 | print(model)
10 | dummy_input = torch.rand((4, 80, 64))
11 | output = model(dummy_input)
12 | assert np.all(output.shape == (4, 1, 64 * 256))
13 | output = model.inference(dummy_input)
14 | assert np.all(output.shape == (4, 1, (64 + 4) * 256))
15 |
--------------------------------------------------------------------------------
/recipes/vctk/download_vctk.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | echo $RUN_DIR
5 | # download VCTK dataset
6 | wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip
7 | # extract
8 | mkdir VCTK
9 | unzip VCTK-Corpus-0.92 -d VCTK
10 | # create train-val splits
11 | mv VCTK $RUN_DIR/recipes/vctk/
12 | rm VCTK-Corpus-0.92.zip
13 |
--------------------------------------------------------------------------------
/TTS/server/conf.json:
--------------------------------------------------------------------------------
1 | {
2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
3 | "tts_file":"best_model.pth", // tts checkpoint file
4 | "tts_config":"config.json", // tts config.json file
5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
6 | "vocoder_config":null,
7 | "vocoder_file": null,
8 | "is_wavernn_batched":true,
9 | "port": 5002,
10 | "use_cuda": true,
11 | "debug": true
12 | }
13 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | builder: html
11 | configuration: docs/source/conf.py
12 |
13 | # Optionally set the version of Python and requirements required to build your docs
14 | python:
15 | version: 3.7
16 | install:
17 | - requirements: docs/requirements.txt
18 | - requirements: requirements.txt
--------------------------------------------------------------------------------
/docs/source/main_classes/dataset.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 |
3 | ## TTS Dataset
4 |
5 | ```{eval-rst}
6 | .. autoclass:: TTS.tts.datasets.TTSDataset
7 | :members:
8 | ```
9 |
10 | ## Vocoder Dataset
11 |
12 | ```{eval-rst}
13 | .. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset
14 | :members:
15 | ```
16 |
17 | ```{eval-rst}
18 | .. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset
19 | :members:
20 | ```
21 |
22 | ```{eval-rst}
23 | .. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset
24 | :members:
25 | ```
--------------------------------------------------------------------------------
/docs/source/main_classes/model_api.md:
--------------------------------------------------------------------------------
1 | # Model API
2 | Model API provides you a set of functions that easily make your model compatible with the `Trainer`,
3 | `Synthesizer` and `ModelZoo`.
4 |
5 | ## Base TTS Model
6 |
7 | ```{eval-rst}
8 | .. autoclass:: TTS.model.BaseModel
9 | :members:
10 | ```
11 |
12 | ## Base `tts` Model
13 |
14 | ```{eval-rst}
15 | .. autoclass:: TTS.tts.models.base_tts.BaseTTS
16 | :members:
17 | ```
18 |
19 | ## Base `vocoder` Model
20 |
21 | ```{eval-rst}
22 | .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder
23 | :members:
24 | ```
--------------------------------------------------------------------------------
/TTS/tts/utils/measures.py:
--------------------------------------------------------------------------------
1 | def alignment_diagonal_score(alignments, binary=False):
2 | """
3 | Compute how diagonal alignment predictions are. It is useful
4 | to measure the alignment consistency of a model
5 | Args:
6 | alignments (torch.Tensor): batch of alignments.
7 | binary (bool): if True, ignore scores and consider attention
8 | as a binary mask.
9 | Shape:
10 | - alignments : :math:`[B, T_de, T_en]`
11 | """
12 | maxs = alignments.max(dim=1)[0]
13 | if binary:
14 | maxs[maxs > 0] = 1
15 | return maxs.mean(dim=1).mean(dim=0).item()
16 |
--------------------------------------------------------------------------------
/TTS/tts/configs/tacotron2_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from TTS.tts.configs.tacotron_config import TacotronConfig
4 |
5 |
6 | @dataclass
7 | class Tacotron2Config(TacotronConfig):
8 | """Defines parameters for Tacotron2 based models.
9 |
10 | Example:
11 |
12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config
13 | >>> config = Tacotron2Config()
14 |
15 | Check `TacotronConfig` for argument descriptions.
16 | """
17 |
18 | model: str = "tacotron2"
19 | out_channels: int = 80
20 | encoder_in_features: int = 512
21 | decoder_in_features: int = 512
22 |
--------------------------------------------------------------------------------
/TTS/tts/models/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 |
3 | from TTS.utils.generic_utils import find_module
4 |
5 |
6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS":
7 | print(" > Using model: {}".format(config.model))
8 | # fetch the right model implementation.
9 | if "base_model" in config and config["base_model"] is not None:
10 | MyModel = find_module("TTS.tts.models", config.base_model.lower())
11 | else:
12 | MyModel = find_module("TTS.tts.models", config.model.lower())
13 | model = MyModel.init_from_config(config, samples)
14 | return model
15 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
3 | title: "Coqui TTS"
4 | abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
5 | date-released: 2021-01-01
6 | authors:
7 | - family-names: "Eren"
8 | given-names: "Gölge"
9 | - name: "The Coqui TTS Team"
10 | version: 1.4
11 | doi: 10.5281/zenodo.6334862
12 | license: "MPL-2.0"
13 | url: "https://www.coqui.ai"
14 | repository-code: "https://github.com/coqui-ai/TTS"
15 | keywords:
16 | - machine learning
17 | - deep learning
18 | - artificial intelligence
19 | - text to speech
20 | - TTS
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_rwd.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscriminator
5 |
6 |
7 | def test_rwd():
8 | layer = RandomWindowDiscriminator(
9 | cond_channels=80,
10 | window_sizes=(512, 1024, 2048, 4096, 8192),
11 | cond_disc_downsample_factors=[(8, 4, 2, 2, 2), (8, 4, 2, 2), (8, 4, 2), (8, 4), (4, 2, 2)],
12 | hop_length=256,
13 | )
14 | x = torch.rand([4, 1, 22050])
15 | c = torch.rand([4, 80, 22050 // 256])
16 |
17 | scores, _ = layer(x, c)
18 | assert len(scores) == 10
19 | assert np.all(scores[0].shape == (4, 1, 1))
20 |
--------------------------------------------------------------------------------
/recipes/ljspeech/download_ljspeech.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | echo $RUN_DIR
5 | # download LJSpeech dataset
6 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
7 | # extract
8 | tar -xjf LJSpeech-1.1.tar.bz2
9 | # create train-val splits
10 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
11 | head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
12 | tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
13 | mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/
14 | rm LJSpeech-1.1.tar.bz2
--------------------------------------------------------------------------------
/recipes/thorsten_DE/download_thorsten_DE.sh:
--------------------------------------------------------------------------------
1 | # create venv
2 | python3 -m venv env
3 | source .env/bin/activate
4 | pip install pip --upgrade
5 |
6 | # download Thorsten_DE dataset
7 | pip install gdown
8 | gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
9 | tar -xzf dataset.tgz
10 |
11 | # create train-val splits
12 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
13 | head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
14 | tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
15 |
16 | # rename dataset and remove archive
17 | mv LJSpeech-1.1 thorsten-de
18 | rm dataset.tgz
19 |
20 | # destry venv
21 | rm -rf env
22 |
--------------------------------------------------------------------------------
/tests/inputs/server_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file
3 | "tts_config":"dummy_model_config.json", // tts config.json file
4 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
5 | "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
6 | "wavernn_file": null, // wavernn checkpoint file name
7 | "wavernn_config": null, // wavernn config file
8 | "vocoder_config":null,
9 | "vocoder_checkpoint": null,
10 | "is_wavernn_batched":true,
11 | "port": 5002,
12 | "use_cuda": false,
13 | "debug": true
14 | }
15 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?= -j auto -WT --keep-going
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"]
3 |
4 | [flake8]
5 | max-line-length=120
6 |
7 | [tool.black]
8 | line-length = 120
9 | target-version = ['py39']
10 | exclude = '''
11 |
12 | (
13 | /(
14 | \.eggs # exclude a few common directories in the
15 | | \.git # root of the project
16 | | \.hg
17 | | \.mypy_cache
18 | | \.tox
19 | | \.venv
20 | | _build
21 | | buck-out
22 | | build
23 | | dist
24 | )/
25 | | foo.py # also separately exclude a file named foo.py in
26 | # the root of the project
27 | )
28 | '''
29 |
30 | [tool.isort]
31 | line_length = 120
32 | profile = "black"
33 | multi_line_output = 3
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: 'https://github.com/pre-commit/pre-commit-hooks'
3 | rev: v2.3.0
4 | hooks:
5 | - id: check-yaml
6 | - id: end-of-file-fixer
7 | - id: trailing-whitespace
8 | - repo: 'https://github.com/psf/black'
9 | rev: 20.8b1
10 | hooks:
11 | - id: black
12 | language_version: python3
13 | - repo: https://github.com/pycqa/isort
14 | rev: 5.8.0
15 | hooks:
16 | - id: isort
17 | name: isort (python)
18 | - id: isort
19 | name: isort (cython)
20 | types: [cython]
21 | - id: isort
22 | name: isort (pyi)
23 | types: [pyi]
24 | - repo: https://github.com/pycqa/pylint
25 | rev: v2.8.2
26 | hooks:
27 | - id: pylint
28 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 30
3 | # Number of days of inactivity before a stale issue is closed
4 | daysUntilClose: 7
5 | # Issues with these labels will never be considered stale
6 | exemptLabels:
7 | - pinned
8 | - security
9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 | This issue has been automatically marked as stale because it has not had
14 | recent activity. It will be closed if no further activity occurs. Thank you
15 | for your contributions. You might also look our discussion channels.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 |
19 |
--------------------------------------------------------------------------------
/TTS/utils/distribute.py:
--------------------------------------------------------------------------------
1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
2 | import torch
3 | import torch.distributed as dist
4 |
5 |
6 | def reduce_tensor(tensor, num_gpus):
7 | rt = tensor.clone()
8 | dist.all_reduce(rt, op=dist.reduce_op.SUM)
9 | rt /= num_gpus
10 | return rt
11 |
12 |
13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url):
14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA."
15 |
16 | # Set cuda device so everything is done on the right GPU.
17 | torch.cuda.set_device(rank % torch.cuda.device_count())
18 |
19 | # Initialize distributed communication
20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name)
21 |
--------------------------------------------------------------------------------
/docs/source/main_classes/audio_processor.md:
--------------------------------------------------------------------------------
1 | # AudioProcessor API
2 |
3 | `TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for
4 |
5 | - Feature extraction.
6 | - Sound normalization.
7 | - Reading and writing audio files.
8 | - Sampling audio signals.
9 | - Normalizing and denormalizing audio signals.
10 | - Griffin-Lim vocoder.
11 |
12 | The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config
13 | also must inherit or initiate `BaseAudioConfig`.
14 |
15 | ## AudioProcessor
16 | ```{eval-rst}
17 | .. autoclass:: TTS.utils.audio.AudioProcessor
18 | :members:
19 | ```
20 |
21 | ## BaseAudioConfig
22 | ```{eval-rst}
23 | .. autoclass:: TTS.config.shared_configs.BaseAudioConfig
24 | :members:
25 | ```
--------------------------------------------------------------------------------
/TTS/vocoder/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | configs_dir = os.path.dirname(__file__)
7 | for file in os.listdir(configs_dir):
8 | path = os.path.join(configs_dir, file)
9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | module = importlib.import_module("TTS.vocoder.configs." + config_name)
12 | for attribute_name in dir(module):
13 | attribute = getattr(module, attribute_name)
14 |
15 | if isclass(attribute):
16 | # Add the class to this package's variables
17 | globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_pqmf.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import soundfile as sf
4 | import torch
5 | from librosa.core import load
6 |
7 | from tests import get_tests_input_path, get_tests_output_path, get_tests_path
8 | from TTS.vocoder.layers.pqmf import PQMF
9 |
10 | TESTS_PATH = get_tests_path()
11 | WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
12 |
13 |
14 | def test_pqmf():
15 | w, sr = load(WAV_FILE)
16 |
17 | layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
18 | w, sr = load(WAV_FILE)
19 | w2 = torch.from_numpy(w[None, None, :])
20 | b2 = layer.analysis(w2)
21 | w2_ = layer.synthesis(b2)
22 |
23 | print(w2_.max())
24 | print(w2_.min())
25 | print(w2_.mean())
26 | sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr)
27 |
--------------------------------------------------------------------------------
/TTS/tts/configs/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | from inspect import isclass
4 |
5 | # import all files under configs/
6 | # configs_dir = os.path.dirname(__file__)
7 | # for file in os.listdir(configs_dir):
8 | # path = os.path.join(configs_dir, file)
9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)):
10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file
11 | # module = importlib.import_module("TTS.tts.configs." + config_name)
12 | # for attribute_name in dir(module):
13 | # attribute = getattr(module, attribute_name)
14 |
15 | # if isclass(attribute):
16 | # # Add the class to this package's variables
17 | # globals()[attribute_name] = attribute
18 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in english:
4 | abbreviations_en = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("mrs", "misess"),
8 | ("mr", "mister"),
9 | ("dr", "doctor"),
10 | ("st", "saint"),
11 | ("co", "company"),
12 | ("jr", "junior"),
13 | ("maj", "major"),
14 | ("gen", "general"),
15 | ("drs", "doctors"),
16 | ("rev", "reverend"),
17 | ("lt", "lieutenant"),
18 | ("hon", "honorable"),
19 | ("sgt", "sergeant"),
20 | ("capt", "captain"),
21 | ("esq", "esquire"),
22 | ("ltd", "limited"),
23 | ("col", "colonel"),
24 | ("ft", "fort"),
25 | ]
26 | ]
27 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
2 | FROM ${BASE}
3 | RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
4 | RUN pip install llvmlite --ignore-installed
5 |
6 | # Create and activate virtual env
7 | ENV VIRTUAL_ENV=/venv
8 | RUN python3 -m venv $VIRTUAL_ENV
9 | ENV PATH="$VIRTUAL_ENV/bin:$PATH"
10 | RUN pip install -U pip setuptools wheel
11 |
12 | WORKDIR /root
13 | COPY requirements.txt /root
14 | COPY requirements.dev.txt /root
15 | COPY requirements.notebooks.txt /root
16 | RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
17 | COPY . /root
18 | RUN make install
19 | ENTRYPOINT ["tts"]
20 | CMD ["--help"]
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🚀 Feature request
3 | about: Suggest a feature or an idea for this project
4 | title: '[Feature request] '
5 | labels: feature request
6 | assignees: ''
7 |
8 | ---
9 |
11 | **🚀 Feature Description**
12 |
13 |
14 |
15 | **Solution**
16 |
17 |
18 |
19 | **Alternative Solutions**
20 |
21 |
22 |
23 | **Additional context**
24 |
25 |
26 |
--------------------------------------------------------------------------------
/docs/source/_templates/page.html:
--------------------------------------------------------------------------------
1 | {% extends "!page.html" %}
2 | {% block scripts %}
3 | {{ super() }}
4 |
5 |
6 |
7 |
13 |
14 |
15 | - You can ask questions about TTS. Try
16 | - What is VITS?
17 | - How to train a TTS model?
18 | - What is the format of training data?
19 |
20 |
21 |
22 |
23 | {% endblock %}
24 |
--------------------------------------------------------------------------------
/recipes/ljspeech/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS LJspeech Recipes
2 |
3 | For running the recipes
4 |
5 | 1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```.
6 | 2. Go to your desired model folder and run the training.
7 |
8 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
9 | ```terminal
10 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py
11 | ```
12 |
13 | Running bash scripts.
14 | ```terminal
15 | bash run.sh
16 | ```
17 |
18 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
19 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
20 |
--------------------------------------------------------------------------------
/tests/text_tests/test_text_cleaners.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners
4 |
5 |
6 | def test_time() -> None:
7 | assert english_cleaners("It's 11:00") == "it's eleven a m"
8 | assert english_cleaners("It's 9:01") == "it's nine oh one a m"
9 | assert english_cleaners("It's 16:00") == "it's four p m"
10 | assert english_cleaners("It's 00:00 am") == "it's twelve a m"
11 |
12 |
13 | def test_currency() -> None:
14 | assert phoneme_cleaners("It's $10.50") == "It's ten dollars fifty cents"
15 | assert phoneme_cleaners("£1.1") == "one pound sterling one penny"
16 | assert phoneme_cleaners("¥1") == "one yen"
17 |
18 |
19 | def test_expand_numbers() -> None:
20 | assert phoneme_cleaners("-1") == "minus one"
21 | assert phoneme_cleaners("1") == "one"
22 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # core deps
2 | numpy==1.21.6;python_version<"3.10"
3 | numpy==1.22.4;python_version=="3.10"
4 | cython==0.29.28
5 | scipy>=1.4.0
6 | torch>=1.7
7 | torchaudio
8 | soundfile
9 | librosa==0.8.0
10 | numba==0.55.1;python_version<"3.10"
11 | numba==0.55.2;python_version=="3.10"
12 | inflect
13 | tqdm
14 | anyascii
15 | pyyaml
16 | fsspec>=2021.04.0
17 | # deps for examples
18 | flask
19 | # deps for inference
20 | pysbd
21 | # deps for notebooks
22 | umap-learn==0.5.1
23 | pandas
24 | # deps for training
25 | matplotlib
26 | pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
27 | # coqui stack
28 | trainer
29 | # config management
30 | coqpit>=0.0.16
31 | # chinese g2p deps
32 | jieba
33 | pypinyin
34 | # japanese g2p deps
35 | mecab-python3==1.0.5
36 | unidic-lite==1.0.8
37 | # gruut+supported langs
38 | gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
39 |
--------------------------------------------------------------------------------
/tests/text_tests/test_japanese_phonemizer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
4 |
5 | _TEST_CASES = """
6 | どちらに行きますか?/dochiraniikimasuka?
7 | 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
8 | 「A」から「Z」までです。/e:karazeqtomadedesu.
9 | そうですね!/so:desune!
10 | クジラは哺乳類です。/kujirawahonyu:ruidesu.
11 | ヴィディオを見ます。/bidioomimasu.
12 | 今日は8月22日です/kyo:wahachigatsuniju:ninichidesu
13 | xyzとαβγ/eqkusuwaizeqtotoarufabe:tagaNma
14 | 値段は$12.34です/nedaNwaju:niteNsaNyoNdorudesu
15 | """
16 |
17 |
18 | class TestText(unittest.TestCase):
19 | def test_japanese_text_to_phonemes(self):
20 | for line in _TEST_CASES.strip().split("\n"):
21 | text, phone = line.split("/")
22 | self.assertEqual(japanese_text_to_phonemes(text), phone)
23 |
24 |
25 | if __name__ == "__main__":
26 | unittest.main()
27 |
--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | 🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10.
4 |
5 | ## Using `pip`
6 |
7 | `pip` is recommended if you want to use 🐸TTS only for inference.
8 |
9 | You can install from PyPI as follows:
10 |
11 | ```bash
12 | pip install TTS # from PyPI
13 | ```
14 |
15 | Or install from Github:
16 |
17 | ```bash
18 | pip install git+https://github.com/coqui-ai/TTS # from Github
19 | ```
20 |
21 | ## Installing From Source
22 |
23 | This is recommended for development and more control over 🐸TTS.
24 |
25 | ```bash
26 | git clone https://github.com/coqui-ai/TTS/
27 | cd TTS
28 | make system-deps # only on Linux systems.
29 | make install
30 | ```
31 |
32 | ## On Windows
33 | If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/
--------------------------------------------------------------------------------
/docs/source/models/glow_tts.md:
--------------------------------------------------------------------------------
1 | # Glow TTS
2 |
3 | Glow TTS is a normalizing flow model for text-to-speech. It is built on the generic Glow model that is previously
4 | used in computer vision and vocoder models. It uses "monotonic alignment search" (MAS) to fine the text-to-speech alignment
5 | and uses the output to train a separate duration predictor network for faster inference run-time.
6 |
7 | ## Important resources & papers
8 | - GlowTTS: https://arxiv.org/abs/2005.11129
9 | - Glow (Generative Flow with invertible 1x1 Convolutions): https://arxiv.org/abs/1807.03039
10 | - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html
11 |
12 | ## GlowTTS Config
13 | ```{eval-rst}
14 | .. autoclass:: TTS.tts.configs.glow_tts_config.GlowTTSConfig
15 | :members:
16 | ```
17 |
18 | ## GlowTTS Model
19 | ```{eval-rst}
20 | .. autoclass:: TTS.tts.models.glow_tts.GlowTTS
21 | :members:
22 | ```
23 |
--------------------------------------------------------------------------------
/.github/PR_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # Pull request guidelines
2 |
3 | Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
4 |
5 | This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
6 |
7 | In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
8 |
9 | Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS).
10 |
11 | This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS):
12 |
13 | - Protects you, Coqui, and the users of the code.
14 | - Does not change your rights to use your contributions for any purpose.
15 | - Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute.
16 |
--------------------------------------------------------------------------------
/docs/source/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/tests/data_tests/test_dataset_formatters.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from tests import get_tests_input_path
5 | from TTS.tts.datasets.formatters import common_voice
6 |
7 |
8 | class TestTTSFormatters(unittest.TestCase):
9 | def test_common_voice_preprocessor(self): # pylint: disable=no-self-use
10 | root_path = get_tests_input_path()
11 | meta_file = "common_voice.tsv"
12 | items = common_voice(root_path, meta_file)
13 | assert items[0]["text"] == "The applicants are invited for coffee and visa is given immediately."
14 | assert items[0]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav")
15 |
16 | assert items[-1]["text"] == "Competition for limited resources has also resulted in some local conflicts."
17 | assert items[-1]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav")
18 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS Thorsten Recipes
2 |
3 | For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
4 |
5 | You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
6 |
7 | Then, go to your desired model folder and run the training.
8 |
9 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
10 | ```terminal
11 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py
12 | ```
13 |
14 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
15 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
16 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_parallel_wavegan_generator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from TTS.vocoder.models.parallel_wavegan_generator import ParallelWaveganGenerator
5 |
6 |
7 | def test_pwgan_generator():
8 | model = ParallelWaveganGenerator(
9 | in_channels=1,
10 | out_channels=1,
11 | kernel_size=3,
12 | num_res_blocks=30,
13 | stacks=3,
14 | res_channels=64,
15 | gate_channels=128,
16 | skip_channels=64,
17 | aux_channels=80,
18 | dropout=0.0,
19 | bias=True,
20 | use_weight_norm=True,
21 | upsample_factors=[4, 4, 4, 4],
22 | )
23 | dummy_c = torch.rand((2, 80, 5))
24 | output = model(dummy_c)
25 | assert np.all(output.shape == (2, 1, 5 * 256)), output.shape
26 | model.remove_weight_norm()
27 | output = model.inference(dummy_c)
28 | assert np.all(output.shape == (2, 1, (5 + 4) * 256))
29 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads):
9 | super().__init__()
10 | self.embed = nn.Embedding(num_chars, hidden_channels)
11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1)
12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1)
13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1)
14 |
15 | def forward(self, text, text_lengths):
16 | # B, L -> B, L
17 | emb = self.embed(text)
18 | emb = self.pos_enc(emb.transpose(1, 2))
19 | x = self.FFT(emb, text_lengths)
20 | x = self.out_layer(x).squeeze(-1)
21 | return x
22 |
--------------------------------------------------------------------------------
/docs/source/tts_datasets.md:
--------------------------------------------------------------------------------
1 | # TTS Datasets
2 |
3 | Some of the known public datasets that we successfully applied 🐸TTS:
4 |
5 | - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
6 | - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
7 | - [English - TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset)
8 | - [English - LibriTTS](https://openslr.org/60/)
9 | - [English - VCTK](https://datashare.ed.ac.uk/handle/10283/2950)
10 | - [Multilingual - M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/)
11 | - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
12 | - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts)
13 | - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1)
14 | - [Chinese](https://www.data-baker.com/data/index/source/)
15 |
16 | Let us know if you use 🐸TTS on a different dataset.
--------------------------------------------------------------------------------
/recipes/README.md:
--------------------------------------------------------------------------------
1 | # 🐸💬 TTS Training Recipes
2 |
3 | TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset.
4 |
5 | For each dataset, you need to download the dataset once. Then you run the training for the model you want.
6 |
7 | Run each script from the root TTS folder as follows.
8 |
9 | ```console
10 | $ sh ./recipes//download_.sh
11 | $ python recipes///train.py
12 | ```
13 |
14 | For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows.
15 |
16 | ```console
17 | python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac
18 | ```
19 |
20 | If you train a new model using TTS, feel free to share your training to expand the list of recipes.
21 |
22 | You can also open a new discussion and share your progress with the 🐸 community.
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_melgan_discriminator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
5 | from TTS.vocoder.models.melgan_multiscale_discriminator import MelganMultiscaleDiscriminator
6 |
7 |
8 | def test_melgan_discriminator():
9 | model = MelganDiscriminator()
10 | print(model)
11 | dummy_input = torch.rand((4, 1, 256 * 10))
12 | output, _ = model(dummy_input)
13 | assert np.all(output.shape == (4, 1, 10))
14 |
15 |
16 | def test_melgan_multi_scale_discriminator():
17 | model = MelganMultiscaleDiscriminator()
18 | print(model)
19 | dummy_input = torch.rand((4, 1, 256 * 16))
20 | scores, feats = model(dummy_input)
21 | assert len(scores) == 3
22 | assert len(scores) == len(feats)
23 | assert np.all(scores[0].shape == (4, 1, 64))
24 | assert np.all(feats[0][0].shape == (4, 16, 4096))
25 | assert np.all(feats[0][1].shape == (4, 64, 1024))
26 | assert np.all(feats[0][2].shape == (4, 256, 256))
27 |
--------------------------------------------------------------------------------
/TTS/tts/layers/align_tts/mdn.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class MDNBlock(nn.Module):
5 | """Mixture of Density Network implementation
6 | https://arxiv.org/pdf/2003.01950.pdf
7 | """
8 |
9 | def __init__(self, in_channels, out_channels):
10 | super().__init__()
11 | self.out_channels = out_channels
12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1)
13 | self.norm = nn.LayerNorm(in_channels)
14 | self.relu = nn.ReLU()
15 | self.dropout = nn.Dropout(0.1)
16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1)
17 |
18 | def forward(self, x):
19 | o = self.conv1(x)
20 | o = o.transpose(1, 2)
21 | o = self.norm(o)
22 | o = o.transpose(1, 2)
23 | o = self.relu(o)
24 | o = self.dropout(o)
25 | mu_sigma = self.conv2(o)
26 | # TODO: check this sigmoid
27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :])
28 | mu = mu_sigma[:, : self.out_channels // 2, :]
29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :]
30 | return mu, log_sigma
31 |
--------------------------------------------------------------------------------
/tests/inference_tests/test_synthesize.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from tests import get_tests_output_path, run_cli
4 |
5 |
6 | def test_synthesize():
7 | """Test synthesize.py with diffent arguments."""
8 | output_path = os.path.join(get_tests_output_path(), "output.wav")
9 | run_cli("tts --list_models")
10 |
11 | # single speaker model
12 | run_cli(f'tts --text "This is an example." --out_path "{output_path}"')
13 | run_cli(
14 | "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"'
15 | )
16 | run_cli(
17 | "tts --model_name tts_models/en/ljspeech/glow-tts "
18 | "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan "
19 | f'--text "This is an example." --out_path "{output_path}"'
20 | )
21 |
22 | # multi-speaker SC-Glow model
23 | # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
24 | # run_cli(
25 | # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
26 | # f'--text "This is an example." --out_path "{output_path}"'
27 | # )
28 |
--------------------------------------------------------------------------------
/TTS/server/README.md:
--------------------------------------------------------------------------------
1 | # :frog: TTS demo server
2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
3 |
4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
5 |
6 | Examples runs:
7 |
8 | List officially released models.
9 | ```python TTS/server/server.py --list_models ```
10 |
11 | Run the server with the official models.
12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
13 |
14 | Run the server with the official models on a GPU.
15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
16 |
17 | Run the server with a custom models.
18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
19 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/fullband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.models.melgan_generator import MelganGenerator
4 |
5 |
6 | class FullbandMelganGenerator(MelganGenerator):
7 | def __init__(
8 | self,
9 | in_channels=80,
10 | out_channels=1,
11 | proj_kernel=7,
12 | base_channels=512,
13 | upsample_factors=(2, 8, 2, 2),
14 | res_kernel=3,
15 | num_res_blocks=4,
16 | ):
17 | super().__init__(
18 | in_channels=in_channels,
19 | out_channels=out_channels,
20 | proj_kernel=proj_kernel,
21 | base_channels=base_channels,
22 | upsample_factors=upsample_factors,
23 | res_kernel=res_kernel,
24 | num_res_blocks=num_res_blocks,
25 | )
26 |
27 | @torch.no_grad()
28 | def inference(self, cond_features):
29 | cond_features = cond_features.to(self.layers[1].weight.device)
30 | cond_features = torch.nn.functional.pad(
31 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
32 | )
33 | return self.layers(cond_features)
34 |
--------------------------------------------------------------------------------
/recipes/blizzard2013/README.md:
--------------------------------------------------------------------------------
1 | # How to get the Blizzard 2013 Dataset
2 |
3 | The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
4 |
5 | To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
6 |
7 | To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
8 |
9 | You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
10 |
11 | 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
12 | 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).
--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
1 |
2 | ```{include} ../../README.md
3 | :relative-images:
4 | ```
5 | ----
6 |
7 | # Documentation Content
8 | ```{eval-rst}
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Get started
12 |
13 | tutorial_for_nervous_beginners
14 | installation
15 | faq
16 | contributing
17 |
18 | .. toctree::
19 | :maxdepth: 2
20 | :caption: Using 🐸TTS
21 |
22 | inference
23 | implementing_a_new_model
24 | training_a_model
25 | finetuning
26 | configuration
27 | formatting_your_dataset
28 | what_makes_a_good_dataset
29 | tts_datasets
30 |
31 | .. toctree::
32 | :maxdepth: 2
33 | :caption: Main Classes
34 |
35 | main_classes/trainer_api
36 | main_classes/audio_processor
37 | main_classes/model_api
38 | main_classes/dataset
39 | main_classes/gan
40 | main_classes/speaker_manager
41 |
42 | .. toctree::
43 | :maxdepth: 2
44 | :caption: `tts` Models
45 |
46 | models/glow_tts.md
47 | models/vits.md
48 | models/forward_tts.md
49 | models/tacotron1-2.md
50 |
51 | .. toctree::
52 | :maxdepth: 2
53 | :caption: `vocoder` Models
54 |
55 | ```
56 |
57 |
--------------------------------------------------------------------------------
/TTS/bin/collect_env_info.py:
--------------------------------------------------------------------------------
1 | """Get detailed info about the working environment."""
2 | import os
3 | import platform
4 | import sys
5 |
6 | import numpy
7 | import torch
8 |
9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10 | import json
11 |
12 | import TTS
13 |
14 |
15 | def system_info():
16 | return {
17 | "OS": platform.system(),
18 | "architecture": platform.architecture(),
19 | "version": platform.version(),
20 | "processor": platform.processor(),
21 | "python": platform.python_version(),
22 | }
23 |
24 |
25 | def cuda_info():
26 | return {
27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28 | "available": torch.cuda.is_available(),
29 | "version": torch.version.cuda,
30 | }
31 |
32 |
33 | def package_info():
34 | return {
35 | "numpy": numpy.__version__,
36 | "PyTorch_version": torch.__version__,
37 | "PyTorch_debug": torch.version.debug,
38 | "TTS": TTS.__version__,
39 | }
40 |
41 |
42 | def main():
43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44 | print(json.dumps(details, indent=4, sort_keys=True))
45 |
46 |
47 | if __name__ == "__main__":
48 | main()
49 |
--------------------------------------------------------------------------------
/tests/inputs/common_voice.tsv:
--------------------------------------------------------------------------------
1 | client_id path sentence up_votes down_votes age gender accent locale segment
2 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en
3 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en
4 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en
5 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en
6 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en
7 |
--------------------------------------------------------------------------------
/TTS/tts/layers/feed_forward/duration_predictor.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN
4 |
5 |
6 | class DurationPredictor(nn.Module):
7 | """Speedy Speech duration predictor model.
8 | Predicts phoneme durations from encoder outputs.
9 |
10 | Note:
11 | Outputs interpreted as log(durations)
12 | To get actual durations, do exp transformation
13 |
14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
15 |
16 | Args:
17 | hidden_channels (int): number of channels in the inner layers.
18 | """
19 |
20 | def __init__(self, hidden_channels):
21 |
22 | super().__init__()
23 |
24 | self.layers = nn.ModuleList(
25 | [
26 | Conv1dBN(hidden_channels, hidden_channels, 4, 1),
27 | Conv1dBN(hidden_channels, hidden_channels, 3, 1),
28 | Conv1dBN(hidden_channels, hidden_channels, 1, 1),
29 | nn.Conv1d(hidden_channels, 1, 1),
30 | ]
31 | )
32 |
33 | def forward(self, x, x_mask):
34 | """
35 | Shapes:
36 | x: [B, C, T]
37 | x_mask: [B, 1, T]
38 | """
39 | o = x
40 | for layer in self.layers:
41 | o = layer(o) * x_mask
42 | return o
43 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/chinese_mandarin/phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import jieba
4 | import pypinyin
5 |
6 | from .pinyinToPhonemes import PINYIN_DICT
7 |
8 |
9 | def _chinese_character_to_pinyin(text: str) -> List[str]:
10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)
11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist]
12 | return pinyins_flat_list
13 |
14 |
15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
16 | segment = pinyin[:-1]
17 | tone = pinyin[-1]
18 | phoneme = PINYIN_DICT.get(segment, [""])[0]
19 | return phoneme + tone
20 |
21 |
22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str:
23 | tokenized_text = jieba.cut(text, HMM=False)
24 | tokenized_text = " ".join(tokenized_text)
25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
26 |
27 | results: List[str] = []
28 |
29 | for token in pinyined_text:
30 | if token[-1] in "12345": # TODO transform to is_pinyin()
31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
32 |
33 | results += list(pinyin_phonemes)
34 | else: # is ponctuation or other
35 | results += list(token)
36 |
37 | return seperator.join(results)
38 |
--------------------------------------------------------------------------------
/recipes/ljspeech/univnet/train.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import UnivnetConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = UnivnetConfig(
12 | batch_size=64,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1000,
19 | seq_len=8192,
20 | pad_short=2000,
21 | use_noise_augment=True,
22 | eval_split_size=10,
23 | print_step=25,
24 | print_eval=False,
25 | mixed_precision=False,
26 | lr_gen=1e-4,
27 | lr_disc=1e-4,
28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
29 | output_path=output_path,
30 | )
31 |
32 | # init audio processor
33 | ap = AudioProcessor(**config.audio.to_dict())
34 |
35 | # load training samples
36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
37 |
38 | # init model
39 | model = GAN(config, ap)
40 |
41 | # init the trainer and 🚀
42 | trainer = Trainer(
43 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
44 | )
45 | trainer.fit()
46 |
--------------------------------------------------------------------------------
/recipes/ljspeech/hifigan/train_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import HifiganConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 |
12 | config = HifiganConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=5,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # init audio processor
34 | ap = AudioProcessor(**config.audio.to_dict())
35 |
36 | # load training samples
37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
38 |
39 | # init model
40 | model = GAN(config, ap)
41 |
42 | # init the trainer and 🚀
43 | trainer = Trainer(
44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
45 | )
46 | trainer.fit()
47 |
--------------------------------------------------------------------------------
/TTS/encoder/README.md:
--------------------------------------------------------------------------------
1 | ### Speaker Encoder
2 |
3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4 |
5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6 |
7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8 |
9 | 
10 |
11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12 |
13 | To run the code, you need to follow the same flow as in TTS.
14 |
15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18 | - Watch training on Tensorboard as in TTS
19 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/english/time_norm.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import inflect
4 |
5 | _inflect = inflect.engine()
6 |
7 | _time_re = re.compile(
8 | r"""\b
9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
10 | :
11 | ([0-5][0-9]) # minutes
12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
13 | \b""",
14 | re.IGNORECASE | re.X,
15 | )
16 |
17 |
18 | def _expand_num(n: int) -> str:
19 | return _inflect.number_to_words(n)
20 |
21 |
22 | def _expand_time_english(match: "re.Match") -> str:
23 | hour = int(match.group(1))
24 | past_noon = hour >= 12
25 | time = []
26 | if hour > 12:
27 | hour -= 12
28 | elif hour == 0:
29 | hour = 12
30 | past_noon = True
31 | time.append(_expand_num(hour))
32 |
33 | minute = int(match.group(6))
34 | if minute > 0:
35 | if minute < 10:
36 | time.append("oh")
37 | time.append(_expand_num(minute))
38 | am_pm = match.group(7)
39 | if am_pm is None:
40 | time.append("p m" if past_noon else "a m")
41 | else:
42 | time.extend(list(am_pm.replace(".", "")))
43 | return " ".join(time)
44 |
45 |
46 | def expand_time_english(text: str) -> str:
47 | return re.sub(_time_re, _expand_time_english, text)
48 |
--------------------------------------------------------------------------------
/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import MultibandMelganConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.gan import GAN
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 |
12 | config = MultibandMelganConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=5,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # init audio processor
34 | ap = AudioProcessor(**config.audio.to_dict())
35 |
36 | # load training samples
37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
38 |
39 | # init model
40 | model = GAN(config, ap)
41 |
42 | # init the trainer and 🚀
43 | trainer = Trainer(
44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
45 | )
46 | trainer.fit()
47 |
--------------------------------------------------------------------------------
/recipes/ljspeech/wavegrad/train_wavegrad.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import WavegradConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.wavegrad import Wavegrad
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = WavegradConfig(
12 | batch_size=32,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1000,
19 | seq_len=6144,
20 | pad_short=2000,
21 | use_noise_augment=True,
22 | eval_split_size=50,
23 | print_step=50,
24 | print_eval=True,
25 | mixed_precision=False,
26 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
27 | output_path=output_path,
28 | )
29 |
30 | # init audio processor
31 | ap = AudioProcessor(**config.audio.to_dict())
32 |
33 | # load training samples
34 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
35 |
36 | # init model
37 | model = Wavegrad(config)
38 |
39 | # init the trainer and 🚀
40 | trainer = Trainer(
41 | TrainerArgs(),
42 | config,
43 | output_path,
44 | model=model,
45 | train_samples=train_samples,
46 | eval_samples=eval_samples,
47 | training_assets={"audio_processor": ap},
48 | )
49 | trainer.fit()
50 |
--------------------------------------------------------------------------------
/TTS/tts/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | cimport cython
4 | cimport numpy as np
5 |
6 | from cython.parallel import prange
7 |
8 |
9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 | cdef int x
13 | cdef int y
14 | cdef float v_prev
15 | cdef float v_cur
16 | cdef float tmp
17 | cdef int index = t_x - 1
18 |
19 | for y in range(t_y):
20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 | if x == y:
22 | v_cur = max_neg_val
23 | else:
24 | v_cur = value[x, y-1]
25 | if x == 0:
26 | if y == 0:
27 | v_prev = 0.
28 | else:
29 | v_prev = max_neg_val
30 | else:
31 | v_prev = value[x-1, y-1]
32 | value[x, y] = max(v_cur, v_prev) + value[x, y]
33 |
34 | for y in range(t_y - 1, -1, -1):
35 | path[index, y] = 1
36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 | index = index - 1
38 |
39 |
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 | cdef int b = values.shape[0]
44 |
45 | cdef int i
46 | for i in prange(b, nogil=True):
47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_hifigan_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import HifiganConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 |
12 | config = HifiganConfig(
13 | batch_size=8,
14 | eval_batch_size=8,
15 | num_loader_workers=0,
16 | num_eval_loader_workers=0,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=1,
20 | seq_len=1024,
21 | eval_split_size=1,
22 | print_step=1,
23 | print_eval=True,
24 | data_path="tests/data/ljspeech",
25 | output_path=output_path,
26 | )
27 | config.audio.do_trim_silence = True
28 | config.audio.trim_db = 60
29 | config.save_json(config_path)
30 |
31 | # train the model for one epoch
32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
33 | run_cli(command_train)
34 |
35 | # Find latest folder
36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
37 |
38 | # restore the model and continue training for one more epoch
39 | command_train = (
40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
41 | )
42 | run_cli(command_train)
43 | shutil.rmtree(continue_path)
44 |
--------------------------------------------------------------------------------
/TTS/tts/layers/generic/gated_conv.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from .normalization import LayerNorm
4 |
5 |
6 | class GatedConvBlock(nn.Module):
7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf
8 | Args:
9 | in_out_channels (int): number of input/output channels.
10 | kernel_size (int): convolution kernel size.
11 | dropout_p (float): dropout rate.
12 | """
13 |
14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers):
15 | super().__init__()
16 | # class arguments
17 | self.dropout_p = dropout_p
18 | self.num_layers = num_layers
19 | # define layers
20 | self.conv_layers = nn.ModuleList()
21 | self.norm_layers = nn.ModuleList()
22 | self.layers = nn.ModuleList()
23 | for _ in range(num_layers):
24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)]
25 | self.norm_layers += [LayerNorm(2 * in_out_channels)]
26 |
27 | def forward(self, x, x_mask):
28 | o = x
29 | res = x
30 | for idx in range(self.num_layers):
31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training)
32 | o = self.conv_layers[idx](o * x_mask)
33 | o = self.norm_layers[idx](o)
34 | o = nn.functional.glu(o, dim=1)
35 | o = res + o
36 | res = o
37 | return o
38 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_parallel_wavegan_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import ParallelWaveganConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 | config = ParallelWaveganConfig(
12 | batch_size=4,
13 | eval_batch_size=4,
14 | num_loader_workers=0,
15 | num_eval_loader_workers=0,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1,
19 | seq_len=2048,
20 | eval_split_size=1,
21 | print_step=1,
22 | print_eval=True,
23 | data_path="tests/data/ljspeech",
24 | output_path=output_path,
25 | )
26 | config.audio.do_trim_silence = True
27 | config.audio.trim_db = 60
28 | config.save_json(config_path)
29 |
30 | # train the model for one epoch
31 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
32 | run_cli(command_train)
33 |
34 | # Find latest folder
35 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
36 |
37 | # restore the model and continue training for one more epoch
38 | command_train = (
39 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
40 | )
41 | run_cli(command_train)
42 | shutil.rmtree(continue_path)
43 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/multiband_melgan_generator.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from TTS.vocoder.layers.pqmf import PQMF
4 | from TTS.vocoder.models.melgan_generator import MelganGenerator
5 |
6 |
7 | class MultibandMelganGenerator(MelganGenerator):
8 | def __init__(
9 | self,
10 | in_channels=80,
11 | out_channels=4,
12 | proj_kernel=7,
13 | base_channels=384,
14 | upsample_factors=(2, 8, 2, 2),
15 | res_kernel=3,
16 | num_res_blocks=3,
17 | ):
18 | super().__init__(
19 | in_channels=in_channels,
20 | out_channels=out_channels,
21 | proj_kernel=proj_kernel,
22 | base_channels=base_channels,
23 | upsample_factors=upsample_factors,
24 | res_kernel=res_kernel,
25 | num_res_blocks=num_res_blocks,
26 | )
27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
28 |
29 | def pqmf_analysis(self, x):
30 | return self.pqmf_layer.analysis(x)
31 |
32 | def pqmf_synthesis(self, x):
33 | return self.pqmf_layer.synthesis(x)
34 |
35 | @torch.no_grad()
36 | def inference(self, cond_features):
37 | cond_features = cond_features.to(self.layers[1].weight.device)
38 | cond_features = torch.nn.functional.pad(
39 | cond_features, (self.inference_padding, self.inference_padding), "replicate"
40 | )
41 | return self.pqmf_synthesis(self.layers(cond_features))
42 |
--------------------------------------------------------------------------------
/recipes/kokoro/tacotron2-DDC/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # take the scripts's parent's directory to prefix all the output paths.
3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
4 | CORPUS=kokoro-speech-v1_1-small
5 | echo $RUN_DIR
6 | if [ \! -d $RUN_DIR/$CORPUS ] ; then
7 | echo "$RUN_DIR/$CORPUS doesn't exist."
8 | echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
9 | exit 1
10 | fi
11 | # create train-val splits
12 | shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
13 | head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
14 | tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
15 | # compute dataset mean and variance for normalization
16 | python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
17 | # training ....
18 | # change the GPU id if needed
19 | CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \
20 | --coqpit.output_path $RUN_DIR \
21 | --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
22 | --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
23 | --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
--------------------------------------------------------------------------------
/recipes/ljspeech/wavernn/train_wavernn.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.vocoder.configs import WavernnConfig
7 | from TTS.vocoder.datasets.preprocess import load_wav_data
8 | from TTS.vocoder.models.wavernn import Wavernn
9 |
10 | output_path = os.path.dirname(os.path.abspath(__file__))
11 | config = WavernnConfig(
12 | batch_size=64,
13 | eval_batch_size=16,
14 | num_loader_workers=4,
15 | num_eval_loader_workers=4,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=10000,
19 | seq_len=1280,
20 | pad_short=2000,
21 | use_noise_augment=False,
22 | eval_split_size=10,
23 | print_step=25,
24 | print_eval=True,
25 | mixed_precision=False,
26 | lr=1e-4,
27 | grad_clip=4,
28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
29 | output_path=output_path,
30 | )
31 |
32 | # init audio processor
33 | ap = AudioProcessor(**config.audio.to_dict())
34 |
35 | # load training samples
36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
37 |
38 | # init model
39 | model = Wavernn(config)
40 |
41 | # init the trainer and 🚀
42 | trainer = Trainer(
43 | TrainerArgs(),
44 | config,
45 | output_path,
46 | model=model,
47 | train_samples=train_samples,
48 | eval_samples=eval_samples,
49 | training_assets={"audio_processor": ap},
50 | )
51 | trainer.fit()
52 |
--------------------------------------------------------------------------------
/.github/workflows/style_check.yml:
--------------------------------------------------------------------------------
1 | name: style-check
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.9]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y git make gcc
38 | make system-deps
39 | - name: Install/upgrade Python setup deps
40 | run: python3 -m pip install --upgrade pip setuptools wheel
41 | - name: Install TTS
42 | run: |
43 | python3 -m pip install .[all]
44 | python3 setup.py egg_info
45 | - name: Lint check
46 | run: |
47 | make lint
--------------------------------------------------------------------------------
/.github/workflows/vocoder_tests.yml:
--------------------------------------------------------------------------------
1 | name: vocoder-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y git make gcc
38 | make system-deps
39 | - name: Install/upgrade Python setup deps
40 | run: python3 -m pip install --upgrade pip setuptools wheel
41 | - name: Install TTS
42 | run: |
43 | python3 -m pip install .[all]
44 | python3 setup.py egg_info
45 | - name: Unit tests
46 | run: make test_vocoder
47 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_parallel_wavegan_discriminator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from TTS.vocoder.models.parallel_wavegan_discriminator import (
5 | ParallelWaveganDiscriminator,
6 | ResidualParallelWaveganDiscriminator,
7 | )
8 |
9 |
10 | def test_pwgan_disciminator():
11 | model = ParallelWaveganDiscriminator(
12 | in_channels=1,
13 | out_channels=1,
14 | kernel_size=3,
15 | num_layers=10,
16 | conv_channels=64,
17 | dilation_factor=1,
18 | nonlinear_activation="LeakyReLU",
19 | nonlinear_activation_params={"negative_slope": 0.2},
20 | bias=True,
21 | )
22 | dummy_x = torch.rand((4, 1, 64 * 256))
23 | output = model(dummy_x)
24 | assert np.all(output.shape == (4, 1, 64 * 256))
25 | model.remove_weight_norm()
26 |
27 |
28 | def test_redisual_pwgan_disciminator():
29 | model = ResidualParallelWaveganDiscriminator(
30 | in_channels=1,
31 | out_channels=1,
32 | kernel_size=3,
33 | num_layers=30,
34 | stacks=3,
35 | res_channels=64,
36 | gate_channels=128,
37 | skip_channels=64,
38 | dropout=0.0,
39 | bias=True,
40 | nonlinear_activation="LeakyReLU",
41 | nonlinear_activation_params={"negative_slope": 0.2},
42 | )
43 | dummy_x = torch.rand((4, 1, 64 * 256))
44 | output = model(dummy_x)
45 | assert np.all(output.shape == (4, 1, 64 * 256))
46 | model.remove_weight_norm()
47 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/io.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 |
4 | from TTS.utils.io import save_fsspec
5 |
6 |
7 | def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
8 | checkpoint_path = "checkpoint_{}.pth".format(current_step)
9 | checkpoint_path = os.path.join(out_path, checkpoint_path)
10 | print(" | | > Checkpoint saving : {}".format(checkpoint_path))
11 |
12 | new_state_dict = model.state_dict()
13 | state = {
14 | "model": new_state_dict,
15 | "optimizer": optimizer.state_dict() if optimizer is not None else None,
16 | "step": current_step,
17 | "loss": model_loss,
18 | "date": datetime.date.today().strftime("%B %d, %Y"),
19 | }
20 | save_fsspec(state, checkpoint_path)
21 |
22 |
23 | def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
24 | if model_loss < best_loss:
25 | new_state_dict = model.state_dict()
26 | state = {
27 | "model": new_state_dict,
28 | "optimizer": optimizer.state_dict(),
29 | "step": current_step,
30 | "loss": model_loss,
31 | "date": datetime.date.today().strftime("%B %d, %Y"),
32 | }
33 | best_loss = model_loss
34 | bestmodel_path = "best_model.pth"
35 | bestmodel_path = os.path.join(out_path, bestmodel_path)
36 | print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
37 | save_fsspec(state, bestmodel_path)
38 | return best_loss
39 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_wavegrad_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import WavegradConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 | config = WavegradConfig(
12 | batch_size=8,
13 | eval_batch_size=8,
14 | num_loader_workers=0,
15 | num_eval_loader_workers=0,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1,
19 | seq_len=8192,
20 | eval_split_size=1,
21 | print_step=1,
22 | print_eval=True,
23 | data_path="tests/data/ljspeech",
24 | output_path=output_path,
25 | test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2},
26 | )
27 | config.audio.do_trim_silence = True
28 | config.audio.trim_db = 60
29 | config.save_json(config_path)
30 |
31 | # train the model for one epoch
32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
33 | run_cli(command_train)
34 |
35 | # Find latest folder
36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
37 |
38 | # restore the model and continue training for one more epoch
39 | command_train = (
40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
41 | )
42 | run_cli(command_train)
43 | shutil.rmtree(continue_path)
44 |
--------------------------------------------------------------------------------
/TTS/encoder/utils/visual.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import umap
5 |
6 | matplotlib.use("Agg")
7 |
8 |
9 | colormap = (
10 | np.array(
11 | [
12 | [76, 255, 0],
13 | [0, 127, 70],
14 | [255, 0, 0],
15 | [255, 217, 38],
16 | [0, 135, 255],
17 | [165, 0, 165],
18 | [255, 167, 255],
19 | [0, 255, 255],
20 | [255, 96, 38],
21 | [142, 76, 0],
22 | [33, 0, 127],
23 | [0, 0, 0],
24 | [183, 183, 183],
25 | ],
26 | dtype=np.float,
27 | )
28 | / 255
29 | )
30 |
31 |
32 | def plot_embeddings(embeddings, num_classes_in_batch):
33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
34 |
35 | # if necessary get just the first 10 classes
36 | if num_classes_in_batch > 10:
37 | num_classes_in_batch = 10
38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
39 |
40 | model = umap.UMAP()
41 | projection = model.fit_transform(embeddings)
42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
43 | colors = [colormap[i] for i in ground_truth]
44 | fig, ax = plt.subplots(figsize=(16, 10))
45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
46 | plt.gca().set_aspect("equal", "datalim")
47 | plt.title("UMAP projection")
48 | plt.tight_layout()
49 | plt.savefig("umap")
50 | return fig
51 |
--------------------------------------------------------------------------------
/tests/text_tests/test_punctuation.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from TTS.tts.utils.text.punctuation import _DEF_PUNCS, Punctuation
4 |
5 |
6 | class PunctuationTest(unittest.TestCase):
7 | def setUp(self):
8 | self.punctuation = Punctuation()
9 | self.test_texts = [
10 | ("This, is my text ... to be striped !! from text?", "This is my text to be striped from text"),
11 | ("This, is my text ... to be striped !! from text", "This is my text to be striped from text"),
12 | ("This, is my text ... to be striped from text?", "This is my text to be striped from text"),
13 | ("This, is my text to be striped from text", "This is my text to be striped from text"),
14 | ]
15 |
16 | def test_get_set_puncs(self):
17 | self.punctuation.puncs = "-="
18 | self.assertEqual(self.punctuation.puncs, "-=")
19 |
20 | self.punctuation.puncs = _DEF_PUNCS
21 | self.assertEqual(self.punctuation.puncs, _DEF_PUNCS)
22 |
23 | def test_strip_punc(self):
24 | for text, gt in self.test_texts:
25 | text_striped = self.punctuation.strip(text)
26 | self.assertEqual(text_striped, gt)
27 |
28 | def test_strip_restore(self):
29 | for text, gt in self.test_texts:
30 | text_striped, puncs_map = self.punctuation.strip_to_restore(text)
31 | text_restored = self.punctuation.restore(text_striped, puncs_map)
32 | self.assertEqual(" ".join(text_striped), gt)
33 | self.assertEqual(text_restored[0], text)
34 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_melgan_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import MelganConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 | config = MelganConfig(
12 | batch_size=4,
13 | eval_batch_size=4,
14 | num_loader_workers=0,
15 | num_eval_loader_workers=0,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1,
19 | seq_len=2048,
20 | eval_split_size=1,
21 | print_step=1,
22 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
23 | print_eval=True,
24 | data_path="tests/data/ljspeech",
25 | output_path=output_path,
26 | )
27 | config.audio.do_trim_silence = True
28 | config.audio.trim_db = 60
29 | config.save_json(config_path)
30 |
31 | # train the model for one epoch
32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
33 | run_cli(command_train)
34 |
35 | # Find latest folder
36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
37 |
38 | # restore the model and continue training for one more epoch
39 | command_train = (
40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
41 | )
42 | run_cli(command_train)
43 | shutil.rmtree(continue_path)
44 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_wavernn_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import WavernnConfig
7 | from TTS.vocoder.models.wavernn import WavernnArgs
8 |
9 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
10 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
11 |
12 |
13 | config = WavernnConfig(
14 | model_args=WavernnArgs(),
15 | batch_size=8,
16 | eval_batch_size=8,
17 | num_loader_workers=0,
18 | num_eval_loader_workers=0,
19 | run_eval=True,
20 | test_delay_epochs=-1,
21 | epochs=1,
22 | seq_len=256, # for shorter test time
23 | eval_split_size=1,
24 | print_step=1,
25 | print_eval=True,
26 | data_path="tests/data/ljspeech",
27 | output_path=output_path,
28 | )
29 | config.audio.do_trim_silence = True
30 | config.audio.trim_db = 60
31 | config.save_json(config_path)
32 |
33 | # train the model for one epoch
34 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
35 | run_cli(command_train)
36 |
37 | # Find latest folder
38 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
39 |
40 | # restore the model and continue training for one more epoch
41 | command_train = (
42 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
43 | )
44 | run_cli(command_train)
45 | shutil.rmtree(continue_path)
46 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_fullband_melgan_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import FullbandMelganConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 | config = FullbandMelganConfig(
12 | batch_size=8,
13 | eval_batch_size=8,
14 | num_loader_workers=0,
15 | num_eval_loader_workers=0,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1,
19 | seq_len=8192,
20 | eval_split_size=1,
21 | print_step=1,
22 | print_eval=True,
23 | data_path="tests/data/ljspeech",
24 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
25 | output_path=output_path,
26 | )
27 | config.audio.do_trim_silence = True
28 | config.audio.trim_db = 60
29 | config.save_json(config_path)
30 |
31 | # train the model for one epoch
32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
33 | run_cli(command_train)
34 |
35 | # Find latest folder
36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
37 |
38 | # restore the model and continue training for one more epoch
39 | command_train = (
40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
41 | )
42 | run_cli(command_train)
43 | shutil.rmtree(continue_path)
44 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/french/abbreviations.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | # List of (regular expression, replacement) pairs for abbreviations in french:
4 | abbreviations_fr = [
5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
6 | for x in [
7 | ("M", "monsieur"),
8 | ("Mlle", "mademoiselle"),
9 | ("Mlles", "mesdemoiselles"),
10 | ("Mme", "Madame"),
11 | ("Mmes", "Mesdames"),
12 | ("N.B", "nota bene"),
13 | ("M", "monsieur"),
14 | ("p.c.q", "parce que"),
15 | ("Pr", "professeur"),
16 | ("qqch", "quelque chose"),
17 | ("rdv", "rendez-vous"),
18 | ("max", "maximum"),
19 | ("min", "minimum"),
20 | ("no", "numéro"),
21 | ("adr", "adresse"),
22 | ("dr", "docteur"),
23 | ("st", "saint"),
24 | ("co", "companie"),
25 | ("jr", "junior"),
26 | ("sgt", "sergent"),
27 | ("capt", "capitain"),
28 | ("col", "colonel"),
29 | ("av", "avenue"),
30 | ("av. J.-C", "avant Jésus-Christ"),
31 | ("apr. J.-C", "après Jésus-Christ"),
32 | ("art", "article"),
33 | ("boul", "boulevard"),
34 | ("c.-à-d", "c’est-à-dire"),
35 | ("etc", "et cetera"),
36 | ("ex", "exemple"),
37 | ("excl", "exclusivement"),
38 | ("boul", "boulevard"),
39 | ]
40 | ] + [
41 | (re.compile("\\b%s" % x[0]), x[1])
42 | for x in [
43 | ("Mlle", "mademoiselle"),
44 | ("Mlles", "mesdemoiselles"),
45 | ("Mme", "Madame"),
46 | ("Mmes", "Mesdames"),
47 | ]
48 | ]
49 |
--------------------------------------------------------------------------------
/.github/workflows/text_tests.yml:
--------------------------------------------------------------------------------
1 | name: text-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y --no-install-recommends git make gcc
38 | sudo apt-get install espeak
39 | sudo apt-get install espeak-ng
40 | make system-deps
41 | - name: Install/upgrade Python setup deps
42 | run: python3 -m pip install --upgrade pip setuptools wheel
43 | - name: Install TTS
44 | run: |
45 | python3 -m pip install .[all]
46 | python3 setup.py egg_info
47 | - name: Unit tests
48 | run: make test_text
49 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_multiband_melgan_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.vocoder.configs import MultibandMelganConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 | config = MultibandMelganConfig(
12 | batch_size=8,
13 | eval_batch_size=8,
14 | num_loader_workers=0,
15 | num_eval_loader_workers=0,
16 | run_eval=True,
17 | test_delay_epochs=-1,
18 | epochs=1,
19 | seq_len=8192,
20 | eval_split_size=1,
21 | print_step=1,
22 | print_eval=True,
23 | steps_to_start_discriminator=1,
24 | data_path="tests/data/ljspeech",
25 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]},
26 | output_path=output_path,
27 | )
28 | config.audio.do_trim_silence = True
29 | config.audio.trim_db = 60
30 | config.save_json(config_path)
31 |
32 | # train the model for one epoch
33 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} "
34 | run_cli(command_train)
35 |
36 | # Find latest folder
37 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
38 |
39 | # restore the model and continue training for one more epoch
40 | command_train = (
41 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} "
42 | )
43 | run_cli(command_train)
44 | shutil.rmtree(continue_path)
45 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_vocoder_wavernn.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import numpy as np
4 | import torch
5 |
6 | from TTS.vocoder.configs import WavernnConfig
7 | from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs
8 |
9 |
10 | def test_wavernn():
11 | config = WavernnConfig()
12 | config.model_args = WavernnArgs(
13 | rnn_dims=512,
14 | fc_dims=512,
15 | mode="mold",
16 | mulaw=False,
17 | pad=2,
18 | use_aux_net=True,
19 | use_upsample_net=True,
20 | upsample_factors=[4, 8, 8],
21 | feat_dims=80,
22 | compute_dims=128,
23 | res_out_dims=128,
24 | num_res_blocks=10,
25 | )
26 | config.audio.hop_length = 256
27 | config.audio.sample_rate = 2048
28 |
29 | dummy_x = torch.rand((2, 1280))
30 | dummy_m = torch.rand((2, 80, 9))
31 | y_size = random.randrange(20, 60)
32 | dummy_y = torch.rand((80, y_size))
33 |
34 | # mode: mold
35 | model = Wavernn(config)
36 | output = model(dummy_x, dummy_m)
37 | assert np.all(output.shape == (2, 1280, 30)), output.shape
38 |
39 | # mode: gauss
40 | config.model_args.mode = "gauss"
41 | model = Wavernn(config)
42 | output = model(dummy_x, dummy_m)
43 | assert np.all(output.shape == (2, 1280, 2)), output.shape
44 |
45 | # mode: quantized
46 | config.model_args.mode = 4
47 | model = Wavernn(config)
48 | output = model(dummy_x, dummy_m)
49 | assert np.all(output.shape == (2, 1280, 2**4)), output.shape
50 | output = model.inference(dummy_y, True, 5500, 550)
51 | assert np.all(output.shape == (256 * (y_size - 1),))
52 |
--------------------------------------------------------------------------------
/tests/inputs/test_vocoder_audio_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "audio":{
3 | "num_mels": 80, // size of the mel spec frame.
4 | "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
5 | "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
6 | "frame_length_ms": null, // stft window length in ms.
7 | "frame_shift_ms": null, // stft window hop-lengh in ms.
8 | "hop_length": 256,
9 | "win_length": 1024,
10 | "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
11 | "min_level_db": -100, // normalization range
12 | "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
13 | "power": 1.5, // value to sharpen wav signals after GL algorithm.
14 | "griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
15 | "signal_norm": true, // normalize the spec values in range [0, 1]
16 | "symmetric_norm": true, // move normalization to range [-1, 1]
17 | "clip_norm": true, // clip normalized values into the range.
18 | "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
19 | "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
20 | "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
21 | "do_trim_silence": false
22 | }
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/tests/data/ljspeech/metadata.csv:
--------------------------------------------------------------------------------
1 | LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
2 | LJ001-0002|in being comparatively modern.|in being comparatively modern.
3 | LJ001-0003|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
4 | LJ001-0004|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,
5 | LJ001-0005|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
6 | LJ001-0006|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,
7 | LJ001-0007|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,
8 | LJ001-0008|has never been surpassed.|has never been surpassed.
9 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_chars.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | from argparse import RawTextHelpFormatter
4 |
5 | from TTS.config import load_config
6 | from TTS.tts.datasets import load_tts_samples
7 |
8 |
9 | def main():
10 | # pylint: disable=bad-option-value
11 | parser = argparse.ArgumentParser(
12 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13 | """
14 | Example runs:
15 |
16 | python TTS/bin/find_unique_chars.py --config_path config.json
17 | """,
18 | formatter_class=RawTextHelpFormatter,
19 | )
20 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21 | args = parser.parse_args()
22 |
23 | c = load_config(args.config_path)
24 |
25 | # load all datasets
26 | train_items, eval_items = load_tts_samples(
27 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28 | )
29 |
30 | items = train_items + eval_items
31 |
32 | texts = "".join(item["text"] for item in items)
33 | chars = set(texts)
34 | lower_chars = filter(lambda c: c.islower(), chars)
35 | chars_force_lower = [c.lower() for c in chars]
36 | chars_force_lower = set(chars_force_lower)
37 |
38 | print(f" > Number of unique characters: {len(chars)}")
39 | print(f" > Unique characters: {''.join(sorted(chars))}")
40 | print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41 | print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/.github/workflows/aux_tests.yml:
--------------------------------------------------------------------------------
1 | name: aux-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y git make gcc
38 | make system-deps
39 | - name: Install/upgrade Python setup deps
40 | run: python3 -m pip install --upgrade pip setuptools wheel
41 | - name: Replace scarf urls
42 | run: |
43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
44 | - name: Install TTS
45 | run: |
46 | python3 -m pip install .[all]
47 | python3 setup.py egg_info
48 | - name: Unit tests
49 | run: make test_aux
50 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/univnet/train_univnet.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import UnivnetConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = UnivnetConfig(
13 | batch_size=64,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=1000,
20 | seq_len=8192,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=False,
26 | mixed_precision=False,
27 | lr_gen=1e-4,
28 | lr_disc=1e-4,
29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # download dataset if not already present
34 | if not os.path.exists(config.data_path):
35 | print("Downloading dataset")
36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
37 | download_thorsten_de(download_path)
38 |
39 | # init audio processor
40 | ap = AudioProcessor(**config.audio.to_dict())
41 |
42 | # load training samples
43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
44 |
45 | # init model
46 | model = GAN(config, ap)
47 |
48 | # init the trainer and 🚀
49 | trainer = Trainer(
50 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
51 | )
52 | trainer.fit()
53 |
--------------------------------------------------------------------------------
/.github/workflows/data_tests.yml:
--------------------------------------------------------------------------------
1 | name: data-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y --no-install-recommends git make gcc
38 | make system-deps
39 | - name: Install/upgrade Python setup deps
40 | run: python3 -m pip install --upgrade pip setuptools wheel
41 | - name: Replace scarf urls
42 | run: |
43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
44 | - name: Install TTS
45 | run: |
46 | python3 -m pip install .[all]
47 | python3 setup.py egg_info
48 | - name: Unit tests
49 | run: make data_tests
50 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/hifigan/train_hifigan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import HifiganConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 |
13 | config = HifiganConfig(
14 | batch_size=32,
15 | eval_batch_size=16,
16 | num_loader_workers=4,
17 | num_eval_loader_workers=4,
18 | run_eval=True,
19 | test_delay_epochs=5,
20 | epochs=1000,
21 | seq_len=8192,
22 | pad_short=2000,
23 | use_noise_augment=True,
24 | eval_split_size=10,
25 | print_step=25,
26 | print_eval=False,
27 | mixed_precision=False,
28 | lr_gen=1e-4,
29 | lr_disc=1e-4,
30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
31 | output_path=output_path,
32 | )
33 |
34 | # download dataset if not already present
35 | if not os.path.exists(config.data_path):
36 | print("Downloading dataset")
37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
38 | download_thorsten_de(download_path)
39 |
40 | # init audio processor
41 | ap = AudioProcessor(**config.audio.to_dict())
42 |
43 | # load training samples
44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
45 |
46 | # init model
47 | model = GAN(config, ap)
48 |
49 | # init the trainer and 🚀
50 | trainer = Trainer(
51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
52 | )
53 | trainer.fit()
54 |
--------------------------------------------------------------------------------
/TTS/utils/training.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
6 | r"""Check model gradient against unexpected jumps and failures"""
7 | skip_flag = False
8 | if ignore_stopnet:
9 | if not amp_opt_params:
10 | grad_norm = torch.nn.utils.clip_grad_norm_(
11 | [param for name, param in model.named_parameters() if "stopnet" not in name], grad_clip
12 | )
13 | else:
14 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
15 | else:
16 | if not amp_opt_params:
17 | grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
18 | else:
19 | grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
20 |
21 | # compatibility with different torch versions
22 | if isinstance(grad_norm, float):
23 | if np.isinf(grad_norm):
24 | print(" | > Gradient is INF !!")
25 | skip_flag = True
26 | else:
27 | if torch.isinf(grad_norm):
28 | print(" | > Gradient is INF !!")
29 | skip_flag = True
30 | return grad_norm, skip_flag
31 |
32 |
33 | def gradual_training_scheduler(global_step, config):
34 | """Setup the gradual training schedule wrt number
35 | of active GPUs"""
36 | num_gpus = torch.cuda.device_count()
37 | if num_gpus == 0:
38 | num_gpus = 1
39 | new_values = None
40 | # we set the scheduling wrt num_gpus
41 | for values in config.gradual_training:
42 | if global_step * num_gpus >= values[0]:
43 | new_values = values
44 | return new_values[1], new_values[2]
45 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/melgan_multiscale_discriminator.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
4 |
5 |
6 | class MelganMultiscaleDiscriminator(nn.Module):
7 | def __init__(
8 | self,
9 | in_channels=1,
10 | out_channels=1,
11 | num_scales=3,
12 | kernel_sizes=(5, 3),
13 | base_channels=16,
14 | max_channels=1024,
15 | downsample_factors=(4, 4, 4),
16 | pooling_kernel_size=4,
17 | pooling_stride=2,
18 | pooling_padding=2,
19 | groups_denominator=4,
20 | ):
21 | super().__init__()
22 |
23 | self.discriminators = nn.ModuleList(
24 | [
25 | MelganDiscriminator(
26 | in_channels=in_channels,
27 | out_channels=out_channels,
28 | kernel_sizes=kernel_sizes,
29 | base_channels=base_channels,
30 | max_channels=max_channels,
31 | downsample_factors=downsample_factors,
32 | groups_denominator=groups_denominator,
33 | )
34 | for _ in range(num_scales)
35 | ]
36 | )
37 |
38 | self.pooling = nn.AvgPool1d(
39 | kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False
40 | )
41 |
42 | def forward(self, x):
43 | scores = []
44 | feats = []
45 | for disc in self.discriminators:
46 | score, feat = disc(x)
47 | scores.append(score)
48 | feats.append(feat)
49 | x = self.pooling(x)
50 | return scores, feats
51 |
--------------------------------------------------------------------------------
/.github/workflows/inference_tests.yml:
--------------------------------------------------------------------------------
1 | name: inference_tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y --no-install-recommends git make gcc
38 | make system-deps
39 | - name: Install/upgrade Python setup deps
40 | run: python3 -m pip install --upgrade pip setuptools wheel
41 | - name: Replace scarf urls
42 | run: |
43 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
44 | - name: Install TTS
45 | run: |
46 | python3 -m pip install .[all]
47 | python3 setup.py egg_info
48 | - name: Unit tests
49 | run: make inference_tests
50 |
--------------------------------------------------------------------------------
/docs/source/models/vits.md:
--------------------------------------------------------------------------------
1 | # VITS
2 |
3 | VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech
4 | ) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE,
5 | Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment
6 | using MAS, as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder.
7 | It is a feed-forward model with x67.12 real-time factor on a GPU.
8 |
9 | 🐸 YourTTS is a multi-speaker and multi-lingual TTS model that can perform voice conversion and zero-shot speaker adaptation.
10 | It can also learn a new language or voice with a ~ 1 minute long audio clip. This is a big open gate for training
11 | TTS models in low-resources languages. 🐸 YourTTS uses VITS as the backbone architecture coupled with a speaker encoder model.
12 |
13 | ## Important resources & papers
14 | - 🐸 YourTTS: https://arxiv.org/abs/2112.02418
15 | - VITS: https://arxiv.org/pdf/2106.06103.pdf
16 | - Neural Spline Flows: https://arxiv.org/abs/1906.04032
17 | - Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf
18 | - Generative Adversarial Networks: https://arxiv.org/abs/1406.2661
19 | - HiFiGAN: https://arxiv.org/abs/2010.05646
20 | - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html
21 |
22 | ## VitsConfig
23 | ```{eval-rst}
24 | .. autoclass:: TTS.tts.configs.vits_config.VitsConfig
25 | :members:
26 | ```
27 |
28 | ## VitsArgs
29 | ```{eval-rst}
30 | .. autoclass:: TTS.tts.models.vits.VitsArgs
31 | :members:
32 | ```
33 |
34 | ## Vits Model
35 | ```{eval-rst}
36 | .. autoclass:: TTS.tts.models.vits.Vits
37 | :members:
38 | ```
39 |
--------------------------------------------------------------------------------
/.github/workflows/zoo_tests.yml:
--------------------------------------------------------------------------------
1 | name: zoo-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y git make gcc
38 | sudo apt-get install espeak espeak-ng
39 | make system-deps
40 | - name: Install/upgrade Python setup deps
41 | run: python3 -m pip install --upgrade pip setuptools wheel
42 | - name: Replace scarf urls
43 | run: |
44 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
45 | - name: Install TTS
46 | run: |
47 | python3 -m pip install .[all]
48 | python3 setup.py egg_info
49 | - name: Unit tests
50 | run: make test_zoo
51 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import MultibandMelganConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.gan import GAN
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 |
13 | config = MultibandMelganConfig(
14 | batch_size=32,
15 | eval_batch_size=16,
16 | num_loader_workers=4,
17 | num_eval_loader_workers=4,
18 | run_eval=True,
19 | test_delay_epochs=5,
20 | epochs=1000,
21 | seq_len=8192,
22 | pad_short=2000,
23 | use_noise_augment=True,
24 | eval_split_size=10,
25 | print_step=25,
26 | print_eval=False,
27 | mixed_precision=False,
28 | lr_gen=1e-4,
29 | lr_disc=1e-4,
30 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
31 | output_path=output_path,
32 | )
33 |
34 | # download dataset if not already present
35 | if not os.path.exists(config.data_path):
36 | print("Downloading dataset")
37 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
38 | download_thorsten_de(download_path)
39 |
40 | # init audio processor
41 | ap = AudioProcessor(**config.audio.to_dict())
42 |
43 | # load training samples
44 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
45 |
46 | # init model
47 | model = GAN(config, ap)
48 |
49 | # init the trainer and 🚀
50 | trainer = Trainer(
51 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
52 | )
53 | trainer.fit()
54 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/melgan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from torch.nn.utils import weight_norm
3 |
4 |
5 | class ResidualStack(nn.Module):
6 | def __init__(self, channels, num_res_blocks, kernel_size):
7 | super().__init__()
8 |
9 | assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
10 | base_padding = (kernel_size - 1) // 2
11 |
12 | self.blocks = nn.ModuleList()
13 | for idx in range(num_res_blocks):
14 | layer_kernel_size = kernel_size
15 | layer_dilation = layer_kernel_size**idx
16 | layer_padding = base_padding * layer_dilation
17 | self.blocks += [
18 | nn.Sequential(
19 | nn.LeakyReLU(0.2),
20 | nn.ReflectionPad1d(layer_padding),
21 | weight_norm(
22 | nn.Conv1d(channels, channels, kernel_size=kernel_size, dilation=layer_dilation, bias=True)
23 | ),
24 | nn.LeakyReLU(0.2),
25 | weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
26 | )
27 | ]
28 |
29 | self.shortcuts = nn.ModuleList(
30 | [weight_norm(nn.Conv1d(channels, channels, kernel_size=1, bias=True)) for i in range(num_res_blocks)]
31 | )
32 |
33 | def forward(self, x):
34 | for block, shortcut in zip(self.blocks, self.shortcuts):
35 | x = shortcut(x) + block(x)
36 | return x
37 |
38 | def remove_weight_norm(self):
39 | for block, shortcut in zip(self.blocks, self.shortcuts):
40 | nn.utils.remove_weight_norm(block[2])
41 | nn.utils.remove_weight_norm(block[4])
42 | nn.utils.remove_weight_norm(shortcut)
43 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/wavegrad/train_wavegrad.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import WavegradConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.wavegrad import Wavegrad
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = WavegradConfig(
13 | batch_size=32,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=1000,
20 | seq_len=6144,
21 | pad_short=2000,
22 | use_noise_augment=True,
23 | eval_split_size=50,
24 | print_step=50,
25 | print_eval=True,
26 | mixed_precision=False,
27 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
28 | output_path=output_path,
29 | )
30 |
31 | # download dataset if not already present
32 | if not os.path.exists(config.data_path):
33 | print("Downloading dataset")
34 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
35 | download_thorsten_de(download_path)
36 |
37 | # init audio processor
38 | ap = AudioProcessor(**config.audio.to_dict())
39 |
40 | # load training samples
41 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
42 |
43 | # init model
44 | model = Wavegrad(config)
45 |
46 | # init the trainer and 🚀
47 | trainer = Trainer(
48 | TrainerArgs(),
49 | config,
50 | output_path,
51 | model=model,
52 | train_samples=train_samples,
53 | eval_samples=eval_samples,
54 | training_assets={"audio_processor": ap},
55 | )
56 | trainer.fit()
57 |
--------------------------------------------------------------------------------
/.github/workflows/tts_tests.yml:
--------------------------------------------------------------------------------
1 | name: tts-tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | types: [opened, synchronize, reopened]
9 | jobs:
10 | check_skip:
11 | runs-on: ubuntu-latest
12 | if: "! contains(github.event.head_commit.message, '[ci skip]')"
13 | steps:
14 | - run: echo "${{ github.event.head_commit.message }}"
15 |
16 | test:
17 | runs-on: ubuntu-latest
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | python-version: [3.7, 3.8, 3.9, "3.10"]
22 | experimental: [false]
23 | steps:
24 | - uses: actions/checkout@v2
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: coqui-ai/setup-python@pip-cache-key-py-ver
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | architecture: x64
30 | cache: 'pip'
31 | cache-dependency-path: 'requirements*'
32 | - name: check OS
33 | run: cat /etc/os-release
34 | - name: Install dependencies
35 | run: |
36 | sudo apt-get update
37 | sudo apt-get install -y --no-install-recommends git make gcc
38 | sudo apt-get install espeak
39 | sudo apt-get install espeak-ng
40 | make system-deps
41 | - name: Install/upgrade Python setup deps
42 | run: python3 -m pip install --upgrade pip setuptools wheel
43 | - name: Replace scarf urls
44 | run: |
45 | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
46 | - name: Install TTS
47 | run: |
48 | python3 -m pip install .[all]
49 | python3 setup.py egg_info
50 | - name: Unit tests
51 | run: make test_tts
52 |
--------------------------------------------------------------------------------
/recipes/thorsten_DE/wavernn/train_wavernn.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.utils.audio import AudioProcessor
6 | from TTS.utils.downloaders import download_thorsten_de
7 | from TTS.vocoder.configs import WavernnConfig
8 | from TTS.vocoder.datasets.preprocess import load_wav_data
9 | from TTS.vocoder.models.wavernn import Wavernn
10 |
11 | output_path = os.path.dirname(os.path.abspath(__file__))
12 | config = WavernnConfig(
13 | batch_size=64,
14 | eval_batch_size=16,
15 | num_loader_workers=4,
16 | num_eval_loader_workers=4,
17 | run_eval=True,
18 | test_delay_epochs=-1,
19 | epochs=10000,
20 | seq_len=1280,
21 | pad_short=2000,
22 | use_noise_augment=False,
23 | eval_split_size=10,
24 | print_step=25,
25 | print_eval=True,
26 | mixed_precision=False,
27 | lr=1e-4,
28 | grad_clip=4,
29 | data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
30 | output_path=output_path,
31 | )
32 |
33 | # download dataset if not already present
34 | if not os.path.exists(config.data_path):
35 | print("Downloading dataset")
36 | download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
37 | download_thorsten_de(download_path)
38 |
39 | # init audio processor
40 | ap = AudioProcessor(**config.audio.to_dict())
41 |
42 | # load training samples
43 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
44 |
45 | # init model
46 | model = Wavernn(config)
47 |
48 | # init the trainer and 🚀
49 | trainer = Trainer(
50 | TrainerArgs(),
51 | config,
52 | output_path,
53 | model=model,
54 | train_samples=train_samples,
55 | eval_samples=eval_samples,
56 | training_assets={"audio_processor": ap},
57 | )
58 | trainer.fit()
59 |
--------------------------------------------------------------------------------
/TTS/vocoder/README.md:
--------------------------------------------------------------------------------
1 | # Mozilla TTS Vocoders (Experimental)
2 |
3 | Here there are vocoder model implementations which can be combined with the other TTS models.
4 |
5 | Currently, following models are implemented:
6 |
7 | - Melgan
8 | - MultiBand-Melgan
9 | - ParallelWaveGAN
10 | - GAN-TTS (Discriminator Only)
11 |
12 | It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
13 |
14 | ## Training a model
15 |
16 | You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
17 |
18 | In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
19 |
20 | You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
21 |
22 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
23 |
24 | Example config files can be found under `tts/vocoder/configs/` folder.
25 |
26 | You can continue a previous training run by the following command.
27 |
28 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
29 |
30 | You can fine-tune a pre-trained model by the following command.
31 |
32 | ```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
33 |
34 | Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
35 |
36 | You can also follow your training runs on Tensorboard as you do with our TTS models.
37 |
38 | ## Acknowledgement
39 | Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
40 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/pqmf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from scipy import signal as sig
5 |
6 |
7 | # adapted from
8 | # https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
9 | class PQMF(torch.nn.Module):
10 | def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
11 | super().__init__()
12 |
13 | self.N = N
14 | self.taps = taps
15 | self.cutoff = cutoff
16 | self.beta = beta
17 |
18 | QMF = sig.firwin(taps + 1, cutoff, window=("kaiser", beta))
19 | H = np.zeros((N, len(QMF)))
20 | G = np.zeros((N, len(QMF)))
21 | for k in range(N):
22 | constant_factor = (
23 | (2 * k + 1) * (np.pi / (2 * N)) * (np.arange(taps + 1) - ((taps - 1) / 2))
24 | ) # TODO: (taps - 1) -> taps
25 | phase = (-1) ** k * np.pi / 4
26 | H[k] = 2 * QMF * np.cos(constant_factor + phase)
27 |
28 | G[k] = 2 * QMF * np.cos(constant_factor - phase)
29 |
30 | H = torch.from_numpy(H[:, None, :]).float()
31 | G = torch.from_numpy(G[None, :, :]).float()
32 |
33 | self.register_buffer("H", H)
34 | self.register_buffer("G", G)
35 |
36 | updown_filter = torch.zeros((N, N, N)).float()
37 | for k in range(N):
38 | updown_filter[k, k, 0] = 1.0
39 | self.register_buffer("updown_filter", updown_filter)
40 | self.N = N
41 |
42 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
43 |
44 | def forward(self, x):
45 | return self.analysis(x)
46 |
47 | def analysis(self, x):
48 | return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
49 |
50 | def synthesis(self, x):
51 | x = F.conv_transpose1d(x, self.updown_filter * self.N, stride=self.N)
52 | x = F.conv1d(x, self.G, padding=self.taps // 2)
53 | return x
54 |
--------------------------------------------------------------------------------
/tests/inputs/test_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "audio":{
3 | "audio_processor": "audio",
4 | "num_mels": 80,
5 | "fft_size": 1024,
6 | "sample_rate": 22050,
7 | "frame_length_ms": null,
8 | "frame_shift_ms": null,
9 | "hop_length": 256,
10 | "win_length": 1024,
11 | "preemphasis": 0.97,
12 | "min_level_db": -100,
13 | "ref_level_db": 20,
14 | "power": 1.5,
15 | "griffin_lim_iters": 30,
16 | "signal_norm": true,
17 | "symmetric_norm": true,
18 | "clip_norm": true,
19 | "max_norm": 4,
20 | "mel_fmin": 0,
21 | "mel_fmax": 8000,
22 | "do_trim_silence": false,
23 | "spec_gain": 20
24 | },
25 |
26 | "characters":{
27 | "pad": "_",
28 | "eos": "~",
29 | "bos": "^",
30 | "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
31 | "punctuations":"!'(),-.:;? ",
32 | "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫʲ"
33 | },
34 |
35 | "hidden_size": 128,
36 | "embedding_size": 256,
37 | "text_cleaner": "english_cleaners",
38 |
39 | "epochs": 2000,
40 | "lr": 0.003,
41 | "lr_patience": 5,
42 | "lr_decay": 0.5,
43 | "batch_size": 2,
44 | "r": 5,
45 | "mk": 1.0,
46 | "num_loader_workers": 0,
47 | "memory_size": 5,
48 |
49 | "save_step": 200,
50 | "data_path": "tests/data/ljspeech/",
51 | "output_path": "result",
52 | "min_seq_len": 0,
53 | "max_seq_len": 300,
54 | "log_dir": "tests/outputs/",
55 |
56 |
57 | "use_speaker_embedding": false,
58 | "use_gst": true,
59 | "gst": {
60 | "gst_style_input": null,
61 |
62 |
63 |
64 | "gst_use_speaker_embedding": true,
65 | "gst_embedding_dim": 512,
66 | "gst_num_heads": 4,
67 | "gst_num_style_tokens": 10
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from TTS.config import BaseDatasetConfig
4 | from TTS.utils.generic_utils import get_cuda
5 |
6 |
7 | def get_device_id():
8 | use_cuda, _ = get_cuda()
9 | if use_cuda:
10 | if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] != "":
11 | GPU_ID = os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0]
12 | else:
13 | GPU_ID = "0"
14 | else:
15 | GPU_ID = ""
16 | return GPU_ID
17 |
18 |
19 | def get_tests_path():
20 | """Returns the path to the test directory."""
21 | return os.path.dirname(os.path.realpath(__file__))
22 |
23 |
24 | def get_tests_input_path():
25 | """Returns the path to the test data directory."""
26 | return os.path.join(get_tests_path(), "inputs")
27 |
28 |
29 | def get_tests_data_path():
30 | """Returns the path to the test data directory."""
31 | return os.path.join(get_tests_path(), "data")
32 |
33 |
34 | def get_tests_output_path():
35 | """Returns the path to the directory for test outputs."""
36 | return os.path.join(get_tests_path(), "outputs")
37 |
38 |
39 | def run_cli(command):
40 | exit_status = os.system(command)
41 | assert exit_status == 0, f" [!] command `{command}` failed."
42 |
43 |
44 | def get_test_data_config():
45 | return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
46 |
47 |
48 | def assertHasAttr(test_obj, obj, intendedAttr):
49 | # from https://stackoverflow.com/questions/48078636/pythons-unittest-lacks-an-asserthasattr-method-what-should-i-use-instead
50 | testBool = hasattr(obj, intendedAttr)
51 | test_obj.assertTrue(testBool, msg=f"obj lacking an attribute. obj: {obj}, intendedAttr: {intendedAttr}")
52 |
53 |
54 | def assertHasNotAttr(test_obj, obj, intendedAttr):
55 | testBool = hasattr(obj, intendedAttr)
56 | test_obj.assertFalse(testBool, msg=f"obj should not have an attribute. obj: {obj}, intendedAttr: {intendedAttr}")
57 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/__init__.py:
--------------------------------------------------------------------------------
1 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
2 | from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
3 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
4 | from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
5 | from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
6 |
7 | PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
8 |
9 |
10 | ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
11 | GRUUT_LANGS = list(Gruut.supported_languages())
12 |
13 |
14 | # Dict setting default phonemizers for each language
15 | # Add Gruut languages
16 | _ = [Gruut.name()] * len(GRUUT_LANGS)
17 | DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
18 |
19 |
20 | # Add ESpeak languages and override any existing ones
21 | _ = [ESpeak.name()] * len(ESPEAK_LANGS)
22 | _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
23 | DEF_LANG_TO_PHONEMIZER.update(_new_dict)
24 |
25 | # Force default for some languages
26 | DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
27 | DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
28 | DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
29 |
30 |
31 | def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
32 | """Initiate a phonemizer by name
33 |
34 | Args:
35 | name (str):
36 | Name of the phonemizer that should match `phonemizer.name()`.
37 |
38 | kwargs (dict):
39 | Extra keyword arguments that should be passed to the phonemizer.
40 | """
41 | if name == "espeak":
42 | return ESpeak(**kwargs)
43 | if name == "gruut":
44 | return Gruut(**kwargs)
45 | if name == "zh_cn_phonemizer":
46 | return ZH_CN_Phonemizer(**kwargs)
47 | if name == "ja_jp_phonemizer":
48 | return JA_JP_Phonemizer(**kwargs)
49 | raise ValueError(f"Phonemizer {name} not found")
50 |
51 |
52 | if __name__ == "__main__":
53 | print(DEF_LANG_TO_PHONEMIZER)
54 |
--------------------------------------------------------------------------------
/hubconf.py:
--------------------------------------------------------------------------------
1 | dependencies = [
2 | 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite'
3 | ]
4 | import torch
5 |
6 | from TTS.utils.manage import ModelManager
7 | from TTS.utils.synthesizer import Synthesizer
8 |
9 |
10 | def tts(model_name='tts_models/en/ljspeech/tacotron2-DCA',
11 | vocoder_name=None,
12 | use_cuda=False):
13 | """TTS entry point for PyTorch Hub that provides a Synthesizer object to synthesize speech from a give text.
14 |
15 | Example:
16 | >>> synthesizer = torch.hub.load('coqui-ai/TTS', 'tts', source='github')
17 | >>> wavs = synthesizer.tts("This is a test! This is also a test!!")
18 | wavs - is a list of values of the synthesized speech.
19 |
20 | Args:
21 | model_name (str, optional): One of the model names from .model.json. Defaults to 'tts_models/en/ljspeech/tacotron2-DCA'.
22 | vocoder_name (str, optional): One of the model names from .model.json. Defaults to 'vocoder_models/en/ljspeech/multiband-melgan'.
23 | pretrained (bool, optional): [description]. Defaults to True.
24 |
25 | Returns:
26 | TTS.utils.synthesizer.Synthesizer: Synthesizer object wrapping both vocoder and tts models.
27 | """
28 | manager = ModelManager()
29 |
30 | model_path, config_path, model_item = manager.download_model(model_name)
31 | vocoder_name = model_item[
32 | 'default_vocoder'] if vocoder_name is None else vocoder_name
33 | vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
34 |
35 | # create synthesizer
36 | synt = Synthesizer(tts_checkpoint=model_path,
37 | tts_config_path=config_path,
38 | vocoder_checkpoint=vocoder_path,
39 | vocoder_config=vocoder_config_path,
40 | use_cuda=use_cuda)
41 | return synt
42 |
43 |
44 | if __name__ == '__main__':
45 | synthesizer = torch.hub.load('coqui-ai/TTS:dev', 'tts', source='github')
46 | synthesizer.tts("This is a test!")
47 |
--------------------------------------------------------------------------------
/TTS/model.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Dict
3 |
4 | import torch
5 | from coqpit import Coqpit
6 | from trainer import TrainerModel
7 |
8 | # pylint: skip-file
9 |
10 |
11 | class BaseTrainerModel(TrainerModel):
12 | """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13 |
14 | Every new 🐸TTS model must inherit it.
15 | """
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def init_from_config(config: Coqpit):
20 | """Init the model and all its attributes from the given config.
21 |
22 | Override this depending on your model.
23 | """
24 | ...
25 |
26 | @abstractmethod
27 | def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28 | """Forward pass for inference.
29 |
30 | It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31 | is considered to be the main output and you can add any other auxiliary outputs as you want.
32 |
33 | We don't use `*kwargs` since it is problematic with the TorchScript API.
34 |
35 | Args:
36 | input (torch.Tensor): [description]
37 | aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38 |
39 | Returns:
40 | Dict: [description]
41 | """
42 | outputs_dict = {"model_outputs": None}
43 | ...
44 | return outputs_dict
45 |
46 | @abstractmethod
47 | def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
48 | """Load a model checkpoint gile and get ready for training or inference.
49 |
50 | Args:
51 | config (Coqpit): Model configuration.
52 | checkpoint_path (str): Path to the model checkpoint file.
53 | eval (bool, optional): If true, init model for inference else for training. Defaults to False.
54 | strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
55 | """
56 | ...
57 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 |
9 | class ZH_CN_Phonemizer(BasePhonemizer):
10 | """🐸TTS Zh-Cn phonemizer using functions in `TTS.tts.utils.text.chinese_mandarin.phonemizer`
11 |
12 | Args:
13 | punctuations (str):
14 | Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
15 |
16 | keep_puncs (bool):
17 | If True, keep the punctuations after phonemization. Defaults to False.
18 |
19 | Example ::
20 |
21 | "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。`
22 |
23 | TODO: someone with Mandarin knowledge should check this implementation
24 | """
25 |
26 | language = "zh-cn"
27 |
28 | def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
29 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
30 |
31 | @staticmethod
32 | def name():
33 | return "zh_cn_phonemizer"
34 |
35 | @staticmethod
36 | def phonemize_zh_cn(text: str, separator: str = "|") -> str:
37 | ph = chinese_text_to_phonemes(text, separator)
38 | return ph
39 |
40 | def _phonemize(self, text, separator):
41 | return self.phonemize_zh_cn(text, separator)
42 |
43 | @staticmethod
44 | def supported_languages() -> Dict:
45 | return {"zh-cn": "Japanese (Japan)"}
46 |
47 | def version(self) -> str:
48 | return "0.0.1"
49 |
50 | def is_available(self) -> bool:
51 | return True
52 |
53 |
54 | # if __name__ == "__main__":
55 | # text = "这是,样本中文。"
56 | # e = ZH_CN_Phonemizer()
57 | # print(e.supported_languages())
58 | # print(e.version())
59 | # print(e.language)
60 | # print(e.name())
61 | # print(e.is_available())
62 | # print("`" + e.phonemize(text) + "`")
63 |
--------------------------------------------------------------------------------
/TTS/encoder/configs/base_encoder_config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict, dataclass, field
2 | from typing import Dict, List
3 |
4 | from coqpit import MISSING
5 |
6 | from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7 |
8 |
9 | @dataclass
10 | class BaseEncoderConfig(BaseTrainingConfig):
11 | """Defines parameters for a Generic Encoder model."""
12 |
13 | model: str = None
14 | audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15 | datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16 | # model params
17 | model_params: Dict = field(
18 | default_factory=lambda: {
19 | "model_name": "lstm",
20 | "input_dim": 80,
21 | "proj_dim": 256,
22 | "lstm_dim": 768,
23 | "num_lstm_layers": 3,
24 | "use_lstm_with_projection": True,
25 | }
26 | )
27 |
28 | audio_augmentation: Dict = field(default_factory=lambda: {})
29 |
30 | # training params
31 | epochs: int = 10000
32 | loss: str = "angleproto"
33 | grad_clip: float = 3.0
34 | lr: float = 0.0001
35 | optimizer: str = "radam"
36 | optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37 | lr_decay: bool = False
38 | warmup_steps: int = 4000
39 |
40 | # logging params
41 | tb_model_param_stats: bool = False
42 | steps_plot_stats: int = 10
43 | save_step: int = 1000
44 | print_step: int = 20
45 | run_eval: bool = False
46 |
47 | # data loader
48 | num_classes_in_batch: int = MISSING
49 | num_utter_per_class: int = MISSING
50 | eval_num_classes_in_batch: int = None
51 | eval_num_utter_per_class: int = None
52 |
53 | num_loader_workers: int = MISSING
54 | voice_len: float = 1.6
55 |
56 | def check_values(self):
57 | super().check_values()
58 | c = asdict(self)
59 | assert (
60 | c["model_params"]["input_dim"] == self.audio.num_mels
61 | ), " [!] model input dimendion must be equal to melspectrogram dimension."
62 |
--------------------------------------------------------------------------------
/docs/source/models/forward_tts.md:
--------------------------------------------------------------------------------
1 | # Forward TTS model(s)
2 |
3 | A general feed-forward TTS model implementation that can be configured to different architectures by setting different
4 | encoder and decoder networks. It can be trained with either pre-computed durations (from pre-trained Tacotron) or
5 | an alignment network that learns the text to audio alignment from the input data.
6 |
7 | Currently we provide the following pre-configured architectures:
8 |
9 | - **FastSpeech:**
10 |
11 | It's a feed-forward model TTS model that uses Feed Forward Transformer (FFT) modules as the encoder and decoder.
12 |
13 | - **FastPitch:**
14 |
15 | It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
16 | promise of more expressive speech.
17 |
18 | - **SpeedySpeech:**
19 |
20 | It uses Residual Convolution layers instead of Transformers that leads to a more compute friendly model.
21 |
22 | - **FastSpeech2 (TODO):**
23 |
24 | Similar to FastPitch but it also uses a spectral energy values as an addition.
25 |
26 | ## Important resources & papers
27 | - FastPitch: https://arxiv.org/abs/2006.06873
28 | - SpeedySpeech: https://arxiv.org/abs/2008.03802
29 | - FastSpeech: https://arxiv.org/pdf/1905.09263
30 | - FastSpeech2: https://arxiv.org/abs/2006.04558
31 | - Aligner Network: https://arxiv.org/abs/2108.10447
32 | - What is Pitch: https://www.britannica.com/topic/pitch-speech
33 |
34 |
35 | ## ForwardTTSArgs
36 | ```{eval-rst}
37 | .. autoclass:: TTS.tts.models.forward_tts.ForwardTTSArgs
38 | :members:
39 | ```
40 |
41 | ## ForwardTTS Model
42 | ```{eval-rst}
43 | .. autoclass:: TTS.tts.models.forward_tts.ForwardTTS
44 | :members:
45 | ```
46 |
47 | ## FastPitchConfig
48 | ```{eval-rst}
49 | .. autoclass:: TTS.tts.configs.fast_pitch_config.FastPitchConfig
50 | :members:
51 | ```
52 |
53 | ## SpeedySpeechConfig
54 | ```{eval-rst}
55 | .. autoclass:: TTS.tts.configs.speedy_speech_config.SpeedySpeechConfig
56 | :members:
57 | ```
58 |
59 | ## FastSpeechConfig
60 | ```{eval-rst}
61 | .. autoclass:: TTS.tts.configs.fast_speech_config.FastSpeechConfig
62 | :members:
63 | ```
64 |
65 |
66 |
--------------------------------------------------------------------------------
/TTS/vocoder/layers/hifigan.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | # pylint: disable=dangerous-default-value
5 | class ResStack(nn.Module):
6 | def __init__(self, kernel, channel, padding, dilations=[1, 3, 5]):
7 | super().__init__()
8 | resstack = []
9 | for dilation in dilations:
10 | resstack += [
11 | nn.LeakyReLU(0.2),
12 | nn.ReflectionPad1d(dilation),
13 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=kernel, dilation=dilation)),
14 | nn.LeakyReLU(0.2),
15 | nn.ReflectionPad1d(padding),
16 | nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1)),
17 | ]
18 | self.resstack = nn.Sequential(*resstack)
19 |
20 | self.shortcut = nn.utils.weight_norm(nn.Conv1d(channel, channel, kernel_size=1))
21 |
22 | def forward(self, x):
23 | x1 = self.shortcut(x)
24 | x2 = self.resstack(x)
25 | return x1 + x2
26 |
27 | def remove_weight_norm(self):
28 | nn.utils.remove_weight_norm(self.shortcut)
29 | nn.utils.remove_weight_norm(self.resstack[2])
30 | nn.utils.remove_weight_norm(self.resstack[5])
31 | nn.utils.remove_weight_norm(self.resstack[8])
32 | nn.utils.remove_weight_norm(self.resstack[11])
33 | nn.utils.remove_weight_norm(self.resstack[14])
34 | nn.utils.remove_weight_norm(self.resstack[17])
35 |
36 |
37 | class MRF(nn.Module):
38 | def __init__(self, kernels, channel, dilations=[1, 3, 5]): # # pylint: disable=dangerous-default-value
39 | super().__init__()
40 | self.resblock1 = ResStack(kernels[0], channel, 0, dilations)
41 | self.resblock2 = ResStack(kernels[1], channel, 6, dilations)
42 | self.resblock3 = ResStack(kernels[2], channel, 12, dilations)
43 |
44 | def forward(self, x):
45 | x1 = self.resblock1(x)
46 | x2 = self.resblock2(x)
47 | x3 = self.resblock3(x)
48 | return x1 + x2 + x3
49 |
50 | def remove_weight_norm(self):
51 | self.resblock1.remove_weight_norm()
52 | self.resblock2.remove_weight_norm()
53 | self.resblock3.remove_weight_norm()
54 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/multi_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
4 |
5 |
6 | class MultiPhonemizer:
7 | """🐸TTS multi-phonemizer that operates phonemizers for multiple langugages
8 |
9 | Args:
10 | custom_lang_to_phonemizer (Dict):
11 | Custom phonemizer mapping if you want to change the defaults. In the format of
12 | `{"lang_code", "phonemizer_name"}`. When it is None, `DEF_LANG_TO_PHONEMIZER` is used. Defaults to `{}`.
13 |
14 | TODO: find a way to pass custom kwargs to the phonemizers
15 | """
16 |
17 | lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
18 | language = "multi-lingual"
19 |
20 | def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None: # pylint: disable=dangerous-default-value
21 | self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
22 | self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
23 |
24 | @staticmethod
25 | def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
26 | lang_to_phonemizer = {}
27 | for k, v in lang_to_phonemizer_name.items():
28 | phonemizer = get_phonemizer_by_name(v, language=k)
29 | lang_to_phonemizer[k] = phonemizer
30 | return lang_to_phonemizer
31 |
32 | @staticmethod
33 | def name():
34 | return "multi-phonemizer"
35 |
36 | def phonemize(self, text, language, separator="|"):
37 | return self.lang_to_phonemizer[language].phonemize(text, separator)
38 |
39 | def supported_languages(self) -> List:
40 | return list(self.lang_to_phonemizer_name.keys())
41 |
42 |
43 | # if __name__ == "__main__":
44 | # texts = {
45 | # "tr": "Merhaba, bu Türkçe bit örnek!",
46 | # "en-us": "Hello, this is English example!",
47 | # "de": "Hallo, das ist ein Deutches Beipiel!",
48 | # "zh-cn": "这是中国的例子",
49 | # }
50 | # phonemes = {}
51 | # ph = MultiPhonemizer()
52 | # for lang, text in texts.items():
53 | # phoneme = ph.phonemize(text, lang)
54 | # phonemes[lang] = phoneme
55 | # print(phonemes)
56 |
--------------------------------------------------------------------------------
/TTS/vocoder/models/base_vocoder.py:
--------------------------------------------------------------------------------
1 | from coqpit import Coqpit
2 |
3 | from TTS.model import BaseTrainerModel
4 |
5 | # pylint: skip-file
6 |
7 |
8 | class BaseVocoder(BaseTrainerModel):
9 | """Base `vocoder` class. Every new `vocoder` model must inherit this.
10 |
11 | It defines `vocoder` specific functions on top of `Model`.
12 |
13 | Notes on input/output tensor shapes:
14 | Any input or output tensor of the model must be shaped as
15 |
16 | - 3D tensors `batch x time x channels`
17 | - 2D tensors `batch x channels`
18 | - 1D tensors `batch x 1`
19 | """
20 |
21 | def __init__(self, config):
22 | super().__init__()
23 | self._set_model_args(config)
24 |
25 | def _set_model_args(self, config: Coqpit):
26 | """Setup model args based on the config type.
27 |
28 | If the config is for training with a name like "*Config", then the model args are embeded in the
29 | config.model_args
30 |
31 | If the config is for the model with a name like "*Args", then we assign the directly.
32 | """
33 | # don't use isintance not to import recursively
34 | if "Config" in config.__class__.__name__:
35 | if "characters" in config:
36 | _, self.config, num_chars = self.get_characters(config)
37 | self.config.num_chars = num_chars
38 | if hasattr(self.config, "model_args"):
39 | config.model_args.num_chars = num_chars
40 | if "model_args" in config:
41 | self.args = self.config.model_args
42 | # This is for backward compatibility
43 | if "model_params" in config:
44 | self.args = self.config.model_params
45 | else:
46 | self.config = config
47 | if "model_args" in config:
48 | self.args = self.config.model_args
49 | # This is for backward compatibility
50 | if "model_params" in config:
51 | self.args = self.config.model_params
52 | else:
53 | raise ValueError("config must be either a *Config or *Args")
54 |
--------------------------------------------------------------------------------
/tests/tts_tests/test_vits_d-vectors_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from tests import get_device_id, get_tests_output_path, run_cli
6 | from TTS.tts.configs.vits_config import VitsConfig
7 |
8 | config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
9 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
10 |
11 |
12 | config = VitsConfig(
13 | batch_size=2,
14 | eval_batch_size=2,
15 | num_loader_workers=0,
16 | num_eval_loader_workers=0,
17 | text_cleaner="english_cleaners",
18 | use_phonemes=True,
19 | phoneme_language="en-us",
20 | phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
21 | run_eval=True,
22 | test_delay_epochs=-1,
23 | epochs=1,
24 | print_step=1,
25 | print_eval=True,
26 | test_sentences=[
27 | ["Be a voice, not an echo.", "ljspeech-0"],
28 | ],
29 | )
30 | # set audio config
31 | config.audio.do_trim_silence = True
32 | config.audio.trim_db = 60
33 |
34 | # active multispeaker d-vec mode
35 | config.model_args.use_d_vector_file = True
36 | config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
37 | config.model_args.d_vector_dim = 256
38 |
39 |
40 | config.save_json(config_path)
41 |
42 | # train the model for one epoch
43 | command_train = (
44 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
45 | f"--coqpit.output_path {output_path} "
46 | "--coqpit.datasets.0.name ljspeech "
47 | "--coqpit.datasets.0.meta_file_train metadata.csv "
48 | "--coqpit.datasets.0.meta_file_val metadata.csv "
49 | "--coqpit.datasets.0.path tests/data/ljspeech "
50 | "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
51 | "--coqpit.test_delay_epochs 0"
52 | )
53 | run_cli(command_train)
54 |
55 | # Find latest folder
56 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
57 |
58 | # restore the model and continue training for one more epoch
59 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
60 | run_cli(command_train)
61 | shutil.rmtree(continue_path)
62 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .DEFAULT_GOAL := help
2 | .PHONY: test system-deps dev-deps deps style lint install help docs
3 |
4 | help:
5 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
6 |
7 | target_dirs := tests TTS notebooks recipes
8 |
9 | test_all: ## run tests and don't stop on an error.
10 | nose2 --with-coverage --coverage TTS tests
11 | ./run_bash_tests.sh
12 |
13 | test: ## run tests.
14 | nose2 -F -v -B --with-coverage --coverage TTS tests
15 |
16 | test_vocoder: ## run vocoder tests.
17 | nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
18 |
19 | test_tts: ## run tts tests.
20 | nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
21 |
22 | test_aux: ## run aux tests.
23 | nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
24 | ./run_bash_tests.sh
25 |
26 | test_zoo: ## run zoo tests.
27 | nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
28 |
29 | inference_tests: ## run inference tests.
30 | nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
31 |
32 | data_tests: ## run data tests.
33 | nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
34 |
35 | test_text: ## run text tests.
36 | nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
37 |
38 | test_failed: ## only run tests failed the last time.
39 | nose2 -F -v -B --with-coverage --coverage TTS tests
40 |
41 | style: ## update code style.
42 | black ${target_dirs}
43 | isort ${target_dirs}
44 |
45 | lint: ## run pylint linter.
46 | pylint ${target_dirs}
47 | black ${target_dirs} --check
48 | isort ${target_dirs} --check-only
49 |
50 | system-deps: ## install linux system deps
51 | sudo apt-get install -y libsndfile1-dev
52 |
53 | dev-deps: ## install development deps
54 | pip install -r requirements.dev.txt
55 |
56 | doc-deps: ## install docs dependencies
57 | pip install -r docs/requirements.txt
58 |
59 | build-docs: ## build the docs
60 | cd docs && make clean && make build
61 |
62 | hub-deps: ## install deps for torch hub use
63 | pip install -r requirements.hub.txt
64 |
65 | deps: ## install 🐸 requirements.
66 | pip install -r requirements.txt
67 |
68 | install: ## install 🐸 TTS for development.
69 | pip install -e .[all]
70 |
71 | docs: ## build the docs
72 | $(MAKE) -C docs clean && $(MAKE) -C docs html
73 |
--------------------------------------------------------------------------------
/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
4 | from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
5 |
6 | _DEF_JA_PUNCS = "、.,[]()?!〽~『』「」【】"
7 |
8 | _TRANS_TABLE = {"、": ","}
9 |
10 |
11 | def trans(text):
12 | for i, j in _TRANS_TABLE.items():
13 | text = text.replace(i, j)
14 | return text
15 |
16 |
17 | class JA_JP_Phonemizer(BasePhonemizer):
18 | """🐸TTS Ja-Jp phonemizer using functions in `TTS.tts.utils.text.japanese.phonemizer`
19 |
20 | TODO: someone with JA knowledge should check this implementation
21 |
22 | Example:
23 |
24 | >>> from TTS.tts.utils.text.phonemizers import JA_JP_Phonemizer
25 | >>> phonemizer = JA_JP_Phonemizer()
26 | >>> phonemizer.phonemize("どちらに行きますか?", separator="|")
27 | 'd|o|c|h|i|r|a|n|i|i|k|i|m|a|s|u|k|a|?'
28 |
29 | """
30 |
31 | language = "ja-jp"
32 |
33 | def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
34 | super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
35 |
36 | @staticmethod
37 | def name():
38 | return "ja_jp_phonemizer"
39 |
40 | def _phonemize(self, text: str, separator: str = "|") -> str:
41 | ph = japanese_text_to_phonemes(text)
42 | if separator is not None or separator != "":
43 | return separator.join(ph)
44 | return ph
45 |
46 | def phonemize(self, text: str, separator="|") -> str:
47 | """Custom phonemize for JP_JA
48 |
49 | Skip pre-post processing steps used by the other phonemizers.
50 | """
51 | return self._phonemize(text, separator)
52 |
53 | @staticmethod
54 | def supported_languages() -> Dict:
55 | return {"ja-jp": "Japanese (Japan)"}
56 |
57 | def version(self) -> str:
58 | return "0.0.1"
59 |
60 | def is_available(self) -> bool:
61 | return True
62 |
63 |
64 | # if __name__ == "__main__":
65 | # text = "これは、電話をかけるための私の日本語の例のテキストです。"
66 | # e = JA_JP_Phonemizer()
67 | # print(e.supported_languages())
68 | # print(e.version())
69 | # print(e.language)
70 | # print(e.name())
71 | # print(e.is_available())
72 | # print("`" + e.phonemize(text) + "`")
73 |
--------------------------------------------------------------------------------
/TTS/vocoder/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from coqpit import Coqpit
4 | from torch.utils.data import Dataset
5 |
6 | from TTS.utils.audio import AudioProcessor
7 | from TTS.vocoder.datasets.gan_dataset import GANDataset
8 | from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9 | from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
10 | from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
11 |
12 |
13 | def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset:
14 | if config.model.lower() in "gan":
15 | dataset = GANDataset(
16 | ap=ap,
17 | items=data_items,
18 | seq_len=config.seq_len,
19 | hop_len=ap.hop_length,
20 | pad_short=config.pad_short,
21 | conv_pad=config.conv_pad,
22 | return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False,
23 | is_training=not is_eval,
24 | return_segments=not is_eval,
25 | use_noise_augment=config.use_noise_augment,
26 | use_cache=config.use_cache,
27 | verbose=verbose,
28 | )
29 | dataset.shuffle_mapping()
30 | elif config.model.lower() == "wavegrad":
31 | dataset = WaveGradDataset(
32 | ap=ap,
33 | items=data_items,
34 | seq_len=config.seq_len,
35 | hop_len=ap.hop_length,
36 | pad_short=config.pad_short,
37 | conv_pad=config.conv_pad,
38 | is_training=not is_eval,
39 | return_segments=True,
40 | use_noise_augment=False,
41 | use_cache=config.use_cache,
42 | verbose=verbose,
43 | )
44 | elif config.model.lower() == "wavernn":
45 | dataset = WaveRNNDataset(
46 | ap=ap,
47 | items=data_items,
48 | seq_len=config.seq_len,
49 | hop_len=ap.hop_length,
50 | pad=config.model_params.pad,
51 | mode=config.model_params.mode,
52 | mulaw=config.model_params.mulaw,
53 | is_training=not is_eval,
54 | verbose=verbose,
55 | )
56 | else:
57 | raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.")
58 | return dataset
59 |
--------------------------------------------------------------------------------
/.github/workflows/docker.yaml:
--------------------------------------------------------------------------------
1 | name: "Docker build and push"
2 | on:
3 | pull_request:
4 | push:
5 | branches:
6 | - main
7 | - dev
8 | tags:
9 | - v*
10 | jobs:
11 | docker-build:
12 | name: "Build and push Docker image"
13 | runs-on: ubuntu-20.04
14 | strategy:
15 | matrix:
16 | arch: ["amd64"]
17 | base:
18 | - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
19 | - "ubuntu:20.04" # CPU only
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Log in to the Container registry
23 | uses: docker/login-action@v1
24 | with:
25 | registry: ghcr.io
26 | username: ${{ github.actor }}
27 | password: ${{ secrets.GITHUB_TOKEN }}
28 | - name: Compute Docker tags, check VERSION file matches tag
29 | id: compute-tag
30 | run: |
31 | set -ex
32 | base="ghcr.io/coqui-ai/tts"
33 | tags="" # PR build
34 |
35 | if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
36 | base="ghcr.io/coqui-ai/tts-cpu"
37 | fi
38 |
39 | if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
40 | # Push to branch
41 | github_ref="${{ github.ref }}"
42 | branch=${github_ref#*refs/heads/} # strip prefix to get branch name
43 | tags="${base}:${branch},${base}:${{ github.sha }},"
44 | elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
45 | VERSION="v$(cat TTS/VERSION)"
46 | if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
47 | echo "Pushed tag does not match VERSION file. Aborting push."
48 | exit 1
49 | fi
50 | tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
51 | fi
52 | echo "::set-output name=tags::${tags}"
53 | - name: Set up QEMU
54 | uses: docker/setup-qemu-action@v1
55 | - name: Set up Docker Buildx
56 | id: buildx
57 | uses: docker/setup-buildx-action@v1
58 | - name: Build and push
59 | uses: docker/build-push-action@v2
60 | with:
61 | context: .
62 | platforms: linux/${{ matrix.arch }}
63 | push: ${{ github.event_name == 'push' }}
64 | build-args: "BASE=${{ matrix.base }}"
65 | tags: ${{ steps.compute-tag.outputs.tags }}
66 |
--------------------------------------------------------------------------------
/tests/vocoder_tests/test_wavegrad.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import numpy as np
4 | import torch
5 | from torch import optim
6 |
7 | from TTS.vocoder.configs import WavegradConfig
8 | from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs
9 |
10 | # pylint: disable=unused-variable
11 |
12 | torch.manual_seed(1)
13 | use_cuda = torch.cuda.is_available()
14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
15 |
16 |
17 | class WavegradTrainTest(unittest.TestCase):
18 | def test_train_step(self): # pylint: disable=no-self-use
19 | """Test if all layers are updated in a basic training cycle"""
20 | input_dummy = torch.rand(8, 1, 20 * 300).to(device)
21 | mel_spec = torch.rand(8, 80, 20).to(device)
22 |
23 | criterion = torch.nn.L1Loss().to(device)
24 | args = WavegradArgs(
25 | in_channels=80,
26 | out_channels=1,
27 | upsample_factors=[5, 5, 3, 2, 2],
28 | upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
29 | )
30 | config = WavegradConfig(model_params=args)
31 | model = Wavegrad(config)
32 |
33 | model_ref = Wavegrad(config)
34 | model.train()
35 | model.to(device)
36 | betas = np.linspace(1e-6, 1e-2, 1000)
37 | model.compute_noise_level(betas)
38 | model_ref.load_state_dict(model.state_dict())
39 | model_ref.to(device)
40 | count = 0
41 | for param, param_ref in zip(model.parameters(), model_ref.parameters()):
42 | assert (param - param_ref).sum() == 0, param
43 | count += 1
44 | optimizer = optim.Adam(model.parameters(), lr=0.001)
45 | for i in range(5):
46 | y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device))
47 | optimizer.zero_grad()
48 | loss = criterion(y_hat, input_dummy)
49 | loss.backward()
50 | optimizer.step()
51 | # check parameter changes
52 | count = 0
53 | for param, param_ref in zip(model.parameters(), model_ref.parameters()):
54 | # ignore pre-higway layer since it works conditional
55 | # if count not in [145, 59]:
56 | assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
57 | count, param.shape, param, param_ref
58 | )
59 | count += 1
60 |
--------------------------------------------------------------------------------
/tests/tts_tests/test_tacotron_train.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import shutil
4 |
5 | from trainer import get_last_checkpoint
6 |
7 | from tests import get_device_id, get_tests_output_path, run_cli
8 | from TTS.tts.configs.tacotron_config import TacotronConfig
9 |
10 | config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
11 | output_path = os.path.join(get_tests_output_path(), "train_outputs")
12 |
13 |
14 | config = TacotronConfig(
15 | batch_size=8,
16 | eval_batch_size=8,
17 | num_loader_workers=0,
18 | num_eval_loader_workers=0,
19 | text_cleaner="english_cleaners",
20 | use_phonemes=False,
21 | phoneme_language="en-us",
22 | phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
23 | run_eval=True,
24 | test_delay_epochs=-1,
25 | epochs=1,
26 | print_step=1,
27 | test_sentences=[
28 | "Be a voice, not an echo.",
29 | ],
30 | print_eval=True,
31 | r=5,
32 | max_decoder_steps=50,
33 | )
34 | config.audio.do_trim_silence = True
35 | config.audio.trim_db = 60
36 | config.save_json(config_path)
37 |
38 | # train the model for one epoch
39 | command_train = (
40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
41 | f"--coqpit.output_path {output_path} "
42 | "--coqpit.datasets.0.name ljspeech "
43 | "--coqpit.datasets.0.meta_file_train metadata.csv "
44 | "--coqpit.datasets.0.meta_file_val metadata.csv "
45 | "--coqpit.datasets.0.path tests/data/ljspeech "
46 | "--coqpit.test_delay_epochs 0"
47 | )
48 | run_cli(command_train)
49 |
50 | # Find latest folder
51 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
52 |
53 | # Inference using TTS API
54 | continue_config_path = os.path.join(continue_path, "config.json")
55 | continue_restore_path, _ = get_last_checkpoint(continue_path)
56 | out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
57 |
58 | inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
59 | run_cli(inference_command)
60 |
61 | # restore the model and continue training for one more epoch
62 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
63 | run_cli(command_train)
64 | shutil.rmtree(continue_path)
65 |
--------------------------------------------------------------------------------
/TTS/bin/find_unique_phonemes.py:
--------------------------------------------------------------------------------
1 | """Find all the unique characters in a dataset"""
2 | import argparse
3 | import multiprocessing
4 | from argparse import RawTextHelpFormatter
5 |
6 | from tqdm.contrib.concurrent import process_map
7 |
8 | from TTS.config import load_config
9 | from TTS.tts.datasets import load_tts_samples
10 | from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
11 |
12 | phonemizer = Gruut(language="en-us")
13 |
14 |
15 | def compute_phonemes(item):
16 | try:
17 | text = item[0]
18 | ph = phonemizer.phonemize(text).split("|")
19 | except:
20 | return []
21 | return list(set(ph))
22 |
23 |
24 | def main():
25 | # pylint: disable=W0601
26 | global c
27 | # pylint: disable=bad-option-value
28 | parser = argparse.ArgumentParser(
29 | description="""Find all the unique characters or phonemes in a dataset.\n\n"""
30 | """
31 | Example runs:
32 |
33 | python TTS/bin/find_unique_chars.py --config_path config.json
34 | """,
35 | formatter_class=RawTextHelpFormatter,
36 | )
37 | parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
38 | args = parser.parse_args()
39 |
40 | c = load_config(args.config_path)
41 |
42 | # load all datasets
43 | train_items, eval_items = load_tts_samples(
44 | c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
45 | )
46 | items = train_items + eval_items
47 | print("Num items:", len(items))
48 |
49 | is_lang_def = all(item["language"] for item in items)
50 |
51 | if not c.phoneme_language or not is_lang_def:
52 | raise ValueError("Phoneme language must be defined in config.")
53 |
54 | phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
55 | phones = []
56 | for ph in phonemes:
57 | phones.extend(ph)
58 | phones = set(phones)
59 | lower_phones = filter(lambda c: c.islower(), phones)
60 | phones_force_lower = [c.lower() for c in phones]
61 | phones_force_lower = set(phones_force_lower)
62 |
63 | print(f" > Number of unique phonemes: {len(phones)}")
64 | print(f" > Unique phonemes: {''.join(sorted(phones))}")
65 | print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
66 | print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
67 |
68 |
69 | if __name__ == "__main__":
70 | main()
71 |
--------------------------------------------------------------------------------
/TTS/bin/train_tts.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, field
3 |
4 | from trainer import Trainer, TrainerArgs
5 |
6 | from TTS.config import load_config, register_config
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models import setup_model
9 |
10 |
11 | @dataclass
12 | class TrainTTSArgs(TrainerArgs):
13 | config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14 |
15 |
16 | def main():
17 | """Run `tts` model training directly by a `config.json` file."""
18 | # init trainer args
19 | train_args = TrainTTSArgs()
20 | parser = train_args.init_argparse(arg_prefix="")
21 |
22 | # override trainer args from comman-line args
23 | args, config_overrides = parser.parse_known_args()
24 | train_args.parse_args(args)
25 |
26 | # load config.json and register
27 | if args.config_path or args.continue_path:
28 | if args.config_path:
29 | # init from a file
30 | config = load_config(args.config_path)
31 | if len(config_overrides) > 0:
32 | config.parse_known_args(config_overrides, relaxed_parser=True)
33 | elif args.continue_path:
34 | # continue from a prev experiment
35 | config = load_config(os.path.join(args.continue_path, "config.json"))
36 | if len(config_overrides) > 0:
37 | config.parse_known_args(config_overrides, relaxed_parser=True)
38 | else:
39 | # init from console args
40 | from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41 |
42 | config_base = BaseTrainingConfig()
43 | config_base.parse_known_args(config_overrides)
44 | config = register_config(config_base.model)()
45 |
46 | # load training samples
47 | train_samples, eval_samples = load_tts_samples(
48 | config.datasets,
49 | eval_split=True,
50 | eval_split_max_size=config.eval_split_max_size,
51 | eval_split_size=config.eval_split_size,
52 | )
53 |
54 | # init the model from config
55 | model = setup_model(config, train_samples + eval_samples)
56 |
57 | # init the trainer and 🚀
58 | trainer = Trainer(
59 | train_args,
60 | model.config,
61 | config.output_path,
62 | model=model,
63 | train_samples=train_samples,
64 | eval_samples=eval_samples,
65 | parse_command_line_args=False,
66 | )
67 | trainer.fit()
68 |
69 |
70 | if __name__ == "__main__":
71 | main()
72 |
--------------------------------------------------------------------------------
/TTS/tts/layers/glow_tts/duration_predictor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 | from ..generic.normalization import LayerNorm
5 |
6 |
7 | class DurationPredictor(nn.Module):
8 | """Glow-TTS duration prediction model.
9 |
10 | ::
11 |
12 | [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs
13 |
14 | Args:
15 | in_channels (int): Number of channels of the input tensor.
16 | hidden_channels (int): Number of hidden channels of the network.
17 | kernel_size (int): Kernel size for the conv layers.
18 | dropout_p (float): Dropout rate used after each conv layer.
19 | """
20 |
21 | def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None):
22 | super().__init__()
23 |
24 | # add language embedding dim in the input
25 | if language_emb_dim:
26 | in_channels += language_emb_dim
27 |
28 | # class arguments
29 | self.in_channels = in_channels
30 | self.filter_channels = hidden_channels
31 | self.kernel_size = kernel_size
32 | self.dropout_p = dropout_p
33 | # layers
34 | self.drop = nn.Dropout(dropout_p)
35 | self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
36 | self.norm_1 = LayerNorm(hidden_channels)
37 | self.conv_2 = nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
38 | self.norm_2 = LayerNorm(hidden_channels)
39 | # output layer
40 | self.proj = nn.Conv1d(hidden_channels, 1, 1)
41 | if cond_channels is not None and cond_channels != 0:
42 | self.cond = nn.Conv1d(cond_channels, in_channels, 1)
43 |
44 | if language_emb_dim != 0 and language_emb_dim is not None:
45 | self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1)
46 |
47 | def forward(self, x, x_mask, g=None, lang_emb=None):
48 | """
49 | Shapes:
50 | - x: :math:`[B, C, T]`
51 | - x_mask: :math:`[B, 1, T]`
52 | - g: :math:`[B, C, 1]`
53 | """
54 | if g is not None:
55 | x = x + self.cond(g)
56 |
57 | if lang_emb is not None:
58 | x = x + self.cond_lang(lang_emb)
59 |
60 | x = self.conv_1(x * x_mask)
61 | x = torch.relu(x)
62 | x = self.norm_1(x)
63 | x = self.drop(x)
64 | x = self.conv_2(x * x_mask)
65 | x = torch.relu(x)
66 | x = self.norm_2(x)
67 | x = self.drop(x)
68 | x = self.proj(x * x_mask)
69 | return x * x_mask
70 |
--------------------------------------------------------------------------------
/TTS/utils/capacitron_optimizer.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | from trainer.trainer_utils import get_optimizer
4 |
5 |
6 | class CapacitronOptimizer:
7 | """Double optimizer class for the Capacitron model."""
8 |
9 | def __init__(self, config: dict, model_params: Generator) -> None:
10 | self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
11 |
12 | optimizer_names = list(config.optimizer_params.keys())
13 | optimizer_parameters = list(config.optimizer_params.values())
14 |
15 | self.primary_optimizer = get_optimizer(
16 | optimizer_names[0],
17 | optimizer_parameters[0],
18 | config.lr,
19 | parameters=self.primary_params,
20 | )
21 |
22 | self.secondary_optimizer = get_optimizer(
23 | optimizer_names[1],
24 | self.extract_optimizer_parameters(optimizer_parameters[1]),
25 | optimizer_parameters[1]["lr"],
26 | parameters=self.secondary_params,
27 | )
28 |
29 | self.param_groups = self.primary_optimizer.param_groups
30 |
31 | def first_step(self):
32 | self.secondary_optimizer.step()
33 | self.secondary_optimizer.zero_grad()
34 | self.primary_optimizer.zero_grad()
35 |
36 | def step(self):
37 | # Update param groups to display the correct learning rate
38 | self.param_groups = self.primary_optimizer.param_groups
39 | self.primary_optimizer.step()
40 |
41 | def zero_grad(self):
42 | self.primary_optimizer.zero_grad()
43 | self.secondary_optimizer.zero_grad()
44 |
45 | def load_state_dict(self, state_dict):
46 | self.primary_optimizer.load_state_dict(state_dict[0])
47 | self.secondary_optimizer.load_state_dict(state_dict[1])
48 |
49 | def state_dict(self):
50 | return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
51 |
52 | @staticmethod
53 | def split_model_parameters(model_params: Generator) -> list:
54 | primary_params = []
55 | secondary_params = []
56 | for name, param in model_params:
57 | if param.requires_grad:
58 | if name == "capacitron_vae_layer.beta":
59 | secondary_params.append(param)
60 | else:
61 | primary_params.append(param)
62 | return [iter(primary_params), iter(secondary_params)]
63 |
64 | @staticmethod
65 | def extract_optimizer_parameters(params: dict) -> dict:
66 | """Extract parameters that are not the learning rate"""
67 | return {k: v for k, v in params.items() if k != "lr"}
68 |
--------------------------------------------------------------------------------
/recipes/ljspeech/align_tts/train_aligntts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from trainer import Trainer, TrainerArgs
4 |
5 | from TTS.tts.configs.align_tts_config import AlignTTSConfig
6 | from TTS.tts.configs.shared_configs import BaseDatasetConfig
7 | from TTS.tts.datasets import load_tts_samples
8 | from TTS.tts.models.align_tts import AlignTTS
9 | from TTS.tts.utils.text.tokenizer import TTSTokenizer
10 | from TTS.utils.audio import AudioProcessor
11 |
12 | output_path = os.path.dirname(os.path.abspath(__file__))
13 |
14 | # init configs
15 | dataset_config = BaseDatasetConfig(
16 | name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
17 | )
18 | config = AlignTTSConfig(
19 | batch_size=32,
20 | eval_batch_size=16,
21 | num_loader_workers=4,
22 | num_eval_loader_workers=4,
23 | run_eval=True,
24 | test_delay_epochs=-1,
25 | epochs=1000,
26 | text_cleaner="english_cleaners",
27 | use_phonemes=False,
28 | phoneme_language="en-us",
29 | phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
30 | print_step=25,
31 | print_eval=True,
32 | mixed_precision=False,
33 | output_path=output_path,
34 | datasets=[dataset_config],
35 | )
36 |
37 | # INITIALIZE THE AUDIO PROCESSOR
38 | # Audio processor is used for feature extraction and audio I/O.
39 | # It mainly serves to the dataloader and the training loggers.
40 | ap = AudioProcessor.init_from_config(config)
41 |
42 | # INITIALIZE THE TOKENIZER
43 | # Tokenizer is used to convert text to sequences of token IDs.
44 | # If characters are not defined in the config, default characters are passed to the config
45 | tokenizer, config = TTSTokenizer.init_from_config(config)
46 |
47 | # LOAD DATA SAMPLES
48 | # Each sample is a list of ```[text, audio_file_path, speaker_name]```
49 | # You can define your custom sample loader returning the list of samples.
50 | # Or define your custom formatter and pass it to the `load_tts_samples`.
51 | # Check `TTS.tts.datasets.load_tts_samples` for more details.
52 | train_samples, eval_samples = load_tts_samples(
53 | dataset_config,
54 | eval_split=True,
55 | eval_split_max_size=config.eval_split_max_size,
56 | eval_split_size=config.eval_split_size,
57 | )
58 |
59 | # init model
60 | model = AlignTTS(config, ap, tokenizer)
61 |
62 | # INITIALIZE THE TRAINER
63 | # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
64 | # distributed training, etc.
65 | trainer = Trainer(
66 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
67 | )
68 |
69 | # AND... 3,2,1... 🚀
70 | trainer.fit()
71 |
--------------------------------------------------------------------------------