├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── README_zh.md ├── colab_webui.ipynb ├── docs ├── download.png ├── linly_logo.png ├── linly_watermark.png └── webui.png ├── env.example ├── examples ├── .DS_Store └── bk_music.mp3 ├── font └── SimHei.ttf ├── gui.py ├── requirements.txt ├── requirements_module.txt ├── scripts ├── download_models.sh ├── huggingface_download.py └── modelscope_download.py ├── submodules ├── TTS │ ├── CITATION.cff │ ├── CODE_OF_CONDUCT.md │ ├── CODE_OWNERS.rst │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE.txt │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── TTS │ │ ├── .models.json │ │ ├── VERSION │ │ ├── __init__.py │ │ ├── api.py │ │ ├── bin │ │ │ ├── __init__.py │ │ │ ├── collect_env_info.py │ │ │ ├── compute_attention_masks.py │ │ │ ├── compute_embeddings.py │ │ │ ├── compute_statistics.py │ │ │ ├── eval_encoder.py │ │ │ ├── extract_tts_spectrograms.py │ │ │ ├── find_unique_chars.py │ │ │ ├── find_unique_phonemes.py │ │ │ ├── remove_silence_using_vad.py │ │ │ ├── resample.py │ │ │ ├── synthesize.py │ │ │ ├── train_encoder.py │ │ │ ├── train_tts.py │ │ │ ├── train_vocoder.py │ │ │ └── tune_wavegrad.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ └── shared_configs.py │ │ ├── demos │ │ │ └── xtts_ft_demo │ │ │ │ ├── requirements.txt │ │ │ │ ├── utils │ │ │ │ ├── formatter.py │ │ │ │ └── gpt_train.py │ │ │ │ └── xtts_demo.py │ │ ├── encoder │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ │ ├── base_encoder_config.py │ │ │ │ ├── emotion_encoder_config.py │ │ │ │ └── speaker_encoder_config.py │ │ │ ├── dataset.py │ │ │ ├── losses.py │ │ │ ├── models │ │ │ │ ├── base_encoder.py │ │ │ │ ├── lstm.py │ │ │ │ └── resnet.py │ │ │ ├── requirements.txt │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── generic_utils.py │ │ │ │ ├── prepare_voxceleb.py │ │ │ │ ├── training.py │ │ │ │ └── visual.py │ │ ├── model.py │ │ ├── server │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── conf.json │ │ │ ├── server.py │ │ │ ├── static │ │ │ │ └── coqui-log-green-TTS.png │ │ │ └── templates │ │ │ │ ├── details.html │ │ │ │ └── index.html │ │ ├── tts │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── align_tts_config.py │ │ │ │ ├── bark_config.py │ │ │ │ ├── delightful_tts_config.py │ │ │ │ ├── fast_pitch_config.py │ │ │ │ ├── fast_speech_config.py │ │ │ │ ├── fastspeech2_config.py │ │ │ │ ├── glow_tts_config.py │ │ │ │ ├── neuralhmm_tts_config.py │ │ │ │ ├── overflow_config.py │ │ │ │ ├── shared_configs.py │ │ │ │ ├── speedy_speech_config.py │ │ │ │ ├── tacotron2_config.py │ │ │ │ ├── tacotron_config.py │ │ │ │ ├── tortoise_config.py │ │ │ │ ├── vits_config.py │ │ │ │ └── xtts_config.py │ │ │ ├── datasets │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset.py │ │ │ │ └── formatters.py │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── align_tts │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── duration_predictor.py │ │ │ │ │ └── mdn.py │ │ │ │ ├── bark │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── hubert │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── hubert_manager.py │ │ │ │ │ │ ├── kmeans_hubert.py │ │ │ │ │ │ └── tokenizer.py │ │ │ │ │ ├── inference_funcs.py │ │ │ │ │ ├── load_model.py │ │ │ │ │ ├── model.py │ │ │ │ │ └── model_fine.py │ │ │ │ ├── delightful_tts │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── acoustic_model.py │ │ │ │ │ ├── conformer.py │ │ │ │ │ ├── conv_layers.py │ │ │ │ │ ├── encoders.py │ │ │ │ │ ├── energy_adaptor.py │ │ │ │ │ ├── kernel_predictor.py │ │ │ │ │ ├── networks.py │ │ │ │ │ ├── phoneme_prosody_predictor.py │ │ │ │ │ ├── pitch_adaptor.py │ │ │ │ │ └── variance_predictor.py │ │ │ │ ├── feed_forward │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── decoder.py │ │ │ │ │ ├── duration_predictor.py │ │ │ │ │ └── encoder.py │ │ │ │ ├── generic │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── aligner.py │ │ │ │ │ ├── gated_conv.py │ │ │ │ │ ├── normalization.py │ │ │ │ │ ├── pos_encoding.py │ │ │ │ │ ├── res_conv_bn.py │ │ │ │ │ ├── time_depth_sep_conv.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ └── wavenet.py │ │ │ │ ├── glow_tts │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── decoder.py │ │ │ │ │ ├── duration_predictor.py │ │ │ │ │ ├── encoder.py │ │ │ │ │ ├── glow.py │ │ │ │ │ └── transformer.py │ │ │ │ ├── losses.py │ │ │ │ ├── overflow │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── common_layers.py │ │ │ │ │ ├── decoder.py │ │ │ │ │ ├── neural_hmm.py │ │ │ │ │ └── plotting_utils.py │ │ │ │ ├── tacotron │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── attentions.py │ │ │ │ │ ├── capacitron_layers.py │ │ │ │ │ ├── common_layers.py │ │ │ │ │ ├── gst_layers.py │ │ │ │ │ ├── tacotron.py │ │ │ │ │ └── tacotron2.py │ │ │ │ ├── tortoise │ │ │ │ │ ├── arch_utils.py │ │ │ │ │ ├── audio_utils.py │ │ │ │ │ ├── autoregressive.py │ │ │ │ │ ├── classifier.py │ │ │ │ │ ├── clvp.py │ │ │ │ │ ├── diffusion.py │ │ │ │ │ ├── diffusion_decoder.py │ │ │ │ │ ├── dpm_solver.py │ │ │ │ │ ├── random_latent_generator.py │ │ │ │ │ ├── tokenizer.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ ├── utils.py │ │ │ │ │ ├── vocoder.py │ │ │ │ │ ├── wav2vec_alignment.py │ │ │ │ │ └── xtransformers.py │ │ │ │ ├── vits │ │ │ │ │ ├── discriminator.py │ │ │ │ │ ├── networks.py │ │ │ │ │ ├── stochastic_duration_predictor.py │ │ │ │ │ └── transforms.py │ │ │ │ └── xtts │ │ │ │ │ ├── dvae.py │ │ │ │ │ ├── gpt.py │ │ │ │ │ ├── gpt_inference.py │ │ │ │ │ ├── hifigan_decoder.py │ │ │ │ │ ├── latent_encoder.py │ │ │ │ │ ├── perceiver_encoder.py │ │ │ │ │ ├── stream_generator.py │ │ │ │ │ ├── tokenizer.py │ │ │ │ │ ├── trainer │ │ │ │ │ ├── dataset.py │ │ │ │ │ └── gpt_trainer.py │ │ │ │ │ ├── xtts_manager.py │ │ │ │ │ └── zh_num2words.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── align_tts.py │ │ │ │ ├── bark.py │ │ │ │ ├── base_tacotron.py │ │ │ │ ├── base_tts.py │ │ │ │ ├── delightful_tts.py │ │ │ │ ├── forward_tts.py │ │ │ │ ├── glow_tts.py │ │ │ │ ├── neuralhmm_tts.py │ │ │ │ ├── overflow.py │ │ │ │ ├── tacotron.py │ │ │ │ ├── tacotron2.py │ │ │ │ ├── tortoise.py │ │ │ │ ├── vits.py │ │ │ │ └── xtts.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── assets │ │ │ │ └── tortoise │ │ │ │ │ └── tokenizer.json │ │ │ │ ├── data.py │ │ │ │ ├── fairseq.py │ │ │ │ ├── helpers.py │ │ │ │ ├── languages.py │ │ │ │ ├── managers.py │ │ │ │ ├── measures.py │ │ │ │ ├── monotonic_align │ │ │ │ ├── __init__.py │ │ │ │ ├── core.pyx │ │ │ │ └── setup.py │ │ │ │ ├── speakers.py │ │ │ │ ├── ssim.py │ │ │ │ ├── synthesis.py │ │ │ │ ├── text │ │ │ │ ├── __init__.py │ │ │ │ ├── bangla │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── phonemizer.py │ │ │ │ ├── belarusian │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── phonemizer.py │ │ │ │ ├── characters.py │ │ │ │ ├── chinese_mandarin │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── numbers.py │ │ │ │ │ ├── phonemizer.py │ │ │ │ │ └── pinyinToPhonemes.py │ │ │ │ ├── cleaners.py │ │ │ │ ├── cmudict.py │ │ │ │ ├── english │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── abbreviations.py │ │ │ │ │ ├── number_norm.py │ │ │ │ │ └── time_norm.py │ │ │ │ ├── french │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── abbreviations.py │ │ │ │ ├── japanese │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── phonemizer.py │ │ │ │ ├── korean │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ko_dictionary.py │ │ │ │ │ ├── korean.py │ │ │ │ │ └── phonemizer.py │ │ │ │ ├── phonemizers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── bangla_phonemizer.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── belarusian_phonemizer.py │ │ │ │ │ ├── espeak_wrapper.py │ │ │ │ │ ├── gruut_wrapper.py │ │ │ │ │ ├── ja_jp_phonemizer.py │ │ │ │ │ ├── ko_kr_phonemizer.py │ │ │ │ │ ├── multi_phonemizer.py │ │ │ │ │ └── zh_cn_phonemizer.py │ │ │ │ ├── punctuation.py │ │ │ │ └── tokenizer.py │ │ │ │ └── visual.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── audio │ │ │ │ ├── __init__.py │ │ │ │ ├── numpy_transforms.py │ │ │ │ ├── processor.py │ │ │ │ └── torch_transforms.py │ │ │ ├── callbacks.py │ │ │ ├── capacitron_optimizer.py │ │ │ ├── distribute.py │ │ │ ├── download.py │ │ │ ├── downloaders.py │ │ │ ├── generic_utils.py │ │ │ ├── io.py │ │ │ ├── manage.py │ │ │ ├── radam.py │ │ │ ├── samplers.py │ │ │ ├── synthesizer.py │ │ │ ├── training.py │ │ │ └── vad.py │ │ ├── vc │ │ │ ├── configs │ │ │ │ ├── __init__.py │ │ │ │ ├── freevc_config.py │ │ │ │ └── shared_configs.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── base_vc.py │ │ │ │ └── freevc.py │ │ │ └── modules │ │ │ │ ├── __init__.py │ │ │ │ └── freevc │ │ │ │ ├── __init__.py │ │ │ │ ├── commons.py │ │ │ │ ├── mel_processing.py │ │ │ │ ├── modules.py │ │ │ │ ├── speaker_encoder │ │ │ │ ├── __init__.py │ │ │ │ ├── audio.py │ │ │ │ ├── hparams.py │ │ │ │ └── speaker_encoder.py │ │ │ │ └── wavlm │ │ │ │ ├── __init__.py │ │ │ │ ├── config.json │ │ │ │ ├── modules.py │ │ │ │ └── wavlm.py │ │ └── vocoder │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ ├── __init__.py │ │ │ ├── fullband_melgan_config.py │ │ │ ├── hifigan_config.py │ │ │ ├── melgan_config.py │ │ │ ├── multiband_melgan_config.py │ │ │ ├── parallel_wavegan_config.py │ │ │ ├── shared_configs.py │ │ │ ├── univnet_config.py │ │ │ ├── wavegrad_config.py │ │ │ └── wavernn_config.py │ │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── gan_dataset.py │ │ │ ├── preprocess.py │ │ │ ├── wavegrad_dataset.py │ │ │ └── wavernn_dataset.py │ │ │ ├── layers │ │ │ ├── __init__.py │ │ │ ├── hifigan.py │ │ │ ├── losses.py │ │ │ ├── lvc_block.py │ │ │ ├── melgan.py │ │ │ ├── parallel_wavegan.py │ │ │ ├── pqmf.py │ │ │ ├── qmf.dat │ │ │ ├── upsample.py │ │ │ └── wavegrad.py │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── base_vocoder.py │ │ │ ├── fullband_melgan_generator.py │ │ │ ├── gan.py │ │ │ ├── hifigan_discriminator.py │ │ │ ├── hifigan_generator.py │ │ │ ├── melgan_discriminator.py │ │ │ ├── melgan_generator.py │ │ │ ├── melgan_multiscale_discriminator.py │ │ │ ├── multiband_melgan_generator.py │ │ │ ├── parallel_wavegan_discriminator.py │ │ │ ├── parallel_wavegan_generator.py │ │ │ ├── random_window_discriminator.py │ │ │ ├── univnet_discriminator.py │ │ │ ├── univnet_generator.py │ │ │ ├── wavegrad.py │ │ │ └── wavernn.py │ │ │ ├── pqmf_output.wav │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── distribution.py │ │ │ └── generic_utils.py │ ├── dockerfiles │ │ └── Dockerfile.dev │ ├── docs │ │ ├── Makefile │ │ ├── README.md │ │ ├── requirements.txt │ │ └── source │ │ │ ├── _static │ │ │ └── logo.png │ │ │ ├── _templates │ │ │ └── page.html │ │ │ ├── conf.py │ │ │ ├── configuration.md │ │ │ ├── contributing.md │ │ │ ├── docker_images.md │ │ │ ├── faq.md │ │ │ ├── finetuning.md │ │ │ ├── formatting_your_dataset.md │ │ │ ├── implementing_a_new_language_frontend.md │ │ │ ├── implementing_a_new_model.md │ │ │ ├── index.md │ │ │ ├── inference.md │ │ │ ├── installation.md │ │ │ ├── main_classes │ │ │ ├── audio_processor.md │ │ │ ├── dataset.md │ │ │ ├── gan.md │ │ │ ├── model_api.md │ │ │ ├── speaker_manager.md │ │ │ └── trainer_api.md │ │ │ ├── make.bat │ │ │ ├── marytts.md │ │ │ ├── models │ │ │ ├── bark.md │ │ │ ├── forward_tts.md │ │ │ ├── glow_tts.md │ │ │ ├── overflow.md │ │ │ ├── tacotron1-2.md │ │ │ ├── tortoise.md │ │ │ ├── vits.md │ │ │ └── xtts.md │ │ │ ├── training_a_model.md │ │ │ ├── tts_datasets.md │ │ │ ├── tutorial_for_nervous_beginners.md │ │ │ └── what_makes_a_good_dataset.md │ ├── hubconf.py │ ├── images │ │ ├── TTS-performance.png │ │ ├── coqui-log-green-TTS.png │ │ ├── demo_server.gif │ │ ├── example_model_output.png │ │ ├── model.png │ │ ├── tts_cli.gif │ │ └── tts_performance.png │ ├── notebooks │ │ ├── ExtractTTSpectrogram.ipynb │ │ ├── PlotUmapLibriTTS.ipynb │ │ ├── TestAttention.ipynb │ │ ├── Tortoise.ipynb │ │ ├── Tutorial_1_use-pretrained-TTS.ipynb │ │ ├── Tutorial_2_train_your_first_TTS_model.ipynb │ │ └── dataset_analysis │ │ │ ├── AnalyzeDataset.ipynb │ │ │ ├── CheckDatasetSNR.ipynb │ │ │ ├── CheckPitch.ipynb │ │ │ ├── CheckSpectrograms.ipynb │ │ │ ├── PhonemeCoverage.ipynb │ │ │ ├── README.md │ │ │ └── analyze.py │ ├── pyproject.toml │ ├── recipes │ │ ├── README.md │ │ ├── bel-alex73 │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── choose_speaker.ipynb │ │ │ ├── docker-prepare-start.sh │ │ │ ├── docker-prepare │ │ │ │ ├── Dockerfile │ │ │ │ └── runtime.sh │ │ │ ├── dump_config.py │ │ │ ├── train_glowtts.py │ │ │ └── train_hifigan.py │ │ ├── blizzard2013 │ │ │ ├── README.md │ │ │ ├── tacotron1-Capacitron │ │ │ │ └── train_capacitron_t1.py │ │ │ └── tacotron2-Capacitron │ │ │ │ └── train_capacitron_t2.py │ │ ├── kokoro │ │ │ └── tacotron2-DDC │ │ │ │ ├── run.sh │ │ │ │ └── tacotron2-DDC.json │ │ ├── ljspeech │ │ │ ├── README.md │ │ │ ├── align_tts │ │ │ │ └── train_aligntts.py │ │ │ ├── delightful_tts │ │ │ │ └── train_delightful_tts.py │ │ │ ├── download_ljspeech.sh │ │ │ ├── fast_pitch │ │ │ │ └── train_fast_pitch.py │ │ │ ├── fast_speech │ │ │ │ └── train_fast_speech.py │ │ │ ├── fastspeech2 │ │ │ │ └── train_fastspeech2.py │ │ │ ├── glow_tts │ │ │ │ └── train_glowtts.py │ │ │ ├── hifigan │ │ │ │ └── train_hifigan.py │ │ │ ├── multiband_melgan │ │ │ │ └── train_multiband_melgan.py │ │ │ ├── neuralhmm_tts │ │ │ │ └── train_neuralhmmtts.py │ │ │ ├── overflow │ │ │ │ ├── lj_parameters.pt │ │ │ │ └── train_overflow.py │ │ │ ├── speedy_speech │ │ │ │ └── train_speedy_speech.py │ │ │ ├── tacotron2-Capacitron │ │ │ │ └── train_capacitron_t2.py │ │ │ ├── tacotron2-DCA │ │ │ │ └── train_tacotron_dca.py │ │ │ ├── tacotron2-DDC │ │ │ │ └── train_tacotron_ddc.py │ │ │ ├── univnet │ │ │ │ └── train.py │ │ │ ├── vits_tts │ │ │ │ └── train_vits.py │ │ │ ├── wavegrad │ │ │ │ └── train_wavegrad.py │ │ │ ├── wavernn │ │ │ │ └── train_wavernn.py │ │ │ ├── xtts_v1 │ │ │ │ └── train_gpt_xtts.py │ │ │ └── xtts_v2 │ │ │ │ └── train_gpt_xtts.py │ │ ├── multilingual │ │ │ ├── cml_yourtts │ │ │ │ └── train_yourtts.py │ │ │ └── vits_tts │ │ │ │ ├── train_vits_tts.py │ │ │ │ └── train_vits_tts_phonemes.py │ │ ├── thorsten_DE │ │ │ ├── README.md │ │ │ ├── align_tts │ │ │ │ └── train_aligntts.py │ │ │ ├── download_thorsten_DE.sh │ │ │ ├── glow_tts │ │ │ │ └── train_glowtts.py │ │ │ ├── hifigan │ │ │ │ └── train_hifigan.py │ │ │ ├── multiband_melgan │ │ │ │ └── train_multiband_melgan.py │ │ │ ├── speedy_speech │ │ │ │ └── train_speedy_speech.py │ │ │ ├── tacotron2-DDC │ │ │ │ └── train_tacotron_ddc.py │ │ │ ├── univnet │ │ │ │ └── train_univnet.py │ │ │ ├── vits_tts │ │ │ │ └── train_vits.py │ │ │ ├── wavegrad │ │ │ │ └── train_wavegrad.py │ │ │ └── wavernn │ │ │ │ └── train_wavernn.py │ │ └── vctk │ │ │ ├── delightful_tts │ │ │ └── train_delightful_tts.py │ │ │ ├── download_vctk.sh │ │ │ ├── fast_pitch │ │ │ └── train_fast_pitch.py │ │ │ ├── fast_speech │ │ │ └── train_fast_speech.py │ │ │ ├── glow_tts │ │ │ └── train_glow_tts.py │ │ │ ├── resnet_speaker_encoder │ │ │ └── train_encoder.py │ │ │ ├── speedy_speech │ │ │ └── train_speedy_speech.py │ │ │ ├── tacotron-DDC │ │ │ └── train_tacotron-DDC.py │ │ │ ├── tacotron2-DDC │ │ │ └── train_tacotron2-ddc.py │ │ │ ├── tacotron2 │ │ │ └── train_tacotron2.py │ │ │ ├── vits │ │ │ └── train_vits.py │ │ │ └── yourtts │ │ │ └── train_yourtts.py │ ├── requirements.dev.txt │ ├── requirements.ja.txt │ ├── requirements.notebooks.txt │ ├── requirements.txt │ ├── run_bash_tests.sh │ ├── scripts │ │ └── sync_readme.py │ ├── setup.cfg │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ ├── aux_tests │ │ ├── __init__.py │ │ ├── test_audio_processor.py │ │ ├── test_embedding_manager.py │ │ ├── test_extract_tts_spectrograms.py │ │ ├── test_find_unique_phonemes.py │ │ ├── test_numpy_transforms.py │ │ ├── test_readme.py │ │ ├── test_speaker_encoder.py │ │ ├── test_speaker_encoder_train.py │ │ ├── test_speaker_manager.py │ │ └── test_stft_torch.py │ │ ├── bash_tests │ │ ├── test_compute_statistics.sh │ │ └── test_demo_server.sh │ │ ├── data │ │ ├── dummy_speakers.json │ │ ├── dummy_speakers.pth │ │ ├── dummy_speakers2.json │ │ └── ljspeech │ │ │ ├── f0_cache │ │ │ └── pitch_stats.npy │ │ │ ├── metadata.csv │ │ │ ├── metadata_attn_mask.txt │ │ │ ├── metadata_flac.csv │ │ │ ├── metadata_mp3.csv │ │ │ ├── metadata_wav.csv │ │ │ ├── speakers.json │ │ │ └── wavs │ │ │ ├── LJ001-0001.flac │ │ │ ├── LJ001-0001.mp3 │ │ │ ├── LJ001-0001.npy │ │ │ ├── LJ001-0001.wav │ │ │ ├── LJ001-0002.flac │ │ │ ├── LJ001-0002.mp3 │ │ │ ├── LJ001-0002.npy │ │ │ ├── LJ001-0002.wav │ │ │ ├── LJ001-0003.flac │ │ │ ├── LJ001-0003.mp3 │ │ │ ├── LJ001-0003.npy │ │ │ ├── LJ001-0003.wav │ │ │ ├── LJ001-0004.flac │ │ │ ├── LJ001-0004.mp3 │ │ │ ├── LJ001-0004.npy │ │ │ ├── LJ001-0004.wav │ │ │ ├── LJ001-0005.flac │ │ │ ├── LJ001-0005.mp3 │ │ │ ├── LJ001-0005.npy │ │ │ ├── LJ001-0005.wav │ │ │ ├── LJ001-0006.flac │ │ │ ├── LJ001-0006.mp3 │ │ │ ├── LJ001-0006.npy │ │ │ ├── LJ001-0006.wav │ │ │ ├── LJ001-0007.flac │ │ │ ├── LJ001-0007.mp3 │ │ │ ├── LJ001-0007.npy │ │ │ ├── LJ001-0007.wav │ │ │ ├── LJ001-0008.flac │ │ │ ├── LJ001-0008.mp3 │ │ │ ├── LJ001-0008.npy │ │ │ ├── LJ001-0008.wav │ │ │ ├── LJ001-0009.flac │ │ │ ├── LJ001-0009.mp3 │ │ │ ├── LJ001-0009.npy │ │ │ ├── LJ001-0009.wav │ │ │ ├── LJ001-0010.flac │ │ │ ├── LJ001-0010.mp3 │ │ │ ├── LJ001-0010.npy │ │ │ ├── LJ001-0010.wav │ │ │ ├── LJ001-0011.flac │ │ │ ├── LJ001-0011.mp3 │ │ │ ├── LJ001-0011.npy │ │ │ ├── LJ001-0011.wav │ │ │ ├── LJ001-0012.flac │ │ │ ├── LJ001-0012.mp3 │ │ │ ├── LJ001-0012.npy │ │ │ ├── LJ001-0012.wav │ │ │ ├── LJ001-0013.flac │ │ │ ├── LJ001-0013.mp3 │ │ │ ├── LJ001-0013.npy │ │ │ ├── LJ001-0013.wav │ │ │ ├── LJ001-0014.flac │ │ │ ├── LJ001-0014.mp3 │ │ │ ├── LJ001-0014.npy │ │ │ ├── LJ001-0014.wav │ │ │ ├── LJ001-0015.flac │ │ │ ├── LJ001-0015.mp3 │ │ │ ├── LJ001-0015.npy │ │ │ ├── LJ001-0015.wav │ │ │ ├── LJ001-0016.flac │ │ │ ├── LJ001-0016.mp3 │ │ │ ├── LJ001-0016.npy │ │ │ ├── LJ001-0016.wav │ │ │ ├── LJ001-0017.flac │ │ │ ├── LJ001-0017.mp3 │ │ │ ├── LJ001-0017.npy │ │ │ ├── LJ001-0017.wav │ │ │ ├── LJ001-0018.flac │ │ │ ├── LJ001-0018.mp3 │ │ │ ├── LJ001-0018.npy │ │ │ ├── LJ001-0018.wav │ │ │ ├── LJ001-0019.flac │ │ │ ├── LJ001-0019.mp3 │ │ │ ├── LJ001-0019.npy │ │ │ ├── LJ001-0019.wav │ │ │ ├── LJ001-0020.flac │ │ │ ├── LJ001-0020.mp3 │ │ │ ├── LJ001-0020.npy │ │ │ ├── LJ001-0020.wav │ │ │ ├── LJ001-0021.flac │ │ │ ├── LJ001-0021.mp3 │ │ │ ├── LJ001-0021.npy │ │ │ ├── LJ001-0021.wav │ │ │ ├── LJ001-0022.flac │ │ │ ├── LJ001-0022.mp3 │ │ │ ├── LJ001-0022.npy │ │ │ ├── LJ001-0022.wav │ │ │ ├── LJ001-0023.flac │ │ │ ├── LJ001-0023.mp3 │ │ │ ├── LJ001-0023.npy │ │ │ ├── LJ001-0023.wav │ │ │ ├── LJ001-0024.flac │ │ │ ├── LJ001-0024.mp3 │ │ │ ├── LJ001-0024.npy │ │ │ ├── LJ001-0024.wav │ │ │ ├── LJ001-0025.flac │ │ │ ├── LJ001-0025.mp3 │ │ │ ├── LJ001-0025.npy │ │ │ ├── LJ001-0025.wav │ │ │ ├── LJ001-0026.flac │ │ │ ├── LJ001-0026.mp3 │ │ │ ├── LJ001-0026.npy │ │ │ ├── LJ001-0026.wav │ │ │ ├── LJ001-0027.flac │ │ │ ├── LJ001-0027.mp3 │ │ │ ├── LJ001-0027.npy │ │ │ ├── LJ001-0027.wav │ │ │ ├── LJ001-0028.flac │ │ │ ├── LJ001-0028.mp3 │ │ │ ├── LJ001-0028.npy │ │ │ ├── LJ001-0028.wav │ │ │ ├── LJ001-0029.flac │ │ │ ├── LJ001-0029.mp3 │ │ │ ├── LJ001-0029.npy │ │ │ ├── LJ001-0029.wav │ │ │ ├── LJ001-0030.flac │ │ │ ├── LJ001-0030.mp3 │ │ │ ├── LJ001-0030.npy │ │ │ ├── LJ001-0030.wav │ │ │ ├── LJ001-0031.flac │ │ │ ├── LJ001-0031.mp3 │ │ │ ├── LJ001-0031.npy │ │ │ ├── LJ001-0031.wav │ │ │ ├── LJ001-0032.flac │ │ │ ├── LJ001-0032.mp3 │ │ │ ├── LJ001-0032.npy │ │ │ └── LJ001-0032.wav │ │ ├── data_tests │ │ ├── __init__.py │ │ ├── test_dataset_formatters.py │ │ ├── test_loader.py │ │ └── test_samplers.py │ │ ├── inference_tests │ │ ├── __init__.py │ │ ├── test_synthesize.py │ │ └── test_synthesizer.py │ │ ├── inputs │ │ ├── common_voice.tsv │ │ ├── dummy_model_config.json │ │ ├── example_1.wav │ │ ├── language_ids.json │ │ ├── scale_stats.npy │ │ ├── server_config.json │ │ ├── test_align_tts.json │ │ ├── test_config.json │ │ ├── test_glow_tts.json │ │ ├── test_speaker_encoder_config.json │ │ ├── test_speedy_speech.json │ │ ├── test_tacotron2_config.json │ │ ├── test_tacotron_bd_config.json │ │ ├── test_tacotron_config.json │ │ ├── test_vocoder_audio_config.json │ │ ├── test_vocoder_multiband_melgan_config.json │ │ ├── test_vocoder_wavegrad.json │ │ ├── test_vocoder_wavernn_config.json │ │ └── xtts_vocab.json │ │ ├── text_tests │ │ ├── __init__.py │ │ ├── test_belarusian_phonemizer.py │ │ ├── test_characters.py │ │ ├── test_japanese_phonemizer.py │ │ ├── test_korean_phonemizer.py │ │ ├── test_phonemizer.py │ │ ├── test_punctuation.py │ │ ├── test_text_cleaners.py │ │ └── test_tokenizer.py │ │ ├── tts_tests │ │ ├── __init__.py │ │ ├── test_helpers.py │ │ ├── test_losses.py │ │ ├── test_neuralhmm_tts_train.py │ │ ├── test_overflow.py │ │ ├── test_overflow_train.py │ │ ├── test_speedy_speech_train.py │ │ ├── test_tacotron2_d-vectors_train.py │ │ ├── test_tacotron2_model.py │ │ ├── test_tacotron2_speaker_emb_train.py │ │ ├── test_tacotron2_train.py │ │ ├── test_tacotron_layers.py │ │ ├── test_tacotron_model.py │ │ ├── test_tacotron_train.py │ │ ├── test_vits.py │ │ ├── test_vits_d-vectors_train.py │ │ ├── test_vits_multilingual_speaker_emb_train.py │ │ ├── test_vits_multilingual_train-d_vectors.py │ │ ├── test_vits_speaker_emb_train.py │ │ └── test_vits_train.py │ │ ├── tts_tests2 │ │ ├── __init__.py │ │ ├── test_align_tts_train.py │ │ ├── test_delightful_tts_d-vectors_train.py │ │ ├── test_delightful_tts_emb_spk.py │ │ ├── test_delightful_tts_layers.py │ │ ├── test_delightful_tts_train.py │ │ ├── test_fast_pitch_speaker_emb_train.py │ │ ├── test_fast_pitch_train.py │ │ ├── test_fastspeech_2_speaker_emb_train.py │ │ ├── test_fastspeech_2_train.py │ │ ├── test_feed_forward_layers.py │ │ ├── test_forward_tts.py │ │ ├── test_glow_tts.py │ │ ├── test_glow_tts_d-vectors_train.py │ │ ├── test_glow_tts_speaker_emb_train.py │ │ └── test_glow_tts_train.py │ │ ├── vc_tests │ │ ├── __init__.py │ │ └── test_freevc.py │ │ ├── vocoder_tests │ │ ├── __init__.py │ │ ├── test_fullband_melgan_train.py │ │ ├── test_hifigan_train.py │ │ ├── test_melgan_train.py │ │ ├── test_multiband_melgan_train.py │ │ ├── test_parallel_wavegan_train.py │ │ ├── test_vocoder_gan_datasets.py │ │ ├── test_vocoder_losses.py │ │ ├── test_vocoder_melgan_discriminator.py │ │ ├── test_vocoder_melgan_generator.py │ │ ├── test_vocoder_parallel_wavegan_discriminator.py │ │ ├── test_vocoder_parallel_wavegan_generator.py │ │ ├── test_vocoder_pqmf.py │ │ ├── test_vocoder_rwd.py │ │ ├── test_vocoder_wavernn.py │ │ ├── test_vocoder_wavernn_datasets.py │ │ ├── test_wavegrad.py │ │ ├── test_wavegrad_layers.py │ │ ├── test_wavegrad_train.py │ │ └── test_wavernn_train.py │ │ ├── xtts_tests │ │ ├── test_xtts_gpt_train.py │ │ └── test_xtts_v2-0_gpt_train.py │ │ └── zoo_tests │ │ ├── __init__.py │ │ └── test_models.py ├── demucs │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── Demucs.ipynb │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── conf │ │ ├── config.yaml │ │ ├── dset │ │ │ ├── aetl.yaml │ │ │ ├── auto_extra_test.yaml │ │ │ ├── auto_mus.yaml │ │ │ ├── extra44.yaml │ │ │ ├── extra_mmi_goodclean.yaml │ │ │ ├── extra_test.yaml │ │ │ ├── musdb44.yaml │ │ │ ├── sdx23_bleeding.yaml │ │ │ └── sdx23_labelnoise.yaml │ │ ├── svd │ │ │ ├── base.yaml │ │ │ ├── base2.yaml │ │ │ └── default.yaml │ │ └── variant │ │ │ ├── default.yaml │ │ │ ├── example.yaml │ │ │ └── finetune.yaml │ ├── demucs.png │ ├── demucs │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── api.py │ │ ├── apply.py │ │ ├── audio.py │ │ ├── augment.py │ │ ├── demucs.py │ │ ├── distrib.py │ │ ├── ema.py │ │ ├── evaluate.py │ │ ├── grids │ │ │ ├── __init__.py │ │ │ ├── _explorers.py │ │ │ ├── mdx.py │ │ │ ├── mdx_extra.py │ │ │ ├── mdx_refine.py │ │ │ ├── mmi.py │ │ │ ├── mmi_ft.py │ │ │ ├── repro.py │ │ │ ├── repro_ft.py │ │ │ └── sdx23.py │ │ ├── hdemucs.py │ │ ├── htdemucs.py │ │ ├── pretrained.py │ │ ├── py.typed │ │ ├── remote │ │ │ ├── files.txt │ │ │ ├── hdemucs_mmi.yaml │ │ │ ├── htdemucs.yaml │ │ │ ├── htdemucs_6s.yaml │ │ │ ├── htdemucs_ft.yaml │ │ │ ├── mdx.yaml │ │ │ ├── mdx_extra.yaml │ │ │ ├── mdx_extra_q.yaml │ │ │ ├── mdx_q.yaml │ │ │ ├── repro_mdx_a.yaml │ │ │ ├── repro_mdx_a_hybrid_only.yaml │ │ │ └── repro_mdx_a_time_only.yaml │ │ ├── repitch.py │ │ ├── repo.py │ │ ├── separate.py │ │ ├── solver.py │ │ ├── spec.py │ │ ├── states.py │ │ ├── svd.py │ │ ├── train.py │ │ ├── transformer.py │ │ ├── utils.py │ │ ├── wav.py │ │ └── wdemucs.py │ ├── docs │ │ ├── api.md │ │ ├── linux.md │ │ ├── mac.md │ │ ├── mdx.md │ │ ├── release.md │ │ ├── sdx23.md │ │ ├── training.md │ │ └── windows.md │ ├── environment-cpu.yml │ ├── environment-cuda.yml │ ├── hubconf.py │ ├── mypy.ini │ ├── outputs.tar.gz │ ├── requirements.txt │ ├── requirements_minimal.txt │ ├── setup.cfg │ ├── setup.py │ ├── test.mp3 │ └── tools │ │ ├── __init__.py │ │ ├── automix.py │ │ ├── bench.py │ │ ├── convert.py │ │ ├── export.py │ │ └── test_pretrained.py ├── whisper │ ├── CHANGELOG.md │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── approach.png │ ├── data │ │ ├── README.md │ │ └── meanwhile.json │ ├── language-breakdown.svg │ ├── model-card.md │ ├── notebooks │ │ ├── LibriSpeech.ipynb │ │ └── Multilingual_ASR.ipynb │ ├── pyproject.toml │ ├── requirements.txt │ ├── setup.py │ ├── tests │ │ ├── conftest.py │ │ ├── jfk.flac │ │ ├── test_audio.py │ │ ├── test_normalizer.py │ │ ├── test_timing.py │ │ ├── test_tokenizer.py │ │ └── test_transcribe.py │ └── whisper │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── assets │ │ ├── gpt2.tiktoken │ │ ├── mel_filters.npz │ │ └── multilingual.tiktoken │ │ ├── audio.py │ │ ├── decoding.py │ │ ├── model.py │ │ ├── normalizers │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── english.json │ │ └── english.py │ │ ├── timing.py │ │ ├── tokenizer.py │ │ ├── transcribe.py │ │ ├── triton_ops.py │ │ ├── utils.py │ │ └── version.py └── whisperX │ ├── EXAMPLES.md │ ├── LICENSE │ ├── MANIFEST.in │ ├── README.md │ ├── figures │ └── pipeline.png │ ├── requirements.txt │ ├── setup.py │ └── whisperx │ ├── SubtitlesProcessor.py │ ├── __init__.py │ ├── __main__.py │ ├── alignment.py │ ├── asr.py │ ├── assets │ └── mel_filters.npz │ ├── audio.py │ ├── conjunctions.py │ ├── diarize.py │ ├── transcribe.py │ ├── types.py │ ├── utils.py │ └── vad.py ├── tabs ├── __init__.py ├── asr_tab.py ├── demucs_tab.py ├── download_tab.py ├── full_auto_tab.py ├── linly_talker_tab.py ├── settings_tab.py ├── translation_tab.py ├── tts_tab.py └── video_tab.py ├── tools ├── cn_tx.py ├── do_everything.py ├── step000_video_downloader.py ├── step010_demucs_vr.py ├── step020_asr.py ├── step021_asr_whisperx.py ├── step022_asr_funasr.py ├── step030_translation.py ├── step031_translation_openai.py ├── step032_translation_llm.py ├── step033_translation_translator.py ├── step034_translation_ernie.py ├── step035_translation_qwen.py ├── step036_translation_ollama.py ├── step040_tts.py ├── step041_tts_bytedance.py ├── step042_tts_xtts.py ├── step043_tts_cosyvoice.py ├── step044_tts_edge_tts.py ├── step050_synthesize_video.py └── utils.py ├── ui_components.py ├── webui.py └── 问题参考汇总.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "CosyVoice"] 2 | path = CosyVoice 3 | url = https://github.com/FunAudioLLM/CosyVoice.git 4 | -------------------------------------------------------------------------------- /docs/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/docs/download.png -------------------------------------------------------------------------------- /docs/linly_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/docs/linly_logo.png -------------------------------------------------------------------------------- /docs/linly_watermark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/docs/linly_watermark.png -------------------------------------------------------------------------------- /docs/webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/docs/webui.png -------------------------------------------------------------------------------- /env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY = 'sk-***' 2 | OPENAI_API_BASE = 3 | # MODEL_NAME = 'gpt-3.5-turbo' 4 | MODEL_NAME = 'qwen/Qwen1.5-4B-Chat' 5 | # 下载模型的HF_TOKEN 6 | HF_TOKEN = '' 7 | 8 | # 火山引擎 9 | BYTEDANCE_APPID = 10 | BYTEDANCE_ACCESS_TOKEN = 11 | 12 | # 如果在从 huggingface 下载模型时报错,uncomment 下面的代码 13 | # HF_ENDPOINT = 'https://hf-mirror.com' 14 | BILI_BASE64 = 15 | 16 | # 百度API 17 | BAIDU_API_KEY='' 18 | BAIDU_SECRET_KEY='' -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/examples/.DS_Store -------------------------------------------------------------------------------- /examples/bk_music.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/examples/bk_music.mp3 -------------------------------------------------------------------------------- /font/SimHei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/font/SimHei.ttf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch and its dependencies 2 | # These libraries include PyTorch and its related packages, supporting CUDA 11.8. 3 | # --extra-index-url https://download.pytorch.org/whl/cu118 4 | # torch 5 | # torchvision 6 | # torchaudio 7 | 8 | numpy==1.26.3 9 | transformers==4.39.3 10 | translators 11 | edge-tts 12 | gradio 13 | loguru 14 | yt-dlp 15 | scipy 16 | python-dotenv 17 | openai 18 | audiostretchy 19 | modelscope 20 | 21 | # ASR 22 | # git+https://github.com/m-bain/whisperx.git 23 | # git+https://github.com/facebookresearch/demucs#egg=demucs 24 | funasr 25 | 26 | # googletrans 27 | 28 | # Qwen 29 | accelerate 30 | 31 | # CoxyVoice 32 | HyperPyYAML==1.2.2 33 | librosa==0.10.2 34 | WeTextProcessing==1.0.3 35 | wget==3.2 36 | # openai-whisper==20231117 37 | modelscope 38 | diffusers==0.27.2 39 | gdown==5.1.0 40 | pyarrow 41 | conformer==0.3.2 42 | lightning==2.2.4 43 | requests 44 | dotenv 45 | loguru 46 | moviepy 47 | # ctranslate2==3.24.0 -------------------------------------------------------------------------------- /requirements_module.txt: -------------------------------------------------------------------------------- 1 | submodules/demucs 2 | submodules/whisper 3 | submodules/whisperX 4 | submodules/TTS -------------------------------------------------------------------------------- /scripts/download_models.sh: -------------------------------------------------------------------------------- 1 | # 下载 wav2vec2 模型并保存到指定路径,如果文件已经存在,则跳过下载 2 | mkdir -p models/ASR/whisper & wget -nc https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth \ 3 | -O models/ASR/whisper/wav2vec2_fairseq_base_ls960_asr_ls960.pth 4 | 5 | # 执行下载脚本 6 | python scripts/modelscope_download.py -------------------------------------------------------------------------------- /scripts/huggingface_download.py: -------------------------------------------------------------------------------- 1 | # pip install huggingface_hub 2 | from huggingface_hub import snapshot_download 3 | 4 | # https://huggingface.co/coqui/XTTS-v2 5 | snapshot_download('coqui/XTTS-v2', local_dir='models/TTS/XTTS-v2', resume_download=True, local_dir_use_symlinks=False) 6 | 7 | # https://huggingface.co/FunAudioLLM/CosyVoice-300M 8 | # snapshot_download('FunAudioLLM/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M', resume_download=True, local_dir_use_symlinks=False) 9 | 10 | # https://huggingface.co/Qwen/Qwen1.5-4B-Chat 11 | snapshot_download('Qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat', resume_download=True, local_dir_use_symlinks=False) 12 | 13 | # https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat 14 | snapshot_download('Qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat', resume_download=True, local_dir_use_symlinks=False) 15 | 16 | # https://huggingface.co/Systran/faster-whisper-large-v3 17 | snapshot_download('Systran/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3', resume_download=True, local_dir_use_symlinks=False) 18 | 19 | # 需要申请自动下载 20 | # https://huggingface.co/pyannote/speaker-diarization-3.1 21 | # snapshot_download('pyannote/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1', resume_download=True, local_dir_use_symlinks=False) 22 | -------------------------------------------------------------------------------- /scripts/modelscope_download.py: -------------------------------------------------------------------------------- 1 | # pip install modelscope 2 | from modelscope import snapshot_download 3 | 4 | # https://modelscope.cn/models/AI-ModelScope/XTTS-v2 5 | snapshot_download('AI-ModelScope/XTTS-v2', local_dir='models/TTS/XTTS-v2') 6 | 7 | # https://modelscope.cn/models/iic/CosyVoice-300M 8 | # snapshot_download('iic/CosyVoice-300M', local_dir='models/TTS/CosyVoice-300M') 9 | 10 | # https://modelscope.cn/models/qwen/qwen1.5-4b-chat 11 | snapshot_download('qwen/Qwen1.5-4B-Chat', local_dir='models/LLM/Qwen1.5-4B-Chat') 12 | 13 | # https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat 14 | # snapshot_download('qwen/Qwen1.5-1.8B-Chat', local_dir='models/LLM/Qwen1.5-1.8B-Chat') 15 | 16 | # https://modelscope.cn/models/keepitsimple/faster-whisper-large-v3 17 | snapshot_download('keepitsimple/faster-whisper-large-v3', local_dir='models/ASR/whisper/faster-whisper-large-v3') 18 | 19 | # 需要申请自动下载 20 | # https://modelscope.cn/models/mirror013/speaker-diarization-3.1 21 | # snapshot_download('mirror013/speaker-diarization-3.1', local_dir='models/ASR/whisper/speaker-diarization-3.1') 22 | -------------------------------------------------------------------------------- /submodules/TTS/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)" 3 | title: "Coqui TTS" 4 | abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production" 5 | date-released: 2021-01-01 6 | authors: 7 | - family-names: "Eren" 8 | given-names: "Gölge" 9 | - name: "The Coqui TTS Team" 10 | version: 1.4 11 | doi: 10.5281/zenodo.6334862 12 | license: "MPL-2.0" 13 | url: "https://www.coqui.ai" 14 | repository-code: "https://github.com/coqui-ai/TTS" 15 | keywords: 16 | - machine learning 17 | - deep learning 18 | - artificial intelligence 19 | - text to speech 20 | - TTS -------------------------------------------------------------------------------- /submodules/TTS/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 2 | FROM ${BASE} 3 | 4 | RUN apt-get update && apt-get upgrade -y 5 | RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* 6 | RUN pip3 install llvmlite --ignore-installed 7 | 8 | # Install Dependencies: 9 | RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 10 | RUN rm -rf /root/.cache/pip 11 | 12 | # Copy TTS repository contents: 13 | WORKDIR /root 14 | COPY . /root 15 | 16 | RUN make install 17 | 18 | ENTRYPOINT ["tts"] 19 | CMD ["--help"] 20 | -------------------------------------------------------------------------------- /submodules/TTS/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include TTS/VERSION 7 | recursive-include TTS *.json 8 | recursive-include TTS *.html 9 | recursive-include TTS *.png 10 | recursive-include TTS *.md 11 | recursive-include TTS *.py 12 | recursive-include TTS *.pyx 13 | recursive-include images *.png 14 | recursive-exclude tests * 15 | prune tests* 16 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/VERSION: -------------------------------------------------------------------------------- 1 | 0.22.0 2 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f: 4 | version = f.read().strip() 5 | 6 | __version__ = version 7 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/bin/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/bin/collect_env_info.py: -------------------------------------------------------------------------------- 1 | """Get detailed info about the working environment.""" 2 | import os 3 | import platform 4 | import sys 5 | 6 | import numpy 7 | import torch 8 | 9 | sys.path += [os.path.abspath(".."), os.path.abspath(".")] 10 | import json 11 | 12 | import TTS 13 | 14 | 15 | def system_info(): 16 | return { 17 | "OS": platform.system(), 18 | "architecture": platform.architecture(), 19 | "version": platform.version(), 20 | "processor": platform.processor(), 21 | "python": platform.python_version(), 22 | } 23 | 24 | 25 | def cuda_info(): 26 | return { 27 | "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], 28 | "available": torch.cuda.is_available(), 29 | "version": torch.version.cuda, 30 | } 31 | 32 | 33 | def package_info(): 34 | return { 35 | "numpy": numpy.__version__, 36 | "PyTorch_version": torch.__version__, 37 | "PyTorch_debug": torch.version.debug, 38 | "TTS": TTS.__version__, 39 | } 40 | 41 | 42 | def main(): 43 | details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()} 44 | print(json.dumps(details, indent=4, sort_keys=True)) 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/demos/xtts_ft_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | faster_whisper==0.9.0 2 | gradio==4.7.1 -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/README.md: -------------------------------------------------------------------------------- 1 | ### Speaker Encoder 2 | 3 | This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding. 4 | 5 | With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart. 6 | 7 | Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q). 8 | 9 | ![](umap.png) 10 | 11 | Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. 12 | 13 | To run the code, you need to follow the same flow as in TTS. 14 | 15 | - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. 16 | - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` 17 | - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. 18 | - Watch training on Tensorboard as in TTS 19 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/encoder/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/configs/emotion_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class EmotionEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Emotion Encoder model.""" 9 | 10 | model: str = "emotion_encoder" 11 | map_classid_to_classname: dict = None 12 | class_name_key: str = "emotion_name" 13 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/configs/speaker_encoder_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass 2 | 3 | from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig 4 | 5 | 6 | @dataclass 7 | class SpeakerEncoderConfig(BaseEncoderConfig): 8 | """Defines parameters for Speaker Encoder model.""" 9 | 10 | model: str = "speaker_encoder" 11 | class_name_key: str = "speaker_name" 12 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/requirements.txt: -------------------------------------------------------------------------------- 1 | umap-learn 2 | numpy>=1.17.0 3 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/encoder/utils/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/encoder/utils/visual.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import umap 5 | 6 | matplotlib.use("Agg") 7 | 8 | 9 | colormap = ( 10 | np.array( 11 | [ 12 | [76, 255, 0], 13 | [0, 127, 70], 14 | [255, 0, 0], 15 | [255, 217, 38], 16 | [0, 135, 255], 17 | [165, 0, 165], 18 | [255, 167, 255], 19 | [0, 255, 255], 20 | [255, 96, 38], 21 | [142, 76, 0], 22 | [33, 0, 127], 23 | [0, 0, 0], 24 | [183, 183, 183], 25 | ], 26 | dtype=float, 27 | ) 28 | / 255 29 | ) 30 | 31 | 32 | def plot_embeddings(embeddings, num_classes_in_batch): 33 | num_utter_per_class = embeddings.shape[0] // num_classes_in_batch 34 | 35 | # if necessary get just the first 10 classes 36 | if num_classes_in_batch > 10: 37 | num_classes_in_batch = 10 38 | embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] 39 | 40 | model = umap.UMAP() 41 | projection = model.fit_transform(embeddings) 42 | ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) 43 | colors = [colormap[i] for i in ground_truth] 44 | fig, ax = plt.subplots(figsize=(16, 10)) 45 | _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) 46 | plt.gca().set_aspect("equal", "datalim") 47 | plt.title("UMAP projection") 48 | plt.tight_layout() 49 | plt.savefig("umap") 50 | return fig 51 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/server/README.md: -------------------------------------------------------------------------------- 1 | # :frog: TTS demo server 2 | Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. 3 | 4 | **Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal. 5 | 6 | Examples runs: 7 | 8 | List officially released models. 9 | ```python TTS/server/server.py --list_models ``` 10 | 11 | Run the server with the official models. 12 | ```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan``` 13 | 14 | Run the server with the official models on a GPU. 15 | ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` 16 | 17 | Run the server with a custom models. 18 | ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` 19 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/server/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/server/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder 3 | "tts_file":"best_model.pth", // tts checkpoint file 4 | "tts_config":"config.json", // tts config.json file 5 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. 6 | "vocoder_config":null, 7 | "vocoder_file": null, 8 | "is_wavernn_batched":true, 9 | "port": 5002, 10 | "use_cuda": true, 11 | "debug": true 12 | } 13 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/server/static/coqui-log-green-TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/server/static/coqui-log-green-TTS.png -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | # configs_dir = os.path.dirname(__file__) 7 | # for file in os.listdir(configs_dir): 8 | # path = os.path.join(configs_dir, file) 9 | # if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | # config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | # module = importlib.import_module("TTS.tts.configs." + config_name) 12 | # for attribute_name in dir(module): 13 | # attribute = getattr(module, attribute_name) 14 | 15 | # if isclass(attribute): 16 | # # Add the class to this package's variables 17 | # globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/configs/tacotron2_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from TTS.tts.configs.tacotron_config import TacotronConfig 4 | 5 | 6 | @dataclass 7 | class Tacotron2Config(TacotronConfig): 8 | """Defines parameters for Tacotron2 based models. 9 | 10 | Example: 11 | 12 | >>> from TTS.tts.configs.tacotron2_config import Tacotron2Config 13 | >>> config = Tacotron2Config() 14 | 15 | Check `TacotronConfig` for argument descriptions. 16 | """ 17 | 18 | model: str = "tacotron2" 19 | out_channels: int = 80 20 | encoder_in_features: int = 512 21 | decoder_in_features: int = 512 22 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.layers.losses import * 2 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/align_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/align_tts/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/align_tts/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.pos_encoding import PositionalEncoding 4 | from TTS.tts.layers.generic.transformer import FFTransformerBlock 5 | 6 | 7 | class DurationPredictor(nn.Module): 8 | def __init__(self, num_chars, hidden_channels, hidden_channels_ffn, num_heads): 9 | super().__init__() 10 | self.embed = nn.Embedding(num_chars, hidden_channels) 11 | self.pos_enc = PositionalEncoding(hidden_channels, dropout_p=0.1) 12 | self.FFT = FFTransformerBlock(hidden_channels, num_heads, hidden_channels_ffn, 2, 0.1) 13 | self.out_layer = nn.Conv1d(hidden_channels, 1, 1) 14 | 15 | def forward(self, text, text_lengths): 16 | # B, L -> B, L 17 | emb = self.embed(text) 18 | emb = self.pos_enc(emb.transpose(1, 2)) 19 | x = self.FFT(emb, text_lengths) 20 | x = self.out_layer(x).squeeze(-1) 21 | return x 22 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/align_tts/mdn.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class MDNBlock(nn.Module): 5 | """Mixture of Density Network implementation 6 | https://arxiv.org/pdf/2003.01950.pdf 7 | """ 8 | 9 | def __init__(self, in_channels, out_channels): 10 | super().__init__() 11 | self.out_channels = out_channels 12 | self.conv1 = nn.Conv1d(in_channels, in_channels, 1) 13 | self.norm = nn.LayerNorm(in_channels) 14 | self.relu = nn.ReLU() 15 | self.dropout = nn.Dropout(0.1) 16 | self.conv2 = nn.Conv1d(in_channels, out_channels, 1) 17 | 18 | def forward(self, x): 19 | o = self.conv1(x) 20 | o = o.transpose(1, 2) 21 | o = self.norm(o) 22 | o = o.transpose(1, 2) 23 | o = self.relu(o) 24 | o = self.dropout(o) 25 | mu_sigma = self.conv2(o) 26 | # TODO: check this sigmoid 27 | # mu = torch.sigmoid(mu_sigma[:, :self.out_channels//2, :]) 28 | mu = mu_sigma[:, : self.out_channels // 2, :] 29 | log_sigma = mu_sigma[:, self.out_channels // 2 :, :] 30 | return mu, log_sigma 31 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/bark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/bark/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/bark/hubert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/bark/hubert/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/bark/hubert/hubert_manager.py: -------------------------------------------------------------------------------- 1 | # From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 2 | 3 | import os.path 4 | import shutil 5 | import urllib.request 6 | 7 | import huggingface_hub 8 | 9 | 10 | class HubertManager: 11 | @staticmethod 12 | def make_sure_hubert_installed( 13 | download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = "" 14 | ): 15 | if not os.path.isfile(model_path): 16 | print("Downloading HuBERT base model") 17 | urllib.request.urlretrieve(download_url, model_path) 18 | print("Downloaded HuBERT") 19 | return model_path 20 | return None 21 | 22 | @staticmethod 23 | def make_sure_tokenizer_installed( 24 | model: str = "quantifier_hubert_base_ls960_14.pth", 25 | repo: str = "GitMylo/bark-voice-cloning", 26 | model_path: str = "", 27 | ): 28 | model_dir = os.path.dirname(model_path) 29 | if not os.path.isfile(model_path): 30 | print("Downloading HuBERT custom tokenizer") 31 | huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False) 32 | shutil.move(os.path.join(model_dir, model), model_path) 33 | print("Downloaded tokenizer") 34 | return model_path 35 | return None 36 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/delightful_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/delightful_tts/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/feed_forward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/feed_forward/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/feed_forward/duration_predictor.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from TTS.tts.layers.generic.res_conv_bn import Conv1dBN 4 | 5 | 6 | class DurationPredictor(nn.Module): 7 | """Speedy Speech duration predictor model. 8 | Predicts phoneme durations from encoder outputs. 9 | 10 | Note: 11 | Outputs interpreted as log(durations) 12 | To get actual durations, do exp transformation 13 | 14 | conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1 15 | 16 | Args: 17 | hidden_channels (int): number of channels in the inner layers. 18 | """ 19 | 20 | def __init__(self, hidden_channels): 21 | super().__init__() 22 | 23 | self.layers = nn.ModuleList( 24 | [ 25 | Conv1dBN(hidden_channels, hidden_channels, 4, 1), 26 | Conv1dBN(hidden_channels, hidden_channels, 3, 1), 27 | Conv1dBN(hidden_channels, hidden_channels, 1, 1), 28 | nn.Conv1d(hidden_channels, 1, 1), 29 | ] 30 | ) 31 | 32 | def forward(self, x, x_mask): 33 | """ 34 | Shapes: 35 | x: [B, C, T] 36 | x_mask: [B, 1, T] 37 | """ 38 | o = x 39 | for layer in self.layers: 40 | o = layer(o) * x_mask 41 | return o 42 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/generic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/generic/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/generic/gated_conv.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .normalization import LayerNorm 4 | 5 | 6 | class GatedConvBlock(nn.Module): 7 | """Gated convolutional block as in https://arxiv.org/pdf/1612.08083.pdf 8 | Args: 9 | in_out_channels (int): number of input/output channels. 10 | kernel_size (int): convolution kernel size. 11 | dropout_p (float): dropout rate. 12 | """ 13 | 14 | def __init__(self, in_out_channels, kernel_size, dropout_p, num_layers): 15 | super().__init__() 16 | # class arguments 17 | self.dropout_p = dropout_p 18 | self.num_layers = num_layers 19 | # define layers 20 | self.conv_layers = nn.ModuleList() 21 | self.norm_layers = nn.ModuleList() 22 | self.layers = nn.ModuleList() 23 | for _ in range(num_layers): 24 | self.conv_layers += [nn.Conv1d(in_out_channels, 2 * in_out_channels, kernel_size, padding=kernel_size // 2)] 25 | self.norm_layers += [LayerNorm(2 * in_out_channels)] 26 | 27 | def forward(self, x, x_mask): 28 | o = x 29 | res = x 30 | for idx in range(self.num_layers): 31 | o = nn.functional.dropout(o, p=self.dropout_p, training=self.training) 32 | o = self.conv_layers[idx](o * x_mask) 33 | o = self.norm_layers[idx](o) 34 | o = nn.functional.glu(o, dim=1) 35 | o = res + o 36 | res = o 37 | return o 38 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/glow_tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/glow_tts/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/overflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/overflow/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/tacotron/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/layers/tacotron/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/tortoise/tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from tokenizers import Tokenizer 5 | 6 | from TTS.tts.utils.text.cleaners import english_cleaners 7 | 8 | DEFAULT_VOCAB_FILE = os.path.join( 9 | os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json" 10 | ) 11 | 12 | 13 | class VoiceBpeTokenizer: 14 | def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None): 15 | self.tokenizer = None 16 | if vocab_file is not None: 17 | self.tokenizer = Tokenizer.from_file(vocab_file) 18 | if vocab_str is not None: 19 | self.tokenizer = Tokenizer.from_str(vocab_str) 20 | 21 | def preprocess_text(self, txt): 22 | txt = english_cleaners(txt) 23 | return txt 24 | 25 | def encode(self, txt): 26 | txt = self.preprocess_text(txt) 27 | txt = txt.replace(" ", "[SPACE]") 28 | return self.tokenizer.encode(txt).ids 29 | 30 | def decode(self, seq): 31 | if isinstance(seq, torch.Tensor): 32 | seq = seq.cpu().numpy() 33 | txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") 34 | txt = txt.replace("[SPACE]", " ") 35 | txt = txt.replace("[STOP]", "") 36 | txt = txt.replace("[UNK]", "") 37 | return txt 38 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/layers/xtts/xtts_manager.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SpeakerManager(): 4 | def __init__(self, speaker_file_path=None): 5 | self.speakers = torch.load(speaker_file_path) 6 | 7 | @property 8 | def name_to_id(self): 9 | return self.speakers.keys() 10 | 11 | @property 12 | def num_speakers(self): 13 | return len(self.name_to_id) 14 | 15 | @property 16 | def speaker_names(self): 17 | return list(self.name_to_id.keys()) 18 | 19 | 20 | class LanguageManager(): 21 | def __init__(self, config): 22 | self.langs = config["languages"] 23 | 24 | @property 25 | def name_to_id(self): 26 | return self.langs 27 | 28 | @property 29 | def num_languages(self): 30 | return len(self.name_to_id) 31 | 32 | @property 33 | def language_names(self): 34 | return list(self.name_to_id) 35 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | from TTS.utils.generic_utils import find_module 4 | 5 | 6 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseTTS": 7 | print(" > Using model: {}".format(config.model)) 8 | # fetch the right model implementation. 9 | if "base_model" in config and config["base_model"] is not None: 10 | MyModel = find_module("TTS.tts.models", config.base_model.lower()) 11 | else: 12 | MyModel = find_module("TTS.tts.models", config.model.lower()) 13 | model = MyModel.init_from_config(config=config, samples=samples) 14 | return model 15 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/measures.py: -------------------------------------------------------------------------------- 1 | def alignment_diagonal_score(alignments, binary=False): 2 | """ 3 | Compute how diagonal alignment predictions are. It is useful 4 | to measure the alignment consistency of a model 5 | Args: 6 | alignments (torch.Tensor): batch of alignments. 7 | binary (bool): if True, ignore scores and consider attention 8 | as a binary mask. 9 | Shape: 10 | - alignments : :math:`[B, T_de, T_en]` 11 | """ 12 | maxs = alignments.max(dim=1)[0] 13 | if binary: 14 | maxs[maxs > 0] = 1 15 | return maxs.mean(dim=1).mean(dim=0).item() 16 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/monotonic_align/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.tts.utils.text.tokenizer import TTSTokenizer 2 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/bangla/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/bangla/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/belarusian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/belarusian/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/belarusian/phonemizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | finder = None 4 | 5 | 6 | def init(): 7 | try: 8 | import jpype 9 | import jpype.imports 10 | except ModuleNotFoundError: 11 | raise ModuleNotFoundError( 12 | "Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`." 13 | ) 14 | 15 | try: 16 | jar_path = os.environ["BEL_FANETYKA_JAR"] 17 | except KeyError: 18 | raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file") 19 | 20 | jpype.startJVM(classpath=[jar_path]) 21 | 22 | # import the Java modules 23 | from org.alex73.korpus.base import GrammarDB2, GrammarFinder 24 | 25 | grammar_db = GrammarDB2.initializeFromJar() 26 | global finder 27 | finder = GrammarFinder(grammar_db) 28 | 29 | 30 | def belarusian_text_to_phonemes(text: str) -> str: 31 | # Initialize only on first run 32 | if finder is None: 33 | init() 34 | 35 | from org.alex73.fanetyka.impl import FanetykaText 36 | 37 | return str(FanetykaText(finder, text).ipa) 38 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/chinese_mandarin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/chinese_mandarin/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/chinese_mandarin/phonemizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import jieba 4 | import pypinyin 5 | 6 | from .pinyinToPhonemes import PINYIN_DICT 7 | 8 | 9 | def _chinese_character_to_pinyin(text: str) -> List[str]: 10 | pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) 11 | pinyins_flat_list = [item for sublist in pinyins for item in sublist] 12 | return pinyins_flat_list 13 | 14 | 15 | def _chinese_pinyin_to_phoneme(pinyin: str) -> str: 16 | segment = pinyin[:-1] 17 | tone = pinyin[-1] 18 | phoneme = PINYIN_DICT.get(segment, [""])[0] 19 | return phoneme + tone 20 | 21 | 22 | def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: 23 | tokenized_text = jieba.cut(text, HMM=False) 24 | tokenized_text = " ".join(tokenized_text) 25 | pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) 26 | 27 | results: List[str] = [] 28 | 29 | for token in pinyined_text: 30 | if token[-1] in "12345": # TODO transform to is_pinyin() 31 | pinyin_phonemes = _chinese_pinyin_to_phoneme(token) 32 | 33 | results += list(pinyin_phonemes) 34 | else: # is ponctuation or other 35 | results += list(token) 36 | 37 | return seperator.join(results) 38 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/english/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/english/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/english/abbreviations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # List of (regular expression, replacement) pairs for abbreviations in english: 4 | abbreviations_en = [ 5 | (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 6 | for x in [ 7 | ("mrs", "misess"), 8 | ("mr", "mister"), 9 | ("dr", "doctor"), 10 | ("st", "saint"), 11 | ("co", "company"), 12 | ("jr", "junior"), 13 | ("maj", "major"), 14 | ("gen", "general"), 15 | ("drs", "doctors"), 16 | ("rev", "reverend"), 17 | ("lt", "lieutenant"), 18 | ("hon", "honorable"), 19 | ("sgt", "sergeant"), 20 | ("capt", "captain"), 21 | ("esq", "esquire"), 22 | ("ltd", "limited"), 23 | ("col", "colonel"), 24 | ("ft", "fort"), 25 | ] 26 | ] 27 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/english/time_norm.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import inflect 4 | 5 | _inflect = inflect.engine() 6 | 7 | _time_re = re.compile( 8 | r"""\b 9 | ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours 10 | : 11 | ([0-5][0-9]) # minutes 12 | \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm 13 | \b""", 14 | re.IGNORECASE | re.X, 15 | ) 16 | 17 | 18 | def _expand_num(n: int) -> str: 19 | return _inflect.number_to_words(n) 20 | 21 | 22 | def _expand_time_english(match: "re.Match") -> str: 23 | hour = int(match.group(1)) 24 | past_noon = hour >= 12 25 | time = [] 26 | if hour > 12: 27 | hour -= 12 28 | elif hour == 0: 29 | hour = 12 30 | past_noon = True 31 | time.append(_expand_num(hour)) 32 | 33 | minute = int(match.group(6)) 34 | if minute > 0: 35 | if minute < 10: 36 | time.append("oh") 37 | time.append(_expand_num(minute)) 38 | am_pm = match.group(7) 39 | if am_pm is None: 40 | time.append("p m" if past_noon else "a m") 41 | else: 42 | time.extend(list(am_pm.replace(".", ""))) 43 | return " ".join(time) 44 | 45 | 46 | def expand_time_english(text: str) -> str: 47 | return re.sub(_time_re, _expand_time_english, text) 48 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/french/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/french/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/japanese/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/japanese/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/korean/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/tts/utils/text/korean/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/korean/ko_dictionary.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Add the word you want to the dictionary. 3 | etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"} 4 | 5 | 6 | english_dictionary = { 7 | "KOREA": "코리아", 8 | "IDOL": "아이돌", 9 | "IT": "아이티", 10 | "IQ": "아이큐", 11 | "UP": "업", 12 | "DOWN": "다운", 13 | "PC": "피씨", 14 | "CCTV": "씨씨티비", 15 | "SNS": "에스엔에스", 16 | "AI": "에이아이", 17 | "CEO": "씨이오", 18 | "A": "에이", 19 | "B": "비", 20 | "C": "씨", 21 | "D": "디", 22 | "E": "이", 23 | "F": "에프", 24 | "G": "지", 25 | "H": "에이치", 26 | "I": "아이", 27 | "J": "제이", 28 | "K": "케이", 29 | "L": "엘", 30 | "M": "엠", 31 | "N": "엔", 32 | "O": "오", 33 | "P": "피", 34 | "Q": "큐", 35 | "R": "알", 36 | "S": "에스", 37 | "T": "티", 38 | "U": "유", 39 | "V": "브이", 40 | "W": "더블유", 41 | "X": "엑스", 42 | "Y": "와이", 43 | "Z": "제트", 44 | } 45 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/korean/korean.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py 3 | import re 4 | 5 | from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary 6 | 7 | 8 | def normalize(text): 9 | text = text.strip() 10 | text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text) 11 | text = normalize_with_dictionary(text, etc_dictionary) 12 | text = normalize_english(text) 13 | text = text.lower() 14 | return text 15 | 16 | 17 | def normalize_with_dictionary(text, dic): 18 | if any(key in text for key in dic.keys()): 19 | pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) 20 | return pattern.sub(lambda x: dic[x.group()], text) 21 | return text 22 | 23 | 24 | def normalize_english(text): 25 | def fn(m): 26 | word = m.group() 27 | if word in english_dictionary: 28 | return english_dictionary.get(word) 29 | return word 30 | 31 | text = re.sub("([A-Za-z]+)", fn, text) 32 | return text 33 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/tts/utils/text/korean/phonemizer.py: -------------------------------------------------------------------------------- 1 | from jamo import hangul_to_jamo 2 | 3 | from TTS.tts.utils.text.korean.korean import normalize 4 | 5 | g2p = None 6 | 7 | 8 | def korean_text_to_phonemes(text, character: str = "hangeul") -> str: 9 | """ 10 | 11 | The input and output values look the same, but they are different in Unicode. 12 | 13 | example : 14 | 15 | input = '하늘' (Unicode : \ud558\ub298), (하 + 늘) 16 | output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ) 17 | 18 | """ 19 | global g2p # pylint: disable=global-statement 20 | if g2p is None: 21 | from g2pkk import G2p 22 | 23 | g2p = G2p() 24 | 25 | if character == "english": 26 | from anyascii import anyascii 27 | 28 | text = normalize(text) 29 | text = g2p(text) 30 | text = anyascii(text) 31 | return text 32 | 33 | text = normalize(text) 34 | text = g2p(text) 35 | text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ'] 36 | return "".join(text) 37 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/utils/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/utils/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from TTS.utils.audio.processor import AudioProcessor 2 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/utils/distribute.py: -------------------------------------------------------------------------------- 1 | # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py 2 | import torch 3 | import torch.distributed as dist 4 | 5 | 6 | def reduce_tensor(tensor, num_gpus): 7 | rt = tensor.clone() 8 | dist.all_reduce(rt, op=dist.reduce_op.SUM) 9 | rt /= num_gpus 10 | return rt 11 | 12 | 13 | def init_distributed(rank, num_gpus, group_name, dist_backend, dist_url): 14 | assert torch.cuda.is_available(), "Distributed mode requires CUDA." 15 | 16 | # Set cuda device so everything is done on the right GPU. 17 | torch.cuda.set_device(rank % torch.cuda.device_count()) 18 | 19 | # Initialize distributed communication 20 | dist.init_process_group(dist_backend, init_method=dist_url, world_size=num_gpus, rank=rank, group_name=group_name) 21 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vc/configs/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/models/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import re 3 | from typing import Dict, List, Union 4 | 5 | 6 | def to_camel(text): 7 | text = text.capitalize() 8 | return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) 9 | 10 | 11 | def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": 12 | print(" > Using model: {}".format(config.model)) 13 | # fetch the right model implementation. 14 | if "model" in config and config["model"].lower() == "freevc": 15 | MyModel = importlib.import_module("TTS.vc.models.freevc").FreeVC 16 | model = MyModel.init_from_config(config, samples) 17 | return model 18 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vc/modules/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/modules/freevc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vc/modules/freevc/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/modules/freevc/speaker_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vc/modules/freevc/speaker_encoder/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/modules/freevc/speaker_encoder/hparams.py: -------------------------------------------------------------------------------- 1 | ## Mel-filterbank 2 | mel_window_length = 25 # In milliseconds 3 | mel_window_step = 10 # In milliseconds 4 | mel_n_channels = 40 5 | 6 | 7 | ## Audio 8 | sampling_rate = 16000 9 | # Number of spectrogram frames in a partial utterance 10 | partials_n_frames = 160 # 1600 ms 11 | 12 | 13 | ## Voice Activation Detection 14 | # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. 15 | # This sets the granularity of the VAD. Should not need to be changed. 16 | vad_window_length = 30 # In milliseconds 17 | # Number of frames to average together when performing the moving average smoothing. 18 | # The larger this value, the larger the VAD variations must be to not get smoothed out. 19 | vad_moving_average_width = 8 20 | # Maximum number of consecutive silent frames a segment can have. 21 | vad_max_silence_length = 6 22 | 23 | 24 | ## Audio volume normalization 25 | audio_norm_target_dBFS = -30 26 | 27 | 28 | ## Model parameters 29 | model_hidden_size = 256 30 | model_embedding_size = 256 31 | model_num_layers = 3 32 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vc/modules/freevc/wavlm/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | 4 | import torch 5 | 6 | from TTS.utils.generic_utils import get_user_data_dir 7 | from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig 8 | 9 | model_uri = "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/WavLM-Large.pt" 10 | 11 | 12 | def get_wavlm(device="cpu"): 13 | """Download the model and return the model object.""" 14 | 15 | output_path = get_user_data_dir("tts") 16 | 17 | output_path = os.path.join(output_path, "wavlm") 18 | if not os.path.exists(output_path): 19 | os.makedirs(output_path) 20 | 21 | output_path = os.path.join(output_path, "WavLM-Large.pt") 22 | if not os.path.exists(output_path): 23 | print(f" > Downloading WavLM model to {output_path} ...") 24 | urllib.request.urlretrieve(model_uri, output_path) 25 | 26 | checkpoint = torch.load(output_path, map_location=torch.device(device)) 27 | cfg = WavLMConfig(checkpoint["cfg"]) 28 | wavlm = WavLM(cfg).to(device) 29 | wavlm.load_state_dict(checkpoint["model"]) 30 | wavlm.eval() 31 | return wavlm 32 | 33 | 34 | if __name__ == "__main__": 35 | wavlm = get_wavlm() 36 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vocoder/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/configs/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from inspect import isclass 4 | 5 | # import all files under configs/ 6 | configs_dir = os.path.dirname(__file__) 7 | for file in os.listdir(configs_dir): 8 | path = os.path.join(configs_dir, file) 9 | if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): 10 | config_name = file[: file.find(".py")] if file.endswith(".py") else file 11 | module = importlib.import_module("TTS.vocoder.configs." + config_name) 12 | for attribute_name in dir(module): 13 | attribute = getattr(module, attribute_name) 14 | 15 | if isclass(attribute): 16 | # Add the class to this package's variables 17 | globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vocoder/layers/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/models/fullband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.models.melgan_generator import MelganGenerator 4 | 5 | 6 | class FullbandMelganGenerator(MelganGenerator): 7 | def __init__( 8 | self, 9 | in_channels=80, 10 | out_channels=1, 11 | proj_kernel=7, 12 | base_channels=512, 13 | upsample_factors=(2, 8, 2, 2), 14 | res_kernel=3, 15 | num_res_blocks=4, 16 | ): 17 | super().__init__( 18 | in_channels=in_channels, 19 | out_channels=out_channels, 20 | proj_kernel=proj_kernel, 21 | base_channels=base_channels, 22 | upsample_factors=upsample_factors, 23 | res_kernel=res_kernel, 24 | num_res_blocks=num_res_blocks, 25 | ) 26 | 27 | @torch.no_grad() 28 | def inference(self, cond_features): 29 | cond_features = cond_features.to(self.layers[1].weight.device) 30 | cond_features = torch.nn.functional.pad( 31 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 32 | ) 33 | return self.layers(cond_features) 34 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/models/multiband_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from TTS.vocoder.layers.pqmf import PQMF 4 | from TTS.vocoder.models.melgan_generator import MelganGenerator 5 | 6 | 7 | class MultibandMelganGenerator(MelganGenerator): 8 | def __init__( 9 | self, 10 | in_channels=80, 11 | out_channels=4, 12 | proj_kernel=7, 13 | base_channels=384, 14 | upsample_factors=(2, 8, 2, 2), 15 | res_kernel=3, 16 | num_res_blocks=3, 17 | ): 18 | super().__init__( 19 | in_channels=in_channels, 20 | out_channels=out_channels, 21 | proj_kernel=proj_kernel, 22 | base_channels=base_channels, 23 | upsample_factors=upsample_factors, 24 | res_kernel=res_kernel, 25 | num_res_blocks=num_res_blocks, 26 | ) 27 | self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) 28 | 29 | def pqmf_analysis(self, x): 30 | return self.pqmf_layer.analysis(x) 31 | 32 | def pqmf_synthesis(self, x): 33 | return self.pqmf_layer.synthesis(x) 34 | 35 | @torch.no_grad() 36 | def inference(self, cond_features): 37 | cond_features = cond_features.to(self.layers[1].weight.device) 38 | cond_features = torch.nn.functional.pad( 39 | cond_features, (self.inference_padding, self.inference_padding), "replicate" 40 | ) 41 | return self.pqmf_synthesis(self.layers(cond_features)) 42 | -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/pqmf_output.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vocoder/pqmf_output.wav -------------------------------------------------------------------------------- /submodules/TTS/TTS/vocoder/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/TTS/vocoder/utils/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/dockerfiles/Dockerfile.dev: -------------------------------------------------------------------------------- 1 | ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 2 | FROM ${BASE} 3 | 4 | # Install OS dependencies: 5 | RUN apt-get update && apt-get upgrade -y 6 | RUN apt-get install -y --no-install-recommends \ 7 | gcc g++ \ 8 | make \ 9 | python3 python3-dev python3-pip python3-venv python3-wheel \ 10 | espeak-ng libsndfile1-dev \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | # Install Major Python Dependencies: 14 | RUN pip3 install llvmlite --ignore-installed 15 | RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 16 | RUN rm -rf /root/.cache/pip 17 | 18 | WORKDIR /root 19 | 20 | # Copy Dependency Lock Files: 21 | COPY \ 22 | Makefile \ 23 | pyproject.toml \ 24 | setup.py \ 25 | requirements.dev.txt \ 26 | requirements.ja.txt \ 27 | requirements.notebooks.txt \ 28 | requirements.txt \ 29 | /root/ 30 | 31 | # Install Project Dependencies 32 | # Separate stage to limit re-downloading: 33 | RUN pip install \ 34 | -r requirements.txt \ 35 | -r requirements.dev.txt \ 36 | -r requirements.ja.txt \ 37 | -r requirements.notebooks.txt 38 | 39 | # Copy TTS repository contents: 40 | COPY . /root 41 | 42 | # Installing the TTS package itself: 43 | RUN make install 44 | 45 | -------------------------------------------------------------------------------- /submodules/TTS/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -j auto -WT --keep-going 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /submodules/TTS/docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/docs/README.md -------------------------------------------------------------------------------- /submodules/TTS/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo 2 | myst-parser == 2.0.0 3 | sphinx == 7.2.5 4 | sphinx_inline_tabs 5 | sphinx_copybutton 6 | linkify-it-py -------------------------------------------------------------------------------- /submodules/TTS/docs/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/docs/source/_static/logo.png -------------------------------------------------------------------------------- /submodules/TTS/docs/source/_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | {% block scripts %} 3 | {{ super() }} 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/contributing.md: -------------------------------------------------------------------------------- 1 | ```{include} ../../CONTRIBUTING.md 2 | :relative-images: 3 | ``` 4 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/implementing_a_new_language_frontend.md: -------------------------------------------------------------------------------- 1 | # Implementing a New Language Frontend 2 | 3 | - Language frontends are located under `TTS.tts.utils.text` 4 | - Each special language has a separate folder. 5 | - Each folder contains all the utilities for processing the text input. 6 | - `TTS.tts.utils.text.phonemizers` contains the main phonemizer for a language. This is the class that uses the utilities 7 | from the previous step and used to convert the text to phonemes or graphemes for the model. 8 | - After you implement your phonemizer, you need to add it to the `TTS/tts/utils/text/phonemizers/__init__.py` to be able to 9 | map the language code in the model config - `config.phoneme_language` - to the phonemizer class and initiate the phonemizer automatically. 10 | - You should also add tests to `tests/text_tests` if you want to make a PR. 11 | 12 | We suggest you to check the available implementations as reference. Good luck! 13 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/index.md: -------------------------------------------------------------------------------- 1 | 2 | ```{include} ../../README.md 3 | :relative-images: 4 | ``` 5 | ---- 6 | 7 | # Documentation Content 8 | ```{eval-rst} 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Get started 12 | 13 | tutorial_for_nervous_beginners 14 | installation 15 | faq 16 | contributing 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | :caption: Using 🐸TTS 21 | 22 | inference 23 | docker_images 24 | implementing_a_new_model 25 | implementing_a_new_language_frontend 26 | training_a_model 27 | finetuning 28 | configuration 29 | formatting_your_dataset 30 | what_makes_a_good_dataset 31 | tts_datasets 32 | marytts 33 | 34 | .. toctree:: 35 | :maxdepth: 2 36 | :caption: Main Classes 37 | 38 | main_classes/trainer_api 39 | main_classes/audio_processor 40 | main_classes/model_api 41 | main_classes/dataset 42 | main_classes/gan 43 | main_classes/speaker_manager 44 | 45 | .. toctree:: 46 | :maxdepth: 2 47 | :caption: `tts` Models 48 | 49 | models/glow_tts.md 50 | models/vits.md 51 | models/forward_tts.md 52 | models/tacotron1-2.md 53 | models/overflow.md 54 | models/tortoise.md 55 | models/bark.md 56 | models/xtts.md 57 | 58 | .. toctree:: 59 | :maxdepth: 2 60 | :caption: `vocoder` Models 61 | 62 | ``` 63 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | 🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10. 4 | 5 | ## Using `pip` 6 | 7 | `pip` is recommended if you want to use 🐸TTS only for inference. 8 | 9 | You can install from PyPI as follows: 10 | 11 | ```bash 12 | pip install TTS # from PyPI 13 | ``` 14 | 15 | Or install from Github: 16 | 17 | ```bash 18 | pip install git+https://github.com/coqui-ai/TTS # from Github 19 | ``` 20 | 21 | ## Installing From Source 22 | 23 | This is recommended for development and more control over 🐸TTS. 24 | 25 | ```bash 26 | git clone https://github.com/coqui-ai/TTS/ 27 | cd TTS 28 | make system-deps # only on Linux systems. 29 | make install 30 | ``` 31 | 32 | ## On Windows 33 | If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/audio_processor.md: -------------------------------------------------------------------------------- 1 | # AudioProcessor API 2 | 3 | `TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for 4 | 5 | - Feature extraction. 6 | - Sound normalization. 7 | - Reading and writing audio files. 8 | - Sampling audio signals. 9 | - Normalizing and denormalizing audio signals. 10 | - Griffin-Lim vocoder. 11 | 12 | The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config 13 | also must inherit or initiate `BaseAudioConfig`. 14 | 15 | ## AudioProcessor 16 | ```{eval-rst} 17 | .. autoclass:: TTS.utils.audio.AudioProcessor 18 | :members: 19 | ``` 20 | 21 | ## BaseAudioConfig 22 | ```{eval-rst} 23 | .. autoclass:: TTS.config.shared_configs.BaseAudioConfig 24 | :members: 25 | ``` -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/dataset.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | ## TTS Dataset 4 | 5 | ```{eval-rst} 6 | .. autoclass:: TTS.tts.datasets.TTSDataset 7 | :members: 8 | ``` 9 | 10 | ## Vocoder Dataset 11 | 12 | ```{eval-rst} 13 | .. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset 14 | :members: 15 | ``` 16 | 17 | ```{eval-rst} 18 | .. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset 19 | :members: 20 | ``` 21 | 22 | ```{eval-rst} 23 | .. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset 24 | :members: 25 | ``` -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/gan.md: -------------------------------------------------------------------------------- 1 | # GAN API 2 | 3 | The {class}`TTS.vocoder.models.gan.GAN` provides an easy way to implementing new GAN based models. You just need 4 | to define the model architectures for the generator and the discriminator networks and give them to the `GAN` class 5 | to do its ✨️. 6 | 7 | 8 | ## GAN 9 | ```{eval-rst} 10 | .. autoclass:: TTS.vocoder.models.gan.GAN 11 | :members: 12 | ``` -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/model_api.md: -------------------------------------------------------------------------------- 1 | # Model API 2 | Model API provides you a set of functions that easily make your model compatible with the `Trainer`, 3 | `Synthesizer` and `ModelZoo`. 4 | 5 | ## Base TTS Model 6 | 7 | ```{eval-rst} 8 | .. autoclass:: TTS.model.BaseTrainerModel 9 | :members: 10 | ``` 11 | 12 | ## Base tts Model 13 | 14 | ```{eval-rst} 15 | .. autoclass:: TTS.tts.models.base_tts.BaseTTS 16 | :members: 17 | ``` 18 | 19 | ## Base vocoder Model 20 | 21 | ```{eval-rst} 22 | .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder 23 | :members: 24 | ``` -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/speaker_manager.md: -------------------------------------------------------------------------------- 1 | # Speaker Manager API 2 | 3 | The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is 4 | especially useful for multi-speaker models. 5 | 6 | 7 | ## Speaker Manager 8 | ```{eval-rst} 9 | .. automodule:: TTS.tts.utils.speakers 10 | :members: 11 | ``` -------------------------------------------------------------------------------- /submodules/TTS/docs/source/main_classes/trainer_api.md: -------------------------------------------------------------------------------- 1 | # Trainer API 2 | 3 | We made the trainer a separate project on https://github.com/coqui-ai/Trainer 4 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/models/glow_tts.md: -------------------------------------------------------------------------------- 1 | # Glow TTS 2 | 3 | Glow TTS is a normalizing flow model for text-to-speech. It is built on the generic Glow model that is previously 4 | used in computer vision and vocoder models. It uses "monotonic alignment search" (MAS) to fine the text-to-speech alignment 5 | and uses the output to train a separate duration predictor network for faster inference run-time. 6 | 7 | ## Important resources & papers 8 | - GlowTTS: https://arxiv.org/abs/2005.11129 9 | - Glow (Generative Flow with invertible 1x1 Convolutions): https://arxiv.org/abs/1807.03039 10 | - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html 11 | 12 | ## GlowTTS Config 13 | ```{eval-rst} 14 | .. autoclass:: TTS.tts.configs.glow_tts_config.GlowTTSConfig 15 | :members: 16 | ``` 17 | 18 | ## GlowTTS Model 19 | ```{eval-rst} 20 | .. autoclass:: TTS.tts.models.glow_tts.GlowTTS 21 | :members: 22 | ``` 23 | -------------------------------------------------------------------------------- /submodules/TTS/docs/source/tts_datasets.md: -------------------------------------------------------------------------------- 1 | # TTS Datasets 2 | 3 | Some of the known public datasets that we successfully applied 🐸TTS: 4 | 5 | - [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/) 6 | - [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) 7 | - [English - TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) 8 | - [English - LibriTTS](https://openslr.org/60/) 9 | - [English - VCTK](https://datashare.ed.ac.uk/handle/10283/2950) 10 | - [Multilingual - M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) 11 | - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 12 | - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) 13 | - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) 14 | - [Chinese](https://www.data-baker.com/data/index/source/) 15 | - [Ukrainian - LADA](https://github.com/egorsmkv/ukrainian-tts-datasets/tree/main/lada) 16 | 17 | Let us know if you use 🐸TTS on a different dataset. 18 | -------------------------------------------------------------------------------- /submodules/TTS/images/TTS-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/TTS-performance.png -------------------------------------------------------------------------------- /submodules/TTS/images/coqui-log-green-TTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/coqui-log-green-TTS.png -------------------------------------------------------------------------------- /submodules/TTS/images/demo_server.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/demo_server.gif -------------------------------------------------------------------------------- /submodules/TTS/images/example_model_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/example_model_output.png -------------------------------------------------------------------------------- /submodules/TTS/images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/model.png -------------------------------------------------------------------------------- /submodules/TTS/images/tts_cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/tts_cli.gif -------------------------------------------------------------------------------- /submodules/TTS/images/tts_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/images/tts_performance.png -------------------------------------------------------------------------------- /submodules/TTS/notebooks/dataset_analysis/README.md: -------------------------------------------------------------------------------- 1 | ## Simple Notebook to Analyze a Dataset 2 | 3 | By the use of this notebook, you can easily analyze a brand new dataset, find exceptional cases and define your training set. 4 | 5 | What we are looking in here is reasonable distribution of instances in terms of sequence-length, audio-length and word-coverage. 6 | 7 | This notebook is inspired from https://github.com/MycroftAI/mimic2 8 | -------------------------------------------------------------------------------- /submodules/TTS/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | "cython~=0.29.30", 6 | "numpy>=1.22.0", 7 | "packaging", 8 | ] 9 | 10 | [flake8] 11 | max-line-length=120 12 | 13 | [tool.black] 14 | line-length = 120 15 | target-version = ['py39'] 16 | 17 | [tool.isort] 18 | line_length = 120 19 | profile = "black" 20 | multi_line_output = 3 21 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Training Recipes 2 | 3 | TTS recipes intended to host scripts running all the necessary steps to train a TTS model on a particular dataset. 4 | 5 | For each dataset, you need to download the dataset once. Then you run the training for the model you want. 6 | 7 | Run each script from the root TTS folder as follows. 8 | 9 | ```console 10 | $ sh ./recipes//download_.sh 11 | $ python recipes///train.py 12 | ``` 13 | 14 | For some datasets you might need to resample the audio files. For example, VCTK dataset can be resampled to 22050Hz as follows. 15 | 16 | ```console 17 | python TTS/bin/resample.py --input_dir recipes/vctk/VCTK/wav48_silence_trimmed --output_sr 22050 --output_dir recipes/vctk/VCTK/wav48_silence_trimmed --n_jobs 8 --file_ext flac 18 | ``` 19 | 20 | If you train a new model using TTS, feel free to share your training to expand the list of recipes. 21 | 22 | You can also open a new discussion and share your progress with the 🐸 community. -------------------------------------------------------------------------------- /submodules/TTS/recipes/bel-alex73/.gitignore: -------------------------------------------------------------------------------- 1 | /docker-prepare/*.txt 2 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/bel-alex73/docker-prepare-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | cd $( dirname -- "$0"; ) 5 | 6 | cp ../../requirements*.txt docker-prepare/ 7 | 8 | docker build -t tts-learn -f docker-prepare/Dockerfile docker-prepare/ 9 | 10 | mkdir -p ../../../storage 11 | docker run --rm -it \ 12 | -p 2525:2525 \ 13 | --shm-size=256M \ 14 | --name tts-learn-run \ 15 | -v $(pwd)/../../:/a/TTS \ 16 | -v $(pwd)/../../../cv-corpus:/a/cv-corpus \ 17 | -v $(pwd)/../../../fanetyka/:/a/fanetyka/ \ 18 | -v $(pwd)/../../../storage:/storage \ 19 | tts-learn 20 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/bel-alex73/docker-prepare/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:22.04 2 | 3 | RUN apt -y update 4 | RUN apt -y upgrade 5 | RUN apt -y install --no-install-recommends pip ffmpeg openjdk-19-jre-headless 6 | 7 | RUN mkdir /a/ 8 | ADD requirements*.txt /a/ 9 | WORKDIR /a/ 10 | RUN pip install -r requirements.txt -r requirements.dev.txt -r requirements.notebooks.txt 11 | RUN pip install seaborn pydub notebook 12 | 13 | RUN apt -y install --no-install-recommends gcc libpython3.10-dev 14 | 15 | ADD runtime.sh /a/ 16 | 17 | WORKDIR /a/TTS/ 18 | CMD /a/runtime.sh 19 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/bel-alex73/docker-prepare/runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd /a/TTS 4 | pip install -e .[all,dev,notebooks] 5 | 6 | LANG=C.utf8 bash 7 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/bel-alex73/dump_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from train_glowtts import config 5 | 6 | s = json.dumps(config, default=vars, indent=2) 7 | s = re.sub(r'"test_sentences":\s*\[\],', "", s) 8 | print(s) 9 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/blizzard2013/README.md: -------------------------------------------------------------------------------- 1 | # How to get the Blizzard 2013 Dataset 2 | 3 | The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody. 4 | 5 | To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings. 6 | 7 | To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh. 8 | 9 | You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset. 10 | 11 | 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments). 12 | 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation). -------------------------------------------------------------------------------- /submodules/TTS/recipes/kokoro/tacotron2-DDC/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | CORPUS=kokoro-speech-v1_1-small 5 | echo $RUN_DIR 6 | if [ \! -d $RUN_DIR/$CORPUS ] ; then 7 | echo "$RUN_DIR/$CORPUS doesn't exist." 8 | echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." 9 | exit 1 10 | fi 11 | # create train-val splits 12 | shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv 13 | head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv 14 | tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv 15 | # compute dataset mean and variance for normalization 16 | python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ 17 | # training .... 18 | # change the GPU id if needed 19 | CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ 20 | --coqpit.output_path $RUN_DIR \ 21 | --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ 22 | --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ 23 | --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS LJspeech Recipes 2 | 3 | For running the recipes 4 | 5 | 1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. 6 | 2. Go to your desired model folder and run the training. 7 | 8 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 9 | ```terminal 10 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 11 | ``` 12 | 13 | Running bash scripts. 14 | ```terminal 15 | bash run.sh 16 | ``` 17 | 18 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 19 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 20 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/download_ljspeech.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download LJSpeech dataset 6 | wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 7 | # extract 8 | tar -xjf LJSpeech-1.1.tar.bz2 9 | # create train-val splits 10 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 11 | head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 12 | tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 13 | mv LJSpeech-1.1 $RUN_DIR/recipes/ljspeech/ 14 | rm LJSpeech-1.1.tar.bz2 -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/hifigan/train_hifigan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import HifiganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = HifiganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import MultibandMelganConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | config = MultibandMelganConfig( 13 | batch_size=32, 14 | eval_batch_size=16, 15 | num_loader_workers=4, 16 | num_eval_loader_workers=4, 17 | run_eval=True, 18 | test_delay_epochs=5, 19 | epochs=1000, 20 | seq_len=8192, 21 | pad_short=2000, 22 | use_noise_augment=True, 23 | eval_split_size=10, 24 | print_step=25, 25 | print_eval=False, 26 | mixed_precision=False, 27 | lr_gen=1e-4, 28 | lr_disc=1e-4, 29 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 30 | output_path=output_path, 31 | ) 32 | 33 | # init audio processor 34 | ap = AudioProcessor(**config.audio.to_dict()) 35 | 36 | # load training samples 37 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 38 | 39 | # init model 40 | model = GAN(config, ap) 41 | 42 | # init the trainer and 🚀 43 | trainer = Trainer( 44 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 45 | ) 46 | trainer.fit() 47 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/overflow/lj_parameters.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/recipes/ljspeech/overflow/lj_parameters.pt -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/univnet/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import UnivnetConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.gan import GAN 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = UnivnetConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=8192, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=False, 25 | mixed_precision=False, 26 | lr_gen=1e-4, 27 | lr_disc=1e-4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = GAN(config, ap) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples 44 | ) 45 | trainer.fit() 46 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/wavegrad/train_wavegrad.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavegradConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavegrad import Wavegrad 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavegradConfig( 12 | batch_size=32, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1000, 19 | seq_len=6144, 20 | pad_short=2000, 21 | use_noise_augment=True, 22 | eval_split_size=50, 23 | print_step=50, 24 | print_eval=True, 25 | mixed_precision=False, 26 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 27 | output_path=output_path, 28 | ) 29 | 30 | # init audio processor 31 | ap = AudioProcessor(**config.audio.to_dict()) 32 | 33 | # load training samples 34 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 35 | 36 | # init model 37 | model = Wavegrad(config) 38 | 39 | # init the trainer and 🚀 40 | trainer = Trainer( 41 | TrainerArgs(), 42 | config, 43 | output_path, 44 | model=model, 45 | train_samples=train_samples, 46 | eval_samples=eval_samples, 47 | training_assets={"audio_processor": ap}, 48 | ) 49 | trainer.fit() 50 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/ljspeech/wavernn/train_wavernn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from trainer import Trainer, TrainerArgs 4 | 5 | from TTS.utils.audio import AudioProcessor 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.datasets.preprocess import load_wav_data 8 | from TTS.vocoder.models.wavernn import Wavernn 9 | 10 | output_path = os.path.dirname(os.path.abspath(__file__)) 11 | config = WavernnConfig( 12 | batch_size=64, 13 | eval_batch_size=16, 14 | num_loader_workers=4, 15 | num_eval_loader_workers=4, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=10000, 19 | seq_len=1280, 20 | pad_short=2000, 21 | use_noise_augment=False, 22 | eval_split_size=10, 23 | print_step=25, 24 | print_eval=True, 25 | mixed_precision=False, 26 | lr=1e-4, 27 | grad_clip=4, 28 | data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), 29 | output_path=output_path, 30 | ) 31 | 32 | # init audio processor 33 | ap = AudioProcessor(**config.audio.to_dict()) 34 | 35 | # load training samples 36 | eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) 37 | 38 | # init model 39 | model = Wavernn(config) 40 | 41 | # init the trainer and 🚀 42 | trainer = Trainer( 43 | TrainerArgs(), 44 | config, 45 | output_path, 46 | model=model, 47 | train_samples=train_samples, 48 | eval_samples=eval_samples, 49 | training_assets={"audio_processor": ap}, 50 | ) 51 | trainer.fit() 52 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/thorsten_DE/README.md: -------------------------------------------------------------------------------- 1 | # 🐸💬 TTS Thorsten Recipes 2 | 3 | For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset. 4 | 5 | You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present. 6 | 7 | Then, go to your desired model folder and run the training. 8 | 9 | Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) 10 | ```terminal 11 | CUDA_VISIBLE_DEVICES="0" python train_modelX.py 12 | ``` 13 | 14 | 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best 15 | result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. 16 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/thorsten_DE/download_thorsten_DE.sh: -------------------------------------------------------------------------------- 1 | # create venv 2 | python3 -m venv env 3 | source .env/bin/activate 4 | pip install pip --upgrade 5 | 6 | # download Thorsten_DE dataset 7 | pip install gdown 8 | gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz 9 | tar -xzf dataset.tgz 10 | 11 | # create train-val splits 12 | shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv 13 | head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv 14 | tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv 15 | 16 | # rename dataset and remove archive 17 | mv LJSpeech-1.1 thorsten-de 18 | rm dataset.tgz 19 | 20 | # destry venv 21 | rm -rf env 22 | -------------------------------------------------------------------------------- /submodules/TTS/recipes/vctk/download_vctk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # take the scripts's parent's directory to prefix all the output paths. 3 | RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 4 | echo $RUN_DIR 5 | # download VCTK dataset 6 | wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip 7 | # extract 8 | mkdir VCTK 9 | unzip VCTK-Corpus-0.92 -d VCTK 10 | # create train-val splits 11 | mv VCTK $RUN_DIR/recipes/vctk/ 12 | rm VCTK-Corpus-0.92.zip 13 | -------------------------------------------------------------------------------- /submodules/TTS/requirements.dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | coverage 3 | isort 4 | nose2 5 | pylint==2.10.2 6 | -------------------------------------------------------------------------------- /submodules/TTS/requirements.ja.txt: -------------------------------------------------------------------------------- 1 | # These cause some compatibility issues on some systems and are not strictly necessary 2 | # japanese g2p deps 3 | mecab-python3==1.0.6 4 | unidic-lite==1.0.8 5 | cutlet 6 | -------------------------------------------------------------------------------- /submodules/TTS/requirements.notebooks.txt: -------------------------------------------------------------------------------- 1 | bokeh==1.4.0 -------------------------------------------------------------------------------- /submodules/TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | # core deps 2 | # numpy==1.22.0;python_version<="3.10" 3 | # numpy>=1.24.3;python_version>"3.10" 4 | cython>=0.29.30 5 | scipy>=1.11.2 6 | # torch>=2.1 7 | # torchaudio 8 | soundfile>=0.12.0 9 | librosa>=0.10.0 10 | scikit-learn>=1.3.0 11 | numba==0.55.1;python_version<"3.9" 12 | numba>=0.57.0;python_version>="3.9" 13 | inflect>=5.6.0 14 | tqdm>=4.64.1 15 | anyascii>=0.3.0 16 | pyyaml>=6.0 17 | fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail 18 | aiohttp>=3.8.1 19 | packaging>=23.1 20 | mutagen==1.47.0 21 | # deps for examples 22 | flask>=2.0.1 23 | # deps for inference 24 | pysbd>=0.3.4 25 | # deps for notebooks 26 | umap-learn>=0.5.1 27 | pandas>=1.4,<2.0 28 | # deps for training 29 | matplotlib>=3.7.0 30 | # coqui stack 31 | trainer>=0.0.36 32 | # config management 33 | coqpit>=0.0.16 34 | # chinese g2p deps 35 | jieba 36 | pypinyin 37 | # korean 38 | hangul_romanize 39 | # gruut+supported langs 40 | gruut[de,es,fr]==2.2.3 41 | # deps for korean 42 | jamo 43 | nltk 44 | g2pkk>=0.1.1 45 | # deps for bangla 46 | bangla 47 | bnnumerizer 48 | bnunicodenormalizer 49 | #deps for tortoise 50 | einops>=0.6.0 51 | transformers>=4.33.0 52 | #deps for bark 53 | encodec>=0.1.1 54 | # deps for XTTS 55 | unidecode>=1.3.2 56 | num2words 57 | spacy[ja]>=3 -------------------------------------------------------------------------------- /submodules/TTS/run_bash_tests.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | TF_CPP_MIN_LOG_LEVEL=3 3 | 4 | # runtime bash based tests 5 | # TODO: move these to python 6 | ./tests/bash_tests/test_demo_server.sh && \ 7 | ./tests/bash_tests/test_compute_statistics.sh 8 | -------------------------------------------------------------------------------- /submodules/TTS/scripts/sync_readme.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | 5 | def replace_between_markers(content, marker: str, replacement: str) -> str: 6 | start_marker = f"\n\n" 7 | end_marker = f"\n\n\n" 8 | start_index = content.index(start_marker) + len(start_marker) 9 | end_index = content.index(end_marker) 10 | content = content[:start_index] + replacement + content[end_index:] 11 | return content 12 | 13 | 14 | def sync_readme(): 15 | ap = argparse.ArgumentParser() 16 | ap.add_argument("--check", action="store_true", default=False) 17 | args = ap.parse_args() 18 | readme_path = Path(__file__).parent.parent / "README.md" 19 | orig_content = readme_path.read_text() 20 | from TTS.bin.synthesize import description 21 | 22 | new_content = replace_between_markers(orig_content, "tts-readme", description.strip()) 23 | if args.check: 24 | if orig_content != new_content: 25 | print("README.md is out of sync; please edit TTS/bin/TTS_README.md and run scripts/sync_readme.py") 26 | exit(42) 27 | readme_path.write_text(new_content) 28 | print("Updated README.md") 29 | 30 | 31 | if __name__ == "__main__": 32 | sync_readme() 33 | -------------------------------------------------------------------------------- /submodules/TTS/setup.cfg: -------------------------------------------------------------------------------- 1 | [build_py] 2 | build_lib=temp_build 3 | 4 | [bdist_wheel] 5 | bdist_dir=temp_build 6 | 7 | [install_lib] 8 | build_dir=temp_build 9 | -------------------------------------------------------------------------------- /submodules/TTS/tests/aux_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/aux_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/aux_tests/test_readme.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from pathlib import Path 4 | 5 | 6 | def test_readme_up_to_date(): 7 | root = Path(__file__).parent.parent.parent 8 | sync_readme = root / "scripts" / "sync_readme.py" 9 | subprocess.check_call([sys.executable, str(sync_readme), "--check"], cwd=root) 10 | -------------------------------------------------------------------------------- /submodules/TTS/tests/aux_tests/test_stft_torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/aux_tests/test_stft_torch.py -------------------------------------------------------------------------------- /submodules/TTS/tests/bash_tests/test_compute_statistics.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xe 3 | BASEDIR=$(dirname "$0") 4 | echo "$BASEDIR" 5 | # run training 6 | CUDA_VISIBLE_DEVICES="" python TTS/bin/compute_statistics.py --config_path $BASEDIR/../inputs/test_glow_tts.json --out_path $BASEDIR/../outputs/scale_stats.npy 7 | 8 | -------------------------------------------------------------------------------- /submodules/TTS/tests/bash_tests/test_demo_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | python -m TTS.server.server & 5 | SERVER_PID=$! 6 | 7 | echo 'Waiting for server...' 8 | sleep 30 9 | 10 | curl -o /tmp/audio.wav "http://localhost:5002/api/tts?text=synthesis%20schmynthesis" 11 | python -c 'import sys; import wave; print(wave.open(sys.argv[1]).getnframes())' /tmp/audio.wav 12 | 13 | kill $SERVER_PID 14 | 15 | rm /tmp/audio.wav 16 | -------------------------------------------------------------------------------- /submodules/TTS/tests/data/dummy_speakers.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/dummy_speakers.pth -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/f0_cache/pitch_stats.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/f0_cache/pitch_stats.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0001.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0002.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0003.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0004.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0005.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0006.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0007.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0008.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0009.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0010.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0011.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0012.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0013.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0014.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0015.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0016.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0017.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0018.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0019.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0020.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0021.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0022.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0023.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0024.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0025.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0026.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0027.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0028.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0029.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0030.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0031.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.flac -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.mp3 -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data/ljspeech/wavs/LJ001-0032.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/data_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/data_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/data_tests/test_dataset_formatters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from tests import get_tests_input_path 5 | from TTS.tts.datasets.formatters import common_voice 6 | 7 | 8 | class TestTTSFormatters(unittest.TestCase): 9 | def test_common_voice_preprocessor(self): # pylint: disable=no-self-use 10 | root_path = get_tests_input_path() 11 | meta_file = "common_voice.tsv" 12 | items = common_voice(root_path, meta_file) 13 | assert items[0]["text"] == "The applicants are invited for coffee and visa is given immediately." 14 | assert items[0]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") 15 | 16 | assert items[-1]["text"] == "Competition for limited resources has also resulted in some local conflicts." 17 | assert items[-1]["audio_file"] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav") 18 | -------------------------------------------------------------------------------- /submodules/TTS/tests/inference_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/inference_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/inference_tests/test_synthesize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tests import get_tests_output_path, run_cli 4 | 5 | 6 | def test_synthesize(): 7 | """Test synthesize.py with diffent arguments.""" 8 | output_path = os.path.join(get_tests_output_path(), "output.wav") 9 | run_cli("tts --list_models") 10 | 11 | # single speaker model 12 | run_cli(f'tts --text "This is an example." --out_path "{output_path}"') 13 | run_cli( 14 | "tts --model_name tts_models/en/ljspeech/glow-tts " f'--text "This is an example." --out_path "{output_path}"' 15 | ) 16 | run_cli( 17 | "tts --model_name tts_models/en/ljspeech/glow-tts " 18 | "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " 19 | f'--text "This is an example." --out_path "{output_path}"' 20 | ) 21 | -------------------------------------------------------------------------------- /submodules/TTS/tests/inputs/common_voice.tsv: -------------------------------------------------------------------------------- 1 | client_id path sentence up_votes down_votes age gender accent locale segment 2 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en 3 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en 4 | 95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en 5 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en 6 | 954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en 7 | -------------------------------------------------------------------------------- /submodules/TTS/tests/inputs/example_1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/inputs/example_1.wav -------------------------------------------------------------------------------- /submodules/TTS/tests/inputs/language_ids.json: -------------------------------------------------------------------------------- 1 | { 2 | "en": 0, 3 | "fr-fr": 1, 4 | "pt-br": 2 5 | } -------------------------------------------------------------------------------- /submodules/TTS/tests/inputs/scale_stats.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/inputs/scale_stats.npy -------------------------------------------------------------------------------- /submodules/TTS/tests/inputs/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file 3 | "tts_config":"dummy_model_config.json", // tts config.json file 4 | "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. 5 | "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. 6 | "wavernn_file": null, // wavernn checkpoint file name 7 | "wavernn_config": null, // wavernn config file 8 | "vocoder_config":null, 9 | "vocoder_checkpoint": null, 10 | "is_wavernn_batched":true, 11 | "port": 5002, 12 | "use_cuda": false, 13 | "debug": true 14 | } 15 | -------------------------------------------------------------------------------- /submodules/TTS/tests/text_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/text_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/text_tests/test_belarusian_phonemizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import warnings 4 | 5 | from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes 6 | 7 | _TEST_CASES = """ 8 | Фанетычны канвертар/fanʲɛˈtɨt͡ʂnɨ kanˈvʲɛrtar 9 | Гэтак мы працавалі/ˈɣɛtak ˈmɨ prat͡saˈvalʲi 10 | """ 11 | 12 | 13 | class TestText(unittest.TestCase): 14 | def test_belarusian_text_to_phonemes(self): 15 | try: 16 | os.environ["BEL_FANETYKA_JAR"] 17 | except KeyError: 18 | warnings.warn( 19 | "You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file to test Belarusian phonemizer", 20 | Warning, 21 | ) 22 | return 23 | 24 | for line in _TEST_CASES.strip().split("\n"): 25 | text, phonemes = line.split("/") 26 | self.assertEqual(belarusian_text_to_phonemes(text), phonemes) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /submodules/TTS/tests/text_tests/test_japanese_phonemizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes 4 | 5 | _TEST_CASES = """ 6 | どちらに行きますか?/dochiraniikimasuka? 7 | 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. 8 | 「A」から「Z」までです。/e:karazeqtomadedesu. 9 | そうですね!/so:desune! 10 | クジラは哺乳類です。/kujirawahonyu:ruidesu. 11 | ヴィディオを見ます。/bidioomimasu. 12 | 今日は8月22日です/kyo:wahachigatsuniju:ninichidesu 13 | xyzとαβγ/eqkusuwaizeqtotoarufabe:tagaNma 14 | 値段は$12.34です/nedaNwaju:niteNsaNyoNdorudesu 15 | """ 16 | 17 | 18 | class TestText(unittest.TestCase): 19 | def test_japanese_text_to_phonemes(self): 20 | for line in _TEST_CASES.strip().split("\n"): 21 | text, phone = line.split("/") 22 | self.assertEqual(japanese_text_to_phonemes(text), phone) 23 | 24 | 25 | if __name__ == "__main__": 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /submodules/TTS/tests/text_tests/test_korean_phonemizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes 4 | 5 | _TEST_CASES = """ 6 | 포상은 열심히 한 아이에게만 주어지기 때문에 포상인 것입니다./포상으 녈심히 하 나이에게만 주어지기 때무네 포상인 거심니다. 7 | 오늘은 8월 31일 입니다./오느른 파뤌 삼시비리 림니다. 8 | 친구 100명 만들기가 목표입니다./친구 뱅명 만들기가 목표임니다. 9 | A부터 Z까지 입니다./에이부터 제트까지 임니다. 10 | 이게 제 마음이에요./이게 제 마으미에요. 11 | """ 12 | _TEST_CASES_EN = """ 13 | 이제야 이쪽을 보는구나./IJeYa IJjoGeul BoNeunGuNa. 14 | 크고 맛있는 cake를 부탁해요./KeuGo MaSinNeun KeIKeuLeul BuTaKaeYo. 15 | 전부 거짓말이야./JeonBu GeoJinMaLiYa. 16 | 좋은 노래를 찾았어요./JoEun NoLaeLeul ChaJaSseoYo. 17 | """ 18 | 19 | 20 | class TestText(unittest.TestCase): 21 | def test_korean_text_to_phonemes(self): 22 | for line in _TEST_CASES.strip().split("\n"): 23 | text, phone = line.split("/") 24 | self.assertEqual(korean_text_to_phonemes(text), phone) 25 | for line in _TEST_CASES_EN.strip().split("\n"): 26 | text, phone = line.split("/") 27 | self.assertEqual(korean_text_to_phonemes(text, character="english"), phone) 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /submodules/TTS/tests/text_tests/test_text_cleaners.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners 4 | 5 | 6 | def test_time() -> None: 7 | assert english_cleaners("It's 11:00") == "it's eleven a m" 8 | assert english_cleaners("It's 9:01") == "it's nine oh one a m" 9 | assert english_cleaners("It's 16:00") == "it's four p m" 10 | assert english_cleaners("It's 00:00 am") == "it's twelve a m" 11 | 12 | 13 | def test_currency() -> None: 14 | assert phoneme_cleaners("It's $10.50") == "It's ten dollars fifty cents" 15 | assert phoneme_cleaners("£1.1") == "one pound sterling one penny" 16 | assert phoneme_cleaners("¥1") == "one yen" 17 | 18 | 19 | def test_expand_numbers() -> None: 20 | assert phoneme_cleaners("-1") == "minus one" 21 | assert phoneme_cleaners("1") == "one" 22 | -------------------------------------------------------------------------------- /submodules/TTS/tests/tts_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/tts_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/tts_tests2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/tts_tests2/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/vc_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/vc_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/vocoder_tests/__init__.py -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_hifigan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import HifiganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | 12 | config = HifiganConfig( 13 | batch_size=8, 14 | eval_batch_size=8, 15 | num_loader_workers=0, 16 | num_eval_loader_workers=0, 17 | run_eval=True, 18 | test_delay_epochs=-1, 19 | epochs=1, 20 | seq_len=1024, 21 | eval_split_size=1, 22 | print_step=1, 23 | print_eval=True, 24 | data_path="tests/data/ljspeech", 25 | output_path=output_path, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_melgan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import MelganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = MelganConfig( 12 | batch_size=4, 13 | eval_batch_size=4, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=2048, 20 | eval_split_size=1, 21 | print_step=1, 22 | discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, 23 | print_eval=True, 24 | data_path="tests/data/ljspeech", 25 | output_path=output_path, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_parallel_wavegan_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import ParallelWaveganConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = ParallelWaveganConfig( 12 | batch_size=4, 13 | eval_batch_size=4, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=2048, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | data_path="tests/data/ljspeech", 24 | output_path=output_path, 25 | ) 26 | config.audio.do_trim_silence = True 27 | config.audio.trim_db = 60 28 | config.save_json(config_path) 29 | 30 | # train the model for one epoch 31 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 32 | run_cli(command_train) 33 | 34 | # Find latest folder 35 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 36 | 37 | # restore the model and continue training for one more epoch 38 | command_train = ( 39 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 40 | ) 41 | run_cli(command_train) 42 | shutil.rmtree(continue_path) 43 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_melgan_discriminator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator 5 | from TTS.vocoder.models.melgan_multiscale_discriminator import MelganMultiscaleDiscriminator 6 | 7 | 8 | def test_melgan_discriminator(): 9 | model = MelganDiscriminator() 10 | print(model) 11 | dummy_input = torch.rand((4, 1, 256 * 10)) 12 | output, _ = model(dummy_input) 13 | assert np.all(output.shape == (4, 1, 10)) 14 | 15 | 16 | def test_melgan_multi_scale_discriminator(): 17 | model = MelganMultiscaleDiscriminator() 18 | print(model) 19 | dummy_input = torch.rand((4, 1, 256 * 16)) 20 | scores, feats = model(dummy_input) 21 | assert len(scores) == 3 22 | assert len(scores) == len(feats) 23 | assert np.all(scores[0].shape == (4, 1, 64)) 24 | assert np.all(feats[0][0].shape == (4, 16, 4096)) 25 | assert np.all(feats[0][1].shape == (4, 64, 1024)) 26 | assert np.all(feats[0][2].shape == (4, 256, 256)) 27 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_melgan_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.melgan_generator import MelganGenerator 5 | 6 | 7 | def test_melgan_generator(): 8 | model = MelganGenerator() 9 | print(model) 10 | dummy_input = torch.rand((4, 80, 64)) 11 | output = model(dummy_input) 12 | assert np.all(output.shape == (4, 1, 64 * 256)) 13 | output = model.inference(dummy_input) 14 | assert np.all(output.shape == (4, 1, (64 + 4) * 256)) 15 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_parallel_wavegan_discriminator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.parallel_wavegan_discriminator import ( 5 | ParallelWaveganDiscriminator, 6 | ResidualParallelWaveganDiscriminator, 7 | ) 8 | 9 | 10 | def test_pwgan_disciminator(): 11 | model = ParallelWaveganDiscriminator( 12 | in_channels=1, 13 | out_channels=1, 14 | kernel_size=3, 15 | num_layers=10, 16 | conv_channels=64, 17 | dilation_factor=1, 18 | nonlinear_activation="LeakyReLU", 19 | nonlinear_activation_params={"negative_slope": 0.2}, 20 | bias=True, 21 | ) 22 | dummy_x = torch.rand((4, 1, 64 * 256)) 23 | output = model(dummy_x) 24 | assert np.all(output.shape == (4, 1, 64 * 256)) 25 | model.remove_weight_norm() 26 | 27 | 28 | def test_redisual_pwgan_disciminator(): 29 | model = ResidualParallelWaveganDiscriminator( 30 | in_channels=1, 31 | out_channels=1, 32 | kernel_size=3, 33 | num_layers=30, 34 | stacks=3, 35 | res_channels=64, 36 | gate_channels=128, 37 | skip_channels=64, 38 | dropout=0.0, 39 | bias=True, 40 | nonlinear_activation="LeakyReLU", 41 | nonlinear_activation_params={"negative_slope": 0.2}, 42 | ) 43 | dummy_x = torch.rand((4, 1, 64 * 256)) 44 | output = model(dummy_x) 45 | assert np.all(output.shape == (4, 1, 64 * 256)) 46 | model.remove_weight_norm() 47 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_parallel_wavegan_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.parallel_wavegan_generator import ParallelWaveganGenerator 5 | 6 | 7 | def test_pwgan_generator(): 8 | model = ParallelWaveganGenerator( 9 | in_channels=1, 10 | out_channels=1, 11 | kernel_size=3, 12 | num_res_blocks=30, 13 | stacks=3, 14 | res_channels=64, 15 | gate_channels=128, 16 | skip_channels=64, 17 | aux_channels=80, 18 | dropout=0.0, 19 | bias=True, 20 | use_weight_norm=True, 21 | upsample_factors=[4, 4, 4, 4], 22 | ) 23 | dummy_c = torch.rand((2, 80, 5)) 24 | output = model(dummy_c) 25 | assert np.all(output.shape == (2, 1, 5 * 256)), output.shape 26 | model.remove_weight_norm() 27 | output = model.inference(dummy_c) 28 | assert np.all(output.shape == (2, 1, (5 + 4) * 256)) 29 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_pqmf.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import soundfile as sf 4 | import torch 5 | from librosa.core import load 6 | 7 | from tests import get_tests_input_path, get_tests_output_path, get_tests_path 8 | from TTS.vocoder.layers.pqmf import PQMF 9 | 10 | TESTS_PATH = get_tests_path() 11 | WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") 12 | 13 | 14 | def test_pqmf(): 15 | w, sr = load(WAV_FILE) 16 | 17 | layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) 18 | w, sr = load(WAV_FILE) 19 | w2 = torch.from_numpy(w[None, None, :]) 20 | b2 = layer.analysis(w2) 21 | w2_ = layer.synthesis(b2) 22 | 23 | print(w2_.max()) 24 | print(w2_.min()) 25 | print(w2_.mean()) 26 | sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) 27 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_vocoder_rwd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscriminator 5 | 6 | 7 | def test_rwd(): 8 | layer = RandomWindowDiscriminator( 9 | cond_channels=80, 10 | window_sizes=(512, 1024, 2048, 4096, 8192), 11 | cond_disc_downsample_factors=[(8, 4, 2, 2, 2), (8, 4, 2, 2), (8, 4, 2), (8, 4), (4, 2, 2)], 12 | hop_length=256, 13 | ) 14 | x = torch.rand([4, 1, 22050]) 15 | c = torch.rand([4, 80, 22050 // 256]) 16 | 17 | scores, _ = layer(x, c) 18 | assert len(scores) == 10 19 | assert np.all(scores[0].shape == (4, 1, 1)) 20 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_wavegrad_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import WavegradConfig 7 | 8 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 9 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 10 | 11 | config = WavegradConfig( 12 | batch_size=8, 13 | eval_batch_size=8, 14 | num_loader_workers=0, 15 | num_eval_loader_workers=0, 16 | run_eval=True, 17 | test_delay_epochs=-1, 18 | epochs=1, 19 | seq_len=8192, 20 | eval_split_size=1, 21 | print_step=1, 22 | print_eval=True, 23 | data_path="tests/data/ljspeech", 24 | output_path=output_path, 25 | test_noise_schedule={"min_val": 1e-6, "max_val": 1e-2, "num_steps": 2}, 26 | ) 27 | config.audio.do_trim_silence = True 28 | config.audio.trim_db = 60 29 | config.save_json(config_path) 30 | 31 | # train the model for one epoch 32 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 33 | run_cli(command_train) 34 | 35 | # Find latest folder 36 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 37 | 38 | # restore the model and continue training for one more epoch 39 | command_train = ( 40 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 41 | ) 42 | run_cli(command_train) 43 | shutil.rmtree(continue_path) 44 | -------------------------------------------------------------------------------- /submodules/TTS/tests/vocoder_tests/test_wavernn_train.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | from tests import get_device_id, get_tests_output_path, run_cli 6 | from TTS.vocoder.configs import WavernnConfig 7 | from TTS.vocoder.models.wavernn import WavernnArgs 8 | 9 | config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") 10 | output_path = os.path.join(get_tests_output_path(), "train_outputs") 11 | 12 | 13 | config = WavernnConfig( 14 | model_args=WavernnArgs(), 15 | batch_size=8, 16 | eval_batch_size=8, 17 | num_loader_workers=0, 18 | num_eval_loader_workers=0, 19 | run_eval=True, 20 | test_delay_epochs=-1, 21 | epochs=1, 22 | seq_len=256, # for shorter test time 23 | eval_split_size=1, 24 | print_step=1, 25 | print_eval=True, 26 | data_path="tests/data/ljspeech", 27 | output_path=output_path, 28 | ) 29 | config.audio.do_trim_silence = True 30 | config.audio.trim_db = 60 31 | config.save_json(config_path) 32 | 33 | # train the model for one epoch 34 | command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " 35 | run_cli(command_train) 36 | 37 | # Find latest folder 38 | continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) 39 | 40 | # restore the model and continue training for one more epoch 41 | command_train = ( 42 | f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " 43 | ) 44 | run_cli(command_train) 45 | shutil.rmtree(continue_path) 46 | -------------------------------------------------------------------------------- /submodules/TTS/tests/zoo_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/TTS/tests/zoo_tests/__init__.py -------------------------------------------------------------------------------- /submodules/demucs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Demucs 2 | 3 | ## Pull Requests 4 | 5 | In order to accept your pull request, we need you to submit a CLA. You only need 6 | to do this once to work on any of Facebook's open source projects. 7 | 8 | Complete your CLA here: 9 | 10 | Demucs is the implementation of a research paper. 11 | Therefore, we do not plan on accepting many pull requests for new features. 12 | We certainly welcome them for bug fixes. 13 | 14 | 15 | ## Issues 16 | 17 | We use GitHub issues to track public bugs. Please ensure your description is 18 | clear and has sufficient instructions to be able to reproduce the issue. 19 | 20 | 21 | ## License 22 | By contributing to this repository, you agree that your contributions will be licensed 23 | under the LICENSE file in the root directory of this source tree. 24 | -------------------------------------------------------------------------------- /submodules/demucs/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /submodules/demucs/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-exclude env * 2 | recursive-include conf *.yaml 3 | include Makefile 4 | include LICENSE 5 | include demucs.png 6 | include outputs.tar.gz 7 | include test.mp3 8 | include requirements.txt 9 | include requirements_minimal.txt 10 | include mypy.ini 11 | include demucs/py.typed 12 | include demucs/remote/*.txt 13 | include demucs/remote/*.yaml 14 | -------------------------------------------------------------------------------- /submodules/demucs/Makefile: -------------------------------------------------------------------------------- 1 | all: linter tests 2 | 3 | linter: 4 | flake8 demucs 5 | mypy demucs 6 | 7 | tests: test_train test_eval 8 | 9 | test_train: tests/musdb 10 | _DORA_TEST_PATH=/tmp/demucs python3 -m dora run --clear \ 11 | dset.musdb=./tests/musdb dset.segment=4 dset.shift=2 epochs=2 model=demucs \ 12 | demucs.depth=2 demucs.channels=4 test.sdr=false misc.num_workers=0 test.workers=0 \ 13 | test.shifts=0 14 | 15 | test_eval: 16 | python3 -m demucs -n demucs_unittest test.mp3 17 | python3 -m demucs -n demucs_unittest --two-stems=vocals test.mp3 18 | python3 -m demucs -n demucs_unittest --mp3 test.mp3 19 | python3 -m demucs -n demucs_unittest --flac --int24 test.mp3 20 | python3 -m demucs -n demucs_unittest --int24 --clip-mode clamp test.mp3 21 | python3 -m demucs -n demucs_unittest --segment 8 test.mp3 22 | python3 -m demucs.api -n demucs_unittest --segment 8 test.mp3 23 | python3 -m demucs --list-models 24 | 25 | tests/musdb: 26 | test -e tests || mkdir tests 27 | python3 -c 'import musdb; musdb.DB("tests/tmp", download=True)' 28 | musdbconvert tests/tmp tests/musdb 29 | 30 | dist: 31 | python3 setup.py sdist 32 | 33 | clean: 34 | rm -r dist build *.egg-info 35 | 36 | .PHONY: linter dist test_train test_eval 37 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/aetl.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # automix dataset with Musdb, extra training data and the test set of Musdb. 4 | # This used even more remixes than auto_extra_test. 5 | dset: 6 | wav: /checkpoint/defossez/datasets/aetl 7 | samplerate: 44100 8 | channels: 2 9 | epochs: 320 10 | max_batches: 500 11 | 12 | augment: 13 | shift_same: true 14 | scale: 15 | proba: 0. 16 | remix: 17 | proba: 0 18 | repitch: 19 | proba: 0 20 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/auto_extra_test.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # automix dataset with Musdb, extra training data and the test set of Musdb. 4 | dset: 5 | wav: /checkpoint/defossez/datasets/automix_extra_test2 6 | samplerate: 44100 7 | channels: 2 8 | epochs: 320 9 | max_batches: 500 10 | 11 | augment: 12 | shift_same: true 13 | scale: 14 | proba: 0. 15 | remix: 16 | proba: 0 17 | repitch: 18 | proba: 0 19 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/auto_mus.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Automix dataset based on musdb train set. 4 | dset: 5 | wav: /checkpoint/defossez/datasets/automix_musdb 6 | samplerate: 44100 7 | channels: 2 8 | epochs: 360 9 | max_batches: 300 10 | test: 11 | every: 4 12 | 13 | augment: 14 | shift_same: true 15 | scale: 16 | proba: 0.5 17 | remix: 18 | proba: 0 19 | repitch: 20 | proba: 0 21 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/extra44.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Musdb + extra tracks 4 | dset: 5 | wav: /checkpoint/defossez/datasets/allstems_44/ 6 | samplerate: 44100 7 | channels: 2 8 | epochs: 320 9 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/extra_mmi_goodclean.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Musdb + extra tracks 4 | dset: 5 | wav: /checkpoint/defossez/datasets/allstems_44/ 6 | wav2: /checkpoint/defossez/datasets/mmi44_goodclean 7 | samplerate: 44100 8 | channels: 2 9 | wav2_weight: null 10 | wav2_valid: false 11 | valid_samples: 100 12 | epochs: 1200 13 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/extra_test.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Musdb + extra tracks + test set from musdb. 4 | dset: 5 | wav: /checkpoint/defossez/datasets/allstems_test_44/ 6 | samplerate: 44100 7 | channels: 2 8 | epochs: 320 9 | max_batches: 700 10 | test: 11 | sdr: false 12 | every: 500 13 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/musdb44.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | dset: 4 | samplerate: 44100 5 | channels: 2 -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/sdx23_bleeding.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Musdb + extra tracks 4 | dset: 5 | wav: /shared/home/defossez/data/datasets/moisesdb23_bleeding_v1.0/ 6 | use_musdb: false 7 | samplerate: 44100 8 | channels: 2 9 | backend: soundfile # must use soundfile as some mixture would clip with sox. 10 | epochs: 320 11 | -------------------------------------------------------------------------------- /submodules/demucs/conf/dset/sdx23_labelnoise.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # Musdb + extra tracks 4 | dset: 5 | wav: /shared/home/defossez/data/datasets/moisesdb23_labelnoise_v1.0 6 | use_musdb: false 7 | samplerate: 44100 8 | channels: 2 9 | backend: soundfile # must use soundfile as some mixture would clip with sox. 10 | epochs: 320 11 | -------------------------------------------------------------------------------- /submodules/demucs/conf/svd/base.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | svd: 4 | penalty: 0 5 | min_size: 1 6 | dim: 50 7 | niters: 4 8 | powm: false 9 | proba: 1 10 | conv_only: false 11 | convtr: false # ideally this should be true, but some models were trained with this to false. 12 | 13 | optim: 14 | beta2: 0.9998 -------------------------------------------------------------------------------- /submodules/demucs/conf/svd/base2.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | svd: 4 | penalty: 0 5 | min_size: 1 6 | dim: 100 7 | niters: 4 8 | powm: false 9 | proba: 1 10 | conv_only: false 11 | convtr: true 12 | 13 | optim: 14 | beta2: 0.9998 -------------------------------------------------------------------------------- /submodules/demucs/conf/svd/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | -------------------------------------------------------------------------------- /submodules/demucs/conf/variant/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | -------------------------------------------------------------------------------- /submodules/demucs/conf/variant/example.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | model: hdemucs 4 | hdemucs: 5 | channels: 32 -------------------------------------------------------------------------------- /submodules/demucs/conf/variant/finetune.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | epochs: 4 4 | batch_size: 16 5 | optim: 6 | lr: 0.0006 7 | test: 8 | every: 1 9 | sdr: false 10 | dset: 11 | segment: 28 12 | shift: 2 13 | 14 | augment: 15 | scale: 16 | proba: 0 17 | shift_same: true 18 | remix: 19 | proba: 0 20 | -------------------------------------------------------------------------------- /submodules/demucs/demucs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/demucs/demucs.png -------------------------------------------------------------------------------- /submodules/demucs/demucs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | __version__ = "4.1.0a2" 8 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .separate import main 8 | 9 | if __name__ == '__main__': 10 | main() 11 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/grids/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/demucs/demucs/grids/__init__.py -------------------------------------------------------------------------------- /submodules/demucs/demucs/grids/mdx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Main training for the Track A MDX models. 8 | """ 9 | 10 | from ._explorers import MyExplorer 11 | from ..train import main 12 | 13 | 14 | TRACK_A = ['0d19c1c6', '7ecf8ec1', 'c511e2ab', '7d865c68'] 15 | 16 | 17 | @MyExplorer 18 | def explorer(launcher): 19 | launcher.slurm_( 20 | gpus=8, 21 | time=3 * 24 * 60, 22 | partition='learnlab') 23 | 24 | # Reproduce results from MDX competition Track A 25 | # This trains the first round of models. Once this is trained, 26 | # you will need to schedule `mdx_refine`. 27 | for sig in TRACK_A: 28 | xp = main.get_xp_from_sig(sig) 29 | parent = xp.cfg.continue_from 30 | xp = main.get_xp_from_sig(parent) 31 | launcher(xp.argv) 32 | launcher(xp.argv, {'quant.diffq': 1e-4}) 33 | launcher(xp.argv, {'quant.diffq': 3e-4}) 34 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/grids/mdx_extra.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Main training for the Track A MDX models. 8 | """ 9 | 10 | from ._explorers import MyExplorer 11 | from ..train import main 12 | 13 | TRACK_B = ['e51eebcc', 'a1d90b5c', '5d2d6c55', 'cfa93e08'] 14 | 15 | 16 | @MyExplorer 17 | def explorer(launcher): 18 | launcher.slurm_( 19 | gpus=8, 20 | time=3 * 24 * 60, 21 | partition='learnlab') 22 | 23 | # Reproduce results from MDX competition Track A 24 | # This trains the first round of models. Once this is trained, 25 | # you will need to schedule `mdx_refine`. 26 | for sig in TRACK_B: 27 | while sig is not None: 28 | xp = main.get_xp_from_sig(sig) 29 | sig = xp.cfg.continue_from 30 | 31 | for dset in ['extra44', 'extra_test']: 32 | sub = launcher.bind(xp.argv, dset=dset) 33 | sub() 34 | if dset == 'extra_test': 35 | sub({'quant.diffq': 1e-4}) 36 | sub({'quant.diffq': 3e-4}) 37 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/grids/mdx_refine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | Main training for the Track A MDX models. 8 | """ 9 | 10 | from ._explorers import MyExplorer 11 | from .mdx import TRACK_A 12 | from ..train import main 13 | 14 | 15 | @MyExplorer 16 | def explorer(launcher): 17 | launcher.slurm_( 18 | gpus=8, 19 | time=3 * 24 * 60, 20 | partition='learnlab') 21 | 22 | # Reproduce results from MDX competition Track A 23 | # WARNING: all the experiments in the `mdx` grid must have completed. 24 | for sig in TRACK_A: 25 | xp = main.get_xp_from_sig(sig) 26 | launcher(xp.argv) 27 | for diffq in [1e-4, 3e-4]: 28 | xp_src = main.get_xp_from_sig(xp.cfg.continue_from) 29 | q_argv = [f'quant.diffq={diffq}'] 30 | actual_src = main.get_xp(xp_src.argv + q_argv) 31 | actual_src.link.load() 32 | assert len(actual_src.link.history) == actual_src.cfg.epochs 33 | argv = xp.argv + q_argv + [f'continue_from="{actual_src.sig}"'] 34 | launcher(argv) 35 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/grids/sdx23.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from ._explorers import MyExplorer 8 | from dora import Launcher 9 | 10 | 11 | @MyExplorer 12 | def explorer(launcher: Launcher): 13 | launcher.slurm_(gpus=8, time=3 * 24 * 60, partition="speechgpt,learnfair", 14 | mem_per_gpu=None, constraint='') 15 | launcher.bind_({"dset.use_musdb": False}) 16 | 17 | with launcher.job_array(): 18 | launcher(dset='sdx23_bleeding') 19 | launcher(dset='sdx23_labelnoise') 20 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/demucs/demucs/py.typed -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/files.txt: -------------------------------------------------------------------------------- 1 | # MDX Models 2 | root: mdx_final/ 3 | 0d19c1c6-0f06f20e.th 4 | 5d2d6c55-db83574e.th 5 | 7d865c68-3d5dd56b.th 6 | 7ecf8ec1-70f50cc9.th 7 | a1d90b5c-ae9d2452.th 8 | c511e2ab-fe698775.th 9 | cfa93e08-61801ae1.th 10 | e51eebcc-c1b80bdd.th 11 | 6b9c2ca1-3fd82607.th 12 | b72baf4e-8778635e.th 13 | 42e558d4-196e0e1b.th 14 | 305bc58f-18378783.th 15 | 14fc6a69-a89dd0ee.th 16 | 464b36d7-e5a9386e.th 17 | 7fd6ef75-a905dd85.th 18 | 83fc094f-4a16d450.th 19 | 1ef250f1-592467ce.th 20 | 902315c2-b39ce9c9.th 21 | 9a6b4851-03af0aa6.th 22 | fa0cb7f9-100d8bf4.th 23 | # Hybrid Transformer models 24 | root: hybrid_transformer/ 25 | 955717e8-8726e21a.th 26 | f7e0c4bc-ba3fe64a.th 27 | d12395a8-e57c48e6.th 28 | 92cfc3b6-ef3bcb9c.th 29 | 04573f0d-f3cf25b2.th 30 | 75fc33f5-1941ce65.th 31 | # Experimental 6 sources model 32 | 5c90dfd2-34c22ccb.th 33 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/hdemucs_mmi.yaml: -------------------------------------------------------------------------------- 1 | models: ['75fc33f5'] 2 | segment: 44 3 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/htdemucs.yaml: -------------------------------------------------------------------------------- 1 | models: ['955717e8'] 2 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/htdemucs_6s.yaml: -------------------------------------------------------------------------------- 1 | models: ['5c90dfd2'] 2 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/htdemucs_ft.yaml: -------------------------------------------------------------------------------- 1 | models: ['f7e0c4bc', 'd12395a8', '92cfc3b6', '04573f0d'] 2 | weights: [ 3 | [1., 0., 0., 0.], 4 | [0., 1., 0., 0.], 5 | [0., 0., 1., 0.], 6 | [0., 0., 0., 1.], 7 | ] -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/mdx.yaml: -------------------------------------------------------------------------------- 1 | models: ['0d19c1c6', '7ecf8ec1', 'c511e2ab', '7d865c68'] 2 | weights: [ 3 | [1., 1., 0., 0.], 4 | [0., 1., 0., 0.], 5 | [1., 0., 1., 1.], 6 | [1., 0., 1., 1.], 7 | ] 8 | segment: 44 9 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/mdx_extra.yaml: -------------------------------------------------------------------------------- 1 | models: ['e51eebcc', 'a1d90b5c', '5d2d6c55', 'cfa93e08'] 2 | segment: 44 -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/mdx_extra_q.yaml: -------------------------------------------------------------------------------- 1 | models: ['83fc094f', '464b36d7', '14fc6a69', '7fd6ef75'] 2 | segment: 44 3 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/mdx_q.yaml: -------------------------------------------------------------------------------- 1 | models: ['6b9c2ca1', 'b72baf4e', '42e558d4', '305bc58f'] 2 | weights: [ 3 | [1., 1., 0., 0.], 4 | [0., 1., 0., 0.], 5 | [1., 0., 1., 1.], 6 | [1., 0., 1., 1.], 7 | ] 8 | segment: 44 9 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/repro_mdx_a.yaml: -------------------------------------------------------------------------------- 1 | models: ['9a6b4851', '1ef250f1', 'fa0cb7f9', '902315c2'] 2 | segment: 44 3 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/repro_mdx_a_hybrid_only.yaml: -------------------------------------------------------------------------------- 1 | models: ['fa0cb7f9', '902315c2', 'fa0cb7f9', '902315c2'] 2 | segment: 44 3 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/remote/repro_mdx_a_time_only.yaml: -------------------------------------------------------------------------------- 1 | models: ['9a6b4851', '9a6b4851', '1ef250f1', '1ef250f1'] 2 | segment: 44 3 | -------------------------------------------------------------------------------- /submodules/demucs/demucs/wdemucs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # For compat 7 | from .hdemucs import HDemucs 8 | 9 | WDemucs = HDemucs 10 | -------------------------------------------------------------------------------- /submodules/demucs/docs/linux.md: -------------------------------------------------------------------------------- 1 | # Linux support for Demucs 2 | 3 | If your distribution has at least Python 3.8, and you just wish to separate 4 | tracks with Demucs, not train it, you can just run 5 | 6 | ```bash 7 | pip3 install --user -U demucs 8 | # Then anytime you want to use demucs, just do 9 | python3 -m demucs -d cpu PATH_TO_AUDIO_FILE_1 10 | # If you have added the user specific pip bin/ folder to your path, you can also do 11 | demucs -d cpu PATH_TO_AUDIO_FILE_1 12 | ``` 13 | 14 | If Python is too old, or you want to be able to train, I recommend [installing Miniconda][miniconda], with Python 3.8 or more. 15 | 16 | ```bash 17 | conda activate 18 | pip3 install -U demucs 19 | # Then anytime you want to use demucs, first do conda activate, then 20 | demucs -d cpu PATH_TO_AUDIO_FILE_1 21 | ``` 22 | 23 | Of course, you can also use a specific env for Demucs. 24 | 25 | **Important, torchaudio 0.12 update:** Torchaudio no longer supports decoding mp3s without ffmpeg installed. You must have ffmpeg installed, either through Anaconda (`conda install ffmpeg -c conda-forge`) or as a distribution package (e.g. `sudo apt-get install ffmpeg`). 26 | 27 | 28 | [miniconda]: https://docs.conda.io/en/latest/miniconda.html#linux-installers 29 | -------------------------------------------------------------------------------- /submodules/demucs/docs/mac.md: -------------------------------------------------------------------------------- 1 | # macOS support for Demucs 2 | 3 | If you have a sufficiently recent version of macOS, you can just run 4 | 5 | ```bash 6 | python3 -m pip install --user -U demucs 7 | # Then anytime you want to use demucs, just do 8 | python3 -m demucs -d cpu PATH_TO_AUDIO_FILE_1 9 | # If you have added the user specific pip bin/ folder to your path, you can also do 10 | demucs -d cpu PATH_TO_AUDIO_FILE_1 11 | ``` 12 | 13 | If you do not already have Anaconda installed or much experience with the terminal on macOS, here are some detailed instructions: 14 | 15 | 1. Download [Anaconda 3.8 (or more recent) 64-bit for macOS][anaconda]: 16 | 2. Open [Anaconda Prompt in macOS][prompt] 17 | 3. Follow these commands: 18 | ```bash 19 | conda activate 20 | pip3 install -U demucs 21 | # Then anytime you want to use demucs, first do conda activate, then 22 | demucs -d cpu PATH_TO_AUDIO_FILE_1 23 | ``` 24 | 25 | **Important, torchaudio 0.12 update:** Torchaudio no longer supports decoding mp3s without ffmpeg installed. You must have ffmpeg installed, either through Anaconda (`conda install ffmpeg -c conda-forge`) or with Homebrew for instance (`brew install ffmpeg`). 26 | 27 | [anaconda]: https://www.anaconda.com/download 28 | [prompt]: https://docs.anaconda.com/anaconda/user-guide/getting-started/#open-nav-mac 29 | -------------------------------------------------------------------------------- /submodules/demucs/environment-cpu.yml: -------------------------------------------------------------------------------- 1 | name: demucs 2 | 3 | channels: 4 | - pytorch 5 | - conda-forge 6 | 7 | dependencies: 8 | - python>=3.8,<3.10 9 | - ffmpeg>=4.2 10 | - pytorch>=1.8.1 11 | - torchaudio>=0.8 12 | - tqdm>=4.36 13 | - pip 14 | - pip: 15 | - diffq>=0.2 16 | - dora-search 17 | - einops 18 | - hydra-colorlog>=1.1 19 | - hydra-core>=1.1 20 | - julius>=0.2.3 21 | - lameenc>=1.2 22 | - openunmix 23 | - musdb>=0.4.0 24 | - museval>=0.4.0 25 | - soundfile 26 | - submitit 27 | - treetable>=0.2.3 28 | 29 | -------------------------------------------------------------------------------- /submodules/demucs/environment-cuda.yml: -------------------------------------------------------------------------------- 1 | name: demucs 2 | 3 | channels: 4 | - pytorch 5 | - conda-forge 6 | 7 | dependencies: 8 | - python>=3.8,<3.10 9 | - ffmpeg>=4.2 10 | - pytorch>=1.8.1 11 | - torchaudio>=0.8 12 | - cudatoolkit>=10 13 | - tqdm>=4.36 14 | - pip 15 | - pip: 16 | - diffq>=0.2 17 | - dora-search 18 | - einops 19 | - hydra-colorlog>=1.1 20 | - hydra-core>=1.1 21 | - julius>=0.2.3 22 | - lameenc>=1.2 23 | - openunmix 24 | - musdb>=0.4.0 25 | - museval>=0.4.0 26 | - soundfile 27 | - submitit 28 | - treetable>=0.2.3 29 | -------------------------------------------------------------------------------- /submodules/demucs/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | dependencies = ['dora-search', 'julius', 'lameenc', 'openunmix', 'pyyaml', 8 | 'torch', 'torchaudio', 'tqdm'] 9 | 10 | from demucs.pretrained import get_model 11 | 12 | -------------------------------------------------------------------------------- /submodules/demucs/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | 3 | [mypy-treetable,torchaudio.*,diffq,yaml,tqdm,lameenc,musdb,museval,openunmix.*,einops,xformers.*] 4 | ignore_missing_imports = True 5 | 6 | -------------------------------------------------------------------------------- /submodules/demucs/outputs.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/demucs/outputs.tar.gz -------------------------------------------------------------------------------- /submodules/demucs/requirements.txt: -------------------------------------------------------------------------------- 1 | # please make sure you have already a pytorch install that is cuda enabled! 2 | dora-search>=0.1.12 3 | diffq>=0.2.1 4 | einops 5 | flake8 6 | hydra-colorlog>=1.1 7 | hydra-core>=1.1 8 | julius>=0.2.3 9 | lameenc>=1.2 10 | museval 11 | mypy 12 | openunmix 13 | pyyaml 14 | submitit 15 | # torch>=1.8.1 16 | # torchaudio>=0.8,<2.1 17 | tqdm 18 | treetable 19 | soundfile>=0.10.3;sys_platform=="win32" 20 | -------------------------------------------------------------------------------- /submodules/demucs/requirements_minimal.txt: -------------------------------------------------------------------------------- 1 | # please make sure you have already a pytorch install that is cuda enabled! 2 | dora-search 3 | einops 4 | julius>=0.2.3 5 | lameenc>=1.2 6 | openunmix 7 | pyyaml 8 | # torch>=1.8.1 9 | # torchaudio>=0.8,<2.1 10 | tqdm 11 | -------------------------------------------------------------------------------- /submodules/demucs/setup.cfg: -------------------------------------------------------------------------------- 1 | [pep8] 2 | max-line-length = 100 3 | 4 | [flake8] 5 | max-line-length = 100 6 | 7 | [yapf] 8 | column_limit = 100 9 | -------------------------------------------------------------------------------- /submodules/demucs/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/demucs/test.mp3 -------------------------------------------------------------------------------- /submodules/demucs/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /submodules/demucs/tools/test_pretrained.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Script to evaluate pretrained models. 8 | 9 | from argparse import ArgumentParser 10 | import logging 11 | import sys 12 | 13 | import torch 14 | 15 | from demucs import train, pretrained, evaluate 16 | 17 | 18 | def main(): 19 | torch.set_num_threads(1) 20 | logging.basicConfig(stream=sys.stderr, level=logging.INFO) 21 | parser = ArgumentParser("tools.test_pretrained", 22 | description="Evaluate pre-trained models or bags of models " 23 | "on MusDB.") 24 | pretrained.add_model_flags(parser) 25 | parser.add_argument('overrides', nargs='*', 26 | help='Extra overrides, e.g. test.shifts=2.') 27 | args = parser.parse_args() 28 | 29 | xp = train.main.get_xp(args.overrides) 30 | with xp.enter(): 31 | solver = train.get_solver(xp.cfg) 32 | 33 | model = pretrained.get_model_from_args(args) 34 | solver.model = model.to(solver.device) 35 | solver.model.eval() 36 | 37 | with torch.no_grad(): 38 | results = evaluate.evaluate(solver, xp.cfg.test.sdr) 39 | print(results) 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /submodules/whisper/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /submodules/whisper/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include README.md 3 | include LICENSE 4 | include whisper/assets/* 5 | include whisper/normalizers/english.json 6 | -------------------------------------------------------------------------------- /submodules/whisper/approach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/whisper/approach.png -------------------------------------------------------------------------------- /submodules/whisper/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | 3 | [tool.isort] 4 | profile = "black" 5 | include_trailing_comma = true 6 | line_length = 88 7 | multi_line_output = 3 8 | 9 | -------------------------------------------------------------------------------- /submodules/whisper/requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | numpy 3 | # torch 4 | tqdm 5 | more-itertools 6 | tiktoken 7 | triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" 8 | -------------------------------------------------------------------------------- /submodules/whisper/setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import sys 3 | from pathlib import Path 4 | 5 | import pkg_resources 6 | from setuptools import find_packages, setup 7 | 8 | 9 | def read_version(fname="whisper/version.py"): 10 | exec(compile(open(fname, encoding="utf-8").read(), fname, "exec")) 11 | return locals()["__version__"] 12 | 13 | 14 | requirements = [] 15 | if sys.platform.startswith("linux") and platform.machine() == "x86_64": 16 | requirements.append("triton>=2.0.0,<3") 17 | 18 | setup( 19 | name="openai-whisper", 20 | py_modules=["whisper"], 21 | version=read_version(), 22 | description="Robust Speech Recognition via Large-Scale Weak Supervision", 23 | long_description=open("README.md", encoding="utf-8").read(), 24 | long_description_content_type="text/markdown", 25 | readme="README.md", 26 | python_requires=">=3.8", 27 | author="OpenAI", 28 | url="https://github.com/openai/whisper", 29 | license="MIT", 30 | packages=find_packages(exclude=["tests*"]), 31 | install_requires=[ 32 | str(r) 33 | for r in pkg_resources.parse_requirements( 34 | Path(__file__).with_name("requirements.txt").open() 35 | ) 36 | ], 37 | entry_points={ 38 | "console_scripts": ["whisper=whisper.transcribe:cli"], 39 | }, 40 | include_package_data=True, 41 | extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]}, 42 | ) 43 | -------------------------------------------------------------------------------- /submodules/whisper/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import random as rand 2 | 3 | import numpy 4 | import pytest 5 | 6 | 7 | def pytest_configure(config): 8 | config.addinivalue_line("markers", "requires_cuda") 9 | 10 | 11 | @pytest.fixture 12 | def random(): 13 | rand.seed(42) 14 | numpy.random.seed(42) 15 | -------------------------------------------------------------------------------- /submodules/whisper/tests/jfk.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/whisper/tests/jfk.flac -------------------------------------------------------------------------------- /submodules/whisper/tests/test_audio.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy as np 4 | 5 | from whisper.audio import SAMPLE_RATE, load_audio, log_mel_spectrogram 6 | 7 | 8 | def test_audio(): 9 | audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac") 10 | audio = load_audio(audio_path) 11 | assert audio.ndim == 1 12 | assert SAMPLE_RATE * 10 < audio.shape[0] < SAMPLE_RATE * 12 13 | assert 0 < audio.std() < 1 14 | 15 | mel_from_audio = log_mel_spectrogram(audio) 16 | mel_from_file = log_mel_spectrogram(audio_path) 17 | 18 | assert np.allclose(mel_from_audio, mel_from_file) 19 | assert mel_from_audio.max() - mel_from_audio.min() <= 2.0 20 | -------------------------------------------------------------------------------- /submodules/whisper/tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from whisper.tokenizer import get_tokenizer 4 | 5 | 6 | @pytest.mark.parametrize("multilingual", [True, False]) 7 | def test_tokenizer(multilingual): 8 | tokenizer = get_tokenizer(multilingual=False) 9 | assert tokenizer.sot in tokenizer.sot_sequence 10 | assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens) 11 | assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens) 12 | 13 | 14 | def test_multilingual_tokenizer(): 15 | gpt2_tokenizer = get_tokenizer(multilingual=False) 16 | multilingual_tokenizer = get_tokenizer(multilingual=True) 17 | 18 | text = "다람쥐 헌 쳇바퀴에 타고파" 19 | gpt2_tokens = gpt2_tokenizer.encode(text) 20 | multilingual_tokens = multilingual_tokenizer.encode(text) 21 | 22 | assert gpt2_tokenizer.decode(gpt2_tokens) == text 23 | assert multilingual_tokenizer.decode(multilingual_tokens) == text 24 | assert len(gpt2_tokens) > len(multilingual_tokens) 25 | 26 | 27 | def test_split_on_unicode(): 28 | multilingual_tokenizer = get_tokenizer(multilingual=True) 29 | 30 | tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378] 31 | words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens) 32 | 33 | assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"] 34 | assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]] 35 | -------------------------------------------------------------------------------- /submodules/whisper/whisper/__main__.py: -------------------------------------------------------------------------------- 1 | from .transcribe import cli 2 | 3 | cli() 4 | -------------------------------------------------------------------------------- /submodules/whisper/whisper/assets/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/whisper/whisper/assets/mel_filters.npz -------------------------------------------------------------------------------- /submodules/whisper/whisper/normalizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic import BasicTextNormalizer as BasicTextNormalizer 2 | from .english import EnglishTextNormalizer as EnglishTextNormalizer 3 | -------------------------------------------------------------------------------- /submodules/whisper/whisper/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "20231117" 2 | -------------------------------------------------------------------------------- /submodules/whisperX/EXAMPLES.md: -------------------------------------------------------------------------------- 1 | # More Examples 2 | 3 | ## Other Languages 4 | 5 | For non-english ASR, it is best to use the `large` whisper model. Alignment models are automatically picked by the chosen language from the default [lists](https://github.com/m-bain/whisperX/blob/main/whisperx/alignment.py#L18). 6 | 7 | Currently support default models tested for {en, fr, de, es, it, ja, zh, nl} 8 | 9 | 10 | If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data. 11 | 12 | ### French 13 | whisperx --model large --language fr examples/sample_fr_01.wav 14 | 15 | 16 | https://user-images.githubusercontent.com/36994049/208298804-31c49d6f-6787-444e-a53f-e93c52706752.mov 17 | 18 | 19 | ### German 20 | whisperx --model large --language de examples/sample_de_01.wav 21 | 22 | 23 | https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov 24 | 25 | 26 | ### Italian 27 | whisperx --model large --language de examples/sample_it_01.wav 28 | 29 | 30 | https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-b8e1-90855794efc7.mov 31 | 32 | 33 | ### Japanese 34 | whisperx --model large --language ja examples/sample_ja_01.wav 35 | 36 | 37 | https://user-images.githubusercontent.com/19920981/208731743-311f2360-b73b-4c60-809d-aaf3cd7e06f4.mov 38 | -------------------------------------------------------------------------------- /submodules/whisperX/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2024, Max Bain 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /submodules/whisperX/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include whisperx/assets/* 2 | include whisperx/assets/gpt2/* 3 | include whisperx/assets/multilingual/* 4 | include whisperx/normalizers/english.json 5 | -------------------------------------------------------------------------------- /submodules/whisperX/figures/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/whisperX/figures/pipeline.png -------------------------------------------------------------------------------- /submodules/whisperX/requirements.txt: -------------------------------------------------------------------------------- 1 | # torch>=2 2 | # torchaudio>=2 3 | faster-whisper==1.0.0 4 | transformers 5 | pandas 6 | setuptools>=65 7 | nltk 8 | -------------------------------------------------------------------------------- /submodules/whisperX/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | 4 | import pkg_resources 5 | from setuptools import find_packages, setup 6 | 7 | setup( 8 | name="whisperx", 9 | py_modules=["whisperx"], 10 | version="3.1.1", 11 | description="Time-Accurate Automatic Speech Recognition using Whisper.", 12 | readme="README.md", 13 | python_requires=">=3.8", 14 | author="Max Bain", 15 | url="https://github.com/m-bain/whisperx", 16 | license="MIT", 17 | packages=find_packages(exclude=["tests*"]), 18 | install_requires=[ 19 | str(r) 20 | for r in pkg_resources.parse_requirements( 21 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 22 | ) 23 | ] 24 | + [f"pyannote.audio==3.1.1"], 25 | entry_points={ 26 | "console_scripts": ["whisperx=whisperx.transcribe:cli"], 27 | }, 28 | include_package_data=True, 29 | extras_require={"dev": ["pytest"]}, 30 | ) 31 | -------------------------------------------------------------------------------- /submodules/whisperX/whisperx/__init__.py: -------------------------------------------------------------------------------- 1 | from .transcribe import load_model 2 | from .alignment import load_align_model, align 3 | from .audio import load_audio 4 | from .diarize import assign_word_speakers, DiarizationPipeline -------------------------------------------------------------------------------- /submodules/whisperX/whisperx/__main__.py: -------------------------------------------------------------------------------- 1 | from .transcribe import cli 2 | 3 | 4 | cli() 5 | -------------------------------------------------------------------------------- /submodules/whisperX/whisperx/assets/mel_filters.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/submodules/whisperX/whisperx/assets/mel_filters.npz -------------------------------------------------------------------------------- /submodules/whisperX/whisperx/types.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Optional, List 2 | 3 | 4 | class SingleWordSegment(TypedDict): 5 | """ 6 | A single word of a speech. 7 | """ 8 | word: str 9 | start: float 10 | end: float 11 | score: float 12 | 13 | class SingleCharSegment(TypedDict): 14 | """ 15 | A single char of a speech. 16 | """ 17 | char: str 18 | start: float 19 | end: float 20 | score: float 21 | 22 | 23 | class SingleSegment(TypedDict): 24 | """ 25 | A single segment (up to multiple sentences) of a speech. 26 | """ 27 | 28 | start: float 29 | end: float 30 | text: str 31 | 32 | 33 | class SingleAlignedSegment(TypedDict): 34 | """ 35 | A single segment (up to multiple sentences) of a speech with word alignment. 36 | """ 37 | 38 | start: float 39 | end: float 40 | text: str 41 | words: List[SingleWordSegment] 42 | chars: Optional[List[SingleCharSegment]] 43 | 44 | 45 | class TranscriptionResult(TypedDict): 46 | """ 47 | A list of segments and word segments of a speech. 48 | """ 49 | segments: List[SingleSegment] 50 | language: str 51 | 52 | 53 | class AlignedTranscriptionResult(TypedDict): 54 | """ 55 | A list of segments and word segments of a speech. 56 | """ 57 | segments: List[SingleAlignedSegment] 58 | word_segments: List[SingleWordSegment] 59 | -------------------------------------------------------------------------------- /tabs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Kedreamix/Linly-Dubbing/5677191ee544afae8250cee8e801c03839bcba24/tabs/__init__.py -------------------------------------------------------------------------------- /tabs/linly_talker_tab.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PySide6.QtWidgets import (QWidget, QVBoxLayout, QLabel, QLineEdit, 3 | QComboBox, QMessageBox) 4 | 5 | from ui_components import VideoPlayer 6 | 7 | 8 | class LinlyTalkerTab(QWidget): 9 | def __init__(self, parent=None): 10 | super().__init__(parent) 11 | self.layout = QVBoxLayout(self) 12 | 13 | # 视频文件夹 14 | self.video_folder = QLineEdit("videos") 15 | self.layout.addWidget(QLabel("视频文件夹")) 16 | self.layout.addWidget(self.video_folder) 17 | 18 | # AI配音方式 19 | self.talker_method = QComboBox() 20 | self.talker_method.addItems(['Wav2Lip', 'Wav2Lipv2', 'SadTalker']) 21 | self.layout.addWidget(QLabel("AI配音方式")) 22 | self.layout.addWidget(self.talker_method) 23 | 24 | # 施工中提示 25 | construction_label = QLabel("施工中,请静候佳音 可参考 https://github.com/Kedreamix/Linly-Talker") 26 | construction_label.setOpenExternalLinks(True) 27 | self.layout.addWidget(construction_label) 28 | 29 | # 状态显示 30 | self.status_label = QLabel("功能开发中") 31 | self.layout.addWidget(QLabel("合成状态:")) 32 | self.layout.addWidget(self.status_label) 33 | 34 | # 视频播放器 35 | self.video_player = VideoPlayer("合成视频") 36 | self.layout.addWidget(self.video_player) 37 | 38 | self.setLayout(self.layout) -------------------------------------------------------------------------------- /tools/step031_translation_openai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from openai import OpenAI 4 | from dotenv import load_dotenv 5 | from loguru import logger 6 | 7 | extra_body = { 8 | 'repetition_penalty': 1.1, 9 | } 10 | model_name = os.getenv('MODEL_NAME', 'gpt-3.5-turbo') 11 | def openai_response(messages): 12 | client = OpenAI( 13 | # This is the default and can be omitted 14 | base_url=os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1'), 15 | api_key=os.getenv('OPENAI_API_KEY') 16 | ) 17 | if 'gpt' not in model_name: 18 | model_name = 'gpt-3.5-turbo' 19 | response = client.chat.completions.create( 20 | model=model_name, 21 | messages=messages, 22 | timeout=240, 23 | extra_body=extra_body 24 | ) 25 | return response.choices[0].message.content 26 | 27 | if __name__ == '__main__': 28 | test_message = [{"role": "user", "content": "你好,介绍一下你自己"}] 29 | response = openai_response(test_message) 30 | print(response) -------------------------------------------------------------------------------- /tools/step033_translation_translator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | import translators as ts 5 | from dotenv import load_dotenv 6 | from loguru import logger 7 | load_dotenv() 8 | 9 | def translator_response(messages, to_language = 'zh-CN', translator_server = 'bing'): 10 | if '中文' in to_language: 11 | to_language = 'zh-CN' 12 | elif 'English' in to_language: 13 | to_language = 'en' 14 | translation = '' 15 | for retry in range(3): 16 | try: 17 | translation = ts.translate_text(query_text=messages, translator=translator_server, from_language='auto', to_language=to_language) 18 | break 19 | except Exception as e: 20 | logger.info(f'translate failed! {e}') 21 | print('tranlate failed!') 22 | return translation 23 | 24 | if __name__ == '__main__': 25 | response = translator_response('Hello, how are you?', '中文', 'bing') 26 | print(response) 27 | response = translator_response('你好,最近怎么样? ', 'en', 'google') 28 | print(response) -------------------------------------------------------------------------------- /tools/step035_translation_qwen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from openai import OpenAI 4 | from dotenv import load_dotenv 5 | from loguru import logger 6 | 7 | extra_body = { 8 | 'repetition_penalty': 1.1, 9 | } 10 | model_name = os.getenv('QWEN_MODEL_ID', 'qwen-max-2025-01-25') 11 | def qwen_response(messages): 12 | client = OpenAI( 13 | # This is the default and can be omitted 14 | base_url=os.getenv('QWEN_API_BASE', 'https://dashscope.aliyuncs.com/compatible-mode/v1'), 15 | api_key=os.getenv('QWEN_API_KEY') 16 | ) 17 | response = client.chat.completions.create( 18 | model=model_name, 19 | messages=messages, 20 | timeout=240, 21 | extra_body=extra_body 22 | ) 23 | return response.choices[0].message.content 24 | 25 | if __name__ == '__main__': 26 | test_message = [{"role": "user", "content": "你好,介绍一下你自己"}] 27 | response = qwen_response(test_message) 28 | print(response) -------------------------------------------------------------------------------- /tools/step044_tts_edge_tts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from loguru import logger 3 | import numpy as np 4 | import torch 5 | import time 6 | from .utils import save_wav 7 | import sys 8 | 9 | import torchaudio 10 | model = None 11 | 12 | 13 | 14 | # <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean 15 | language_map = { 16 | '中文': 'zh-CN-XiaoxiaoNeural', 17 | 'English': 'en-US-MichelleNeural', 18 | 'Japanese': 'ja-JP-NanamiNeural', 19 | '粤语': 'zh-HK-HiuMaanNeural', 20 | 'Korean': 'ko-KR-SunHiNeural' 21 | } 22 | 23 | def tts(text, output_path, target_language='中文', voice = 'zh-CN-XiaoxiaoNeural'): 24 | if os.path.exists(output_path): 25 | logger.info(f'TTS {text} 已存在') 26 | return 27 | for retry in range(3): 28 | try: 29 | os.system(f'edge-tts --text "{text}" --write-media "{output_path.replace(".wav", ".mp3")}" --voice {voice}') 30 | logger.info(f'TTS {text}') 31 | break 32 | except Exception as e: 33 | logger.warning(f'TTS {text} 失败') 34 | logger.warning(e) 35 | 36 | 37 | if __name__ == '__main__': 38 | speaker_wav = r'videos/村长台钓加拿大/20240805 英文无字幕 阿里这小子在水城威尼斯发来问候/audio_vocals.wav' 39 | while True: 40 | text = input('请输入:') 41 | tts(text, f'playground/{text}.wav', target_language='中文') 42 | 43 | -------------------------------------------------------------------------------- /问题参考汇总.md: -------------------------------------------------------------------------------- 1 | # 问题参考汇总 2 | 3 | ## 目录 Content 4 | 5 | - [yt-dlp下载失败](#yt-dlp下载失败) 6 | - [Could not load library libcudnn_ops_infer.so.8](#could-not-load-library-libcudnn_ops_inferso8) 7 | 8 | ## yt-dlp下载失败 9 | 10 | 有时下载失败可能是由于缺少cookie引起的。可以通过以下命令生成一个`cookies.txt`文件,并将其放在程序的根目录下解决问题(可在本地生成然后上传到服务器)。 11 | 12 | > 参考链接:https://github.com/yt-dlp/yt-dlp/wiki/FAQ 13 | 14 | ```bash 15 | yt-dlp --cookies-from-browser chrome --cookies cookies.txt 16 | ``` 17 | 18 | ## Could not load library libcudnn_ops_infer.so.8 19 | 20 | 此错误通常是由于找不到库文件路径,可以通过设置`torch`的路径来解决,下面的命令可以用来设置环境变量`LD_LIBRARY_PATH` 21 | 22 | > 参考链接:https://github.com/SYSTRAN/faster-whisper/issues/516 23 | 24 | ```bash 25 | export LD_LIBRARY_PATH=`python3 -c 'import os; import torch; print(os.path.dirname(os.path.dirname(torch.__file__)) +"/nvidia/cudnn/lib")'`:$LD_LIBRARY_PATH 26 | # 你也可以尝试以下命令 27 | # export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`:$LD_LIBRARY_PATH 28 | ``` 29 | 30 | --------------------------------------------------------------------------------