├── .DS_Store ├── Communication.md ├── LICENSE ├── assets ├── .DS_Store ├── QR.png ├── adapted_crosstalk_cover.png ├── adapted_standupcomedy_cover.png ├── airencuoguo_cover.png ├── audio_performance.jpg ├── cover_16-9.png ├── cover_3-4.png ├── cover_4-3.png ├── crosstalk_original_cover.png ├── dune_news_cover.png ├── dune_original_cover.png ├── dune_youtube.png ├── edit_workflow.png ├── eva1.png ├── eva2.png ├── eva3.jpg ├── eva4.jpg ├── eval1_audio_new.png ├── eval1_video_new.png ├── framework.jpg ├── grok4.png ├── interstella_cover.png ├── interstella_cover_love.png ├── joylife_cover.png ├── logo.png ├── logo_new.png ├── masterma_cover.png ├── masterma_original_cover.png ├── nezha_cover.png ├── openai_news_cover.png ├── overview.png ├── spiderman_cover.jpg ├── spiderman_cover.png ├── spiderman_new.jpg ├── standup_original_cover.png ├── tech_news_original_cover.png ├── titanic_cover.png ├── video_performance.jpg ├── xiaomingjianmo1_cover.png ├── xiaomingjianmo_findyourproblem_meme.png ├── xiaomingjianmo_mvp_cover.png ├── xiaomingjianmo_original_cover.png └── youhebuke_cover.png ├── dataset ├── presentation_style │ ├── commentary_present.txt │ ├── summarization_present.txt │ └── video_overview_present.txt └── voice │ └── ava_16k.wav ├── demos_documents.md ├── environment ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ └── utils.cpython-310.pyc ├── agents │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── base.cpython-310.pyc │ │ ├── graph.cpython-310.pyc │ │ └── multi.cpython-310.pyc │ ├── base.py │ └── multi.py ├── config │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── config.cpython-310.pyc │ │ └── llm.cpython-310.pyc │ ├── check.py │ ├── config.py │ ├── config.yml │ ├── graph.txt │ ├── intents.yml │ ├── llm.py │ ├── registry.json │ └── user.yml ├── roles │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── audio_extractor.cpython-310.pyc │ │ ├── loudness_normalizer.cpython-310.pyc │ │ ├── merge.cpython-310.pyc │ │ ├── mixer.cpython-310.pyc │ │ ├── resampler.cpython-310.pyc │ │ ├── separator.cpython-310.pyc │ │ ├── transcriber.cpython-310.pyc │ │ ├── vid_conversion.cpython-310.pyc │ │ ├── vid_editor.cpython-310.pyc │ │ ├── vid_editor_base.cpython-310.pyc │ │ ├── vid_preloader.cpython-310.pyc │ │ ├── vid_searcher.cpython-310.pyc │ │ ├── vid_searcher_base.cpython-310.pyc │ │ └── voice_generator.cpython-310.pyc │ ├── audio_extractor.py │ ├── cross_talk │ │ ├── __pycache__ │ │ │ ├── cross_talk_adapter.cpython-310.pyc │ │ │ ├── cross_talk_conversion.cpython-310.pyc │ │ │ └── cross_talk_synth.cpython-310.pyc │ │ ├── cross_talk_adapter.py │ │ ├── cross_talk_conversion.py │ │ └── cross_talk_synth.py │ ├── loudness_normalizer.py │ ├── merge.py │ ├── mixer.py │ ├── resampler.py │ ├── separator.py │ ├── stand_up │ │ ├── __pycache__ │ │ │ ├── stand_up_adapter.cpython-310.pyc │ │ │ ├── stand_up_conversion.cpython-310.pyc │ │ │ └── stand_up_synth.cpython-310.pyc │ │ ├── stand_up_adapter.py │ │ ├── stand_up_conversion.py │ │ └── stand_up_synth.py │ ├── svc │ │ ├── __pycache__ │ │ │ ├── svc_adapter.cpython-310.pyc │ │ │ ├── svc_analyzer.cpython-310.pyc │ │ │ ├── svc_conversion.cpython-310.pyc │ │ │ ├── svc_coverist.cpython-310.pyc │ │ │ └── svc_single.cpython-310.pyc │ │ ├── svc_adapter.py │ │ ├── svc_analyzer.py │ │ ├── svc_conversion.py │ │ ├── svc_coverist.py │ │ └── svc_single.py │ ├── transcriber.py │ ├── tts │ │ ├── __pycache__ │ │ │ ├── tts_infer.cpython-310.pyc │ │ │ ├── tts_replace.cpython-310.pyc │ │ │ ├── tts_slicer.cpython-310.pyc │ │ │ └── tts_writer.cpython-310.pyc │ │ ├── tts_infer.py │ │ ├── tts_replace.py │ │ ├── tts_slicer.py │ │ └── tts_writer.py │ ├── vid_comm │ │ ├── __pycache__ │ │ │ └── comm_story_gen.cpython-310.pyc │ │ └── comm_story_gen.py │ ├── vid_conversion.py │ ├── vid_editor.py │ ├── vid_news │ │ ├── __pycache__ │ │ │ └── news_story_gen.cpython-310.pyc │ │ └── news_story_gen.py │ ├── vid_preloader.py │ ├── vid_qa │ │ ├── __pycache__ │ │ │ ├── content_loader copy.cpython-310.pyc │ │ │ ├── content_loader.cpython-310.pyc │ │ │ └── content_loader_base.cpython-310.pyc │ │ └── content_loader.py │ ├── vid_rhythm │ │ ├── __pycache__ │ │ │ ├── rhythm_detector.cpython-310.pyc │ │ │ └── rhythm_story_gen.cpython-310.pyc │ │ ├── rhythm_detector.py │ │ └── rhythm_story_gen.py │ ├── vid_searcher.py │ ├── vid_summ │ │ ├── __pycache__ │ │ │ ├── summ_loader.cpython-310.pyc │ │ │ └── summ_loader_base.cpython-310.pyc │ │ └── summ_loader.py │ └── voice_generator.py └── utils.py ├── main.py ├── pyproject.toml ├── readme.md ├── readme_zh.md ├── requirements.txt └── tools ├── .gitkeep ├── CosyVoice ├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── FAQ.md ├── LICENSE ├── README.md ├── cosyvoice │ ├── __init__.py │ ├── bin │ │ ├── average_model.py │ │ ├── export_jit.py │ │ ├── export_onnx.py │ │ ├── export_trt.sh │ │ ├── inference.py │ │ └── train.py │ ├── cli │ │ ├── __init__.py │ │ ├── cosyvoice.py │ │ ├── frontend.py │ │ └── model.py │ ├── dataset │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── processor.py │ ├── flow │ │ ├── decoder.py │ │ ├── flow.py │ │ ├── flow_matching.py │ │ └── length_regulator.py │ ├── hifigan │ │ ├── discriminator.py │ │ ├── f0_predictor.py │ │ ├── generator.py │ │ └── hifigan.py │ ├── llm │ │ └── llm.py │ ├── tokenizer │ │ ├── assets │ │ │ └── multilingual_zh_ja_yue_char_del.tiktoken │ │ └── tokenizer.py │ ├── transformer │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── attention.py │ │ ├── convolution.py │ │ ├── decoder.py │ │ ├── decoder_layer.py │ │ ├── embedding.py │ │ ├── encoder.py │ │ ├── encoder_layer.py │ │ ├── label_smoothing_loss.py │ │ ├── positionwise_feed_forward.py │ │ ├── subsampling.py │ │ └── upsample_encoder.py │ └── utils │ │ ├── __init__.py │ │ ├── class_utils.py │ │ ├── common.py │ │ ├── executor.py │ │ ├── file_utils.py │ │ ├── frontend_utils.py │ │ ├── losses.py │ │ ├── mask.py │ │ ├── scheduler.py │ │ └── train_utils.py ├── docker │ └── Dockerfile ├── examples │ ├── libritts │ │ ├── cosyvoice │ │ │ ├── conf │ │ │ │ ├── cosyvoice.fromscratch.yaml │ │ │ │ ├── cosyvoice.yaml │ │ │ │ └── ds_stage2.json │ │ │ ├── cosyvoice │ │ │ ├── local │ │ │ │ ├── download_and_untar.sh │ │ │ │ └── prepare_data.py │ │ │ ├── path.sh │ │ │ ├── run.sh │ │ │ ├── tools │ │ │ └── tts_text.json │ │ └── cosyvoice2 │ │ │ ├── cosyvoice │ │ │ └── tools │ └── magicdata-read │ │ └── cosyvoice │ │ ├── conf │ │ ├── cosyvoice.fromscratch.yaml │ │ ├── cosyvoice.yaml │ │ └── ds_stage2.json │ │ ├── cosyvoice │ │ ├── local │ │ ├── download_and_untar.sh │ │ └── prepare_data.py │ │ ├── path.sh │ │ ├── run.sh │ │ ├── tools │ │ └── tts_text.json ├── requirements.txt ├── runtime │ └── python │ │ ├── Dockerfile │ │ ├── fastapi │ │ ├── client.py │ │ └── server.py │ │ └── grpc │ │ ├── client.py │ │ ├── cosyvoice.proto │ │ └── server.py ├── third_party │ └── Matcha-TTS │ │ ├── .env.example │ │ ├── .github │ │ ├── PULL_REQUEST_TEMPLATE.md │ │ ├── codecov.yml │ │ ├── dependabot.yml │ │ └── release-drafter.yml │ │ ├── .gitignore │ │ ├── .pre-commit-config.yaml │ │ ├── .project-root │ │ ├── .pylintrc │ │ ├── LICENSE │ │ ├── MANIFEST.in │ │ ├── Makefile │ │ ├── README.md │ │ ├── configs │ │ ├── __init__.py │ │ ├── callbacks │ │ │ ├── default.yaml │ │ │ ├── model_checkpoint.yaml │ │ │ ├── model_summary.yaml │ │ │ ├── none.yaml │ │ │ └── rich_progress_bar.yaml │ │ ├── debug │ │ │ ├── default.yaml │ │ │ ├── fdr.yaml │ │ │ ├── limit.yaml │ │ │ ├── overfit.yaml │ │ │ └── profiler.yaml │ │ ├── eval.yaml │ │ ├── experiment │ │ │ ├── hifi_dataset_piper_phonemizer.yaml │ │ │ ├── ljspeech.yaml │ │ │ ├── ljspeech_min_memory.yaml │ │ │ └── multispeaker.yaml │ │ ├── extras │ │ │ └── default.yaml │ │ ├── hparams_search │ │ │ └── mnist_optuna.yaml │ │ ├── hydra │ │ │ └── default.yaml │ │ ├── local │ │ │ └── .gitkeep │ │ ├── logger │ │ │ ├── aim.yaml │ │ │ ├── comet.yaml │ │ │ ├── csv.yaml │ │ │ ├── many_loggers.yaml │ │ │ ├── mlflow.yaml │ │ │ ├── neptune.yaml │ │ │ ├── tensorboard.yaml │ │ │ └── wandb.yaml │ │ ├── model │ │ │ ├── cfm │ │ │ │ └── default.yaml │ │ │ ├── decoder │ │ │ │ └── default.yaml │ │ │ ├── encoder │ │ │ │ └── default.yaml │ │ │ ├── matcha.yaml │ │ │ └── optimizer │ │ │ │ └── adam.yaml │ │ ├── paths │ │ │ └── default.yaml │ │ ├── train.yaml │ │ └── trainer │ │ │ ├── cpu.yaml │ │ │ ├── ddp.yaml │ │ │ ├── ddp_sim.yaml │ │ │ ├── default.yaml │ │ │ ├── gpu.yaml │ │ │ └── mps.yaml │ │ ├── matcha │ │ ├── VERSION │ │ ├── __init__.py │ │ ├── app.py │ │ ├── cli.py │ │ ├── hifigan │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── denoiser.py │ │ │ ├── env.py │ │ │ ├── meldataset.py │ │ │ ├── models.py │ │ │ └── xutils.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── baselightningmodule.py │ │ │ ├── components │ │ │ │ ├── __init__.py │ │ │ │ ├── decoder.py │ │ │ │ ├── flow_matching.py │ │ │ │ ├── text_encoder.py │ │ │ │ └── transformer.py │ │ │ └── matcha_tts.py │ │ ├── onnx │ │ │ ├── __init__.py │ │ │ ├── export.py │ │ │ └── infer.py │ │ ├── text │ │ │ ├── __init__.py │ │ │ ├── cleaners.py │ │ │ ├── numbers.py │ │ │ └── symbols.py │ │ ├── train.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── audio.py │ │ │ ├── generate_data_statistics.py │ │ │ ├── instantiators.py │ │ │ ├── logging_utils.py │ │ │ ├── model.py │ │ │ ├── monotonic_align │ │ │ ├── __init__.py │ │ │ ├── core.pyx │ │ │ └── setup.py │ │ │ ├── pylogger.py │ │ │ ├── rich_utils.py │ │ │ └── utils.py │ │ ├── notebooks │ │ └── .gitkeep │ │ ├── pyproject.toml │ │ ├── requirements.txt │ │ ├── scripts │ │ └── schedule.sh │ │ ├── setup.py │ │ └── synthesis.ipynb ├── tools │ ├── extract_embedding.py │ ├── extract_speech_token.py │ └── make_parquet_list.py └── webui.py ├── DiffSinger ├── .gitignore ├── LICENSE ├── README.md ├── configs │ ├── config_base.yaml │ ├── singing │ │ ├── base.yaml │ │ └── fs2.yaml │ └── tts │ │ ├── base.yaml │ │ ├── base_zh.yaml │ │ ├── fs2.yaml │ │ ├── hifigan.yaml │ │ ├── lj │ │ ├── base_mel2wav.yaml │ │ ├── base_text2mel.yaml │ │ ├── fs2.yaml │ │ ├── hifigan.yaml │ │ └── pwg.yaml │ │ └── pwg.yaml ├── data │ ├── processed │ │ └── ljspeech │ │ │ ├── dict.txt │ │ │ ├── metadata_phone.csv │ │ │ ├── mfa_dict.txt │ │ │ └── phone_set.json │ └── 有何不可.json ├── data_gen │ ├── singing │ │ └── binarize.py │ └── tts │ │ ├── base_binarizer.py │ │ ├── bin │ │ └── binarize.py │ │ ├── binarizer_zh.py │ │ ├── data_gen_utils.py │ │ └── txt_processors │ │ ├── base_text_processor.py │ │ ├── en.py │ │ ├── zh.py │ │ └── zh_g2pM.py ├── diff.py ├── docs │ ├── README-SVS-opencpop-cascade.md │ ├── README-SVS-opencpop-e2e.md │ ├── README-SVS-opencpop-pndm.md │ ├── README-SVS-popcs.md │ ├── README-SVS.md │ ├── README-TTS-pndm.md │ └── README-TTS.md ├── inference │ └── svs │ │ ├── base_svs_infer.py │ │ ├── ds_cascade.py │ │ ├── ds_e2e.py │ │ ├── gradio │ │ ├── gradio_settings.yaml │ │ └── infer.py │ │ └── opencpop │ │ ├── cpop_pinyin2ph.txt │ │ └── map.py ├── modules │ ├── __init__.py │ ├── commons │ │ ├── common_layers.py │ │ ├── espnet_positional_embedding.py │ │ └── ssim.py │ ├── diffsinger_midi │ │ └── fs2.py │ ├── fastspeech │ │ ├── fs2.py │ │ ├── pe.py │ │ └── tts_modules.py │ ├── hifigan │ │ ├── hifigan.py │ │ └── mel_utils.py │ └── parallel_wavegan │ │ ├── __init__.py │ │ ├── layers │ │ ├── __init__.py │ │ ├── causal_conv.py │ │ ├── pqmf.py │ │ ├── residual_block.py │ │ ├── residual_stack.py │ │ ├── tf_layers.py │ │ └── upsample.py │ │ ├── losses │ │ ├── __init__.py │ │ └── stft_loss.py │ │ ├── models │ │ ├── __init__.py │ │ ├── melgan.py │ │ ├── parallel_wavegan.py │ │ └── source.py │ │ ├── optimizers │ │ ├── __init__.py │ │ └── radam.py │ │ ├── stft_loss.py │ │ └── utils │ │ ├── __init__.py │ │ └── utils.py ├── requirements.txt ├── requirements_2080.txt ├── requirements_3090.txt ├── resources │ ├── apply_form.md │ ├── diffspeech-fs2-1.png │ ├── diffspeech-fs2-2.png │ ├── diffspeech-fs2.png │ ├── model_a.png │ ├── model_b.png │ └── tfb.png ├── tasks │ ├── base_task.py │ ├── run.py │ └── tts │ │ ├── fs2.py │ │ ├── fs2_utils.py │ │ ├── pe.py │ │ └── tts.py ├── usr │ ├── .gitkeep │ ├── __init__.py │ ├── configs │ │ ├── base.yaml │ │ ├── lj_ds_beta6.yaml │ │ ├── lj_ds_pndm.yaml │ │ ├── midi │ │ │ ├── cascade │ │ │ │ └── opencs │ │ │ │ │ ├── aux_rel.yaml │ │ │ │ │ ├── ds60_rel.yaml │ │ │ │ │ └── opencpop_statis.yaml │ │ │ ├── e2e │ │ │ │ ├── opencpop │ │ │ │ │ ├── ds1000.yaml │ │ │ │ │ └── ds100_adj_rel.yaml │ │ │ │ └── popcs │ │ │ │ │ └── ds100_adj_rel.yaml │ │ │ └── pe.yaml │ │ ├── popcs_ds_beta6.yaml │ │ ├── popcs_ds_beta6_offline.yaml │ │ └── popcs_fs2.yaml │ ├── diff │ │ ├── candidate_decoder.py │ │ ├── diffusion.py │ │ ├── net.py │ │ └── shallow_diffusion_tts.py │ ├── diffsinger_task.py │ ├── diffspeech_task.py │ └── task.py ├── utils │ ├── __init__.py │ ├── audio.py │ ├── cwt.py │ ├── hparams.py │ ├── indexed_datasets.py │ ├── multiprocess_utils.py │ ├── pitch_utils.py │ ├── pl_utils.py │ ├── plot.py │ ├── text_encoder.py │ ├── text_norm.py │ ├── training_utils.py │ └── tts_utils.py └── vocoders │ ├── __init__.py │ ├── base_vocoder.py │ ├── hifigan.py │ ├── pwg.py │ └── vocoder_utils.py ├── ImageBind ├── .assets │ ├── bird_audio.wav │ ├── bird_image.jpg │ ├── car_audio.wav │ ├── car_image.jpg │ ├── dog_audio.wav │ └── dog_image.jpg ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── build │ └── lib │ │ └── imagebind │ │ ├── __init__.py │ │ ├── bpe │ │ └── bpe_simple_vocab_16e6.txt.gz │ │ ├── data.py │ │ └── models │ │ ├── __init__.py │ │ ├── helpers.py │ │ ├── imagebind_model.py │ │ ├── multimodal_preprocessors.py │ │ └── transformer.py ├── imagebind.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ └── top_level.txt ├── imagebind │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ └── data.cpython-310.pyc │ ├── bpe │ │ └── bpe_simple_vocab_16e6.txt.gz │ ├── data.py │ └── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── helpers.cpython-310.pyc │ │ ├── imagebind_model.cpython-310.pyc │ │ ├── multimodal_preprocessors.cpython-310.pyc │ │ └── transformer.cpython-310.pyc │ │ ├── helpers.py │ │ ├── imagebind_model.py │ │ ├── multimodal_preprocessors.py │ │ └── transformer.py ├── model_card.md ├── requirements.txt └── setup.py ├── audio-preprocess ├── LICENSE ├── README.md ├── README.zh.md ├── fap-complete.zsh ├── fish_audio_preprocess.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── entry_points.txt │ └── top_level.txt ├── fish_audio_preprocess │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-310.pyc │ ├── cli │ │ ├── __main__.py │ │ ├── __pycache__ │ │ │ ├── __main__.cpython-310.pyc │ │ │ ├── convert_to_wav.cpython-310.pyc │ │ │ ├── frequency.cpython-310.pyc │ │ │ ├── length.cpython-310.pyc │ │ │ ├── loudness_norm.cpython-310.pyc │ │ │ ├── merge_lab.cpython-310.pyc │ │ │ ├── merge_short.cpython-310.pyc │ │ │ ├── resample.cpython-310.pyc │ │ │ ├── separate_audio.cpython-310.pyc │ │ │ ├── slice_audio.cpython-310.pyc │ │ │ └── transcribe.cpython-310.pyc │ │ ├── convert_to_wav.py │ │ ├── frequency.py │ │ ├── length.py │ │ ├── loudness_norm.py │ │ ├── merge_lab.py │ │ ├── merge_short.py │ │ ├── resample.py │ │ ├── separate_audio.py │ │ ├── slice_audio.py │ │ └── transcribe.py │ └── utils │ │ ├── __pycache__ │ │ ├── file.cpython-310.pyc │ │ ├── loudness_norm.cpython-310.pyc │ │ ├── separate_audio.cpython-310.pyc │ │ ├── slice_audio.cpython-310.pyc │ │ ├── slice_audio_v2.cpython-310.pyc │ │ └── transcribe.cpython-310.pyc │ │ ├── file.py │ │ ├── loudness_norm.py │ │ ├── separate_audio.py │ │ ├── slice_audio.py │ │ ├── slice_audio_v2.py │ │ └── transcribe.py ├── pyproject.toml └── tools │ └── lint.py ├── fish-speech ├── .dockerignore ├── .gitignore ├── .pre-commit-config.yaml ├── .project-root ├── .readthedocs.yaml ├── API_FLAGS.txt ├── LICENSE ├── README.md ├── docker-compose.dev.yml ├── dockerfile ├── dockerfile.dev ├── docs │ ├── CNAME │ ├── README.ja.md │ ├── README.ko.md │ ├── README.pt-BR.md │ ├── README.zh.md │ ├── assets │ │ └── figs │ │ │ ├── VS_1.jpg │ │ │ ├── VS_1_pt-BR.png │ │ │ ├── agent_gradio.png │ │ │ ├── diagram.png │ │ │ ├── diagrama.png │ │ │ └── logo-circle.png │ ├── en │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── samples.md │ │ └── start_agent.md │ ├── ja │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── samples.md │ │ └── start_agent.md │ ├── ko │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── samples.md │ │ └── start_agent.md │ ├── pt │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── samples.md │ │ └── start_agent.md │ ├── requirements.txt │ ├── stylesheets │ │ └── extra.css │ └── zh │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── samples.md │ │ └── start_agent.md ├── entrypoint.sh ├── fish_speech │ ├── callbacks │ │ ├── __init__.py │ │ └── grad_norm.py │ ├── configs │ │ ├── base.yaml │ │ ├── firefly_gan_vq.yaml │ │ ├── lora │ │ │ └── r_8_alpha_16.yaml │ │ └── text2semantic_finetune.yaml │ ├── conversation.py │ ├── datasets │ │ ├── concat_repeat.py │ │ ├── protos │ │ │ ├── text-data.proto │ │ │ ├── text_data_pb2.py │ │ │ └── text_data_stream.py │ │ ├── semantic.py │ │ └── vqgan.py │ ├── i18n │ │ ├── README.md │ │ ├── __init__.py │ │ ├── core.py │ │ ├── locale │ │ │ ├── en_US.json │ │ │ ├── es_ES.json │ │ │ ├── ja_JP.json │ │ │ ├── ko_KR.json │ │ │ ├── pt_BR.json │ │ │ └── zh_CN.json │ │ └── scan.py │ ├── inference_engine │ │ ├── __init__.py │ │ ├── reference_loader.py │ │ ├── utils.py │ │ └── vq_manager.py │ ├── models │ │ ├── text2semantic │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── lit_module.py │ │ │ ├── llama.py │ │ │ └── lora.py │ │ └── vqgan │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── modules │ │ │ ├── firefly.py │ │ │ └── fsq.py │ │ │ └── utils.py │ ├── scheduler.py │ ├── text │ │ ├── __init__.py │ │ ├── clean.py │ │ └── spliter.py │ ├── tokenizer.py │ ├── train.py │ └── utils │ │ ├── __init__.py │ │ ├── braceexpand.py │ │ ├── context.py │ │ ├── file.py │ │ ├── instantiators.py │ │ ├── logger.py │ │ ├── logging_utils.py │ │ ├── rich_utils.py │ │ ├── schema.py │ │ ├── spectrogram.py │ │ └── utils.py ├── inference.ipynb ├── mkdocs.yml ├── pyproject.toml ├── pyrightconfig.json ├── temp │ └── codes_0.npy └── tools │ ├── api_client.py │ ├── api_server.py │ ├── download_models.py │ ├── e2e_webui.py │ ├── export_onnx.py │ ├── extract_model.py │ ├── fish_e2e.py │ ├── llama │ ├── build_dataset.py │ ├── eval_in_context.py │ ├── merge_lora.py │ └── quantize.py │ ├── run_webui.py │ ├── server │ ├── agent │ │ ├── __init__.py │ │ ├── generate.py │ │ ├── generation_utils.py │ │ └── pre_generation_utils.py │ ├── api_utils.py │ ├── exception_handler.py │ ├── inference.py │ ├── model_manager.py │ ├── model_utils.py │ └── views.py │ ├── smart_pad.py │ ├── vqgan │ ├── create_train_split.py │ └── extract_vq.py │ ├── webui │ ├── __init__.py │ ├── inference.py │ └── variables.py │ └── whisper_asr.py ├── seed-vc ├── .gitignore ├── EVAL.md ├── LICENSE ├── README-JA.md ├── README-ZH.md ├── README.md ├── app.py ├── app_svc.py ├── app_vc.py ├── assets │ └── real-time-demo.webm ├── baselines │ ├── cosyvoice.py │ ├── dnsmos │ │ ├── dnsmos_computor.py │ │ ├── model_v8.onnx │ │ └── sig_bak_ovr.onnx │ └── openvoice.py ├── campplus_cn_common.bin ├── conda-nix-vc-py310.yaml ├── configs │ ├── config.json │ ├── hifigan.yml │ └── presets │ │ ├── config_dit_mel_seed_uvit_whisper_base_f0_44k.yml │ │ ├── config_dit_mel_seed_uvit_whisper_small_wavenet.yml │ │ └── config_dit_mel_seed_uvit_xlsr_tiny.yml ├── dac │ ├── __init__.py │ ├── __main__.py │ ├── model │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dac.py │ │ ├── discriminator.py │ │ └── encodec.py │ ├── nn │ │ ├── __init__.py │ │ ├── layers.py │ │ ├── loss.py │ │ └── quantize.py │ └── utils │ │ ├── __init__.py │ │ ├── decode.py │ │ └── encode.py ├── data │ └── ft_dataset.py ├── eval.py ├── examples │ ├── reference │ │ ├── azuma_0.wav │ │ ├── dingzhen_0.wav │ │ ├── s1p1.wav │ │ ├── s1p2.wav │ │ ├── s2p1.wav │ │ ├── s2p2.wav │ │ ├── s3p1.wav │ │ ├── s3p2.wav │ │ ├── s4p1.wav │ │ ├── s4p2.wav │ │ ├── teio_0.wav │ │ └── trump_0.wav │ └── source │ │ ├── TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav │ │ ├── Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav │ │ ├── glados_0.wav │ │ ├── jay_0.wav │ │ ├── source_s1.wav │ │ ├── source_s2.wav │ │ ├── source_s3.wav │ │ ├── source_s4.wav │ │ └── yae_0.wav ├── hf_utils.py ├── inference.py ├── modules │ ├── alias_free_torch │ │ ├── __init__.py │ │ ├── act.py │ │ ├── filter.py │ │ └── resample.py │ ├── audio.py │ ├── bigvgan │ │ ├── activations.py │ │ ├── alias_free_activation │ │ │ ├── cuda │ │ │ │ ├── __init__.py │ │ │ │ ├── activation1d.py │ │ │ │ ├── anti_alias_activation.cpp │ │ │ │ ├── anti_alias_activation_cuda.cu │ │ │ │ ├── compat.h │ │ │ │ ├── load.py │ │ │ │ └── type_shim.h │ │ │ └── torch │ │ │ │ ├── __init__.py │ │ │ │ ├── act.py │ │ │ │ ├── filter.py │ │ │ │ └── resample.py │ │ ├── bigvgan.py │ │ ├── config.json │ │ ├── env.py │ │ ├── meldataset.py │ │ └── utils.py │ ├── campplus │ │ ├── DTDNN.py │ │ ├── classifier.py │ │ └── layers.py │ ├── commons.py │ ├── diffusion_transformer.py │ ├── encodec.py │ ├── flow_matching.py │ ├── gpt_fast │ │ ├── generate.py │ │ ├── model.py │ │ └── quantize.py │ ├── hifigan │ │ ├── f0_predictor.py │ │ └── generator.py │ ├── layers.py │ ├── length_regulator.py │ ├── openvoice │ │ ├── __init__.py │ │ ├── api.py │ │ ├── attentions.py │ │ ├── checkpoints_v2 │ │ │ └── converter │ │ │ │ └── config.json │ │ ├── commons.py │ │ ├── mel_processing.py │ │ ├── models.py │ │ ├── modules.py │ │ ├── openvoice_app.py │ │ ├── se_extractor.py │ │ ├── transforms.py │ │ └── utils.py │ ├── quantize.py │ ├── rmvpe.py │ ├── vocos │ │ ├── __init__.py │ │ ├── heads.py │ │ ├── helpers.py │ │ ├── loss.py │ │ ├── models.py │ │ ├── modules.py │ │ ├── pretrained.py │ │ └── spectral_ops.py │ └── wavenet.py ├── optimizers.py ├── real-time-gui.py ├── requirements-mac.txt ├── requirements.txt ├── ruff.toml └── train.py └── videorag ├── __init__.py ├── __pycache__ ├── __init__.cpython-310.pyc ├── _opcontent.cpython-310.pyc ├── _utils.cpython-310.pyc ├── base.cpython-310.pyc └── videoragcontent.cpython-310.pyc ├── _opcontent.py ├── _storage ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── kv_json.cpython-310.pyc │ └── vdb_nanovectordb.cpython-310.pyc ├── kv_json.py └── vdb_nanovectordb.py ├── _utils.py ├── _videoutil ├── .ipynb_checkpoints │ └── caption-checkpoint.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── asr.cpython-310.pyc │ ├── caption.cpython-310.pyc │ ├── feature.cpython-310.pyc │ └── split.cpython-310.pyc ├── asr.py ├── caption.py ├── feature.py └── split.py ├── base.py └── videoragcontent.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/.DS_Store -------------------------------------------------------------------------------- /Communication.md: -------------------------------------------------------------------------------- 1 | We provide QR codes for joining the HKUDS discussion groups on WeChat and Feishu. 2 | 3 | You can join by scanning the QR codes below: 4 | 5 | WeChat QR Code 6 | 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 ✨Data Intelligence Lab@HKU✨ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/.DS_Store -------------------------------------------------------------------------------- /assets/QR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/QR.png -------------------------------------------------------------------------------- /assets/adapted_crosstalk_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/adapted_crosstalk_cover.png -------------------------------------------------------------------------------- /assets/adapted_standupcomedy_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/adapted_standupcomedy_cover.png -------------------------------------------------------------------------------- /assets/airencuoguo_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/airencuoguo_cover.png -------------------------------------------------------------------------------- /assets/audio_performance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/audio_performance.jpg -------------------------------------------------------------------------------- /assets/cover_16-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_16-9.png -------------------------------------------------------------------------------- /assets/cover_3-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_3-4.png -------------------------------------------------------------------------------- /assets/cover_4-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_4-3.png -------------------------------------------------------------------------------- /assets/crosstalk_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/crosstalk_original_cover.png -------------------------------------------------------------------------------- /assets/dune_news_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_news_cover.png -------------------------------------------------------------------------------- /assets/dune_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_original_cover.png -------------------------------------------------------------------------------- /assets/dune_youtube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_youtube.png -------------------------------------------------------------------------------- /assets/edit_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/edit_workflow.png -------------------------------------------------------------------------------- /assets/eva1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva1.png -------------------------------------------------------------------------------- /assets/eva2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva2.png -------------------------------------------------------------------------------- /assets/eva3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva3.jpg -------------------------------------------------------------------------------- /assets/eva4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva4.jpg -------------------------------------------------------------------------------- /assets/eval1_audio_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eval1_audio_new.png -------------------------------------------------------------------------------- /assets/eval1_video_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eval1_video_new.png -------------------------------------------------------------------------------- /assets/framework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/framework.jpg -------------------------------------------------------------------------------- /assets/grok4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/grok4.png -------------------------------------------------------------------------------- /assets/interstella_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/interstella_cover.png -------------------------------------------------------------------------------- /assets/interstella_cover_love.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/interstella_cover_love.png -------------------------------------------------------------------------------- /assets/joylife_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/joylife_cover.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/logo.png -------------------------------------------------------------------------------- /assets/logo_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/logo_new.png -------------------------------------------------------------------------------- /assets/masterma_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/masterma_cover.png -------------------------------------------------------------------------------- /assets/masterma_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/masterma_original_cover.png -------------------------------------------------------------------------------- /assets/nezha_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/nezha_cover.png -------------------------------------------------------------------------------- /assets/openai_news_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/openai_news_cover.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/overview.png -------------------------------------------------------------------------------- /assets/spiderman_cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_cover.jpg -------------------------------------------------------------------------------- /assets/spiderman_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_cover.png -------------------------------------------------------------------------------- /assets/spiderman_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_new.jpg -------------------------------------------------------------------------------- /assets/standup_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/standup_original_cover.png -------------------------------------------------------------------------------- /assets/tech_news_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/tech_news_original_cover.png -------------------------------------------------------------------------------- /assets/titanic_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/titanic_cover.png -------------------------------------------------------------------------------- /assets/video_performance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/video_performance.jpg -------------------------------------------------------------------------------- /assets/xiaomingjianmo1_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo1_cover.png -------------------------------------------------------------------------------- /assets/xiaomingjianmo_findyourproblem_meme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_findyourproblem_meme.png -------------------------------------------------------------------------------- /assets/xiaomingjianmo_mvp_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_mvp_cover.png -------------------------------------------------------------------------------- /assets/xiaomingjianmo_original_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_original_cover.png -------------------------------------------------------------------------------- /assets/youhebuke_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/youhebuke_cover.png -------------------------------------------------------------------------------- /dataset/voice/ava_16k.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/dataset/voice/ava_16k.wav -------------------------------------------------------------------------------- /environment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__init__.py -------------------------------------------------------------------------------- /environment/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /environment/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /environment/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__init__.py -------------------------------------------------------------------------------- /environment/agents/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /environment/agents/__pycache__/base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/base.cpython-310.pyc -------------------------------------------------------------------------------- /environment/agents/__pycache__/graph.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/graph.cpython-310.pyc -------------------------------------------------------------------------------- /environment/agents/__pycache__/multi.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/multi.cpython-310.pyc -------------------------------------------------------------------------------- /environment/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__init__.py -------------------------------------------------------------------------------- /environment/config/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /environment/config/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /environment/config/__pycache__/llm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/llm.cpython-310.pyc -------------------------------------------------------------------------------- /environment/config/check.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | print(os.path.splitext("dataset/找自己问题.wav")[0]) -------------------------------------------------------------------------------- /environment/config/config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | 5 | def _load_config(config_path='environment/config/config.yml'): 6 | if not os.path.exists(config_path): 7 | raise FileNotFoundError(f"Config file not found: {config_path}") 8 | 9 | with open(config_path, 'r', encoding='utf-8') as f: 10 | return yaml.safe_load(f) 11 | 12 | 13 | config = _load_config() 14 | -------------------------------------------------------------------------------- /environment/config/config.yml: -------------------------------------------------------------------------------- 1 | llm: 2 | 3 | # Video Remixing/TTS/SVC/Stand-up/CrossTalk 4 | deepseek_api_key: "" 5 | deepseek_base_url: "" 6 | 7 | # Agentic Graph Router/TTS/SVC/Stand-up/CrossTalk 8 | claude_api_key: "" 9 | claude_base_url: "" 10 | 11 | # Video Editing/Overview/Summarization/QA/Commentary Video 12 | gpt_api_key: "" 13 | gpt_base_url: "" 14 | 15 | # MLLM for caption and fine-grained video understanding 16 | gemini_api_key: "" 17 | gemini_base_url: "" 18 | 19 | 20 | 21 | 22 | # api_key: "" # Default/fallback API key 23 | # base_url: "" # Default/fallback base URL 24 | 25 | # deepseek_api_key: "" 26 | # deepseek_base_url: "" 27 | 28 | # Agentic Graph Router 29 | # claude_api_key: "" 30 | # claude_base_url: "" 31 | 32 | # Video Editing/Overview/Summarization/QA/Text to Commentary Video 33 | # gpt_api_key: "" 34 | # gpt_base_url: "" 35 | 36 | # MLLM for caption and fine-grained video understanding 37 | # gemini_api_key: "" 38 | # gemini_base_url: "" 39 | -------------------------------------------------------------------------------- /environment/config/user.yml: -------------------------------------------------------------------------------- 1 | #user: 2 | # reqs: 我将提供给你一个音乐MP4,我希望你进行歌词的改编,以及用我指定的音色克隆,并且用我给的视频素材制作一个全新的音乐视频 3 | 4 | # TTS 5 | reqs: | 6 | I would like you to adapt the content of a video with the following specific requirements: -------------------------------------------------------------------------------- /environment/roles/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import json 3 | 4 | _registry = None 5 | 6 | 7 | def get_agent_class(agent_name: str): 8 | """动态加载Agent类的核心方法""" 9 | global _registry 10 | 11 | # 首次加载注册表 12 | if _registry is None: 13 | registry_path = "environment/config/registry.json" 14 | with open(registry_path, 'r', encoding='utf-8') as f: 15 | _registry = json.load(f) 16 | 17 | # 查找模块路径 18 | if agent_name not in _registry: 19 | raise ValueError(f"Agent {agent_name} not registered") 20 | 21 | module_path = _registry[agent_name] 22 | 23 | try: 24 | # 动态导入模块 25 | module = importlib.import_module(module_path) 26 | # 获取类对象 27 | return getattr(module, agent_name) 28 | except (ImportError, AttributeError) as e: 29 | raise ImportError(f"Load {agent_name} failed: {str(e)}") -------------------------------------------------------------------------------- /environment/roles/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/audio_extractor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/audio_extractor.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/loudness_normalizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/loudness_normalizer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/merge.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/merge.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/mixer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/mixer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/resampler.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/resampler.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/separator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/separator.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/transcriber.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/transcriber.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_conversion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_conversion.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_editor.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_editor.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_editor_base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_editor_base.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_preloader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_preloader.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_searcher.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_searcher.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/vid_searcher_base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_searcher_base.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/__pycache__/voice_generator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/voice_generator.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/cross_talk/__pycache__/cross_talk_adapter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_adapter.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/cross_talk/__pycache__/cross_talk_conversion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_conversion.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/cross_talk/__pycache__/cross_talk_synth.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_synth.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/stand_up/__pycache__/stand_up_adapter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_adapter.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/stand_up/__pycache__/stand_up_conversion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_conversion.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/stand_up/__pycache__/stand_up_synth.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_synth.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/svc/__pycache__/svc_adapter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_adapter.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/svc/__pycache__/svc_analyzer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_analyzer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/svc/__pycache__/svc_conversion.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_conversion.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/svc/__pycache__/svc_coverist.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_coverist.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/svc/__pycache__/svc_single.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_single.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/tts/__pycache__/tts_infer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_infer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/tts/__pycache__/tts_replace.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_replace.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/tts/__pycache__/tts_slicer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_slicer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/tts/__pycache__/tts_writer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_writer.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_comm/__pycache__/comm_story_gen.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_comm/__pycache__/comm_story_gen.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_news/__pycache__/news_story_gen.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_news/__pycache__/news_story_gen.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_qa/__pycache__/content_loader copy.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader copy.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_qa/__pycache__/content_loader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_qa/__pycache__/content_loader_base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader_base.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_rhythm/__pycache__/rhythm_detector.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_rhythm/__pycache__/rhythm_detector.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_rhythm/__pycache__/rhythm_story_gen.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_rhythm/__pycache__/rhythm_story_gen.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_summ/__pycache__/summ_loader.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_summ/__pycache__/summ_loader.cpython-310.pyc -------------------------------------------------------------------------------- /environment/roles/vid_summ/__pycache__/summ_loader_base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_summ/__pycache__/summ_loader_base.cpython-310.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ 3 | -e . -------------------------------------------------------------------------------- /tools/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/CosyVoice/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Visual Studio Code files 7 | .vscode 8 | .vs 9 | 10 | # PyCharm files 11 | .idea 12 | 13 | # Eclipse Project settings 14 | *.*project 15 | .settings 16 | 17 | # Sublime Text settings 18 | *.sublime-workspace 19 | *.sublime-project 20 | 21 | # Editor temporaries 22 | *.swn 23 | *.swo 24 | *.swp 25 | *.swm 26 | *~ 27 | 28 | # IPython notebook checkpoints 29 | .ipynb_checkpoints 30 | 31 | # macOS dir files 32 | .DS_Store 33 | 34 | exp 35 | data 36 | raw_wav 37 | tensorboard 38 | **/*build* 39 | 40 | # Clangd files 41 | .cache 42 | compile_commands.json 43 | 44 | # train/inference files 45 | *.wav 46 | *.m4a 47 | *.aac 48 | *.pt 49 | pretrained_models/* 50 | *_pb2_grpc.py 51 | *_pb2.py 52 | *.tar -------------------------------------------------------------------------------- /tools/CosyVoice/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/Matcha-TTS"] 2 | path = third_party/Matcha-TTS 3 | url = https://github.com/shivammehta25/Matcha-TTS.git 4 | -------------------------------------------------------------------------------- /tools/CosyVoice/FAQ.md: -------------------------------------------------------------------------------- 1 | ## ModuleNotFoundError: No module named 'matcha' 2 | 3 | Matcha-TTS is a third_party module. Please check `third_party` directory. If there is no `Matcha-TTS`, execute `git submodule update --init --recursive`. 4 | 5 | run `export PYTHONPATH=third_party/Matcha-TTS` if you want to use `from cosyvoice.cli.cosyvoice import CosyVoice` in python script. 6 | 7 | ## cannot find resource.zip or cannot unzip resource.zip 8 | 9 | Please make sure you have git-lfs installed. Execute 10 | 11 | ```sh 12 | git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd 13 | cd pretrained_models/CosyVoice-ttsfrd/ 14 | unzip resource.zip -d . 15 | pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl 16 | ``` 17 | -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/bin/export_trt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2024 Alibaba Inc. All Rights Reserved. 3 | # download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability 4 | # for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz 5 | TRT_DIR= 6 | MODEL_DIR= 7 | 8 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64 9 | $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw 10 | $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw 11 | -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/cli/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/dataset/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/transformer/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/utils/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/cosyvoice/utils/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def tpr_loss(disc_real_outputs, disc_generated_outputs, tau): 6 | loss = 0 7 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 8 | m_DG = torch.median((dr - dg)) 9 | L_rel = torch.mean((((dr - dg) - m_DG) ** 2)[dr < dg + m_DG]) 10 | loss += tau - F.relu(tau - L_rel) 11 | return loss 12 | 13 | 14 | def mel_loss(real_speech, generated_speech, mel_transforms): 15 | loss = 0 16 | for transform in mel_transforms: 17 | mel_r = transform(real_speech) 18 | mel_g = transform(generated_speech) 19 | loss += F.l1_loss(mel_g, mel_r) 20 | return loss 21 | -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice/conf/ds_stage2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 5, 6 | "fp16": { 7 | "enabled": false, 8 | "auto_cast": false, 9 | "loss_scale": 0, 10 | "initial_scale_power": 16, 11 | "loss_scale_window": 256, 12 | "hysteresis": 2, 13 | "consecutive_hysteresis": false, 14 | "min_loss_scale": 1 15 | }, 16 | "bf16": { 17 | "enabled": false 18 | }, 19 | "zero_force_ds_cpu_optimizer": false, 20 | "zero_optimization": { 21 | "stage": 2, 22 | "offload_optimizer": { 23 | "device": "none", 24 | "pin_memory": true 25 | }, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 5e8, 28 | "overlap_comm": false, 29 | "reduce_scatter": true, 30 | "reduce_bucket_size": 5e8, 31 | "contiguous_gradients" : true 32 | }, 33 | "optimizer": { 34 | "type": "AdamW", 35 | "params": { 36 | "lr": 0.001, 37 | "weight_decay": 0.0001, 38 | "torch_adam": true, 39 | "adam_w_mode": true 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice/cosyvoice: -------------------------------------------------------------------------------- 1 | ../../../cosyvoice -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice/path.sh: -------------------------------------------------------------------------------- 1 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C 2 | export PYTHONIOENCODING=UTF-8 3 | export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH 4 | -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice/tools: -------------------------------------------------------------------------------- 1 | ../../../tools -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice/tts_text.json: -------------------------------------------------------------------------------- 1 | { 2 | "1089_134686_000002_000000": [ 3 | "hello, my name is Jack. What is your name?" 4 | ] 5 | } -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice2/cosyvoice: -------------------------------------------------------------------------------- 1 | ../../../cosyvoice -------------------------------------------------------------------------------- /tools/CosyVoice/examples/libritts/cosyvoice2/tools: -------------------------------------------------------------------------------- 1 | ../../../tools -------------------------------------------------------------------------------- /tools/CosyVoice/examples/magicdata-read/cosyvoice/conf/ds_stage2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_micro_batch_size_per_gpu": 1, 3 | "gradient_accumulation_steps": 1, 4 | "steps_per_print": 100, 5 | "gradient_clipping": 5, 6 | "fp16": { 7 | "enabled": false, 8 | "auto_cast": false, 9 | "loss_scale": 0, 10 | "initial_scale_power": 16, 11 | "loss_scale_window": 256, 12 | "hysteresis": 2, 13 | "consecutive_hysteresis": false, 14 | "min_loss_scale": 1 15 | }, 16 | "bf16": { 17 | "enabled": false 18 | }, 19 | "zero_force_ds_cpu_optimizer": false, 20 | "zero_optimization": { 21 | "stage": 2, 22 | "offload_optimizer": { 23 | "device": "none", 24 | "pin_memory": true 25 | }, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 5e8, 28 | "overlap_comm": false, 29 | "reduce_scatter": true, 30 | "reduce_bucket_size": 5e8, 31 | "contiguous_gradients" : true 32 | }, 33 | "optimizer": { 34 | "type": "AdamW", 35 | "params": { 36 | "lr": 0.001, 37 | "weight_decay": 0.0001, 38 | "torch_adam": true, 39 | "adam_w_mode": true 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /tools/CosyVoice/examples/magicdata-read/cosyvoice/cosyvoice: -------------------------------------------------------------------------------- 1 | ../../../cosyvoice -------------------------------------------------------------------------------- /tools/CosyVoice/examples/magicdata-read/cosyvoice/path.sh: -------------------------------------------------------------------------------- 1 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C 2 | export PYTHONIOENCODING=UTF-8 3 | export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH 4 | -------------------------------------------------------------------------------- /tools/CosyVoice/examples/magicdata-read/cosyvoice/tools: -------------------------------------------------------------------------------- 1 | ../../../tools -------------------------------------------------------------------------------- /tools/CosyVoice/examples/magicdata-read/cosyvoice/tts_text.json: -------------------------------------------------------------------------------- 1 | { 2 | "38_5718_20170915093303": [ 3 | "我想这出最好歌曲把歌词发到网上请别人帮我作曲急急", 4 | "叫他明天早上差五分儿九点去机场" 5 | ], 6 | "38_5721_20170915091235": [ 7 | "变温室调到零下两度档", 8 | "交谈中请勿轻信汇款信息陌生电话请勿使用外挂软件" 9 | ], 10 | "38_5733_20170915130323": [ 11 | "这是老鹰乐队的一首经典歌曲", 12 | "我急用这段音乐我自己找到一段但是有现场杂音" 13 | ], 14 | "38_5836_20170916221414": [ 15 | "给我播一个陶喆的专辑", 16 | "这套餐好贵呀我发这么多短信贵死了" 17 | ] 18 | } -------------------------------------------------------------------------------- /tools/CosyVoice/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684 3 | conformer==0.3.2 4 | deepspeed==0.14.2; sys_platform == 'linux' 5 | diffusers==0.29.0 6 | gdown==5.1.0 7 | gradio==5.4.0 8 | grpcio==1.57.0 9 | grpcio-tools==1.57.0 10 | hydra-core==1.3.2 11 | HyperPyYAML==1.2.2 12 | inflect==7.3.1 13 | librosa==0.10.2 14 | lightning==2.2.4 15 | matplotlib==3.7.5 16 | modelscope==1.15.0 17 | networkx==3.1 18 | omegaconf==2.3.0 19 | onnx==1.16.0 20 | onnxruntime-gpu==1.18.0; sys_platform == 'linux' 21 | onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32' 22 | openai-whisper==20231117 23 | protobuf==4.25 24 | pydantic==2.7.0 25 | pyworld==0.3.4 26 | rich==13.7.1 27 | soundfile==0.12.1 28 | tensorboard==2.14.0 29 | tensorrt-cu12==10.0.1; sys_platform == 'linux' 30 | tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux' 31 | tensorrt-cu12-libs==10.0.1; sys_platform == 'linux' 32 | torch==2.3.1 33 | torchaudio==2.3.1 34 | transformers==4.40.1 35 | uvicorn==0.30.0 36 | wget==3.2 37 | fastapi==0.115.6 38 | fastapi-cli==0.0.4 39 | WeTextProcessing==1.0.3 40 | -------------------------------------------------------------------------------- /tools/CosyVoice/runtime/python/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | WORKDIR /opt/CosyVoice 5 | 6 | RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list 7 | RUN apt-get update -y 8 | RUN apt-get -y install git unzip git-lfs g++ 9 | RUN git lfs install 10 | RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git 11 | # here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed 12 | RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com 13 | RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto -------------------------------------------------------------------------------- /tools/CosyVoice/runtime/python/grpc/cosyvoice.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package cosyvoice; 4 | option go_package = "protos/"; 5 | 6 | service CosyVoice{ 7 | rpc Inference(Request) returns (stream Response) {} 8 | } 9 | 10 | message Request{ 11 | oneof RequestPayload { 12 | sftRequest sft_request = 1; 13 | zeroshotRequest zero_shot_request = 2; 14 | crosslingualRequest cross_lingual_request = 3; 15 | instructRequest instruct_request = 4; 16 | } 17 | } 18 | 19 | message sftRequest{ 20 | string spk_id = 1; 21 | string tts_text = 2; 22 | } 23 | 24 | message zeroshotRequest{ 25 | string tts_text = 1; 26 | string prompt_text = 2; 27 | bytes prompt_audio = 3; 28 | } 29 | 30 | message crosslingualRequest{ 31 | string tts_text = 1; 32 | bytes prompt_audio = 2; 33 | } 34 | 35 | message instructRequest{ 36 | string tts_text = 1; 37 | string spk_id = 2; 38 | string instruct_text = 3; 39 | } 40 | 41 | message Response{ 42 | bytes tts_audio = 1; 43 | } -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.env.example: -------------------------------------------------------------------------------- 1 | # example of file for storing private and user specific environment variables, like keys or system paths 2 | # rename it to ".env" (excluded from version control by default) 3 | # .env is loaded by train.py automatically 4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR} 5 | 6 | MY_VAR="/home/user/my/system/path" 7 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## What does this PR do? 2 | 3 | 9 | 10 | Fixes #\ 11 | 12 | ## Before submitting 13 | 14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**? 15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together? 16 | - [ ] Did you list all the **breaking changes** introduced by this pull request? 17 | - [ ] Did you **test your PR locally** with `pytest` command? 18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command? 19 | 20 | ## Did you have fun? 21 | 22 | Make sure you had fun coding 🙃 23 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.github/codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | # measures overall project coverage 4 | project: 5 | default: 6 | threshold: 100% # how much decrease in coverage is needed to not consider success 7 | 8 | # measures PR or single commit coverage 9 | patch: 10 | default: 11 | threshold: 100% # how much decrease in coverage is needed to not consider success 12 | 13 | 14 | # project: off 15 | # patch: off 16 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | target-branch: "dev" 11 | schedule: 12 | interval: "daily" 13 | ignore: 14 | - dependency-name: "pytorch-lightning" 15 | update-types: ["version-update:semver-patch"] 16 | - dependency-name: "torchmetrics" 17 | update-types: ["version-update:semver-patch"] 18 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: "v$RESOLVED_VERSION" 2 | tag-template: "v$RESOLVED_VERSION" 3 | 4 | categories: 5 | - title: "🚀 Features" 6 | labels: 7 | - "feature" 8 | - "enhancement" 9 | - title: "🐛 Bug Fixes" 10 | labels: 11 | - "fix" 12 | - "bugfix" 13 | - "bug" 14 | - title: "🧹 Maintenance" 15 | labels: 16 | - "maintenance" 17 | - "dependencies" 18 | - "refactoring" 19 | - "cosmetic" 20 | - "chore" 21 | - title: "📝️ Documentation" 22 | labels: 23 | - "documentation" 24 | - "docs" 25 | 26 | change-template: "- $TITLE @$AUTHOR (#$NUMBER)" 27 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions 28 | 29 | version-resolver: 30 | major: 31 | labels: 32 | - "major" 33 | minor: 34 | labels: 35 | - "minor" 36 | patch: 37 | labels: 38 | - "patch" 39 | default: patch 40 | 41 | template: | 42 | ## Changes 43 | 44 | $CHANGES 45 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/.project-root: -------------------------------------------------------------------------------- 1 | # this file is required for inferring the project root directory 2 | # do not delete 3 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Shivam Mehta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include requirements.*.txt 4 | include *.cff 5 | include requirements.txt 6 | include matcha/VERSION 7 | recursive-include matcha *.json 8 | recursive-include matcha *.html 9 | recursive-include matcha *.png 10 | recursive-include matcha *.md 11 | recursive-include matcha *.py 12 | recursive-include matcha *.pyx 13 | recursive-exclude tests * 14 | prune tests* 15 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/Makefile: -------------------------------------------------------------------------------- 1 | 2 | help: ## Show help 3 | @grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 4 | 5 | clean: ## Clean autogenerated files 6 | rm -rf dist 7 | find . -type f -name "*.DS_Store" -ls -delete 8 | find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf 9 | find . | grep -E ".pytest_cache" | xargs rm -rf 10 | find . | grep -E ".ipynb_checkpoints" | xargs rm -rf 11 | rm -f .coverage 12 | 13 | clean-logs: ## Clean logs 14 | rm -rf logs/** 15 | 16 | create-package: ## Create wheel and tar gz 17 | rm -rf dist/ 18 | python setup.py bdist_wheel --plat-name=manylinux1_x86_64 19 | python setup.py sdist 20 | python -m twine upload dist/* --verbose --skip-existing 21 | 22 | format: ## Run pre-commit hooks 23 | pre-commit run -a 24 | 25 | sync: ## Merge changes from main branch to your current branch 26 | git pull 27 | git pull origin main 28 | 29 | test: ## Run not slow tests 30 | pytest -k "not slow" 31 | 32 | test-full: ## Run all tests 33 | pytest 34 | 35 | train-ljspeech: ## Train the model 36 | python matcha/train.py experiment=ljspeech 37 | 38 | train-ljspeech-min: ## Train the model with minimum memory 39 | python matcha/train.py experiment=ljspeech_min_memory 40 | 41 | start_app: ## Start the app 42 | python matcha/app.py 43 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/__init__.py: -------------------------------------------------------------------------------- 1 | # this file is needed here to include configs when building project as a package 2 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - model_checkpoint.yaml 3 | - model_summary.yaml 4 | - rich_progress_bar.yaml 5 | - _self_ 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html 2 | 3 | model_checkpoint: 4 | _target_: lightning.pytorch.callbacks.ModelCheckpoint 5 | dirpath: ${paths.output_dir}/checkpoints # directory to save the model file 6 | filename: checkpoint_{epoch:03d} # checkpoint filename 7 | monitor: epoch # name of the logged metric which determines when model is improving 8 | verbose: False # verbosity mode 9 | save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt 10 | save_top_k: 10 # save k best models (determined by above metric) 11 | mode: "max" # "max" means higher metric value is better, can be also "min" 12 | auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name 13 | save_weights_only: False # if True, then only the model’s weights will be saved 14 | every_n_train_steps: null # number of training steps between checkpoints 15 | train_time_interval: null # checkpoints are monitored at the specified time interval 16 | every_n_epochs: 100 # number of epochs between checkpoints 17 | save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation 18 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html 2 | 3 | model_summary: 4 | _target_: lightning.pytorch.callbacks.RichModelSummary 5 | max_depth: 3 # the maximum depth of layer nesting that the summary will include 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/none.yaml -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml: -------------------------------------------------------------------------------- 1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html 2 | 3 | rich_progress_bar: 4 | _target_: lightning.pytorch.callbacks.RichProgressBar 5 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/debug/default.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # default debugging setup, runs 1 full epoch 4 | # other debugging configs can inherit from this one 5 | 6 | # overwrite task name so debugging logs are stored in separate folder 7 | task_name: "debug" 8 | 9 | # disable callbacks and loggers during debugging 10 | # callbacks: null 11 | # logger: null 12 | 13 | extras: 14 | ignore_warnings: False 15 | enforce_tags: False 16 | 17 | # sets level of all command line loggers to 'DEBUG' 18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ 19 | hydra: 20 | job_logging: 21 | root: 22 | level: DEBUG 23 | 24 | # use this to also set hydra loggers to 'DEBUG' 25 | # verbose: True 26 | 27 | trainer: 28 | max_epochs: 1 29 | accelerator: cpu # debuggers don't like gpus 30 | devices: 1 # debuggers don't like multiprocessing 31 | detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor 32 | 33 | data: 34 | num_workers: 0 # debuggers don't like multiprocessing 35 | pin_memory: False # disable gpu memory pin 36 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/debug/fdr.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs 1 train, 1 validation and 1 test step 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | fast_dev_run: true 10 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/debug/limit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # uses only 1% of the training data and 5% of validation/test data 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 3 10 | limit_train_batches: 0.01 11 | limit_val_batches: 0.05 12 | limit_test_batches: 0.05 13 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/debug/overfit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # overfits to 3 batches 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 20 10 | overfit_batches: 3 11 | 12 | # model ckpt and early stopping need to be disabled during overfitting 13 | callbacks: null 14 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/debug/profiler.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # runs with execution time profiling 4 | 5 | defaults: 6 | - default 7 | 8 | trainer: 9 | max_epochs: 1 10 | # profiler: "simple" 11 | profiler: "advanced" 12 | # profiler: "pytorch" 13 | accelerator: gpu 14 | 15 | limit_train_batches: 0.02 16 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | defaults: 4 | - _self_ 5 | - data: mnist # choose datamodule with `test_dataloader()` for evaluation 6 | - model: mnist 7 | - logger: null 8 | - trainer: default 9 | - paths: default 10 | - extras: default 11 | - hydra: default 12 | 13 | task_name: "eval" 14 | 15 | tags: ["dev"] 16 | 17 | # passing checkpoint path is necessary for evaluation 18 | ckpt_path: ??? 19 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: hi-fi_en-US_female.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"] 13 | 14 | run_name: hi-fi_en-US_female_piper_phonemizer 15 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech 15 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: ljspeech.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["ljspeech"] 13 | 14 | run_name: ljspeech_min 15 | 16 | 17 | model: 18 | out_size: 172 19 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | # to execute this experiment run: 4 | # python train.py experiment=multispeaker 5 | 6 | defaults: 7 | - override /data: vctk.yaml 8 | 9 | # all parameters below will be merged with parameters from default configurations set above 10 | # this allows you to overwrite only specified parameters 11 | 12 | tags: ["multispeaker"] 13 | 14 | run_name: multispeaker 15 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/extras/default.yaml: -------------------------------------------------------------------------------- 1 | # disable python warnings if they annoy you 2 | ignore_warnings: False 3 | 4 | # ask user for tags if none are provided in the config 5 | enforce_tags: True 6 | 7 | # pretty print config tree at the start of the run using Rich library 8 | print_config: True 9 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/hydra/default.yaml: -------------------------------------------------------------------------------- 1 | # https://hydra.cc/docs/configure_hydra/intro/ 2 | 3 | # enable color logging 4 | defaults: 5 | - override hydra_logging: colorlog 6 | - override job_logging: colorlog 7 | 8 | # output directory, generated dynamically on each run 9 | run: 10 | dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} 11 | sweep: 12 | dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} 13 | subdir: ${hydra.job.num} 14 | 15 | job_logging: 16 | handlers: 17 | file: 18 | # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242 19 | filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log 20 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/local/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/configs/local/.gitkeep -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/aim.yaml: -------------------------------------------------------------------------------- 1 | # https://aimstack.io/ 2 | 3 | # example usage in lightning module: 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py 5 | 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder): 7 | # `aim up` 8 | 9 | aim: 10 | _target_: aim.pytorch_lightning.AimLogger 11 | repo: ${paths.root_dir} # .aim folder will be created here 12 | # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html# 13 | 14 | # aim allows to group runs under experiment name 15 | experiment: null # any string, set to "default" if not specified 16 | 17 | train_metric_prefix: "train/" 18 | val_metric_prefix: "val/" 19 | test_metric_prefix: "test/" 20 | 21 | # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.) 22 | system_tracking_interval: 10 # set to null to disable system metrics tracking 23 | 24 | # enable/disable logging of system params such as installed packages, git info, env vars, etc. 25 | log_system_params: true 26 | 27 | # enable/disable tracking console logs (default value is true) 28 | capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550 29 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/comet.yaml: -------------------------------------------------------------------------------- 1 | # https://www.comet.ml 2 | 3 | comet: 4 | _target_: lightning.pytorch.loggers.comet.CometLogger 5 | api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable 6 | save_dir: "${paths.output_dir}" 7 | project_name: "lightning-hydra-template" 8 | rest_api_key: null 9 | # experiment_name: "" 10 | experiment_key: null # set to resume experiment 11 | offline: False 12 | prefix: "" 13 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/csv.yaml: -------------------------------------------------------------------------------- 1 | # csv logger built in lightning 2 | 3 | csv: 4 | _target_: lightning.pytorch.loggers.csv_logs.CSVLogger 5 | save_dir: "${paths.output_dir}" 6 | name: "csv/" 7 | prefix: "" 8 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/many_loggers.yaml: -------------------------------------------------------------------------------- 1 | # train with many loggers at once 2 | 3 | defaults: 4 | # - comet 5 | - csv 6 | # - mlflow 7 | # - neptune 8 | - tensorboard 9 | - wandb 10 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/mlflow.yaml: -------------------------------------------------------------------------------- 1 | # https://mlflow.org 2 | 3 | mlflow: 4 | _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger 5 | # experiment_name: "" 6 | # run_name: "" 7 | tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI 8 | tags: null 9 | # save_dir: "./mlruns" 10 | prefix: "" 11 | artifact_location: null 12 | # run_id: "" 13 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/neptune.yaml: -------------------------------------------------------------------------------- 1 | # https://neptune.ai 2 | 3 | neptune: 4 | _target_: lightning.pytorch.loggers.neptune.NeptuneLogger 5 | api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable 6 | project: username/lightning-hydra-template 7 | # name: "" 8 | log_model_checkpoints: True 9 | prefix: "" 10 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | # https://www.tensorflow.org/tensorboard/ 2 | 3 | tensorboard: 4 | _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger 5 | save_dir: "${paths.output_dir}/tensorboard/" 6 | name: null 7 | log_graph: False 8 | default_hp_metric: True 9 | prefix: "" 10 | # version: "" 11 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/logger/wandb.yaml: -------------------------------------------------------------------------------- 1 | # https://wandb.ai 2 | 3 | wandb: 4 | _target_: lightning.pytorch.loggers.wandb.WandbLogger 5 | # name: "" # name of the run (normally generated by wandb) 6 | save_dir: "${paths.output_dir}" 7 | offline: False 8 | id: null # pass correct id to resume experiment! 9 | anonymous: null # enable anonymous logging 10 | project: "lightning-hydra-template" 11 | log_model: False # upload lightning ckpts 12 | prefix: "" # a string to put at the beginning of metric keys 13 | # entity: "" # set to name of your wandb team 14 | group: "" 15 | tags: [] 16 | job_type: "" 17 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/model/cfm/default.yaml: -------------------------------------------------------------------------------- 1 | name: CFM 2 | solver: euler 3 | sigma_min: 1e-4 4 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/model/decoder/default.yaml: -------------------------------------------------------------------------------- 1 | channels: [256, 256] 2 | dropout: 0.05 3 | attention_head_dim: 64 4 | n_blocks: 1 5 | num_mid_blocks: 2 6 | num_heads: 2 7 | act_fn: snakebeta 8 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/model/encoder/default.yaml: -------------------------------------------------------------------------------- 1 | encoder_type: RoPE Encoder 2 | encoder_params: 3 | n_feats: ${model.n_feats} 4 | n_channels: 192 5 | filter_channels: 768 6 | filter_channels_dp: 256 7 | n_heads: 2 8 | n_layers: 6 9 | kernel_size: 3 10 | p_dropout: 0.1 11 | spk_emb_dim: 64 12 | n_spks: 1 13 | prenet: true 14 | 15 | duration_predictor_params: 16 | filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp} 17 | kernel_size: 3 18 | p_dropout: ${model.encoder.encoder_params.p_dropout} 19 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/model/matcha.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - encoder: default.yaml 4 | - decoder: default.yaml 5 | - cfm: default.yaml 6 | - optimizer: adam.yaml 7 | 8 | _target_: matcha.models.matcha_tts.MatchaTTS 9 | n_vocab: 178 10 | n_spks: ${data.n_spks} 11 | spk_emb_dim: 64 12 | n_feats: 80 13 | data_statistics: ${data.data_statistics} 14 | out_size: null # Must be divisible by 4 15 | prior_loss: true 16 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml: -------------------------------------------------------------------------------- 1 | _target_: torch.optim.Adam 2 | _partial_: true 3 | lr: 1e-4 4 | weight_decay: 0.0 5 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/paths/default.yaml: -------------------------------------------------------------------------------- 1 | # path to root directory 2 | # this requires PROJECT_ROOT environment variable to exist 3 | # you can replace it with "." if you want the root to be the current working directory 4 | root_dir: ${oc.env:PROJECT_ROOT} 5 | 6 | # path to data directory 7 | data_dir: ${paths.root_dir}/data/ 8 | 9 | # path to logging directory 10 | log_dir: ${paths.root_dir}/logs/ 11 | 12 | # path to output directory, created dynamically by hydra 13 | # path generation pattern is specified in `configs/hydra/default.yaml` 14 | # use it to store all files generated during the run, like ckpts and metrics 15 | output_dir: ${hydra:runtime.output_dir} 16 | 17 | # path to working directory 18 | work_dir: ${hydra:runtime.cwd} 19 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/cpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: cpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/ddp.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | strategy: ddp 5 | 6 | accelerator: gpu 7 | devices: [0,1] 8 | num_nodes: 1 9 | sync_batchnorm: True 10 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | # simulate DDP on CPU, useful for debugging 5 | accelerator: cpu 6 | devices: 2 7 | strategy: ddp_spawn 8 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/default.yaml: -------------------------------------------------------------------------------- 1 | _target_: lightning.pytorch.trainer.Trainer 2 | 3 | default_root_dir: ${paths.output_dir} 4 | 5 | max_epochs: -1 6 | 7 | accelerator: gpu 8 | devices: [0] 9 | 10 | # mixed precision for extra speed-up 11 | precision: 16-mixed 12 | 13 | # perform a validation loop every N training epochs 14 | check_val_every_n_epoch: 1 15 | 16 | # set True to to ensure deterministic results 17 | # makes training slower but gives more reproducibility than just setting seeds 18 | deterministic: False 19 | 20 | gradient_clip_val: 5.0 21 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/gpu.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: gpu 5 | devices: 1 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/mps.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - default 3 | 4 | accelerator: mps 5 | devices: 1 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.5.1 2 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/config.py: -------------------------------------------------------------------------------- 1 | v1 = { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0004, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | "upsample_rates": [8, 8, 2, 2], 11 | "upsample_kernel_sizes": [16, 16, 4, 4], 12 | "upsample_initial_channel": 512, 13 | "resblock_kernel_sizes": [3, 7, 11], 14 | "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 15 | "resblock_initial_channel": 256, 16 | "segment_size": 8192, 17 | "num_mels": 80, 18 | "num_freq": 1025, 19 | "n_fft": 1024, 20 | "hop_size": 256, 21 | "win_size": 1024, 22 | "sampling_rate": 22050, 23 | "fmin": 0, 24 | "fmax": 8000, 25 | "fmax_loss": None, 26 | "num_workers": 4, 27 | "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1}, 28 | } 29 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/env.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | class AttrDict(dict): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | self.__dict__ = self 11 | 12 | 13 | def build_env(config, config_name, path): 14 | t_path = os.path.join(path, config_name) 15 | if config != t_path: 16 | os.makedirs(path, exist_ok=True) 17 | shutil.copyfile(config, os.path.join(path, config_name)) 18 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/xutils.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/jik876/hifi-gan """ 2 | 3 | import glob 4 | import os 5 | 6 | import matplotlib 7 | import torch 8 | from torch.nn.utils import weight_norm 9 | 10 | matplotlib.use("Agg") 11 | import matplotlib.pylab as plt 12 | 13 | 14 | def plot_spectrogram(spectrogram): 15 | fig, ax = plt.subplots(figsize=(10, 2)) 16 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") 17 | plt.colorbar(im, ax=ax) 18 | 19 | fig.canvas.draw() 20 | plt.close() 21 | 22 | return fig 23 | 24 | 25 | def init_weights(m, mean=0.0, std=0.01): 26 | classname = m.__class__.__name__ 27 | if classname.find("Conv") != -1: 28 | m.weight.data.normal_(mean, std) 29 | 30 | 31 | def apply_weight_norm(m): 32 | classname = m.__class__.__name__ 33 | if classname.find("Conv") != -1: 34 | weight_norm(m) 35 | 36 | 37 | def get_padding(kernel_size, dilation=1): 38 | return int((kernel_size * dilation - dilation) / 2) 39 | 40 | 41 | def load_checkpoint(filepath, device): 42 | assert os.path.isfile(filepath) 43 | print(f"Loading '{filepath}'") 44 | checkpoint_dict = torch.load(filepath, map_location=device) 45 | print("Complete.") 46 | return checkpoint_dict 47 | 48 | 49 | def save_checkpoint(filepath, obj): 50 | print(f"Saving checkpoint to {filepath}") 51 | torch.save(obj, filepath) 52 | print("Complete.") 53 | 54 | 55 | def scan_checkpoint(cp_dir, prefix): 56 | pattern = os.path.join(cp_dir, prefix + "????????") 57 | cp_list = glob.glob(pattern) 58 | if len(cp_list) == 0: 59 | return None 60 | return sorted(cp_list)[-1] 61 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/models/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/components/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/onnx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/onnx/__init__.py -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron 2 | 3 | Defines the set of symbols used in text input to the model. 4 | """ 5 | _pad = "_" 6 | _punctuation = ';:,.!?¡¿—…"«»“” ' 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 8 | _letters_ipa = ( 9 | "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | ) 11 | 12 | 13 | # Export all symbols: 14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 15 | 16 | # Special symbol ids 17 | SPACE_ID = symbols.index(" ") 18 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers 2 | from matcha.utils.logging_utils import log_hyperparameters 3 | from matcha.utils.pylogger import get_pylogger 4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree 5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper 6 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from matcha.utils.monotonic_align.core import maximum_path_c 5 | 6 | 7 | def maximum_path(value, mask): 8 | """Cython optimised version. 9 | value: [b, t_x, t_y] 10 | mask: [b, t_x, t_y] 11 | """ 12 | value = value * mask 13 | device = value.device 14 | dtype = value.dtype 15 | value = value.data.cpu().numpy().astype(np.float32) 16 | path = np.zeros_like(value).astype(np.int32) 17 | mask = mask.data.cpu().numpy() 18 | 19 | t_x_max = mask.sum(1)[:, 0].astype(np.int32) 20 | t_y_max = mask.sum(2)[:, 0].astype(np.int32) 21 | maximum_path_c(path, value, t_x_max, t_y_max) 22 | return torch.from_numpy(path).to(device=device, dtype=dtype) 23 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | cimport cython 4 | cimport numpy as np 5 | 6 | from cython.parallel import prange 7 | 8 | 9 | @cython.boundscheck(False) 10 | @cython.wraparound(False) 11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: 12 | cdef int x 13 | cdef int y 14 | cdef float v_prev 15 | cdef float v_cur 16 | cdef float tmp 17 | cdef int index = t_x - 1 18 | 19 | for y in range(t_y): 20 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 21 | if x == y: 22 | v_cur = max_neg_val 23 | else: 24 | v_cur = value[x, y-1] 25 | if x == 0: 26 | if y == 0: 27 | v_prev = 0. 28 | else: 29 | v_prev = max_neg_val 30 | else: 31 | v_prev = value[x-1, y-1] 32 | value[x, y] = max(v_cur, v_prev) + value[x, y] 33 | 34 | for y in range(t_y - 1, -1, -1): 35 | path[index, y] = 1 36 | if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): 37 | index = index - 1 38 | 39 | 40 | @cython.boundscheck(False) 41 | @cython.wraparound(False) 42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: 43 | cdef int b = values.shape[0] 44 | 45 | cdef int i 46 | for i in prange(b, nogil=True): 47 | maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) 48 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | # from distutils.core import setup 2 | # from Cython.Build import cythonize 3 | # import numpy 4 | 5 | # setup(name='monotonic_align', 6 | # ext_modules=cythonize("core.pyx"), 7 | # include_dirs=[numpy.get_include()]) 8 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/pylogger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lightning.pytorch.utilities import rank_zero_only 4 | 5 | 6 | def get_pylogger(name: str = __name__) -> logging.Logger: 7 | """Initializes a multi-GPU-friendly python command line logger. 8 | 9 | :param name: The name of the logger, defaults to ``__name__``. 10 | 11 | :return: A logger object. 12 | """ 13 | logger = logging.getLogger(name) 14 | 15 | # this ensures all logging levels get marked with the rank zero decorator 16 | # otherwise logs would get multiplied for each GPU process in multi-GPU setup 17 | logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical") 18 | for level in logging_levels: 19 | setattr(logger, level, rank_zero_only(getattr(logger, level))) 20 | 21 | return logger 22 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/notebooks/.gitkeep -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"] 3 | 4 | [tool.black] 5 | line-length = 120 6 | target-version = ['py310'] 7 | exclude = ''' 8 | 9 | ( 10 | /( 11 | \.eggs # exclude a few common directories in the 12 | | \.git # root of the project 13 | | \.hg 14 | | \.mypy_cache 15 | | \.tox 16 | | \.venv 17 | | _build 18 | | buck-out 19 | | build 20 | | dist 21 | )/ 22 | | foo.py # also separately exclude a file named foo.py in 23 | # the root of the project 24 | ) 25 | ''' 26 | 27 | [tool.pytest.ini_options] 28 | addopts = [ 29 | "--color=yes", 30 | "--durations=0", 31 | "--strict-markers", 32 | "--doctest-modules", 33 | ] 34 | filterwarnings = [ 35 | "ignore::DeprecationWarning", 36 | "ignore::UserWarning", 37 | ] 38 | log_cli = "True" 39 | markers = [ 40 | "slow: slow tests", 41 | ] 42 | minversion = "6.0" 43 | testpaths = "tests/" 44 | 45 | [tool.coverage.report] 46 | exclude_lines = [ 47 | "pragma: nocover", 48 | "raise NotImplementedError", 49 | "raise NotImplementedError()", 50 | "if __name__ == .__main__.:", 51 | ] 52 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/requirements.txt: -------------------------------------------------------------------------------- 1 | # --------- pytorch --------- # 2 | torch>=2.0.0 3 | torchvision>=0.15.0 4 | lightning>=2.0.0 5 | torchmetrics>=0.11.4 6 | 7 | # --------- hydra --------- # 8 | hydra-core==1.3.2 9 | hydra-colorlog==1.2.0 10 | hydra-optuna-sweeper==1.2.0 11 | 12 | # --------- loggers --------- # 13 | # wandb 14 | # neptune-client 15 | # mlflow 16 | # comet-ml 17 | # aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550 18 | 19 | # --------- others --------- # 20 | rootutils # standardizing the project root setup 21 | pre-commit # hooks for applying linters on commit 22 | rich # beautiful text formatting in terminal 23 | pytest # tests 24 | # sh # for running bash commands in some tests (linux/macos only) 25 | phonemizer # phonemization of text 26 | tensorboard 27 | librosa 28 | Cython 29 | numpy 30 | einops 31 | inflect 32 | Unidecode 33 | scipy 34 | torchaudio 35 | matplotlib 36 | pandas 37 | conformer==0.3.2 38 | diffusers==0.25.0 39 | notebook 40 | ipywidgets 41 | gradio==3.43.2 42 | gdown 43 | wget 44 | seaborn 45 | piper_phonemize 46 | -------------------------------------------------------------------------------- /tools/CosyVoice/third_party/Matcha-TTS/scripts/schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Schedule execution of many runs 3 | # Run from root folder with: bash scripts/schedule.sh 4 | 5 | python src/train.py trainer.max_epochs=5 logger=csv 6 | 7 | python src/train.py trainer.max_epochs=10 logger=csv 8 | -------------------------------------------------------------------------------- /tools/DiffSinger/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | __pycache__/ 4 | *.sh 5 | local_tools/ -------------------------------------------------------------------------------- /tools/DiffSinger/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jinglin Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tools/DiffSinger/configs/config_base.yaml: -------------------------------------------------------------------------------- 1 | # task 2 | binary_data_dir: '' 3 | work_dir: '' # experiment directory. 4 | infer: false # infer 5 | seed: 1234 6 | debug: false 7 | save_codes: 8 | - configs 9 | - modules 10 | - tasks 11 | - utils 12 | - usr 13 | 14 | ############# 15 | # dataset 16 | ############# 17 | ds_workers: 1 18 | test_num: 100 19 | valid_num: 100 20 | endless_ds: false 21 | sort_by_len: true 22 | 23 | ######### 24 | # train and eval 25 | ######### 26 | load_ckpt: '' 27 | save_ckpt: true 28 | save_best: false 29 | num_ckpt_keep: 3 30 | clip_grad_norm: 0 31 | accumulate_grad_batches: 1 32 | log_interval: 100 33 | num_sanity_val_steps: 5 # steps of validation at the beginning 34 | check_val_every_n_epoch: 10 35 | val_check_interval: 2000 36 | max_epochs: 1000 37 | max_updates: 160000 38 | max_tokens: 31250 39 | max_sentences: 100000 40 | max_eval_tokens: -1 41 | max_eval_sentences: -1 42 | test_input_dir: '' 43 | -------------------------------------------------------------------------------- /tools/DiffSinger/configs/singing/base.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/base.yaml 3 | - configs/tts/base_zh.yaml 4 | 5 | 6 | datasets: [] 7 | test_prefixes: [] 8 | test_num: 0 9 | valid_num: 0 10 | 11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign 12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer 13 | pre_align_args: 14 | use_tone: false # for ZH 15 | forced_align: mfa 16 | use_sox: true 17 | hop_size: 128 # Hop size. 18 | fft_size: 512 # FFT size. 19 | win_size: 512 # FFT size. 20 | max_frames: 8000 21 | fmin: 50 # Minimum freq in mel basis calculation. 22 | fmax: 11025 # Maximum frequency in mel basis calculation. 23 | pitch_type: frame 24 | 25 | hidden_size: 256 26 | mel_loss: "ssim:0.5|l1:0.5" 27 | lambda_f0: 0.0 28 | lambda_uv: 0.0 29 | lambda_energy: 0.0 30 | lambda_ph_dur: 0.0 31 | lambda_sent_dur: 0.0 32 | lambda_word_dur: 0.0 33 | predictor_grad: 0.0 34 | use_spk_embed: true 35 | use_spk_id: false 36 | 37 | max_tokens: 20000 38 | max_updates: 400000 39 | num_spk: 100 40 | save_f0: true 41 | use_gt_dur: true 42 | use_gt_f0: true 43 | -------------------------------------------------------------------------------- /tools/DiffSinger/configs/singing/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/singing/base.yaml 4 | -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/base_zh.yaml: -------------------------------------------------------------------------------- 1 | pre_align_args: 2 | txt_processor: zh_g2pM 3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: configs/tts/pwg.yaml 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask 3 | resblock: "1" 4 | adam_b1: 0.8 5 | adam_b2: 0.99 6 | upsample_rates: [ 8,8,2,2 ] 7 | upsample_kernel_sizes: [ 16,16,4,4 ] 8 | upsample_initial_channel: 128 9 | resblock_kernel_sizes: [ 3,7,11 ] 10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ] 11 | 12 | lambda_mel: 45.0 13 | 14 | max_samples: 8192 15 | max_sentences: 16 16 | 17 | generator_params: 18 | lr: 0.0002 # Generator's learning rate. 19 | aux_context_window: 0 # Context window size for auxiliary feature. 20 | discriminator_optimizer_params: 21 | lr: 0.0002 # Discriminator's learning rate. -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/lj/base_mel2wav.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech_wav' 4 | -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/lj/base_text2mel.yaml: -------------------------------------------------------------------------------- 1 | raw_data_dir: 'data/raw/LJSpeech-1.1' 2 | processed_data_dir: 'data/processed/ljspeech' 3 | binary_data_dir: 'data/binary/ljspeech' 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign 5 | 6 | pitch_type: cwt 7 | mel_loss: l1 8 | num_test_samples: 20 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294, 10 | 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ] 11 | use_energy_embed: false 12 | test_num: 523 13 | valid_num: 348 -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/lj/fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/fs2.yaml 3 | - configs/tts/lj/base_text2mel.yaml -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/lj/hifigan.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/hifigan.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /tools/DiffSinger/configs/tts/lj/pwg.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/pwg.yaml 3 | - configs/tts/lj/base_mel2wav.yaml -------------------------------------------------------------------------------- /tools/DiffSinger/data/processed/ljspeech/dict.txt: -------------------------------------------------------------------------------- 1 | ! ! 2 | , , 3 | . . 4 | ; ; 5 | 6 | 7 | ? ? 8 | AA0 AA0 9 | AA1 AA1 10 | AA2 AA2 11 | AE0 AE0 12 | AE1 AE1 13 | AE2 AE2 14 | AH0 AH0 15 | AH1 AH1 16 | AH2 AH2 17 | AO0 AO0 18 | AO1 AO1 19 | AO2 AO2 20 | AW0 AW0 21 | AW1 AW1 22 | AW2 AW2 23 | AY0 AY0 24 | AY1 AY1 25 | AY2 AY2 26 | B B 27 | CH CH 28 | D D 29 | DH DH 30 | EH0 EH0 31 | EH1 EH1 32 | EH2 EH2 33 | ER0 ER0 34 | ER1 ER1 35 | ER2 ER2 36 | EY0 EY0 37 | EY1 EY1 38 | EY2 EY2 39 | F F 40 | G G 41 | HH HH 42 | IH0 IH0 43 | IH1 IH1 44 | IH2 IH2 45 | IY0 IY0 46 | IY1 IY1 47 | IY2 IY2 48 | JH JH 49 | K K 50 | L L 51 | M M 52 | N N 53 | NG NG 54 | OW0 OW0 55 | OW1 OW1 56 | OW2 OW2 57 | OY0 OY0 58 | OY1 OY1 59 | OY2 OY2 60 | P P 61 | R R 62 | S S 63 | SH SH 64 | T T 65 | TH TH 66 | UH0 UH0 67 | UH1 UH1 68 | UH2 UH2 69 | UW0 UW0 70 | UW1 UW1 71 | UW2 UW2 72 | V V 73 | W W 74 | Y Y 75 | Z Z 76 | ZH ZH 77 | | | 78 | -------------------------------------------------------------------------------- /tools/DiffSinger/data/processed/ljspeech/phone_set.json: -------------------------------------------------------------------------------- 1 | ["!", ",", ".", ";", "", "", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"] -------------------------------------------------------------------------------- /tools/DiffSinger/data/有何不可.json: -------------------------------------------------------------------------------- 1 | {"text": "天空好想下雨,我好想住你隔壁。傻站在你家楼下,抬起头数乌云。如果场景里出现一架钢琴,我会唱歌给你听。哪怕(好)多盆水往下淋,夏天快要过去。请你少买冰淇淋,天凉就别穿短裙。别再那么淘气,如果有时不那么开心。我愿意将格洛米借给你,你其实明白我心意。为你唱这首歌没有什么风格,它仅仅代表着我想给你快乐。为你解冻冰河 为你做一只扑火的飞蛾,没有什么事情是不值得。为你唱这首歌没有什么风格,它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可,夏末秋凉里带一点温热有换季的颜色。天空好想下雨,我好想住你隔壁。傻站在你家楼下,抬起头数乌云。如果场景里出现一架钢琴,我会唱歌给你听。哪怕(好)多盆水往下淋,夏天快要过去。请你少买冰淇淋,天凉就别穿短裙。别再那么淘气,如果有时不那么开心。我愿意将格洛米借给你,你其实明白我心意。为你唱这首歌没有什么风格,它仅仅代表着我想给你快乐。为你解冻冰河为你做一只扑火的飞蛾,没有什么事情是不值得。为你唱这首歌没有什么风格,它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可,夏末秋凉里带一点温热。为你解冻冰河为你做一只扑火的飞蛾,没有什么事情是不值得。为你唱这首歌没有什么风格,它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可,夏末秋凉里带一点温热有换季的颜色。", 2 | "notes": } -------------------------------------------------------------------------------- /tools/DiffSinger/data_gen/tts/bin/binarize.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["OMP_NUM_THREADS"] = "1" 4 | 5 | import importlib 6 | from utils.hparams import set_hparams, hparams 7 | 8 | 9 | def binarize(): 10 | binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer') 11 | pkg = ".".join(binarizer_cls.split(".")[:-1]) 12 | cls_name = binarizer_cls.split(".")[-1] 13 | binarizer_cls = getattr(importlib.import_module(pkg), cls_name) 14 | print("| Binarizer: ", binarizer_cls) 15 | binarizer_cls().process() 16 | 17 | 18 | if __name__ == '__main__': 19 | set_hparams() 20 | binarize() 21 | -------------------------------------------------------------------------------- /tools/DiffSinger/data_gen/tts/txt_processors/base_text_processor.py: -------------------------------------------------------------------------------- 1 | class BaseTxtProcessor: 2 | @staticmethod 3 | def sp_phonemes(): 4 | return ['|'] 5 | 6 | @classmethod 7 | def process(cls, txt, pre_align_args): 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /tools/DiffSinger/inference/svs/opencpop/map.py: -------------------------------------------------------------------------------- 1 | def cpop_pinyin2ph_func(): 2 | # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table" 3 | pinyin2phs = {'AP': 'AP', 'SP': 'SP'} 4 | with open('inference/svs/opencpop/cpop_pinyin2ph.txt') as rf: 5 | for line in rf.readlines(): 6 | elements = [x.strip() for x in line.split('|') if x.strip() != ''] 7 | pinyin2phs[elements[0]] = elements[1] 8 | return pinyin2phs -------------------------------------------------------------------------------- /tools/DiffSinger/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/modules/__init__.py -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/modules/parallel_wavegan/__init__.py -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .causal_conv import * # NOQA 2 | from .pqmf import * # NOQA 3 | from .residual_block import * # NOQA 4 | from modules.parallel_wavegan.layers.residual_stack import * # NOQA 5 | from .upsample import * # NOQA 6 | -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .stft_loss import * # NOQA 2 | -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .melgan import * # NOQA 2 | from .parallel_wavegan import * # NOQA 3 | -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.optim import * # NOQA 2 | from .radam import * # NOQA 3 | -------------------------------------------------------------------------------- /tools/DiffSinger/modules/parallel_wavegan/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * # NOQA 2 | -------------------------------------------------------------------------------- /tools/DiffSinger/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | librosa==0.8.0 3 | tqdm 4 | pandas 5 | numba==0.53.1 6 | numpy==1.19.2 7 | scipy==1.5.4 8 | PyYAML==5.3.1 9 | tensorboardX 10 | pyloudnorm 11 | setuptools>=41.0.0 12 | g2p_en 13 | resemblyzer 14 | webrtcvad 15 | tensorboard==2.6.0 16 | scikit-learn==0.24.1 17 | scikit-image==0.16.2 18 | textgrid 19 | jiwer 20 | pycwt 21 | PyWavelets 22 | praat-parselmouth==0.3.3 23 | jieba 24 | einops 25 | chardet 26 | pretty-midi==0.2.9 27 | pytorch-lightning==0.7.1 28 | h5py==3.1.0 29 | pypinyin==0.39.0 30 | g2pM==0.1.2.5 -------------------------------------------------------------------------------- /tools/DiffSinger/requirements_3090.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.15.0 2 | appdirs==1.4.4 3 | audioread==2.1.9 4 | beautifulsoup4==4.10.0 5 | certifi==2021.10.8 6 | cffi==1.15.0 7 | charset-normalizer==2.0.7 8 | cycler==0.11.0 9 | Cython==0.29.24 10 | decorator==4.4.2 11 | dlib==19.22.1 12 | einops==0.3.2 13 | future==0.18.2 14 | g2p-en==2.1.0 15 | google==3.0.0 16 | grpcio==1.42.0 17 | h5py==2.8.0 18 | horology==1.2.0 19 | idna==3.3 20 | imageio==2.10.1 21 | imageio-ffmpeg==0.4.5 22 | importlib-metadata==4.8.1 23 | joblib==1.1.0 24 | kiwisolver==1.3.2 25 | librosa==0.8.0 26 | llvmlite==0.31.0 27 | Markdown==3.3.4 28 | matplotlib==3.4.3 29 | miditoolkit==0.1.7 30 | moviepy==1.0.3 31 | numba==0.48.0 32 | numpy==1.20.0 33 | opencv-python==4.5.4.58 34 | packaging==21.2 35 | pandas==1.3.4 36 | Pillow==8.4.0 37 | pooch==1.5.2 38 | praat-parselmouth==0.3.3 39 | proglog==0.1.9 40 | protobuf==3.19.1 41 | pycparser==2.20 42 | pycwt==0.3.0a22 43 | pydub==0.25.1 44 | pyloudnorm==0.1.0 45 | pyparsing==2.4.7 46 | pypinyin==0.43.0 47 | python-dateutil==2.8.2 48 | pytorch-lightning==0.7.1 49 | pytorch-ssim==0.1 50 | pytz==2021.3 51 | pyworld==0.3.0 52 | PyYAML==6.0 53 | requests==2.26.0 54 | resampy==0.2.2 55 | Resemblyzer==0.1.1.dev0 56 | scikit-image==0.16.2 57 | scikit-learn==0.22 58 | scipy==1.3.0 59 | six==1.16.0 60 | sklearn==0.0 61 | SoundFile==0.10.3.post1 62 | soupsieve==2.3 63 | sympy==1.9 64 | tensorboard==1.15.0 65 | tensorboardX==2.4 66 | test-tube==0.7.5 67 | TextGrid==1.5 68 | torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl 69 | torchvision==0.9.1 70 | tqdm==4.62.3 71 | typing-extensions==3.10.0.2 72 | urllib3==1.26.7 73 | uuid==1.30 74 | webrtcvad==2.0.10 75 | Werkzeug==2.0.2 76 | zipp==3.6.0 77 | -------------------------------------------------------------------------------- /tools/DiffSinger/resources/apply_form.md: -------------------------------------------------------------------------------- 1 | # The way to apply for PopCS 2 | Thanks for your attention to our works. Please write the email to jinglinliu@zju.edu.cn with: 3 | 4 | " 5 | 6 | name: *** 7 | 8 | affiliations: *** (school or institution) 9 | 10 | research fields: *** 11 | 12 | We want to apply for PopCS and agree to the dataset license: CC by-nc-sa 4.0 (NonCommercial!). 13 | 14 | We accept full responsibility for our use of the dataset and shall defend and indemnify the authors of DiffSinger, against any and all claims arising from our use of the dataset, including but not limited to our use of any copies of copyrighted audio files that we may create from the dataset. 15 | 16 | We hereby represent that we are fully authorized to enter into this agreement on behalf of my employer. 17 | 18 | We will cite your paper if these codes or data have been used. We will not distribute the download link to others without informing the authors of DiffSinger. 19 | 20 | " 21 | 22 | Then we will provide the download link to you. 23 | 24 | **Please note that, if you are using PopCS, it means that you have accepted the terms above.** 25 | 26 | **Please use your Official Email Address (like xxx@zju.edu.cn)! Thank you!** -------------------------------------------------------------------------------- /tools/DiffSinger/resources/diffspeech-fs2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2-1.png -------------------------------------------------------------------------------- /tools/DiffSinger/resources/diffspeech-fs2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2-2.png -------------------------------------------------------------------------------- /tools/DiffSinger/resources/diffspeech-fs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2.png -------------------------------------------------------------------------------- /tools/DiffSinger/resources/model_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/model_a.png -------------------------------------------------------------------------------- /tools/DiffSinger/resources/model_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/model_b.png -------------------------------------------------------------------------------- /tools/DiffSinger/resources/tfb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/tfb.png -------------------------------------------------------------------------------- /tools/DiffSinger/tasks/run.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from utils.hparams import set_hparams, hparams 3 | 4 | 5 | def run_task(): 6 | assert hparams['task_cls'] != '' 7 | pkg = ".".join(hparams["task_cls"].split(".")[:-1]) 8 | cls_name = hparams["task_cls"].split(".")[-1] 9 | task_cls = getattr(importlib.import_module(pkg), cls_name) 10 | task_cls.start() 11 | 12 | 13 | if __name__ == '__main__': 14 | set_hparams() 15 | run_task() 16 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/usr/.gitkeep -------------------------------------------------------------------------------- /tools/DiffSinger/usr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/usr/__init__.py -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/base.yaml: -------------------------------------------------------------------------------- 1 | task_cls: usr.task.DiffFsTask 2 | pitch_type: frame 3 | timesteps: 100 4 | dilation_cycle_length: 1 5 | residual_layers: 20 6 | residual_channels: 256 7 | lr: 0.001 8 | decay_steps: 50000 9 | keep_bins: 80 10 | spec_min: [ ] 11 | spec_max: [ ] 12 | 13 | content_cond_steps: [ ] # [ 0, 10000 ] 14 | spk_cond_steps: [ ] # [ 0, 10000 ] 15 | # train and eval 16 | fs2_ckpt: '' 17 | max_updates: 400000 18 | # max_updates: 200000 19 | use_gt_dur: true 20 | use_gt_f0: true 21 | gen_tgt_spk_id: -1 22 | max_sentences: 48 23 | num_sanity_val_steps: 1 24 | num_valid_plots: 1 25 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/lj_ds_pndm.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./lj_ds_beta6.yaml 3 | 4 | fs2_ckpt: '' 5 | gaussian_start: True 6 | max_beta: 0.02 7 | timesteps: 1000 8 | K_step: 1000 9 | pndm_speedup: 10 10 | 11 | pitch_type: frame 12 | use_pitch_embed: false # using diffusion to model pitch curve 13 | lambda_f0: 0. 14 | lambda_uv: 0. 15 | #rel_pos: true 16 | 17 | max_updates: 320000 18 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/cascade/opencs/aux_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binarization_args: 14 | with_wav: true 15 | with_spk_embed: false 16 | with_align: true 17 | raw_data_dir: 'data/raw/opencpop/segments' 18 | processed_data_dir: 'xxx' 19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 20 | 21 | 22 | binary_data_dir: 'data/binary/opencpop-midi-dp' 23 | use_midi: true # for midi exp 24 | use_gt_f0: false # for midi exp 25 | use_gt_dur: false # for further midi exp 26 | lambda_f0: 1.0 27 | lambda_uv: 1.0 28 | #lambda_energy: 0.1 29 | lambda_ph_dur: 1.0 30 | lambda_sent_dur: 1.0 31 | lambda_word_dur: 1.0 32 | predictor_grad: 0.1 33 | pe_enable: false 34 | pe_ckpt: '' 35 | 36 | num_spk: 1 37 | test_prefixes: [ 38 | '2044', 39 | '2086', 40 | '2092', 41 | '2093', 42 | '2100', 43 | ] 44 | 45 | task_cls: usr.diffsinger_task.AuxDecoderMIDITask 46 | #vocoder: usr.singingvocoder.highgan.HighGAN 47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 48 | vocoder: vocoders.hifigan.HifiGAN 49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 50 | 51 | use_nsf: true 52 | 53 | # config for experiments 54 | max_frames: 5000 55 | max_tokens: 40000 56 | predictor_layers: 5 57 | rel_pos: true 58 | dur_predictor_layers: 5 # * 59 | 60 | use_spk_embed: false 61 | num_valid_plots: 10 62 | max_updates: 160000 63 | save_gt: true -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/cascade/opencs/ds60_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_f0: false # for midi exp 11 | use_gt_dur: false # for further midi exp 12 | lambda_f0: 1.0 13 | lambda_uv: 1.0 14 | #lambda_energy: 0.1 15 | lambda_ph_dur: 1.0 16 | lambda_sent_dur: 1.0 17 | lambda_word_dur: 1.0 18 | predictor_grad: 0.1 19 | pe_enable: false 20 | pe_ckpt: '' 21 | 22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' # 23 | #num_valid_plots: 0 24 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 25 | 26 | K_step: 60 27 | max_tokens: 40000 28 | predictor_layers: 5 29 | dilation_cycle_length: 4 # * 30 | rel_pos: true 31 | dur_predictor_layers: 5 # * 32 | max_updates: 160000 33 | gaussian_start: false 34 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/e2e/opencpop/ds1000.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | # for diffusion schedule 23 | timesteps: 1000 24 | K_step: 1000 25 | max_beta: 0.02 26 | max_tokens: 36000 27 | max_updates: 320000 28 | gaussian_start: True 29 | pndm_speedup: 40 30 | 31 | use_pitch_embed: false 32 | use_gt_f0: false # for midi exp 33 | 34 | lambda_f0: 0. 35 | lambda_uv: 0. 36 | dilation_cycle_length: 4 # * 37 | rel_pos: true 38 | predictor_layers: 5 39 | pe_enable: true 40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/opencs/opencpop_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer 6 | binary_data_dir: 'data/binary/opencpop-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 40000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - usr/configs/popcs_ds_beta6.yaml 3 | - usr/configs/midi/cascade/popcs/popcs_statis.yaml 4 | 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer 6 | binary_data_dir: 'data/binary/popcs-midi-dp' 7 | 8 | #switch_midi2f0_step: 174000 9 | use_midi: true # for midi exp 10 | use_gt_dur: false # for further midi exp 11 | lambda_ph_dur: 1.0 12 | lambda_sent_dur: 1.0 13 | lambda_word_dur: 1.0 14 | predictor_grad: 0.1 15 | dur_predictor_layers: 5 # * 16 | 17 | 18 | fs2_ckpt: '' # 19 | #num_valid_plots: 0 20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask 21 | 22 | K_step: 100 23 | max_tokens: 40000 24 | max_updates: 160000 25 | gaussian_start: True 26 | 27 | use_pitch_embed: false 28 | use_gt_f0: false # for midi exp 29 | 30 | lambda_f0: 0. 31 | lambda_uv: 0. 32 | dilation_cycle_length: 4 # * 33 | rel_pos: true 34 | predictor_layers: 5 35 | pe_enable: true 36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe' 37 | -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/midi/pe.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/tts/lj/fs2.yaml 3 | 4 | max_frames: 8000 5 | audio_sample_rate: 24000 6 | hop_size: 128 # Hop size. 7 | fft_size: 512 # FFT size. 8 | win_size: 512 # FFT size. 9 | fmin: 30 10 | fmax: 12000 11 | min_level_db: -120 12 | 13 | binary_data_dir: 'xxx' 14 | 15 | pitch_type: frame 16 | task_cls: tasks.tts.pe.PitchExtractionTask 17 | pitch_extractor_conv_layers: 2 18 | 19 | 20 | # config for experiments 21 | max_tokens: 20000 22 | use_spk_embed: false 23 | num_valid_plots: 10 24 | max_updates: 60000 -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/popcs_ds_beta6_offline.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - ./popcs_ds_beta6.yaml 3 | 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer 5 | num_valid_plots: 0 6 | task_cls: usr.diffsinger_task.DiffSingerOfflineTask 7 | 8 | # tmp: 9 | #pe_enable: true 10 | #pe_ckpt: '' 11 | vocoder: vocoders.hifigan.HifiGAN 12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 -------------------------------------------------------------------------------- /tools/DiffSinger/usr/configs/popcs_fs2.yaml: -------------------------------------------------------------------------------- 1 | base_config: 2 | - configs/singing/fs2.yaml 3 | 4 | audio_sample_rate: 24000 5 | hop_size: 128 # Hop size. 6 | fft_size: 512 # FFT size. 7 | win_size: 512 # FFT size. 8 | fmin: 30 9 | fmax: 12000 10 | min_level_db: -120 11 | 12 | binarization_args: 13 | with_wav: true 14 | with_spk_embed: false 15 | with_align: true 16 | raw_data_dir: 'data/raw/popcs' 17 | processed_data_dir: 'data/processed/popcs' 18 | binary_data_dir: 'data/binary/popcs-pmf0' 19 | num_spk: 1 20 | datasets: [ 21 | 'popcs', 22 | ] 23 | test_prefixes: [ 24 | 'popcs-说散就散', 25 | 'popcs-隐形的翅膀', 26 | ] 27 | 28 | task_cls: tasks.tts.fs2.FastSpeech2Task 29 | #vocoder: usr.singingvocoder.highgan.HighGAN 30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl 31 | vocoder: vocoders.hifigan.HifiGAN 32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 33 | use_nsf: true 34 | 35 | # config for experiments 36 | max_tokens: 18000 37 | use_spk_embed: false 38 | num_valid_plots: 10 39 | max_updates: 160000 40 | save_gt: true 41 | 42 | # tmp: 43 | #pe_enable: true 44 | #pe_ckpt: '' -------------------------------------------------------------------------------- /tools/DiffSinger/utils/training_utils.py: -------------------------------------------------------------------------------- 1 | from utils.hparams import hparams 2 | 3 | 4 | class RSQRTSchedule(object): 5 | def __init__(self, optimizer): 6 | super().__init__() 7 | self.optimizer = optimizer 8 | self.constant_lr = hparams['lr'] 9 | self.warmup_updates = hparams['warmup_updates'] 10 | self.hidden_size = hparams['hidden_size'] 11 | self.lr = hparams['lr'] 12 | for param_group in optimizer.param_groups: 13 | param_group['lr'] = self.lr 14 | self.step(0) 15 | 16 | def step(self, num_updates): 17 | constant_lr = self.constant_lr 18 | warmup = min(num_updates / self.warmup_updates, 1.0) 19 | rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5 20 | rsqrt_hidden = self.hidden_size ** -0.5 21 | self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7) 22 | for param_group in self.optimizer.param_groups: 23 | param_group['lr'] = self.lr 24 | return self.lr 25 | 26 | def get_lr(self): 27 | return self.optimizer.param_groups[0]['lr'] 28 | -------------------------------------------------------------------------------- /tools/DiffSinger/utils/tts_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from collections import defaultdict 4 | 5 | 6 | def make_positions(tensor, padding_idx): 7 | """Replace non-padding symbols with their position numbers. 8 | Position numbers begin at padding_idx+1. Padding symbols are ignored. 9 | """ 10 | # The series of casts and type-conversions here are carefully 11 | # balanced to both work with ONNX export and XLA. In particular XLA 12 | # prefers ints, cumsum defaults to output longs, and ONNX doesn't know 13 | # how to handle the dtype kwarg in cumsum. 14 | mask = tensor.ne(padding_idx).int() 15 | return ( 16 | torch.cumsum(mask, dim=1).type_as(mask) * mask 17 | ).long() + padding_idx 18 | 19 | 20 | def softmax(x, dim): 21 | return F.softmax(x, dim=dim, dtype=torch.float32) 22 | -------------------------------------------------------------------------------- /tools/DiffSinger/vocoders/__init__.py: -------------------------------------------------------------------------------- 1 | from vocoders import hifigan 2 | -------------------------------------------------------------------------------- /tools/DiffSinger/vocoders/base_vocoder.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | VOCODERS = {} 3 | 4 | 5 | def register_vocoder(cls): 6 | VOCODERS[cls.__name__.lower()] = cls 7 | VOCODERS[cls.__name__] = cls 8 | return cls 9 | 10 | 11 | def get_vocoder_cls(hparams): 12 | if hparams['vocoder'] in VOCODERS: 13 | return VOCODERS[hparams['vocoder']] 14 | else: 15 | vocoder_cls = hparams['vocoder'] 16 | pkg = ".".join(vocoder_cls.split(".")[:-1]) 17 | cls_name = vocoder_cls.split(".")[-1] 18 | vocoder_cls = getattr(importlib.import_module(pkg), cls_name) 19 | return vocoder_cls 20 | 21 | 22 | class BaseVocoder: 23 | def spec2wav(self, mel): 24 | """ 25 | 26 | :param mel: [T, 80] 27 | :return: wav: [T'] 28 | """ 29 | 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | def wav2spec(wav_fn): 34 | """ 35 | 36 | :param wav_fn: str 37 | :return: wav, mel: [T, 80] 38 | """ 39 | raise NotImplementedError 40 | -------------------------------------------------------------------------------- /tools/DiffSinger/vocoders/vocoder_utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | 3 | from utils.hparams import hparams 4 | import numpy as np 5 | 6 | 7 | def denoise(wav, v=0.1): 8 | spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'], 9 | win_length=hparams['win_size'], pad_mode='constant') 10 | spec_m = np.abs(spec) 11 | spec_m = np.clip(spec_m - v, a_min=0, a_max=None) 12 | spec_a = np.angle(spec) 13 | 14 | return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'], 15 | win_length=hparams['win_size']) 16 | -------------------------------------------------------------------------------- /tools/ImageBind/.assets/bird_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/bird_audio.wav -------------------------------------------------------------------------------- /tools/ImageBind/.assets/bird_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/bird_image.jpg -------------------------------------------------------------------------------- /tools/ImageBind/.assets/car_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/car_audio.wav -------------------------------------------------------------------------------- /tools/ImageBind/.assets/car_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/car_image.jpg -------------------------------------------------------------------------------- /tools/ImageBind/.assets/dog_audio.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/dog_audio.wav -------------------------------------------------------------------------------- /tools/ImageBind/.assets/dog_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/dog_image.jpg -------------------------------------------------------------------------------- /tools/ImageBind/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to ImageBind 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Meta's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to Omnivore, you agree that your contributions will be licensed 31 | under the [LICENSE](LICENSE) file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /tools/ImageBind/build/lib/imagebind/__init__.py: -------------------------------------------------------------------------------- 1 | from imagebind import data 2 | from imagebind.models import imagebind_model 3 | from imagebind.models.imagebind_model import ModalityType -------------------------------------------------------------------------------- /tools/ImageBind/build/lib/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/build/lib/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /tools/ImageBind/build/lib/imagebind/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/build/lib/imagebind/models/__init__.py -------------------------------------------------------------------------------- /tools/ImageBind/imagebind.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.py 4 | imagebind/__init__.py 5 | imagebind/data.py 6 | imagebind.egg-info/PKG-INFO 7 | imagebind.egg-info/SOURCES.txt 8 | imagebind.egg-info/dependency_links.txt 9 | imagebind.egg-info/top_level.txt 10 | imagebind/bpe/bpe_simple_vocab_16e6.txt.gz 11 | imagebind/models/__init__.py 12 | imagebind/models/helpers.py 13 | imagebind/models/imagebind_model.py 14 | imagebind/models/multimodal_preprocessors.py 15 | imagebind/models/transformer.py -------------------------------------------------------------------------------- /tools/ImageBind/imagebind.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | https://download.pytorch.org/whl/cu113 2 | -------------------------------------------------------------------------------- /tools/ImageBind/imagebind.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | imagebind 2 | -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/__init__.py: -------------------------------------------------------------------------------- 1 | from imagebind import data 2 | from imagebind.models import imagebind_model 3 | from imagebind.models.imagebind_model import ModalityType -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/__pycache__/data.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/__pycache__/data.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__init__.py -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__pycache__/helpers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/helpers.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__pycache__/imagebind_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/imagebind_model.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/imagebind/models/__pycache__/transformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/transformer.cpython-310.pyc -------------------------------------------------------------------------------- /tools/ImageBind/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/requirements.txt -------------------------------------------------------------------------------- /tools/ImageBind/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('requirements.txt') as f: 4 | required = f.read().splitlines() 5 | 6 | setup( 7 | name='imagebind', 8 | version='0.1.0', 9 | packages=find_packages(), 10 | package_data={ 11 | 'imagebind': ['bpe/bpe_simple_vocab_16e6.txt.gz'], 12 | }, 13 | description='A brief description of the package', 14 | long_description=open('README.md', encoding='utf-8').read(), 15 | long_description_content_type="text/markdown", 16 | url='https://github.com/facebookresearch/ImageBind', 17 | classifiers=[ 18 | 'Programming Language :: Python :: 3', 19 | 'License :: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International', 20 | ], 21 | install_requires=required, 22 | dependency_links=['https://download.pytorch.org/whl/cu113'], 23 | ) 24 | -------------------------------------------------------------------------------- /tools/audio-preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Fish Audio Preprocessor 2 | 3 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess) 4 | 5 | [中文文档](README.zh.md) 6 | 7 | This repo contains some scripts for audio processing. Main features include: 8 | 9 | - [x] Video/audio to wav 10 | - [x] Audio vocal separation 11 | - [x] Automatic audio slicing 12 | - [x] Audio loudness matching 13 | - [x] Audio data statistics (supports determining audio length) 14 | - [x] Audio resampling 15 | - [x] Audio transcribe (.lab) 16 | - [x] Audio transcribe via FunASR (use `--model-type funasr` to enable, detailed usage can be found at code) 17 | - [ ] Audio transcribe via WhisperX 18 | - [ ] Merge .lab files (example: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`) 19 | 20 | ([ ] indicates not completed, [x] indicates completed) 21 | 22 | **This code has been tested on Ubuntu 22.04 / 20.04 + Python 3.10. If you encounter problems on other versions, feedback is welcome.** 23 | 24 | ## Getting Started: 25 | 26 | ``` 27 | pip install -e . 28 | fap --help 29 | ``` 30 | 31 | ## Reference 32 | 33 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper) 34 | -------------------------------------------------------------------------------- /tools/audio-preprocess/README.zh.md: -------------------------------------------------------------------------------- 1 | # Fish Audio Preprocessor 2 | 3 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess) 4 | 5 | [English Document](README.md) 6 | 7 | 这个 Repo 包含了一些用于处理音频的脚本. 主要包含以下功能: 8 | 9 | - [x] 视频/音频转 wav 10 | - [x] 音频人声分离 11 | - [x] 音频自动切片 12 | - [x] 音频响度匹配 13 | - [x] 音频数据统计(支持判断音频长度) 14 | - [x] 音频重采样 15 | - [x] 音频打标 (.lab) 16 | - [x] 音频打标 FunASR(使用 `--model-type funasr` 开启, 详细使用方法可查看代码) 17 | - [ ] 音频打标 WhisperX 18 | - [ ] .lab 标注合并为 .list 文件 (示例: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`) 19 | 20 | ([ ] 表示未完成, [x] 表示已完成) 21 | 22 | **本代码已在 Ubuntu 22.04 / 20.04 + Python 3.10 测试过, 如果在其他版本遇到问题, 欢迎反馈** 23 | 24 | ## 上手指南: 25 | 26 | ``` 27 | pip install -e . 28 | fap --help 29 | ``` 30 | 31 | ## 引用 32 | 33 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper) 34 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fap-complete.zsh: -------------------------------------------------------------------------------- 1 | #compdef fap 2 | 3 | _fap_completion() { 4 | local -a completions 5 | local -a completions_with_descriptions 6 | local -a response 7 | (( ! $+commands[fap] )) && return 1 8 | 9 | response=("${(@f)$(env COMP_WORDS="${words[*]}" COMP_CWORD=$((CURRENT-1)) _FAP_COMPLETE=zsh_complete fap)}") 10 | 11 | for type key descr in ${response}; do 12 | if [[ "$type" == "plain" ]]; then 13 | if [[ "$descr" == "_" ]]; then 14 | completions+=("$key") 15 | else 16 | completions_with_descriptions+=("$key":"$descr") 17 | fi 18 | elif [[ "$type" == "dir" ]]; then 19 | _path_files -/ 20 | elif [[ "$type" == "file" ]]; then 21 | _path_files -f 22 | fi 23 | done 24 | 25 | if [ -n "$completions_with_descriptions" ]; then 26 | _describe -V unsorted completions_with_descriptions -U 27 | fi 28 | 29 | if [ -n "$completions" ]; then 30 | compadd -U -V unsorted -a completions 31 | fi 32 | } 33 | 34 | compdef _fap_completion fap; 35 | 36 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.4 2 | Name: fish-audio-preprocess 3 | Version: 0.2.8 4 | Summary: Preprocess audio data 5 | Author-email: Lengyue 6 | License: Apache 7 | Requires-Python: >=3.9 8 | Description-Content-Type: text/markdown 9 | License-File: LICENSE 10 | Dynamic: license-file 11 | 12 | # Fish Audio Preprocessor 13 | 14 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess) 15 | 16 | [中文文档](README.zh.md) 17 | 18 | This repo contains some scripts for audio processing. Main features include: 19 | 20 | - [x] Video/audio to wav 21 | - [x] Audio vocal separation 22 | - [x] Automatic audio slicing 23 | - [x] Audio loudness matching 24 | - [x] Audio data statistics (supports determining audio length) 25 | - [x] Audio resampling 26 | - [x] Audio transcribe (.lab) 27 | - [x] Audio transcribe via FunASR (use `--model-type funasr` to enable, detailed usage can be found at code) 28 | - [ ] Audio transcribe via WhisperX 29 | - [ ] Merge .lab files (example: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`) 30 | 31 | ([ ] indicates not completed, [x] indicates completed) 32 | 33 | **This code has been tested on Ubuntu 22.04 / 20.04 + Python 3.10. If you encounter problems on other versions, feedback is welcome.** 34 | 35 | ## Getting Started: 36 | 37 | ``` 38 | pip install -e . 39 | fap --help 40 | ``` 41 | 42 | ## Reference 43 | 44 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper) 45 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | fap = fish_audio_preprocess.cli.__main__:cli 3 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | fish_audio_preprocess 2 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/__init__.py -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__main__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import richuru 3 | from loguru import logger 4 | 5 | from fish_audio_preprocess.cli.merge_lab import merge_lab 6 | 7 | from .convert_to_wav import to_wav 8 | from .frequency import frequency 9 | from .length import length 10 | from .loudness_norm import loudness_norm 11 | from .merge_short import merge_short 12 | from .resample import resample 13 | from .separate_audio import separate 14 | from .slice_audio import slice_audio, slice_audio_v2 15 | from .transcribe import transcribe 16 | 17 | 18 | @click.group() 19 | @click.option("--debug/--no-debug", default=False) 20 | def cli(debug: bool): 21 | """An audio preprocessing CLI.""" 22 | 23 | if debug: 24 | richuru.install() 25 | logger.info("Debug mode is on") 26 | 27 | 28 | # Register subcommands 29 | cli.add_command(length) 30 | cli.add_command(frequency) 31 | 32 | cli.add_command(to_wav) 33 | cli.add_command(separate) 34 | cli.add_command(loudness_norm) 35 | cli.add_command(slice_audio) 36 | cli.add_command(slice_audio_v2) 37 | cli.add_command(resample) 38 | cli.add_command(transcribe) 39 | cli.add_command(merge_short) 40 | cli.add_command(merge_lab) 41 | 42 | 43 | if __name__ == "__main__": 44 | to_wav() 45 | -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/__main__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/__main__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/convert_to_wav.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/convert_to_wav.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/frequency.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/frequency.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/length.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/length.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/loudness_norm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/loudness_norm.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_lab.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_lab.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_short.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_short.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/resample.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/resample.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/separate_audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/separate_audio.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/slice_audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/slice_audio.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/transcribe.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/transcribe.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/file.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/file.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/loudness_norm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/loudness_norm.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/separate_audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/separate_audio.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio_v2.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio_v2.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/transcribe.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/transcribe.cpython-310.pyc -------------------------------------------------------------------------------- /tools/audio-preprocess/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = [ 3 | {name = "Lengyue", email = "lengyue@lengyue.me"}, 4 | ] 5 | dependencies = [ 6 | ] 7 | description = "Preprocess audio data" 8 | license = {text = "Apache"} 9 | name = "fish-audio-preprocess" 10 | readme = "README.md" 11 | requires-python = ">=3.9" 12 | version = "0.2.8" 13 | 14 | [project.scripts] 15 | fap = "fish_audio_preprocess.cli.__main__:cli" 16 | 17 | [build-system] 18 | build-backend = "setuptools.build_meta" 19 | requires = ["setuptools", "setuptools-scm"] 20 | 21 | [tool.setuptools] 22 | packages = ["fish_audio_preprocess"] 23 | 24 | [tool.isort] 25 | profile = "black" 26 | -------------------------------------------------------------------------------- /tools/audio-preprocess/tools/lint.py: -------------------------------------------------------------------------------- 1 | import subprocess as sp 2 | 3 | # Black 4 | sp.run(["black", "fish_audio_preprocess", "tools"]) 5 | 6 | # Isort 7 | sp.run(["isort", "fish_audio_preprocess", "tools"]) 8 | -------------------------------------------------------------------------------- /tools/fish-speech/.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .github 3 | results 4 | data 5 | *.filelist 6 | /data_server/target 7 | checkpoints 8 | -------------------------------------------------------------------------------- /tools/fish-speech/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .pgx.* 3 | .pdm-python 4 | /fish_speech.egg-info 5 | __pycache__ 6 | /results 7 | /data 8 | /*.test.sh 9 | *.filelist 10 | filelists 11 | /fish_speech/text/cmudict_cache.pickle 12 | /checkpoints 13 | /.vscode 14 | /data_server/target 15 | /*.npy 16 | /*.wav 17 | /*.mp3 18 | /*.lab 19 | /results 20 | /data 21 | /.idea 22 | ffmpeg.exe 23 | ffprobe.exe 24 | asr-label* 25 | /.cache 26 | /fishenv 27 | /.locale 28 | /demo-audios 29 | /references 30 | /example 31 | /faster_whisper 32 | /.gradio 33 | *log 34 | -------------------------------------------------------------------------------- /tools/fish-speech/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_schedule: monthly 3 | 4 | repos: 5 | - repo: https://github.com/pycqa/isort 6 | rev: 6.0.1 7 | hooks: 8 | - id: isort 9 | args: [--profile=black] 10 | 11 | - repo: https://github.com/psf/black 12 | rev: 25.1.0 13 | hooks: 14 | - id: black 15 | 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v5.0.0 18 | hooks: 19 | - id: end-of-file-fixer 20 | - id: check-yaml 21 | - id: check-json 22 | - id: mixed-line-ending 23 | args: ["--fix=lf"] 24 | - id: check-added-large-files 25 | args: ["--maxkb=5000"] 26 | -------------------------------------------------------------------------------- /tools/fish-speech/.project-root: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/.project-root -------------------------------------------------------------------------------- /tools/fish-speech/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for MkDocs projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.12" 12 | 13 | mkdocs: 14 | configuration: mkdocs.yml 15 | 16 | # Optionally declare the Python requirements required to build your docs 17 | python: 18 | install: 19 | - requirements: docs/requirements.txt 20 | -------------------------------------------------------------------------------- /tools/fish-speech/API_FLAGS.txt: -------------------------------------------------------------------------------- 1 | # --infer 2 | --api 3 | --listen 0.0.0.0:8080 \ 4 | --llama-checkpoint-path "checkpoints/fish-speech-1.5" \ 5 | --decoder-checkpoint-path "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \ 6 | --decoder-config-name firefly_gan_vq 7 | -------------------------------------------------------------------------------- /tools/fish-speech/docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | fish-speech: 5 | build: 6 | context: . 7 | dockerfile: dockerfile.dev 8 | container_name: fish-speech 9 | volumes: 10 | - ./:/exp 11 | deploy: 12 | resources: 13 | reservations: 14 | devices: 15 | - driver: nvidia 16 | count: all 17 | capabilities: [gpu] 18 | command: tail -f /dev/null 19 | -------------------------------------------------------------------------------- /tools/fish-speech/dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim-bookworm AS stage-1 2 | ARG TARGETARCH 3 | 4 | ARG HUGGINGFACE_MODEL=fish-speech-1.5 5 | ARG HF_ENDPOINT=https://huggingface.co 6 | 7 | WORKDIR /opt/fish-speech 8 | 9 | RUN set -ex \ 10 | && pip install huggingface_hub \ 11 | && HF_ENDPOINT=${HF_ENDPOINT} huggingface-cli download --resume-download fishaudio/${HUGGINGFACE_MODEL} --local-dir checkpoints/${HUGGINGFACE_MODEL} 12 | 13 | FROM python:3.12-slim-bookworm 14 | ARG TARGETARCH 15 | 16 | ARG DEPENDENCIES=" \ 17 | ca-certificates \ 18 | libsox-dev \ 19 | build-essential \ 20 | cmake \ 21 | libasound-dev \ 22 | portaudio19-dev \ 23 | libportaudio2 \ 24 | libportaudiocpp0 \ 25 | ffmpeg" 26 | 27 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ 28 | --mount=type=cache,target=/var/lib/apt,sharing=locked \ 29 | set -ex \ 30 | && rm -f /etc/apt/apt.conf.d/docker-clean \ 31 | && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' >/etc/apt/apt.conf.d/keep-cache \ 32 | && apt-get update \ 33 | && apt-get -y install --no-install-recommends ${DEPENDENCIES} \ 34 | && echo "no" | dpkg-reconfigure dash 35 | 36 | WORKDIR /opt/fish-speech 37 | 38 | COPY . . 39 | 40 | RUN --mount=type=cache,target=/root/.cache,sharing=locked \ 41 | set -ex \ 42 | && pip install -e .[stable] 43 | 44 | COPY --from=stage-1 /opt/fish-speech/checkpoints /opt/fish-speech/checkpoints 45 | 46 | ENV GRADIO_SERVER_NAME="0.0.0.0" 47 | 48 | EXPOSE 7860 49 | 50 | CMD ["./entrypoint.sh"] 51 | -------------------------------------------------------------------------------- /tools/fish-speech/dockerfile.dev: -------------------------------------------------------------------------------- 1 | ARG VERSION=dev 2 | ARG BASE_IMAGE=ghcr.io/fishaudio/fish-speech:${VERSION} 3 | 4 | FROM ${BASE_IMAGE} 5 | 6 | ARG TOOLS=" \ 7 | git \ 8 | curl \ 9 | build-essential \ 10 | ffmpeg \ 11 | libsm6 \ 12 | libxext6 \ 13 | libjpeg-dev \ 14 | zlib1g-dev \ 15 | aria2 \ 16 | zsh \ 17 | openssh-server \ 18 | sudo \ 19 | protobuf-compiler \ 20 | libasound-dev \ 21 | portaudio19-dev \ 22 | libportaudio2 \ 23 | libportaudiocpp0 \ 24 | cmake" 25 | 26 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ 27 | --mount=type=cache,target=/var/lib/apt,sharing=locked \ 28 | set -ex \ 29 | && apt-get update \ 30 | && apt-get -y install --no-install-recommends ${TOOLS} 31 | 32 | # Install oh-my-zsh so your terminal looks nice 33 | RUN sh -c "$(curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" "" --unattended 34 | 35 | # Set zsh as default shell 36 | RUN chsh -s /usr/bin/zsh 37 | ENV SHELL=/usr/bin/zsh 38 | -------------------------------------------------------------------------------- /tools/fish-speech/docs/CNAME: -------------------------------------------------------------------------------- 1 | speech.fish.audio 2 | -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/VS_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/VS_1.jpg -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/VS_1_pt-BR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/VS_1_pt-BR.png -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/agent_gradio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/agent_gradio.png -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/diagram.png -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/diagrama.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/diagrama.png -------------------------------------------------------------------------------- /tools/fish-speech/docs/assets/figs/logo-circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/logo-circle.png -------------------------------------------------------------------------------- /tools/fish-speech/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs-material 2 | mkdocs-static-i18n[material] 3 | mkdocs[i18n] 4 | -------------------------------------------------------------------------------- /tools/fish-speech/docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | .md-grid { 2 | max-width: 1440px; 3 | } 4 | -------------------------------------------------------------------------------- /tools/fish-speech/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CUDA_ENABLED=${CUDA_ENABLED:-true} 4 | DEVICE="" 5 | 6 | if [ "${CUDA_ENABLED}" != "true" ]; then 7 | DEVICE="--device cpu" 8 | fi 9 | 10 | exec python tools/run_webui.py ${DEVICE} 11 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .grad_norm import GradNormMonitor 2 | 3 | __all__ = ["GradNormMonitor"] 4 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/configs/firefly_gan_vq.yaml: -------------------------------------------------------------------------------- 1 | _target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture 2 | spec_transform: 3 | _target_: fish_speech.utils.spectrogram.LogMelSpectrogram 4 | sample_rate: 44100 5 | n_mels: 160 6 | n_fft: 2048 7 | hop_length: 512 8 | win_length: 2048 9 | backbone: 10 | _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder 11 | input_channels: 160 12 | depths: [3, 3, 9, 3] 13 | dims: [128, 256, 384, 512] 14 | drop_path_rate: 0.2 15 | kernel_size: 7 16 | head: 17 | _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator 18 | hop_length: 512 19 | upsample_rates: [8, 8, 2, 2, 2] # aka. strides 20 | upsample_kernel_sizes: [16, 16, 4, 4, 4] 21 | resblock_kernel_sizes: [3, 7, 11] 22 | resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] 23 | num_mels: 512 24 | upsample_initial_channel: 512 25 | pre_conv_kernel_size: 13 26 | post_conv_kernel_size: 13 27 | quantizer: 28 | _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize 29 | input_dim: 512 30 | n_groups: 8 31 | n_codebooks: 1 32 | levels: [8, 5, 5, 5] 33 | downsample_factor: [2, 2] 34 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/configs/lora/r_8_alpha_16.yaml: -------------------------------------------------------------------------------- 1 | _target_: fish_speech.models.text2semantic.lora.LoraConfig 2 | r: 8 3 | lora_alpha: 16 4 | lora_dropout: 0.01 5 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/datasets/protos/text-data.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package text_data; 4 | 5 | message Semantics { 6 | repeated uint32 values = 1; 7 | } 8 | 9 | message Sentence { 10 | repeated string texts = 1; 11 | repeated Semantics semantics = 3; 12 | } 13 | 14 | message TextData { 15 | string source = 1; 16 | string name = 2; 17 | repeated Sentence sentences = 4; 18 | } 19 | 20 | message SampledData { 21 | string source = 1; 22 | string name = 2; 23 | repeated Sentence samples = 3; 24 | } 25 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/datasets/protos/text_data_stream.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | from .text_data_pb2 import TextData 4 | 5 | 6 | def read_pb_stream(f): 7 | while True: 8 | buf = f.read(4) 9 | if len(buf) == 0: 10 | break 11 | size = struct.unpack("I", buf)[0] 12 | buf = f.read(size) 13 | text_data = TextData() 14 | text_data.ParseFromString(buf) 15 | yield text_data 16 | 17 | 18 | def write_pb_stream(f, text_data): 19 | buf = text_data.SerializeToString() 20 | f.write(struct.pack("I", len(buf))) 21 | f.write(buf) 22 | 23 | 24 | def pack_pb_stream(text_data): 25 | buf = text_data.SerializeToString() 26 | return struct.pack("I", len(buf)) + buf 27 | 28 | 29 | def split_pb_stream(f): 30 | while True: 31 | head = f.read(4) 32 | if len(head) == 0: 33 | break 34 | size = struct.unpack("I", head)[0] 35 | buf = f.read(size) 36 | yield head + buf 37 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/i18n/README.md: -------------------------------------------------------------------------------- 1 | ## i18n Folder Attribution 2 | 3 | The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below: 4 | 5 | ### fish_speech/i18n/core.py 6 | 7 | **Related code from RVC:** 8 | [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py) 9 | 10 | **Initial commit:** 11 | add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35) 12 | 13 | **Initial author:** 14 | [@L4Ph](https://github.com/L4Ph) 15 | 16 | ### fish_speech/i18n/scan.py 17 | 18 | **Related code from RVC:** 19 | [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py) 20 | 21 | **Initial commit:** 22 | File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058) 23 | 24 | **Initial author:** 25 | [@towzeur](https://github.com/towzeur) 26 | 27 | We appreciate the contributions of the RVC project and its authors. 28 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/i18n/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import i18n 2 | 3 | __all__ = ["i18n"] 4 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/i18n/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | import locale 3 | from pathlib import Path 4 | 5 | I18N_FILE_PATH = Path(__file__).parent / "locale" 6 | DEFAULT_LANGUAGE = "en_US" 7 | 8 | 9 | def load_language_list(language): 10 | with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f: 11 | language_list = json.load(f) 12 | 13 | return language_list 14 | 15 | 16 | class I18nAuto: 17 | def __init__(self): 18 | i18n_file = Path(".locale") 19 | 20 | if i18n_file.exists(): 21 | with open(i18n_file, "r", encoding="utf-8") as f: 22 | language = f.read().strip() 23 | else: 24 | # getlocale can't identify the system's language ((None, None)) 25 | language = locale.getdefaultlocale()[0] 26 | 27 | if (I18N_FILE_PATH / f"{language}.json").exists() is False: 28 | language = DEFAULT_LANGUAGE 29 | 30 | self.language = language 31 | self.language_map = load_language_list(language) 32 | 33 | def __call__(self, key): 34 | return self.language_map.get(key, key) 35 | 36 | def __repr__(self): 37 | return "Use Language: " + self.language 38 | 39 | 40 | i18n = I18nAuto() 41 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/inference_engine/utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import wave 3 | from dataclasses import dataclass 4 | from typing import Literal, Optional, Tuple 5 | 6 | import numpy as np 7 | 8 | 9 | @dataclass 10 | class InferenceResult: 11 | code: Literal["header", "segment", "error", "final"] 12 | audio: Optional[Tuple[int, np.ndarray]] 13 | error: Optional[Exception] 14 | 15 | 16 | def wav_chunk_header( 17 | sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1 18 | ) -> bytes: 19 | buffer = io.BytesIO() 20 | 21 | with wave.open(buffer, "wb") as wav_file: 22 | wav_file.setnchannels(channels) 23 | wav_file.setsampwidth(bit_depth // 8) 24 | wav_file.setframerate(sample_rate) 25 | 26 | wav_header_bytes = buffer.getvalue() 27 | buffer.close() 28 | 29 | return wav_header_bytes 30 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/models/text2semantic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/fish_speech/models/text2semantic/__init__.py -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/models/vqgan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/fish_speech/models/vqgan/__init__.py -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def get_cosine_schedule_with_warmup_lr_lambda( 5 | current_step: int, 6 | *, 7 | num_warmup_steps: int | float, 8 | num_training_steps: int, 9 | num_cycles: float = 0.5, 10 | final_lr_ratio: float = 0.0, 11 | ): 12 | if 0 < num_warmup_steps < 1: # float mode 13 | num_warmup_steps = int(num_warmup_steps * num_training_steps) 14 | 15 | if current_step < num_warmup_steps: 16 | return float(current_step) / float(max(1, num_warmup_steps)) 17 | 18 | progress = float(current_step - num_warmup_steps) / float( 19 | max(1, num_training_steps - num_warmup_steps) 20 | ) 21 | 22 | return max( 23 | final_lr_ratio, 24 | 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)), 25 | ) 26 | 27 | 28 | def get_constant_schedule_with_warmup_lr_lambda( 29 | current_step: int, 30 | *, 31 | num_warmup_steps: int | float, 32 | num_training_steps: int | None = None, 33 | ): 34 | if 0 < num_warmup_steps < 1: # float mode 35 | num_warmup_steps = int(num_warmup_steps * num_training_steps) 36 | 37 | if current_step < num_warmup_steps: 38 | return float(current_step) / float(max(1, num_warmup_steps)) 39 | 40 | return 1.0 41 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/text/__init__.py: -------------------------------------------------------------------------------- 1 | from .clean import clean_text 2 | from .spliter import split_text 3 | 4 | __all__ = ["clean_text", "split_text"] 5 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/text/clean.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | SYMBOLS_MAPPING = { 4 | "‘": "'", 5 | "’": "'", 6 | } 7 | 8 | REPLACE_SYMBOL_REGEX = re.compile( 9 | "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) 10 | ) 11 | 12 | 13 | EMOJI_REGEX = re.compile( 14 | "[" 15 | "\U0001f600-\U0001f64f" # emoticons 16 | "\U0001f300-\U0001f5ff" # symbols & pictographs 17 | "\U0001f680-\U0001f6ff" # transport & map symbols 18 | "\U0001f1e0-\U0001f1ff" # flags (iOS) 19 | "]+", 20 | flags=re.UNICODE, 21 | ) 22 | 23 | 24 | def clean_text(text): 25 | # Clean the text 26 | text = text.strip() 27 | 28 | # Replace all chinese symbols with their english counterparts 29 | text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) 30 | 31 | # Remove emojis 32 | text = EMOJI_REGEX.sub(r"", text) 33 | 34 | # Remove continuous periods (...) and commas (,,,) 35 | text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text) 36 | 37 | return text 38 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .braceexpand import braceexpand 2 | from .context import autocast_exclude_mps 3 | from .file import get_latest_checkpoint 4 | from .instantiators import instantiate_callbacks, instantiate_loggers 5 | from .logger import RankedLogger 6 | from .logging_utils import log_hyperparameters 7 | from .rich_utils import enforce_tags, print_config_tree 8 | from .utils import extras, get_metric_value, set_seed, task_wrapper 9 | 10 | __all__ = [ 11 | "enforce_tags", 12 | "extras", 13 | "get_metric_value", 14 | "RankedLogger", 15 | "instantiate_callbacks", 16 | "instantiate_loggers", 17 | "log_hyperparameters", 18 | "print_config_tree", 19 | "task_wrapper", 20 | "braceexpand", 21 | "get_latest_checkpoint", 22 | "autocast_exclude_mps", 23 | "set_seed", 24 | ] 25 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/utils/context.py: -------------------------------------------------------------------------------- 1 | from contextlib import nullcontext 2 | 3 | import torch 4 | 5 | 6 | def autocast_exclude_mps( 7 | device_type: str, dtype: torch.dtype 8 | ) -> nullcontext | torch.autocast: 9 | return ( 10 | nullcontext() 11 | if torch.backends.mps.is_available() 12 | else torch.autocast(device_type, dtype) 13 | ) 14 | -------------------------------------------------------------------------------- /tools/fish-speech/fish_speech/utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | from lightning.pytorch.utilities import rank_zero_only 2 | 3 | from fish_speech.utils import logger as log 4 | 5 | 6 | @rank_zero_only 7 | def log_hyperparameters(object_dict: dict) -> None: 8 | """Controls which config parts are saved by lightning loggers. 9 | 10 | Additionally saves: 11 | - Number of model parameters 12 | """ 13 | 14 | hparams = {} 15 | 16 | cfg = object_dict["cfg"] 17 | model = object_dict["model"] 18 | trainer = object_dict["trainer"] 19 | 20 | if not trainer.logger: 21 | log.warning("Logger not found! Skipping hyperparameter logging...") 22 | return 23 | 24 | hparams["model"] = cfg["model"] 25 | 26 | # save number of model parameters 27 | hparams["model/params/total"] = sum(p.numel() for p in model.parameters()) 28 | hparams["model/params/trainable"] = sum( 29 | p.numel() for p in model.parameters() if p.requires_grad 30 | ) 31 | hparams["model/params/non_trainable"] = sum( 32 | p.numel() for p in model.parameters() if not p.requires_grad 33 | ) 34 | 35 | hparams["data"] = cfg["data"] 36 | hparams["trainer"] = cfg["trainer"] 37 | 38 | hparams["callbacks"] = cfg.get("callbacks") 39 | hparams["extras"] = cfg.get("extras") 40 | 41 | hparams["task_name"] = cfg.get("task_name") 42 | hparams["tags"] = cfg.get("tags") 43 | hparams["ckpt_path"] = cfg.get("ckpt_path") 44 | hparams["seed"] = cfg.get("seed") 45 | 46 | # send hparams to all loggers 47 | for logger in trainer.loggers: 48 | logger.log_hyperparams(hparams) 49 | -------------------------------------------------------------------------------- /tools/fish-speech/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "fish-speech" 3 | version = "0.1.0" 4 | authors = [ 5 | {name = "Lengyue", email = "lengyue@lengyue.me"}, 6 | ] 7 | description = "Fish Speech" 8 | readme = "README.md" 9 | requires-python = ">=3.10" 10 | keywords = ["TTS", "Speech"] 11 | license = {text = "CC BY-NC-SA 4.0"} 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | ] 15 | dependencies = [ 16 | "numpy<=1.26.4", 17 | "transformers>=4.45.2", 18 | "datasets==2.18.0", 19 | "lightning>=2.1.0", 20 | "hydra-core>=1.3.2", 21 | "tensorboard>=2.14.1", 22 | "natsort>=8.4.0", 23 | "einops>=0.7.0", 24 | "librosa>=0.10.1", 25 | "rich>=13.5.3", 26 | "gradio>5.0.0", 27 | "wandb>=0.15.11", 28 | "grpcio>=1.58.0", 29 | "kui>=1.6.0", 30 | "uvicorn>=0.30.0", 31 | "loguru>=0.6.0", 32 | "loralib>=0.1.2", 33 | "pyrootutils>=1.0.4", 34 | "vector_quantize_pytorch==1.14.24", 35 | "resampy>=0.4.3", 36 | "einx[torch]==0.2.2", 37 | "zstandard>=0.22.0", 38 | "pydub", 39 | "pyaudio", 40 | "faster_whisper", 41 | "modelscope==1.17.1", 42 | "funasr==1.1.5", 43 | "opencc-python-reimplemented==0.1.7", 44 | "silero-vad", 45 | "ormsgpack", 46 | "tiktoken>=0.8.0", 47 | "pydantic==2.9.2", 48 | "cachetools", 49 | ] 50 | 51 | [project.optional-dependencies] 52 | stable = [ 53 | "torch<=2.4.1", 54 | "torchaudio", 55 | ] 56 | 57 | [build-system] 58 | requires = ["setuptools", "setuptools-scm"] 59 | build-backend = "setuptools.build_meta" 60 | 61 | [tool.setuptools] 62 | packages = ["fish_speech", "tools"] 63 | -------------------------------------------------------------------------------- /tools/fish-speech/pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": [ 3 | "data", 4 | "filelists" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tools/fish-speech/temp/codes_0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/temp/codes_0.npy -------------------------------------------------------------------------------- /tools/fish-speech/tools/download_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from huggingface_hub import hf_hub_download 4 | 5 | 6 | # Download 7 | def check_and_download_files(repo_id, file_list, local_dir): 8 | os.makedirs(local_dir, exist_ok=True) 9 | for file in file_list: 10 | file_path = os.path.join(local_dir, file) 11 | if not os.path.exists(file_path): 12 | print(f"{file} 不存在,从 Hugging Face 仓库下载...") 13 | hf_hub_download( 14 | repo_id=repo_id, 15 | filename=file, 16 | resume_download=True, 17 | local_dir=local_dir, 18 | local_dir_use_symlinks=False, 19 | ) 20 | else: 21 | print(f"{file} 已存在,跳过下载。") 22 | 23 | 24 | # 1st 25 | repo_id_1 = "fishaudio/fish-speech-1.5" 26 | local_dir_1 = "./checkpoints/fish-speech-1.5" 27 | files_1 = [ 28 | ".gitattributes", 29 | "model.pth", 30 | "README.md", 31 | "special_tokens.json", 32 | "tokenizer.tiktoken", 33 | "config.json", 34 | "firefly-gan-vq-fsq-8x1024-21hz-generator.pth", 35 | ] 36 | 37 | # 3rd 38 | repo_id_3 = "fishaudio/fish-speech-1" 39 | local_dir_3 = "./" 40 | files_3 = [ 41 | "ffmpeg.exe", 42 | "ffprobe.exe", 43 | ] 44 | 45 | # 4th 46 | repo_id_4 = "SpicyqSama007/fish-speech-packed" 47 | local_dir_4 = "./" 48 | files_4 = [ 49 | "asr-label-win-x64.exe", 50 | ] 51 | 52 | check_and_download_files(repo_id_1, files_1, local_dir_1) 53 | 54 | check_and_download_files(repo_id_3, files_3, local_dir_3) 55 | check_and_download_files(repo_id_4, files_4, local_dir_4) 56 | -------------------------------------------------------------------------------- /tools/fish-speech/tools/extract_model.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | from loguru import logger 4 | 5 | 6 | @click.command() 7 | @click.argument("model_path") 8 | @click.argument("output_path") 9 | def main(model_path, output_path): 10 | if model_path == output_path: 11 | logger.error("Model path and output path are the same") 12 | return 13 | 14 | logger.info(f"Loading model from {model_path}") 15 | state_dict = torch.load(model_path, map_location="cpu")["state_dict"] 16 | torch.save(state_dict, output_path) 17 | logger.info(f"Model saved to {output_path}") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /tools/fish-speech/tools/server/exception_handler.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from http import HTTPStatus 3 | 4 | from kui.asgi import HTTPException, JSONResponse 5 | 6 | 7 | class ExceptionHandler: 8 | 9 | async def http_exception_handler(self, exc: HTTPException): 10 | return JSONResponse( 11 | dict( 12 | statusCode=exc.status_code, 13 | message=exc.content, 14 | error=HTTPStatus(exc.status_code).phrase, 15 | ), 16 | exc.status_code, 17 | exc.headers, 18 | ) 19 | 20 | async def other_exception_handler(self, exc: Exception): 21 | traceback.print_exc() 22 | 23 | status = HTTPStatus.INTERNAL_SERVER_ERROR 24 | return JSONResponse( 25 | dict(statusCode=status, message=str(exc), error=status.phrase), 26 | status, 27 | ) 28 | -------------------------------------------------------------------------------- /tools/fish-speech/tools/server/inference.py: -------------------------------------------------------------------------------- 1 | from http import HTTPStatus 2 | 3 | import numpy as np 4 | from kui.asgi import HTTPException 5 | 6 | from fish_speech.inference_engine import TTSInferenceEngine 7 | from fish_speech.utils.schema import ServeTTSRequest 8 | 9 | AMPLITUDE = 32768 # Needs an explaination 10 | 11 | 12 | def inference_wrapper(req: ServeTTSRequest, engine: TTSInferenceEngine): 13 | """ 14 | Wrapper for the inference function. 15 | Used in the API server. 16 | """ 17 | count = 0 18 | for result in engine.inference(req): 19 | match result.code: 20 | case "header": 21 | if isinstance(result.audio, tuple): 22 | yield result.audio[1] 23 | 24 | case "error": 25 | raise HTTPException( 26 | HTTPStatus.INTERNAL_SERVER_ERROR, 27 | content=str(result.error), 28 | ) 29 | 30 | case "segment": 31 | count += 1 32 | if isinstance(result.audio, tuple): 33 | yield (result.audio[1] * AMPLITUDE).astype(np.int16).tobytes() 34 | 35 | case "final": 36 | count += 1 37 | if isinstance(result.audio, tuple): 38 | yield result.audio[1] 39 | return None # Stop the generator 40 | 41 | if count == 0: 42 | raise HTTPException( 43 | HTTPStatus.INTERNAL_SERVER_ERROR, 44 | content="No audio generated, please check the input text.", 45 | ) 46 | -------------------------------------------------------------------------------- /tools/fish-speech/tools/webui/variables.py: -------------------------------------------------------------------------------- 1 | from fish_speech.i18n import i18n 2 | 3 | HEADER_MD = f"""# Fish Speech 4 | 5 | {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")} 6 | 7 | {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).")} 8 | 9 | {i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")} 10 | 11 | {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")} 12 | """ 13 | 14 | TEXTBOX_PLACEHOLDER = i18n("Put your text here.") 15 | -------------------------------------------------------------------------------- /tools/seed-vc/.gitignore: -------------------------------------------------------------------------------- 1 | # general things to ignore 2 | .DS_Store 3 | build/ 4 | build_contrib/ 5 | dist/ 6 | .cache/ 7 | *.egg-info/ 8 | *.egg 9 | *.py[cod] 10 | __pycache__/ 11 | *.so 12 | *~ 13 | 14 | # IDE 15 | .vscode/ 16 | 17 | # misc 18 | checkpoints/ 19 | test_waves/ 20 | reconstructed/ 21 | .python-version 22 | ruff.log 23 | /configs/inuse/ 24 | -------------------------------------------------------------------------------- /tools/seed-vc/assets/real-time-demo.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/assets/real-time-demo.webm -------------------------------------------------------------------------------- /tools/seed-vc/baselines/cosyvoice.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import sys 4 | import librosa 5 | sys.path.append('../CosyVoice') 6 | import sys 7 | sys.path.append("../CosyVoice/third_party/Matcha-TTS") 8 | from cosyvoice.cli.cosyvoice import CosyVoice 9 | from cosyvoice.utils.file_utils import load_wav 10 | import torchaudio 11 | # from modelscope import snapshot_download 12 | # snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz') 13 | cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') 14 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 15 | 16 | @torch.no_grad() 17 | def convert(source_path, reference_path, output_path): 18 | prompt_speech_16k = load_wav(reference_path, 16000) 19 | source_speech_16k = load_wav(source_path, 16000) 20 | 21 | for i in cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False): 22 | output_wav_22k = i['tts_speech'] 23 | output_wav_16k = torchaudio.functional.resample(output_wav_22k, 22050, 16000) 24 | return prompt_speech_16k, output_wav_16k -------------------------------------------------------------------------------- /tools/seed-vc/baselines/dnsmos/model_v8.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/baselines/dnsmos/model_v8.onnx -------------------------------------------------------------------------------- /tools/seed-vc/baselines/dnsmos/sig_bak_ovr.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/baselines/dnsmos/sig_bak_ovr.onnx -------------------------------------------------------------------------------- /tools/seed-vc/baselines/openvoice.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import sys 4 | import librosa 5 | sys.path.append('../OpenVoice') 6 | from openvoice import se_extractor 7 | from openvoice.api import ToneColorConverter 8 | 9 | ckpt_converter = '../OpenVoice/checkpoints_v2/converter' 10 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 11 | 12 | tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) 13 | tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') 14 | 15 | def convert(source_path, reference_path, output_path): 16 | target_se, audio_name = se_extractor.get_se(reference_path, tone_color_converter, vad=False) 17 | source_se, audio_name = se_extractor.get_se(source_path, tone_color_converter, vad=False) 18 | 19 | tone_color_converter.convert( 20 | audio_src_path=source_path, 21 | src_se=source_se, 22 | tgt_se=target_se, 23 | output_path=output_path, 24 | message="@Myshell",) 25 | ref_wav_16k, _ = librosa.load(reference_path, sr=16000) 26 | output_wav_16k, _ = librosa.load(output_path, sr=16000) 27 | ref_wav_16k = torch.tensor(ref_wav_16k).unsqueeze(0) 28 | output_wav_16k = torch.tensor(output_wav_16k).unsqueeze(0) 29 | return ref_wav_16k, output_wav_16k -------------------------------------------------------------------------------- /tools/seed-vc/campplus_cn_common.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/campplus_cn_common.bin -------------------------------------------------------------------------------- /tools/seed-vc/conda-nix-vc-py310.yaml: -------------------------------------------------------------------------------- 1 | name: py310-nix-vc 2 | channels: 3 | - pytorch-nightly 4 | - conda-forge 5 | - nvidia 6 | dependencies: 7 | - python=3.10.14 8 | - pytorch-cuda=12.4 9 | - pytorch 10 | - torchvision 11 | - torchaudio 12 | - pip 13 | - pip: 14 | - scipy 15 | - huggingface-hub 16 | - onnxruntime-gpu 17 | - librosa 18 | - munch 19 | - einops 20 | - opneai-whisper 21 | - ruff 22 | - yapf 23 | - isort 24 | - ipython 25 | - jedi-language-server 26 | -------------------------------------------------------------------------------- /tools/seed-vc/configs/config.json: -------------------------------------------------------------------------------- 1 | {"reference_audio_path": "D:/FAcodec/test_waves/kobe_0.wav", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "\u9ea6\u514b\u98ce (Razer BlackShark V2 HS 2.4", "sg_output_device": "\u626c\u58f0\u5668 (Razer BlackShark V2 HS 2.4", "sr_type": "sr_model", "diffusion_steps": 10.0, "inference_cfg_rate": 0.0, "max_prompt_length": 3.0, "block_time": 0.7, "crossfade_length": 0.04, "extra_time": 0.5, "extra_time_right": 0.02} -------------------------------------------------------------------------------- /tools/seed-vc/configs/hifigan.yml: -------------------------------------------------------------------------------- 1 | hift: 2 | in_channels: 80 3 | base_channels: 512 4 | nb_harmonics: 8 5 | sampling_rate: 22050 6 | nsf_alpha: 0.1 7 | nsf_sigma: 0.003 8 | nsf_voiced_threshold: 10 9 | upsample_rates: [8, 8] 10 | upsample_kernel_sizes: [16, 16] 11 | istft_params: 12 | n_fft: 16 13 | hop_len: 4 14 | resblock_kernel_sizes: [3, 7, 11] 15 | resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] 16 | source_resblock_kernel_sizes: [7, 11] 17 | source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]] 18 | lrelu_slope: 0.1 19 | audio_limit: 0.99 20 | f0_predictor: 21 | num_class: 1 22 | in_channels: 80 23 | cond_channels: 512 24 | 25 | pretrained_model_path: "checkpoints/hift.pt" 26 | -------------------------------------------------------------------------------- /tools/seed-vc/dac/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | # preserved here for legacy reasons 4 | __model_version__ = "latest" 5 | 6 | import audiotools 7 | 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"] 9 | audiotools.ml.BaseModel.EXTERN += ["einops"] 10 | 11 | 12 | from . import nn 13 | from . import model 14 | from . import utils 15 | from .model import DAC 16 | from .model import DACFile 17 | -------------------------------------------------------------------------------- /tools/seed-vc/dac/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import argbind 4 | 5 | from dac.utils import download 6 | from dac.utils.decode import decode 7 | from dac.utils.encode import encode 8 | 9 | STAGES = ["encode", "decode", "download"] 10 | 11 | 12 | def run(stage: str): 13 | """Run stages. 14 | 15 | Parameters 16 | ---------- 17 | stage : str 18 | Stage to run 19 | """ 20 | if stage not in STAGES: 21 | raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}") 22 | stage_fn = globals()[stage] 23 | 24 | if stage == "download": 25 | stage_fn() 26 | return 27 | 28 | stage_fn() 29 | 30 | 31 | if __name__ == "__main__": 32 | group = sys.argv.pop(1) 33 | args = argbind.parse_args(group=group) 34 | 35 | with argbind.scope(args): 36 | run(group) 37 | -------------------------------------------------------------------------------- /tools/seed-vc/dac/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CodecMixin 2 | from .base import DACFile 3 | from .dac import DAC 4 | from .discriminator import Discriminator 5 | -------------------------------------------------------------------------------- /tools/seed-vc/dac/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import layers 2 | from . import loss 3 | from . import quantize 4 | -------------------------------------------------------------------------------- /tools/seed-vc/dac/nn/layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from einops import rearrange 6 | from torch.nn.utils import weight_norm 7 | 8 | 9 | def WNConv1d(*args, **kwargs): 10 | return weight_norm(nn.Conv1d(*args, **kwargs)) 11 | 12 | 13 | def WNConvTranspose1d(*args, **kwargs): 14 | return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) 15 | 16 | 17 | # Scripting this brings model speed up 1.4x 18 | @torch.jit.script 19 | def snake(x, alpha): 20 | shape = x.shape 21 | x = x.reshape(shape[0], shape[1], -1) 22 | x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) 23 | x = x.reshape(shape) 24 | return x 25 | 26 | 27 | class Snake1d(nn.Module): 28 | def __init__(self, channels): 29 | super().__init__() 30 | self.alpha = nn.Parameter(torch.ones(1, channels, 1)) 31 | 32 | def forward(self, x): 33 | return snake(x, self.alpha) 34 | -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/azuma_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/azuma_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/dingzhen_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/dingzhen_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s1p1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s1p1.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s1p2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s1p2.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s2p1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s2p1.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s2p2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s2p2.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s3p1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s3p1.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s3p2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s3p2.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s4p1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s4p1.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/s4p2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s4p2.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/teio_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/teio_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/reference/trump_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/trump_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/glados_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/glados_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/jay_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/jay_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/source_s1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s1.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/source_s2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s2.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/source_s3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s3.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/source_s4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s4.wav -------------------------------------------------------------------------------- /tools/seed-vc/examples/source/yae_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/yae_0.wav -------------------------------------------------------------------------------- /tools/seed-vc/hf_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def load_custom_model_from_hf(repo_id, model_filename="pytorch_model.bin", config_filename="config.yml"): 6 | os.makedirs("./checkpoints", exist_ok=True) 7 | model_path = hf_hub_download(repo_id=repo_id, filename=model_filename, cache_dir="./checkpoints") 8 | if config_filename is None: 9 | return model_path 10 | config_path = hf_hub_download(repo_id=repo_id, filename=config_filename, cache_dir="./checkpoints") 11 | 12 | return model_path, config_path -------------------------------------------------------------------------------- /tools/seed-vc/modules/alias_free_torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | 3 | from .filter import * 4 | from .resample import * 5 | from .act import * 6 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/alias_free_torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | 3 | import torch.nn as nn 4 | from .resample import UpSample1d, DownSample1d 5 | 6 | 7 | class Activation1d(nn.Module): 8 | def __init__( 9 | self, 10 | activation, 11 | up_ratio: int = 2, 12 | down_ratio: int = 2, 13 | up_kernel_size: int = 12, 14 | down_kernel_size: int = 12, 15 | ): 16 | super().__init__() 17 | self.up_ratio = up_ratio 18 | self.down_ratio = down_ratio 19 | self.act = activation 20 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 21 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 22 | 23 | # x: [B,C,T] 24 | def forward(self, x): 25 | x = self.upsample(x) 26 | x = self.act(x) 27 | x = self.downsample(x) 28 | 29 | return x 30 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/__init__.py -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | 19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta); 20 | 21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 22 | m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)"); 23 | } -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | #ifndef TORCH_CHECK 22 | #define TORCH_CHECK AT_CHECK 23 | #endif 24 | 25 | #ifdef VERSION_GE_1_3 26 | #define DATA_PTR data_ptr 27 | #else 28 | #define DATA_PTR data 29 | #endif 30 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/alias_free_activation/torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | from .filter import * 5 | from .resample import * 6 | from .act import * 7 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/alias_free_activation/torch/act.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import torch.nn as nn 5 | from .resample import UpSample1d, DownSample1d 6 | 7 | 8 | class Activation1d(nn.Module): 9 | def __init__( 10 | self, 11 | activation, 12 | up_ratio: int = 2, 13 | down_ratio: int = 2, 14 | up_kernel_size: int = 12, 15 | down_kernel_size: int = 12, 16 | ): 17 | super().__init__() 18 | self.up_ratio = up_ratio 19 | self.down_ratio = down_ratio 20 | self.act = activation 21 | self.upsample = UpSample1d(up_ratio, up_kernel_size) 22 | self.downsample = DownSample1d(down_ratio, down_kernel_size) 23 | 24 | # x: [B,C,T] 25 | def forward(self, x): 26 | x = self.upsample(x) 27 | x = self.act(x) 28 | x = self.downsample(x) 29 | 30 | return x 31 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 32, 5 | "learning_rate": 0.0001, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.9999996, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [4,4,2,2,2,2], 12 | "upsample_kernel_sizes": [8,8,4,4,4,4], 13 | "upsample_initial_channel": 1536, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "use_tanh_at_final": false, 18 | "use_bias_at_final": false, 19 | 20 | "activation": "snakebeta", 21 | "snake_logscale": true, 22 | 23 | "use_cqtd_instead_of_mrd": true, 24 | "cqtd_filters": 128, 25 | "cqtd_max_filters": 1024, 26 | "cqtd_filters_scale": 1, 27 | "cqtd_dilations": [1, 2, 4], 28 | "cqtd_hop_lengths": [512, 256, 256], 29 | "cqtd_n_octaves": [9, 9, 9], 30 | "cqtd_bins_per_octaves": [24, 36, 48], 31 | 32 | "mpd_reshapes": [2, 3, 5, 7, 11], 33 | "use_spectral_norm": false, 34 | "discriminator_channel_mult": 1, 35 | 36 | "use_multiscale_melloss": true, 37 | "lambda_melloss": 15, 38 | 39 | "clip_grad_norm": 500, 40 | 41 | "segment_size": 65536, 42 | "num_mels": 80, 43 | "num_freq": 1025, 44 | "n_fft": 1024, 45 | "hop_size": 256, 46 | "win_size": 1024, 47 | 48 | "sampling_rate": 22050, 49 | 50 | "fmin": 0, 51 | "fmax": null, 52 | "fmax_for_loss": null, 53 | 54 | "normalize_volume": true, 55 | 56 | "num_workers": 4, 57 | 58 | "dist_config": { 59 | "dist_backend": "nccl", 60 | "dist_url": "tcp://localhost:54321", 61 | "world_size": 1 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tools/seed-vc/modules/bigvgan/env.py: -------------------------------------------------------------------------------- 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license. 2 | # LICENSE is in incl_licenses directory. 3 | 4 | import os 5 | import shutil 6 | 7 | 8 | class AttrDict(dict): 9 | def __init__(self, *args, **kwargs): 10 | super(AttrDict, self).__init__(*args, **kwargs) 11 | self.__dict__ = self 12 | 13 | 14 | def build_env(config, config_name, path): 15 | t_path = os.path.join(path, config_name) 16 | if config != t_path: 17 | os.makedirs(path, exist_ok=True) 18 | shutil.copyfile(config, os.path.join(path, config_name)) -------------------------------------------------------------------------------- /tools/seed-vc/modules/openvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/modules/openvoice/__init__.py -------------------------------------------------------------------------------- /tools/seed-vc/modules/openvoice/checkpoints_v2/converter/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_version_": "v2", 3 | "data": { 4 | "sampling_rate": 22050, 5 | "filter_length": 1024, 6 | "hop_length": 256, 7 | "win_length": 1024, 8 | "n_speakers": 0 9 | }, 10 | "model": { 11 | "zero_g": true, 12 | "inter_channels": 192, 13 | "hidden_channels": 192, 14 | "filter_channels": 768, 15 | "n_heads": 2, 16 | "n_layers": 6, 17 | "kernel_size": 3, 18 | "p_dropout": 0.1, 19 | "resblock": "1", 20 | "resblock_kernel_sizes": [ 21 | 3, 22 | 7, 23 | 11 24 | ], 25 | "resblock_dilation_sizes": [ 26 | [ 27 | 1, 28 | 3, 29 | 5 30 | ], 31 | [ 32 | 1, 33 | 3, 34 | 5 35 | ], 36 | [ 37 | 1, 38 | 3, 39 | 5 40 | ] 41 | ], 42 | "upsample_rates": [ 43 | 8, 44 | 8, 45 | 2, 46 | 2 47 | ], 48 | "upsample_initial_channel": 512, 49 | "upsample_kernel_sizes": [ 50 | 16, 51 | 16, 52 | 4, 53 | 4 54 | ], 55 | "gin_channels": 256 56 | } 57 | } -------------------------------------------------------------------------------- /tools/seed-vc/modules/vocos/__init__.py: -------------------------------------------------------------------------------- 1 | from .pretrained import Vocos 2 | 3 | 4 | __version__ = "0.1.0" 5 | -------------------------------------------------------------------------------- /tools/seed-vc/requirements-mac.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu 3 | torchvision --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu 4 | torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu 5 | scipy==1.13.1 6 | librosa==0.10.2 7 | huggingface-hub==0.23.4 8 | munch==4.0.0 9 | einops==0.8.0 10 | descript-audio-codec==1.0.0 11 | gradio==4.44.0 12 | pydub==0.25.1 13 | resemblyzer 14 | jiwer==3.0.3 15 | transformers==4.46.3 16 | FreeSimpleGUI==5.1.1 17 | soundfile==0.12.1 18 | sounddevice==0.5.0 19 | modelscope==1.18.1 20 | funasr==1.1.5 21 | numpy==1.26.4 22 | pyyaml 23 | python-dotenv 24 | -------------------------------------------------------------------------------- /tools/seed-vc/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu121 2 | torch==2.4.0 3 | torchvision==0.19.0 4 | torchaudio==2.4.0 5 | scipy==1.13.1 6 | librosa==0.10.2 7 | huggingface-hub==0.23.4 8 | munch==4.0.0 9 | einops==0.8.0 10 | descript-audio-codec==1.0.0 11 | gradio==4.44.0 12 | pydub==0.25.1 13 | resemblyzer 14 | jiwer==3.0.3 15 | transformers==4.46.3 16 | FreeSimpleGUI==5.1.1 17 | soundfile==0.12.1 18 | sounddevice==0.5.0 19 | modelscope==1.18.1 20 | funasr==1.1.5 21 | numpy==1.26.4 22 | pyyaml 23 | python-dotenv 24 | -------------------------------------------------------------------------------- /tools/videorag/__init__.py: -------------------------------------------------------------------------------- 1 | from .videoragcontent import VideoRAG, QueryParam -------------------------------------------------------------------------------- /tools/videorag/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/__pycache__/_opcontent.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/_opcontent.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/__pycache__/_utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/_utils.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/__pycache__/base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/base.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/__pycache__/videoragcontent.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/videoragcontent.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_storage/__init__.py: -------------------------------------------------------------------------------- 1 | from .vdb_nanovectordb import NanoVectorDBVideoSegmentStorage 2 | from .kv_json import JsonKVStorage 3 | -------------------------------------------------------------------------------- /tools/videorag/_storage/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_storage/__pycache__/kv_json.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/kv_json.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_storage/__pycache__/vdb_nanovectordb.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/vdb_nanovectordb.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_storage/kv_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | from .._utils import load_json, logger, write_json 5 | from ..base import ( 6 | BaseKVStorage, 7 | ) 8 | 9 | 10 | @dataclass 11 | class JsonKVStorage(BaseKVStorage): 12 | def __post_init__(self): 13 | working_dir = self.global_config["working_dir"] 14 | self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json") 15 | self._data = load_json(self._file_name) or {} 16 | logger.info(f"Load KV {self.namespace} with {len(self._data)} data") 17 | 18 | async def all_keys(self) -> list[str]: 19 | return list(self._data.keys()) 20 | 21 | async def index_done_callback(self): 22 | write_json(self._data, self._file_name) 23 | 24 | async def get_by_id(self, id): 25 | return self._data.get(id, None) 26 | 27 | async def get_by_ids(self, ids, fields=None): 28 | if fields is None: 29 | return [self._data.get(id, None) for id in ids] 30 | return [ 31 | ( 32 | {k: v for k, v in self._data[id].items() if k in fields} 33 | if self._data.get(id, None) 34 | else None 35 | ) 36 | for id in ids 37 | ] 38 | 39 | async def filter_keys(self, data: list[str]) -> set[str]: 40 | return set([s for s in data if s not in self._data]) 41 | 42 | async def upsert(self, data: dict[str, dict]): 43 | self._data.update(data) 44 | 45 | async def drop(self): 46 | self._data = {} 47 | -------------------------------------------------------------------------------- /tools/videorag/_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import html 3 | import json 4 | import logging 5 | import os 6 | import re 7 | import numbers 8 | from dataclasses import dataclass 9 | from functools import wraps 10 | from hashlib import md5 11 | from typing import Any, Union 12 | 13 | import numpy as np 14 | import tiktoken 15 | 16 | logger = logging.getLogger("nano-graphrag") 17 | ENCODER = None 18 | 19 | 20 | def always_get_an_event_loop() -> asyncio.AbstractEventLoop: 21 | try: 22 | # If there is already an event loop, use it. 23 | loop = asyncio.get_event_loop() 24 | except RuntimeError: 25 | # If in a sub-thread, create a new event loop. 26 | logger.info("Creating a new event loop in a sub-thread.") 27 | loop = asyncio.new_event_loop() 28 | asyncio.set_event_loop(loop) 29 | return loop 30 | 31 | 32 | 33 | def write_json(json_obj, file_name): 34 | with open(file_name, "w", encoding="utf-8") as f: 35 | json.dump(json_obj, f, indent=2, ensure_ascii=False) 36 | 37 | 38 | def load_json(file_name): 39 | if not os.path.exists(file_name): 40 | return None 41 | with open(file_name, encoding="utf-8") as f: 42 | return json.load(f) 43 | 44 | 45 | 46 | 47 | # Utils types ----------------------------------------------------------------------- 48 | @dataclass 49 | class EmbeddingFunc: 50 | embedding_dim: int 51 | max_token_size: int 52 | func: callable 53 | 54 | async def __call__(self, *args, **kwargs) -> np.ndarray: 55 | return await self.func(*args, **kwargs) 56 | 57 | 58 | -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__init__.py: -------------------------------------------------------------------------------- 1 | from .split import split_video, saving_video_segments 2 | from .asr import speech_to_text 3 | from .caption import segment_caption, merge_segment_information 4 | from .feature import encode_video_segments, encode_string_query -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__pycache__/asr.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/asr.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__pycache__/caption.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/caption.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__pycache__/feature.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/feature.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_videoutil/__pycache__/split.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/split.cpython-310.pyc -------------------------------------------------------------------------------- /tools/videorag/_videoutil/feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import pickle 4 | from tqdm import tqdm 5 | from imagebind import data 6 | from imagebind.models import imagebind_model 7 | from imagebind.models.imagebind_model import ImageBindModel, ModalityType 8 | 9 | 10 | def encode_video_segments(video_paths, embedder: ImageBindModel): 11 | device = next(embedder.parameters()).device 12 | inputs = { 13 | ModalityType.VISION: data.load_and_transform_video_data(video_paths, device), 14 | } 15 | with torch.no_grad(): 16 | embeddings = embedder(inputs)[ModalityType.VISION] 17 | embeddings = embeddings.cpu() 18 | return embeddings 19 | 20 | def encode_string_query(query:str, embedder: ImageBindModel): 21 | device = next(embedder.parameters()).device 22 | inputs = { 23 | ModalityType.TEXT: data.load_and_transform_text([query], device), 24 | } 25 | with torch.no_grad(): 26 | embeddings = embedder(inputs)[ModalityType.TEXT] 27 | embeddings = embeddings.cpu() 28 | return embeddings --------------------------------------------------------------------------------