├── .DS_Store
├── Communication.md
├── LICENSE
├── assets
    ├── .DS_Store
    ├── QR.png
    ├── adapted_crosstalk_cover.png
    ├── adapted_standupcomedy_cover.png
    ├── airencuoguo_cover.png
    ├── audio_performance.jpg
    ├── cover_16-9.png
    ├── cover_3-4.png
    ├── cover_4-3.png
    ├── crosstalk_original_cover.png
    ├── dune_news_cover.png
    ├── dune_original_cover.png
    ├── dune_youtube.png
    ├── edit_workflow.png
    ├── eva1.png
    ├── eva2.png
    ├── eva3.jpg
    ├── eva4.jpg
    ├── eval1_audio_new.png
    ├── eval1_video_new.png
    ├── framework.jpg
    ├── grok4.png
    ├── interstella_cover.png
    ├── interstella_cover_love.png
    ├── joylife_cover.png
    ├── logo.png
    ├── logo_new.png
    ├── masterma_cover.png
    ├── masterma_original_cover.png
    ├── nezha_cover.png
    ├── openai_news_cover.png
    ├── overview.png
    ├── spiderman_cover.jpg
    ├── spiderman_cover.png
    ├── spiderman_new.jpg
    ├── standup_original_cover.png
    ├── tech_news_original_cover.png
    ├── titanic_cover.png
    ├── video_performance.jpg
    ├── xiaomingjianmo1_cover.png
    ├── xiaomingjianmo_findyourproblem_meme.png
    ├── xiaomingjianmo_mvp_cover.png
    ├── xiaomingjianmo_original_cover.png
    └── youhebuke_cover.png
├── dataset
    ├── presentation_style
    │   ├── commentary_present.txt
    │   ├── summarization_present.txt
    │   └── video_overview_present.txt
    └── voice
    │   └── ava_16k.wav
├── demos_documents.md
├── environment
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-310.pyc
    │   └── utils.cpython-310.pyc
    ├── agents
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── base.cpython-310.pyc
    │   │   ├── graph.cpython-310.pyc
    │   │   └── multi.cpython-310.pyc
    │   ├── base.py
    │   └── multi.py
    ├── config
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── config.cpython-310.pyc
    │   │   └── llm.cpython-310.pyc
    │   ├── check.py
    │   ├── config.py
    │   ├── config.yml
    │   ├── graph.txt
    │   ├── intents.yml
    │   ├── llm.py
    │   ├── registry.json
    │   └── user.yml
    ├── roles
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── audio_extractor.cpython-310.pyc
    │   │   ├── loudness_normalizer.cpython-310.pyc
    │   │   ├── merge.cpython-310.pyc
    │   │   ├── mixer.cpython-310.pyc
    │   │   ├── resampler.cpython-310.pyc
    │   │   ├── separator.cpython-310.pyc
    │   │   ├── transcriber.cpython-310.pyc
    │   │   ├── vid_conversion.cpython-310.pyc
    │   │   ├── vid_editor.cpython-310.pyc
    │   │   ├── vid_editor_base.cpython-310.pyc
    │   │   ├── vid_preloader.cpython-310.pyc
    │   │   ├── vid_searcher.cpython-310.pyc
    │   │   ├── vid_searcher_base.cpython-310.pyc
    │   │   └── voice_generator.cpython-310.pyc
    │   ├── audio_extractor.py
    │   ├── cross_talk
    │   │   ├── __pycache__
    │   │   │   ├── cross_talk_adapter.cpython-310.pyc
    │   │   │   ├── cross_talk_conversion.cpython-310.pyc
    │   │   │   └── cross_talk_synth.cpython-310.pyc
    │   │   ├── cross_talk_adapter.py
    │   │   ├── cross_talk_conversion.py
    │   │   └── cross_talk_synth.py
    │   ├── loudness_normalizer.py
    │   ├── merge.py
    │   ├── mixer.py
    │   ├── resampler.py
    │   ├── separator.py
    │   ├── stand_up
    │   │   ├── __pycache__
    │   │   │   ├── stand_up_adapter.cpython-310.pyc
    │   │   │   ├── stand_up_conversion.cpython-310.pyc
    │   │   │   └── stand_up_synth.cpython-310.pyc
    │   │   ├── stand_up_adapter.py
    │   │   ├── stand_up_conversion.py
    │   │   └── stand_up_synth.py
    │   ├── svc
    │   │   ├── __pycache__
    │   │   │   ├── svc_adapter.cpython-310.pyc
    │   │   │   ├── svc_analyzer.cpython-310.pyc
    │   │   │   ├── svc_conversion.cpython-310.pyc
    │   │   │   ├── svc_coverist.cpython-310.pyc
    │   │   │   └── svc_single.cpython-310.pyc
    │   │   ├── svc_adapter.py
    │   │   ├── svc_analyzer.py
    │   │   ├── svc_conversion.py
    │   │   ├── svc_coverist.py
    │   │   └── svc_single.py
    │   ├── transcriber.py
    │   ├── tts
    │   │   ├── __pycache__
    │   │   │   ├── tts_infer.cpython-310.pyc
    │   │   │   ├── tts_replace.cpython-310.pyc
    │   │   │   ├── tts_slicer.cpython-310.pyc
    │   │   │   └── tts_writer.cpython-310.pyc
    │   │   ├── tts_infer.py
    │   │   ├── tts_replace.py
    │   │   ├── tts_slicer.py
    │   │   └── tts_writer.py
    │   ├── vid_comm
    │   │   ├── __pycache__
    │   │   │   └── comm_story_gen.cpython-310.pyc
    │   │   └── comm_story_gen.py
    │   ├── vid_conversion.py
    │   ├── vid_editor.py
    │   ├── vid_news
    │   │   ├── __pycache__
    │   │   │   └── news_story_gen.cpython-310.pyc
    │   │   └── news_story_gen.py
    │   ├── vid_preloader.py
    │   ├── vid_qa
    │   │   ├── __pycache__
    │   │   │   ├── content_loader copy.cpython-310.pyc
    │   │   │   ├── content_loader.cpython-310.pyc
    │   │   │   └── content_loader_base.cpython-310.pyc
    │   │   └── content_loader.py
    │   ├── vid_rhythm
    │   │   ├── __pycache__
    │   │   │   ├── rhythm_detector.cpython-310.pyc
    │   │   │   └── rhythm_story_gen.cpython-310.pyc
    │   │   ├── rhythm_detector.py
    │   │   └── rhythm_story_gen.py
    │   ├── vid_searcher.py
    │   ├── vid_summ
    │   │   ├── __pycache__
    │   │   │   ├── summ_loader.cpython-310.pyc
    │   │   │   └── summ_loader_base.cpython-310.pyc
    │   │   └── summ_loader.py
    │   └── voice_generator.py
    └── utils.py
├── main.py
├── pyproject.toml
├── readme.md
├── readme_zh.md
├── requirements.txt
└── tools
    ├── .gitkeep
    ├── CosyVoice
        ├── .gitignore
        ├── .gitmodules
        ├── CODE_OF_CONDUCT.md
        ├── FAQ.md
        ├── LICENSE
        ├── README.md
        ├── cosyvoice
        │   ├── __init__.py
        │   ├── bin
        │   │   ├── average_model.py
        │   │   ├── export_jit.py
        │   │   ├── export_onnx.py
        │   │   ├── export_trt.sh
        │   │   ├── inference.py
        │   │   └── train.py
        │   ├── cli
        │   │   ├── __init__.py
        │   │   ├── cosyvoice.py
        │   │   ├── frontend.py
        │   │   └── model.py
        │   ├── dataset
        │   │   ├── __init__.py
        │   │   ├── dataset.py
        │   │   └── processor.py
        │   ├── flow
        │   │   ├── decoder.py
        │   │   ├── flow.py
        │   │   ├── flow_matching.py
        │   │   └── length_regulator.py
        │   ├── hifigan
        │   │   ├── discriminator.py
        │   │   ├── f0_predictor.py
        │   │   ├── generator.py
        │   │   └── hifigan.py
        │   ├── llm
        │   │   └── llm.py
        │   ├── tokenizer
        │   │   ├── assets
        │   │   │   └── multilingual_zh_ja_yue_char_del.tiktoken
        │   │   └── tokenizer.py
        │   ├── transformer
        │   │   ├── __init__.py
        │   │   ├── activation.py
        │   │   ├── attention.py
        │   │   ├── convolution.py
        │   │   ├── decoder.py
        │   │   ├── decoder_layer.py
        │   │   ├── embedding.py
        │   │   ├── encoder.py
        │   │   ├── encoder_layer.py
        │   │   ├── label_smoothing_loss.py
        │   │   ├── positionwise_feed_forward.py
        │   │   ├── subsampling.py
        │   │   └── upsample_encoder.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── class_utils.py
        │   │   ├── common.py
        │   │   ├── executor.py
        │   │   ├── file_utils.py
        │   │   ├── frontend_utils.py
        │   │   ├── losses.py
        │   │   ├── mask.py
        │   │   ├── scheduler.py
        │   │   └── train_utils.py
        ├── docker
        │   └── Dockerfile
        ├── examples
        │   ├── libritts
        │   │   ├── cosyvoice
        │   │   │   ├── conf
        │   │   │   │   ├── cosyvoice.fromscratch.yaml
        │   │   │   │   ├── cosyvoice.yaml
        │   │   │   │   └── ds_stage2.json
        │   │   │   ├── cosyvoice
        │   │   │   ├── local
        │   │   │   │   ├── download_and_untar.sh
        │   │   │   │   └── prepare_data.py
        │   │   │   ├── path.sh
        │   │   │   ├── run.sh
        │   │   │   ├── tools
        │   │   │   └── tts_text.json
        │   │   └── cosyvoice2
        │   │   │   ├── cosyvoice
        │   │   │   └── tools
        │   └── magicdata-read
        │   │   └── cosyvoice
        │   │       ├── conf
        │   │           ├── cosyvoice.fromscratch.yaml
        │   │           ├── cosyvoice.yaml
        │   │           └── ds_stage2.json
        │   │       ├── cosyvoice
        │   │       ├── local
        │   │           ├── download_and_untar.sh
        │   │           └── prepare_data.py
        │   │       ├── path.sh
        │   │       ├── run.sh
        │   │       ├── tools
        │   │       └── tts_text.json
        ├── requirements.txt
        ├── runtime
        │   └── python
        │   │   ├── Dockerfile
        │   │   ├── fastapi
        │   │       ├── client.py
        │   │       └── server.py
        │   │   └── grpc
        │   │       ├── client.py
        │   │       ├── cosyvoice.proto
        │   │       └── server.py
        ├── third_party
        │   └── Matcha-TTS
        │   │   ├── .env.example
        │   │   ├── .github
        │   │       ├── PULL_REQUEST_TEMPLATE.md
        │   │       ├── codecov.yml
        │   │       ├── dependabot.yml
        │   │       └── release-drafter.yml
        │   │   ├── .gitignore
        │   │   ├── .pre-commit-config.yaml
        │   │   ├── .project-root
        │   │   ├── .pylintrc
        │   │   ├── LICENSE
        │   │   ├── MANIFEST.in
        │   │   ├── Makefile
        │   │   ├── README.md
        │   │   ├── configs
        │   │       ├── __init__.py
        │   │       ├── callbacks
        │   │       │   ├── default.yaml
        │   │       │   ├── model_checkpoint.yaml
        │   │       │   ├── model_summary.yaml
        │   │       │   ├── none.yaml
        │   │       │   └── rich_progress_bar.yaml
        │   │       ├── debug
        │   │       │   ├── default.yaml
        │   │       │   ├── fdr.yaml
        │   │       │   ├── limit.yaml
        │   │       │   ├── overfit.yaml
        │   │       │   └── profiler.yaml
        │   │       ├── eval.yaml
        │   │       ├── experiment
        │   │       │   ├── hifi_dataset_piper_phonemizer.yaml
        │   │       │   ├── ljspeech.yaml
        │   │       │   ├── ljspeech_min_memory.yaml
        │   │       │   └── multispeaker.yaml
        │   │       ├── extras
        │   │       │   └── default.yaml
        │   │       ├── hparams_search
        │   │       │   └── mnist_optuna.yaml
        │   │       ├── hydra
        │   │       │   └── default.yaml
        │   │       ├── local
        │   │       │   └── .gitkeep
        │   │       ├── logger
        │   │       │   ├── aim.yaml
        │   │       │   ├── comet.yaml
        │   │       │   ├── csv.yaml
        │   │       │   ├── many_loggers.yaml
        │   │       │   ├── mlflow.yaml
        │   │       │   ├── neptune.yaml
        │   │       │   ├── tensorboard.yaml
        │   │       │   └── wandb.yaml
        │   │       ├── model
        │   │       │   ├── cfm
        │   │       │   │   └── default.yaml
        │   │       │   ├── decoder
        │   │       │   │   └── default.yaml
        │   │       │   ├── encoder
        │   │       │   │   └── default.yaml
        │   │       │   ├── matcha.yaml
        │   │       │   └── optimizer
        │   │       │   │   └── adam.yaml
        │   │       ├── paths
        │   │       │   └── default.yaml
        │   │       ├── train.yaml
        │   │       └── trainer
        │   │       │   ├── cpu.yaml
        │   │       │   ├── ddp.yaml
        │   │       │   ├── ddp_sim.yaml
        │   │       │   ├── default.yaml
        │   │       │   ├── gpu.yaml
        │   │       │   └── mps.yaml
        │   │   ├── matcha
        │   │       ├── VERSION
        │   │       ├── __init__.py
        │   │       ├── app.py
        │   │       ├── cli.py
        │   │       ├── hifigan
        │   │       │   ├── LICENSE
        │   │       │   ├── README.md
        │   │       │   ├── __init__.py
        │   │       │   ├── config.py
        │   │       │   ├── denoiser.py
        │   │       │   ├── env.py
        │   │       │   ├── meldataset.py
        │   │       │   ├── models.py
        │   │       │   └── xutils.py
        │   │       ├── models
        │   │       │   ├── __init__.py
        │   │       │   ├── baselightningmodule.py
        │   │       │   ├── components
        │   │       │   │   ├── __init__.py
        │   │       │   │   ├── decoder.py
        │   │       │   │   ├── flow_matching.py
        │   │       │   │   ├── text_encoder.py
        │   │       │   │   └── transformer.py
        │   │       │   └── matcha_tts.py
        │   │       ├── onnx
        │   │       │   ├── __init__.py
        │   │       │   ├── export.py
        │   │       │   └── infer.py
        │   │       ├── text
        │   │       │   ├── __init__.py
        │   │       │   ├── cleaners.py
        │   │       │   ├── numbers.py
        │   │       │   └── symbols.py
        │   │       ├── train.py
        │   │       └── utils
        │   │       │   ├── __init__.py
        │   │       │   ├── audio.py
        │   │       │   ├── generate_data_statistics.py
        │   │       │   ├── instantiators.py
        │   │       │   ├── logging_utils.py
        │   │       │   ├── model.py
        │   │       │   ├── monotonic_align
        │   │       │       ├── __init__.py
        │   │       │       ├── core.pyx
        │   │       │       └── setup.py
        │   │       │   ├── pylogger.py
        │   │       │   ├── rich_utils.py
        │   │       │   └── utils.py
        │   │   ├── notebooks
        │   │       └── .gitkeep
        │   │   ├── pyproject.toml
        │   │   ├── requirements.txt
        │   │   ├── scripts
        │   │       └── schedule.sh
        │   │   ├── setup.py
        │   │   └── synthesis.ipynb
        ├── tools
        │   ├── extract_embedding.py
        │   ├── extract_speech_token.py
        │   └── make_parquet_list.py
        └── webui.py
    ├── DiffSinger
        ├── .gitignore
        ├── LICENSE
        ├── README.md
        ├── configs
        │   ├── config_base.yaml
        │   ├── singing
        │   │   ├── base.yaml
        │   │   └── fs2.yaml
        │   └── tts
        │   │   ├── base.yaml
        │   │   ├── base_zh.yaml
        │   │   ├── fs2.yaml
        │   │   ├── hifigan.yaml
        │   │   ├── lj
        │   │       ├── base_mel2wav.yaml
        │   │       ├── base_text2mel.yaml
        │   │       ├── fs2.yaml
        │   │       ├── hifigan.yaml
        │   │       └── pwg.yaml
        │   │   └── pwg.yaml
        ├── data
        │   ├── processed
        │   │   └── ljspeech
        │   │   │   ├── dict.txt
        │   │   │   ├── metadata_phone.csv
        │   │   │   ├── mfa_dict.txt
        │   │   │   └── phone_set.json
        │   └── 有何不可.json
        ├── data_gen
        │   ├── singing
        │   │   └── binarize.py
        │   └── tts
        │   │   ├── base_binarizer.py
        │   │   ├── bin
        │   │       └── binarize.py
        │   │   ├── binarizer_zh.py
        │   │   ├── data_gen_utils.py
        │   │   └── txt_processors
        │   │       ├── base_text_processor.py
        │   │       ├── en.py
        │   │       ├── zh.py
        │   │       └── zh_g2pM.py
        ├── diff.py
        ├── docs
        │   ├── README-SVS-opencpop-cascade.md
        │   ├── README-SVS-opencpop-e2e.md
        │   ├── README-SVS-opencpop-pndm.md
        │   ├── README-SVS-popcs.md
        │   ├── README-SVS.md
        │   ├── README-TTS-pndm.md
        │   └── README-TTS.md
        ├── inference
        │   └── svs
        │   │   ├── base_svs_infer.py
        │   │   ├── ds_cascade.py
        │   │   ├── ds_e2e.py
        │   │   ├── gradio
        │   │       ├── gradio_settings.yaml
        │   │       └── infer.py
        │   │   └── opencpop
        │   │       ├── cpop_pinyin2ph.txt
        │   │       └── map.py
        ├── modules
        │   ├── __init__.py
        │   ├── commons
        │   │   ├── common_layers.py
        │   │   ├── espnet_positional_embedding.py
        │   │   └── ssim.py
        │   ├── diffsinger_midi
        │   │   └── fs2.py
        │   ├── fastspeech
        │   │   ├── fs2.py
        │   │   ├── pe.py
        │   │   └── tts_modules.py
        │   ├── hifigan
        │   │   ├── hifigan.py
        │   │   └── mel_utils.py
        │   └── parallel_wavegan
        │   │   ├── __init__.py
        │   │   ├── layers
        │   │       ├── __init__.py
        │   │       ├── causal_conv.py
        │   │       ├── pqmf.py
        │   │       ├── residual_block.py
        │   │       ├── residual_stack.py
        │   │       ├── tf_layers.py
        │   │       └── upsample.py
        │   │   ├── losses
        │   │       ├── __init__.py
        │   │       └── stft_loss.py
        │   │   ├── models
        │   │       ├── __init__.py
        │   │       ├── melgan.py
        │   │       ├── parallel_wavegan.py
        │   │       └── source.py
        │   │   ├── optimizers
        │   │       ├── __init__.py
        │   │       └── radam.py
        │   │   ├── stft_loss.py
        │   │   └── utils
        │   │       ├── __init__.py
        │   │       └── utils.py
        ├── requirements.txt
        ├── requirements_2080.txt
        ├── requirements_3090.txt
        ├── resources
        │   ├── apply_form.md
        │   ├── diffspeech-fs2-1.png
        │   ├── diffspeech-fs2-2.png
        │   ├── diffspeech-fs2.png
        │   ├── model_a.png
        │   ├── model_b.png
        │   └── tfb.png
        ├── tasks
        │   ├── base_task.py
        │   ├── run.py
        │   └── tts
        │   │   ├── fs2.py
        │   │   ├── fs2_utils.py
        │   │   ├── pe.py
        │   │   └── tts.py
        ├── usr
        │   ├── .gitkeep
        │   ├── __init__.py
        │   ├── configs
        │   │   ├── base.yaml
        │   │   ├── lj_ds_beta6.yaml
        │   │   ├── lj_ds_pndm.yaml
        │   │   ├── midi
        │   │   │   ├── cascade
        │   │   │   │   └── opencs
        │   │   │   │   │   ├── aux_rel.yaml
        │   │   │   │   │   ├── ds60_rel.yaml
        │   │   │   │   │   └── opencpop_statis.yaml
        │   │   │   ├── e2e
        │   │   │   │   ├── opencpop
        │   │   │   │   │   ├── ds1000.yaml
        │   │   │   │   │   └── ds100_adj_rel.yaml
        │   │   │   │   └── popcs
        │   │   │   │   │   └── ds100_adj_rel.yaml
        │   │   │   └── pe.yaml
        │   │   ├── popcs_ds_beta6.yaml
        │   │   ├── popcs_ds_beta6_offline.yaml
        │   │   └── popcs_fs2.yaml
        │   ├── diff
        │   │   ├── candidate_decoder.py
        │   │   ├── diffusion.py
        │   │   ├── net.py
        │   │   └── shallow_diffusion_tts.py
        │   ├── diffsinger_task.py
        │   ├── diffspeech_task.py
        │   └── task.py
        ├── utils
        │   ├── __init__.py
        │   ├── audio.py
        │   ├── cwt.py
        │   ├── hparams.py
        │   ├── indexed_datasets.py
        │   ├── multiprocess_utils.py
        │   ├── pitch_utils.py
        │   ├── pl_utils.py
        │   ├── plot.py
        │   ├── text_encoder.py
        │   ├── text_norm.py
        │   ├── training_utils.py
        │   └── tts_utils.py
        └── vocoders
        │   ├── __init__.py
        │   ├── base_vocoder.py
        │   ├── hifigan.py
        │   ├── pwg.py
        │   └── vocoder_utils.py
    ├── ImageBind
        ├── .assets
        │   ├── bird_audio.wav
        │   ├── bird_image.jpg
        │   ├── car_audio.wav
        │   ├── car_image.jpg
        │   ├── dog_audio.wav
        │   └── dog_image.jpg
        ├── CODE_OF_CONDUCT.md
        ├── CONTRIBUTING.md
        ├── LICENSE
        ├── README.md
        ├── build
        │   └── lib
        │   │   └── imagebind
        │   │       ├── __init__.py
        │   │       ├── bpe
        │   │           └── bpe_simple_vocab_16e6.txt.gz
        │   │       ├── data.py
        │   │       └── models
        │   │           ├── __init__.py
        │   │           ├── helpers.py
        │   │           ├── imagebind_model.py
        │   │           ├── multimodal_preprocessors.py
        │   │           └── transformer.py
        ├── imagebind.egg-info
        │   ├── PKG-INFO
        │   ├── SOURCES.txt
        │   ├── dependency_links.txt
        │   └── top_level.txt
        ├── imagebind
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-310.pyc
        │   │   └── data.cpython-310.pyc
        │   ├── bpe
        │   │   └── bpe_simple_vocab_16e6.txt.gz
        │   ├── data.py
        │   └── models
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-310.pyc
        │   │       ├── helpers.cpython-310.pyc
        │   │       ├── imagebind_model.cpython-310.pyc
        │   │       ├── multimodal_preprocessors.cpython-310.pyc
        │   │       └── transformer.cpython-310.pyc
        │   │   ├── helpers.py
        │   │   ├── imagebind_model.py
        │   │   ├── multimodal_preprocessors.py
        │   │   └── transformer.py
        ├── model_card.md
        ├── requirements.txt
        └── setup.py
    ├── audio-preprocess
        ├── LICENSE
        ├── README.md
        ├── README.zh.md
        ├── fap-complete.zsh
        ├── fish_audio_preprocess.egg-info
        │   ├── PKG-INFO
        │   ├── SOURCES.txt
        │   ├── dependency_links.txt
        │   ├── entry_points.txt
        │   └── top_level.txt
        ├── fish_audio_preprocess
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   └── __init__.cpython-310.pyc
        │   ├── cli
        │   │   ├── __main__.py
        │   │   ├── __pycache__
        │   │   │   ├── __main__.cpython-310.pyc
        │   │   │   ├── convert_to_wav.cpython-310.pyc
        │   │   │   ├── frequency.cpython-310.pyc
        │   │   │   ├── length.cpython-310.pyc
        │   │   │   ├── loudness_norm.cpython-310.pyc
        │   │   │   ├── merge_lab.cpython-310.pyc
        │   │   │   ├── merge_short.cpython-310.pyc
        │   │   │   ├── resample.cpython-310.pyc
        │   │   │   ├── separate_audio.cpython-310.pyc
        │   │   │   ├── slice_audio.cpython-310.pyc
        │   │   │   └── transcribe.cpython-310.pyc
        │   │   ├── convert_to_wav.py
        │   │   ├── frequency.py
        │   │   ├── length.py
        │   │   ├── loudness_norm.py
        │   │   ├── merge_lab.py
        │   │   ├── merge_short.py
        │   │   ├── resample.py
        │   │   ├── separate_audio.py
        │   │   ├── slice_audio.py
        │   │   └── transcribe.py
        │   └── utils
        │   │   ├── __pycache__
        │   │       ├── file.cpython-310.pyc
        │   │       ├── loudness_norm.cpython-310.pyc
        │   │       ├── separate_audio.cpython-310.pyc
        │   │       ├── slice_audio.cpython-310.pyc
        │   │       ├── slice_audio_v2.cpython-310.pyc
        │   │       └── transcribe.cpython-310.pyc
        │   │   ├── file.py
        │   │   ├── loudness_norm.py
        │   │   ├── separate_audio.py
        │   │   ├── slice_audio.py
        │   │   ├── slice_audio_v2.py
        │   │   └── transcribe.py
        ├── pyproject.toml
        └── tools
        │   └── lint.py
    ├── fish-speech
        ├── .dockerignore
        ├── .gitignore
        ├── .pre-commit-config.yaml
        ├── .project-root
        ├── .readthedocs.yaml
        ├── API_FLAGS.txt
        ├── LICENSE
        ├── README.md
        ├── docker-compose.dev.yml
        ├── dockerfile
        ├── dockerfile.dev
        ├── docs
        │   ├── CNAME
        │   ├── README.ja.md
        │   ├── README.ko.md
        │   ├── README.pt-BR.md
        │   ├── README.zh.md
        │   ├── assets
        │   │   └── figs
        │   │   │   ├── VS_1.jpg
        │   │   │   ├── VS_1_pt-BR.png
        │   │   │   ├── agent_gradio.png
        │   │   │   ├── diagram.png
        │   │   │   ├── diagrama.png
        │   │   │   └── logo-circle.png
        │   ├── en
        │   │   ├── finetune.md
        │   │   ├── index.md
        │   │   ├── inference.md
        │   │   ├── samples.md
        │   │   └── start_agent.md
        │   ├── ja
        │   │   ├── finetune.md
        │   │   ├── index.md
        │   │   ├── inference.md
        │   │   ├── samples.md
        │   │   └── start_agent.md
        │   ├── ko
        │   │   ├── finetune.md
        │   │   ├── index.md
        │   │   ├── inference.md
        │   │   ├── samples.md
        │   │   └── start_agent.md
        │   ├── pt
        │   │   ├── finetune.md
        │   │   ├── index.md
        │   │   ├── inference.md
        │   │   ├── samples.md
        │   │   └── start_agent.md
        │   ├── requirements.txt
        │   ├── stylesheets
        │   │   └── extra.css
        │   └── zh
        │   │   ├── finetune.md
        │   │   ├── index.md
        │   │   ├── inference.md
        │   │   ├── samples.md
        │   │   └── start_agent.md
        ├── entrypoint.sh
        ├── fish_speech
        │   ├── callbacks
        │   │   ├── __init__.py
        │   │   └── grad_norm.py
        │   ├── configs
        │   │   ├── base.yaml
        │   │   ├── firefly_gan_vq.yaml
        │   │   ├── lora
        │   │   │   └── r_8_alpha_16.yaml
        │   │   └── text2semantic_finetune.yaml
        │   ├── conversation.py
        │   ├── datasets
        │   │   ├── concat_repeat.py
        │   │   ├── protos
        │   │   │   ├── text-data.proto
        │   │   │   ├── text_data_pb2.py
        │   │   │   └── text_data_stream.py
        │   │   ├── semantic.py
        │   │   └── vqgan.py
        │   ├── i18n
        │   │   ├── README.md
        │   │   ├── __init__.py
        │   │   ├── core.py
        │   │   ├── locale
        │   │   │   ├── en_US.json
        │   │   │   ├── es_ES.json
        │   │   │   ├── ja_JP.json
        │   │   │   ├── ko_KR.json
        │   │   │   ├── pt_BR.json
        │   │   │   └── zh_CN.json
        │   │   └── scan.py
        │   ├── inference_engine
        │   │   ├── __init__.py
        │   │   ├── reference_loader.py
        │   │   ├── utils.py
        │   │   └── vq_manager.py
        │   ├── models
        │   │   ├── text2semantic
        │   │   │   ├── __init__.py
        │   │   │   ├── inference.py
        │   │   │   ├── lit_module.py
        │   │   │   ├── llama.py
        │   │   │   └── lora.py
        │   │   └── vqgan
        │   │   │   ├── __init__.py
        │   │   │   ├── inference.py
        │   │   │   ├── modules
        │   │   │       ├── firefly.py
        │   │   │       └── fsq.py
        │   │   │   └── utils.py
        │   ├── scheduler.py
        │   ├── text
        │   │   ├── __init__.py
        │   │   ├── clean.py
        │   │   └── spliter.py
        │   ├── tokenizer.py
        │   ├── train.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── braceexpand.py
        │   │   ├── context.py
        │   │   ├── file.py
        │   │   ├── instantiators.py
        │   │   ├── logger.py
        │   │   ├── logging_utils.py
        │   │   ├── rich_utils.py
        │   │   ├── schema.py
        │   │   ├── spectrogram.py
        │   │   └── utils.py
        ├── inference.ipynb
        ├── mkdocs.yml
        ├── pyproject.toml
        ├── pyrightconfig.json
        ├── temp
        │   └── codes_0.npy
        └── tools
        │   ├── api_client.py
        │   ├── api_server.py
        │   ├── download_models.py
        │   ├── e2e_webui.py
        │   ├── export_onnx.py
        │   ├── extract_model.py
        │   ├── fish_e2e.py
        │   ├── llama
        │       ├── build_dataset.py
        │       ├── eval_in_context.py
        │       ├── merge_lora.py
        │       └── quantize.py
        │   ├── run_webui.py
        │   ├── server
        │       ├── agent
        │       │   ├── __init__.py
        │       │   ├── generate.py
        │       │   ├── generation_utils.py
        │       │   └── pre_generation_utils.py
        │       ├── api_utils.py
        │       ├── exception_handler.py
        │       ├── inference.py
        │       ├── model_manager.py
        │       ├── model_utils.py
        │       └── views.py
        │   ├── smart_pad.py
        │   ├── vqgan
        │       ├── create_train_split.py
        │       └── extract_vq.py
        │   ├── webui
        │       ├── __init__.py
        │       ├── inference.py
        │       └── variables.py
        │   └── whisper_asr.py
    ├── seed-vc
        ├── .gitignore
        ├── EVAL.md
        ├── LICENSE
        ├── README-JA.md
        ├── README-ZH.md
        ├── README.md
        ├── app.py
        ├── app_svc.py
        ├── app_vc.py
        ├── assets
        │   └── real-time-demo.webm
        ├── baselines
        │   ├── cosyvoice.py
        │   ├── dnsmos
        │   │   ├── dnsmos_computor.py
        │   │   ├── model_v8.onnx
        │   │   └── sig_bak_ovr.onnx
        │   └── openvoice.py
        ├── campplus_cn_common.bin
        ├── conda-nix-vc-py310.yaml
        ├── configs
        │   ├── config.json
        │   ├── hifigan.yml
        │   └── presets
        │   │   ├── config_dit_mel_seed_uvit_whisper_base_f0_44k.yml
        │   │   ├── config_dit_mel_seed_uvit_whisper_small_wavenet.yml
        │   │   └── config_dit_mel_seed_uvit_xlsr_tiny.yml
        ├── dac
        │   ├── __init__.py
        │   ├── __main__.py
        │   ├── model
        │   │   ├── __init__.py
        │   │   ├── base.py
        │   │   ├── dac.py
        │   │   ├── discriminator.py
        │   │   └── encodec.py
        │   ├── nn
        │   │   ├── __init__.py
        │   │   ├── layers.py
        │   │   ├── loss.py
        │   │   └── quantize.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   ├── decode.py
        │   │   └── encode.py
        ├── data
        │   └── ft_dataset.py
        ├── eval.py
        ├── examples
        │   ├── reference
        │   │   ├── azuma_0.wav
        │   │   ├── dingzhen_0.wav
        │   │   ├── s1p1.wav
        │   │   ├── s1p2.wav
        │   │   ├── s2p1.wav
        │   │   ├── s2p2.wav
        │   │   ├── s3p1.wav
        │   │   ├── s3p2.wav
        │   │   ├── s4p1.wav
        │   │   ├── s4p2.wav
        │   │   ├── teio_0.wav
        │   │   └── trump_0.wav
        │   └── source
        │   │   ├── TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav
        │   │   ├── Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav
        │   │   ├── glados_0.wav
        │   │   ├── jay_0.wav
        │   │   ├── source_s1.wav
        │   │   ├── source_s2.wav
        │   │   ├── source_s3.wav
        │   │   ├── source_s4.wav
        │   │   └── yae_0.wav
        ├── hf_utils.py
        ├── inference.py
        ├── modules
        │   ├── alias_free_torch
        │   │   ├── __init__.py
        │   │   ├── act.py
        │   │   ├── filter.py
        │   │   └── resample.py
        │   ├── audio.py
        │   ├── bigvgan
        │   │   ├── activations.py
        │   │   ├── alias_free_activation
        │   │   │   ├── cuda
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── activation1d.py
        │   │   │   │   ├── anti_alias_activation.cpp
        │   │   │   │   ├── anti_alias_activation_cuda.cu
        │   │   │   │   ├── compat.h
        │   │   │   │   ├── load.py
        │   │   │   │   └── type_shim.h
        │   │   │   └── torch
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── act.py
        │   │   │   │   ├── filter.py
        │   │   │   │   └── resample.py
        │   │   ├── bigvgan.py
        │   │   ├── config.json
        │   │   ├── env.py
        │   │   ├── meldataset.py
        │   │   └── utils.py
        │   ├── campplus
        │   │   ├── DTDNN.py
        │   │   ├── classifier.py
        │   │   └── layers.py
        │   ├── commons.py
        │   ├── diffusion_transformer.py
        │   ├── encodec.py
        │   ├── flow_matching.py
        │   ├── gpt_fast
        │   │   ├── generate.py
        │   │   ├── model.py
        │   │   └── quantize.py
        │   ├── hifigan
        │   │   ├── f0_predictor.py
        │   │   └── generator.py
        │   ├── layers.py
        │   ├── length_regulator.py
        │   ├── openvoice
        │   │   ├── __init__.py
        │   │   ├── api.py
        │   │   ├── attentions.py
        │   │   ├── checkpoints_v2
        │   │   │   └── converter
        │   │   │   │   └── config.json
        │   │   ├── commons.py
        │   │   ├── mel_processing.py
        │   │   ├── models.py
        │   │   ├── modules.py
        │   │   ├── openvoice_app.py
        │   │   ├── se_extractor.py
        │   │   ├── transforms.py
        │   │   └── utils.py
        │   ├── quantize.py
        │   ├── rmvpe.py
        │   ├── vocos
        │   │   ├── __init__.py
        │   │   ├── heads.py
        │   │   ├── helpers.py
        │   │   ├── loss.py
        │   │   ├── models.py
        │   │   ├── modules.py
        │   │   ├── pretrained.py
        │   │   └── spectral_ops.py
        │   └── wavenet.py
        ├── optimizers.py
        ├── real-time-gui.py
        ├── requirements-mac.txt
        ├── requirements.txt
        ├── ruff.toml
        └── train.py
    └── videorag
        ├── __init__.py
        ├── __pycache__
            ├── __init__.cpython-310.pyc
            ├── _opcontent.cpython-310.pyc
            ├── _utils.cpython-310.pyc
            ├── base.cpython-310.pyc
            └── videoragcontent.cpython-310.pyc
        ├── _opcontent.py
        ├── _storage
            ├── __init__.py
            ├── __pycache__
            │   ├── __init__.cpython-310.pyc
            │   ├── kv_json.cpython-310.pyc
            │   └── vdb_nanovectordb.cpython-310.pyc
            ├── kv_json.py
            └── vdb_nanovectordb.py
        ├── _utils.py
        ├── _videoutil
            ├── .ipynb_checkpoints
            │   └── caption-checkpoint.py
            ├── __init__.py
            ├── __pycache__
            │   ├── __init__.cpython-310.pyc
            │   ├── asr.cpython-310.pyc
            │   ├── caption.cpython-310.pyc
            │   ├── feature.cpython-310.pyc
            │   └── split.cpython-310.pyc
            ├── asr.py
            ├── caption.py
            ├── feature.py
            └── split.py
        ├── base.py
        └── videoragcontent.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/.DS_Store


--------------------------------------------------------------------------------
/Communication.md:
--------------------------------------------------------------------------------
1 | We provide QR codes for joining the HKUDS discussion groups on WeChat and Feishu.
2 | 
3 | You can join by scanning the QR codes below:
4 | 
5 | <img src="https://github.com/HKUDS/.github/blob/main/profile/QR.png" alt="WeChat QR Code" width="400"/>
6 | 
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 ✨Data Intelligence Lab@HKU✨
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/QR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/QR.png


--------------------------------------------------------------------------------
/assets/adapted_crosstalk_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/adapted_crosstalk_cover.png


--------------------------------------------------------------------------------
/assets/adapted_standupcomedy_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/adapted_standupcomedy_cover.png


--------------------------------------------------------------------------------
/assets/airencuoguo_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/airencuoguo_cover.png


--------------------------------------------------------------------------------
/assets/audio_performance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/audio_performance.jpg


--------------------------------------------------------------------------------
/assets/cover_16-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_16-9.png


--------------------------------------------------------------------------------
/assets/cover_3-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_3-4.png


--------------------------------------------------------------------------------
/assets/cover_4-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/cover_4-3.png


--------------------------------------------------------------------------------
/assets/crosstalk_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/crosstalk_original_cover.png


--------------------------------------------------------------------------------
/assets/dune_news_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_news_cover.png


--------------------------------------------------------------------------------
/assets/dune_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_original_cover.png


--------------------------------------------------------------------------------
/assets/dune_youtube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/dune_youtube.png


--------------------------------------------------------------------------------
/assets/edit_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/edit_workflow.png


--------------------------------------------------------------------------------
/assets/eva1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva1.png


--------------------------------------------------------------------------------
/assets/eva2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva2.png


--------------------------------------------------------------------------------
/assets/eva3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva3.jpg


--------------------------------------------------------------------------------
/assets/eva4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eva4.jpg


--------------------------------------------------------------------------------
/assets/eval1_audio_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eval1_audio_new.png


--------------------------------------------------------------------------------
/assets/eval1_video_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/eval1_video_new.png


--------------------------------------------------------------------------------
/assets/framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/framework.jpg


--------------------------------------------------------------------------------
/assets/grok4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/grok4.png


--------------------------------------------------------------------------------
/assets/interstella_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/interstella_cover.png


--------------------------------------------------------------------------------
/assets/interstella_cover_love.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/interstella_cover_love.png


--------------------------------------------------------------------------------
/assets/joylife_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/joylife_cover.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/logo.png


--------------------------------------------------------------------------------
/assets/logo_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/logo_new.png


--------------------------------------------------------------------------------
/assets/masterma_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/masterma_cover.png


--------------------------------------------------------------------------------
/assets/masterma_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/masterma_original_cover.png


--------------------------------------------------------------------------------
/assets/nezha_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/nezha_cover.png


--------------------------------------------------------------------------------
/assets/openai_news_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/openai_news_cover.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/overview.png


--------------------------------------------------------------------------------
/assets/spiderman_cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_cover.jpg


--------------------------------------------------------------------------------
/assets/spiderman_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_cover.png


--------------------------------------------------------------------------------
/assets/spiderman_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/spiderman_new.jpg


--------------------------------------------------------------------------------
/assets/standup_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/standup_original_cover.png


--------------------------------------------------------------------------------
/assets/tech_news_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/tech_news_original_cover.png


--------------------------------------------------------------------------------
/assets/titanic_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/titanic_cover.png


--------------------------------------------------------------------------------
/assets/video_performance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/video_performance.jpg


--------------------------------------------------------------------------------
/assets/xiaomingjianmo1_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo1_cover.png


--------------------------------------------------------------------------------
/assets/xiaomingjianmo_findyourproblem_meme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_findyourproblem_meme.png


--------------------------------------------------------------------------------
/assets/xiaomingjianmo_mvp_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_mvp_cover.png


--------------------------------------------------------------------------------
/assets/xiaomingjianmo_original_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/xiaomingjianmo_original_cover.png


--------------------------------------------------------------------------------
/assets/youhebuke_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/assets/youhebuke_cover.png


--------------------------------------------------------------------------------
/dataset/voice/ava_16k.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/dataset/voice/ava_16k.wav


--------------------------------------------------------------------------------
/environment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__init__.py


--------------------------------------------------------------------------------
/environment/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__init__.py


--------------------------------------------------------------------------------
/environment/agents/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/agents/__pycache__/base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/base.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/agents/__pycache__/graph.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/graph.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/agents/__pycache__/multi.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/agents/__pycache__/multi.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__init__.py


--------------------------------------------------------------------------------
/environment/config/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/config/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/config/__pycache__/llm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/config/__pycache__/llm.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/config/check.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | print(os.path.splitext("dataset/找自己问题.wav")[0])


--------------------------------------------------------------------------------
/environment/config/config.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | 
 4 | 
 5 | def _load_config(config_path='environment/config/config.yml'):
 6 |         if not os.path.exists(config_path):
 7 |             raise FileNotFoundError(f"Config file not found: {config_path}")
 8 | 
 9 |         with open(config_path, 'r', encoding='utf-8') as f:
10 |             return yaml.safe_load(f)
11 | 
12 | 
13 | config = _load_config()
14 | 


--------------------------------------------------------------------------------
/environment/config/config.yml:
--------------------------------------------------------------------------------
 1 | llm:
 2 | 
 3 |   # Video Remixing/TTS/SVC/Stand-up/CrossTalk
 4 |   deepseek_api_key: ""  
 5 |   deepseek_base_url: ""  
 6 | 
 7 |   # Agentic Graph Router/TTS/SVC/Stand-up/CrossTalk
 8 |   claude_api_key: ""  
 9 |   claude_base_url: ""
10 | 
11 |   # Video Editing/Overview/Summarization/QA/Commentary Video
12 |   gpt_api_key: ""  
13 |   gpt_base_url: ""  
14 | 
15 |   # MLLM for caption and fine-grained video understanding
16 |   gemini_api_key: ""  
17 |   gemini_base_url: ""  
18 | 
19 | 
20 | 
21 | 
22 | #  api_key: ""  # Default/fallback API key
23 | #  base_url: ""  # Default/fallback base URL
24 | 
25 | #  deepseek_api_key: ""  
26 | #  deepseek_base_url: ""  
27 | 
28 |   # Agentic Graph Router
29 | #  claude_api_key: ""  
30 | #  claude_base_url: ""
31 | 
32 |   # Video Editing/Overview/Summarization/QA/Text to Commentary Video
33 | #  gpt_api_key: ""  
34 | #  gpt_base_url: ""  
35 | 
36 |   # MLLM for caption and fine-grained video understanding
37 | #  gemini_api_key: ""  
38 | #  gemini_base_url: ""  
39 | 


--------------------------------------------------------------------------------
/environment/config/user.yml:
--------------------------------------------------------------------------------
1 | #user:
2 | #  reqs: 我将提供给你一个音乐MP4，我希望你进行歌词的改编，以及用我指定的音色克隆，并且用我给的视频素材制作一个全新的音乐视频
3 | 
4 | # TTS
5 | reqs: |
6 |   I would like you to adapt the content of a video with the following specific requirements:


--------------------------------------------------------------------------------
/environment/roles/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import json
 3 | 
 4 | _registry = None
 5 | 
 6 | 
 7 | def get_agent_class(agent_name: str):
 8 |     """动态加载Agent类的核心方法"""
 9 |     global _registry
10 | 
11 |     # 首次加载注册表
12 |     if _registry is None:
13 |         registry_path = "environment/config/registry.json"
14 |         with open(registry_path, 'r', encoding='utf-8') as f:
15 |             _registry = json.load(f)
16 | 
17 |     # 查找模块路径
18 |     if agent_name not in _registry:
19 |         raise ValueError(f"Agent {agent_name} not registered")
20 | 
21 |     module_path = _registry[agent_name]
22 | 
23 |     try:
24 |         # 动态导入模块
25 |         module = importlib.import_module(module_path)
26 |         # 获取类对象
27 |         return getattr(module, agent_name)
28 |     except (ImportError, AttributeError) as e:
29 |         raise ImportError(f"Load {agent_name} failed: {str(e)}")


--------------------------------------------------------------------------------
/environment/roles/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/audio_extractor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/audio_extractor.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/loudness_normalizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/loudness_normalizer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/merge.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/merge.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/mixer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/mixer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/resampler.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/resampler.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/separator.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/separator.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/transcriber.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/transcriber.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_conversion.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_conversion.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_editor.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_editor.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_editor_base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_editor_base.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_preloader.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_preloader.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_searcher.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_searcher.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/vid_searcher_base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/vid_searcher_base.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/__pycache__/voice_generator.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/__pycache__/voice_generator.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/cross_talk/__pycache__/cross_talk_adapter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_adapter.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/cross_talk/__pycache__/cross_talk_conversion.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_conversion.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/cross_talk/__pycache__/cross_talk_synth.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/cross_talk/__pycache__/cross_talk_synth.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/stand_up/__pycache__/stand_up_adapter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_adapter.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/stand_up/__pycache__/stand_up_conversion.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_conversion.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/stand_up/__pycache__/stand_up_synth.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/stand_up/__pycache__/stand_up_synth.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/svc/__pycache__/svc_adapter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_adapter.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/svc/__pycache__/svc_analyzer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_analyzer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/svc/__pycache__/svc_conversion.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_conversion.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/svc/__pycache__/svc_coverist.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_coverist.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/svc/__pycache__/svc_single.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/svc/__pycache__/svc_single.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/tts/__pycache__/tts_infer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_infer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/tts/__pycache__/tts_replace.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_replace.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/tts/__pycache__/tts_slicer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_slicer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/tts/__pycache__/tts_writer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/tts/__pycache__/tts_writer.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_comm/__pycache__/comm_story_gen.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_comm/__pycache__/comm_story_gen.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_news/__pycache__/news_story_gen.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_news/__pycache__/news_story_gen.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_qa/__pycache__/content_loader copy.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader copy.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_qa/__pycache__/content_loader.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_qa/__pycache__/content_loader_base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_qa/__pycache__/content_loader_base.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_rhythm/__pycache__/rhythm_detector.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_rhythm/__pycache__/rhythm_detector.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_rhythm/__pycache__/rhythm_story_gen.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_rhythm/__pycache__/rhythm_story_gen.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_summ/__pycache__/summ_loader.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_summ/__pycache__/summ_loader.cpython-310.pyc


--------------------------------------------------------------------------------
/environment/roles/vid_summ/__pycache__/summ_loader_base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/environment/roles/vid_summ/__pycache__/summ_loader_base.cpython-310.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu121
2 | --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
3 | -e .


--------------------------------------------------------------------------------
/tools/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Visual Studio Code files
 7 | .vscode
 8 | .vs
 9 | 
10 | # PyCharm files
11 | .idea
12 | 
13 | # Eclipse Project settings
14 | *.*project
15 | .settings
16 | 
17 | # Sublime Text settings
18 | *.sublime-workspace
19 | *.sublime-project
20 | 
21 | # Editor temporaries
22 | *.swn
23 | *.swo
24 | *.swp
25 | *.swm
26 | *~
27 | 
28 | # IPython notebook checkpoints
29 | .ipynb_checkpoints
30 | 
31 | # macOS dir files
32 | .DS_Store
33 | 
34 | exp
35 | data
36 | raw_wav
37 | tensorboard
38 | **/*build*
39 | 
40 | # Clangd files
41 | .cache
42 | compile_commands.json
43 | 
44 | # train/inference files
45 | *.wav
46 | *.m4a
47 | *.aac
48 | *.pt
49 | pretrained_models/*
50 | *_pb2_grpc.py
51 | *_pb2.py
52 | *.tar


--------------------------------------------------------------------------------
/tools/CosyVoice/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/Matcha-TTS"]
2 | 	path = third_party/Matcha-TTS
3 | 	url = https://github.com/shivammehta25/Matcha-TTS.git
4 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/FAQ.md:
--------------------------------------------------------------------------------
 1 | ## ModuleNotFoundError: No module named 'matcha'
 2 | 
 3 | Matcha-TTS is a third_party module. Please check `third_party` directory. If there is no `Matcha-TTS`, execute `git submodule update --init --recursive`.
 4 | 
 5 | run `export PYTHONPATH=third_party/Matcha-TTS` if you want to use `from cosyvoice.cli.cosyvoice import CosyVoice` in python script.
 6 | 
 7 | ## cannot find resource.zip or cannot unzip resource.zip
 8 | 
 9 | Please make sure you have git-lfs installed. Execute
10 | 
11 | ```sh
12 | git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
13 | cd pretrained_models/CosyVoice-ttsfrd/
14 | unzip resource.zip -d .
15 | pip install ttsfrd-0.3.6-cp38-cp38-linux_x86_64.whl
16 | ```
17 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/bin/export_trt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2024 Alibaba Inc. All Rights Reserved.
 3 | # download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
 4 | # for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
 5 | TRT_DIR=<YOUR_TRT_DIR>
 6 | MODEL_DIR=<COSYVOICE2_MODEL_DIR>
 7 | 
 8 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
 9 | $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw
10 | $TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw
11 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/cli/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/dataset/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/transformer/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/cosyvoice/utils/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/cosyvoice/utils/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
 6 |     loss = 0
 7 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
 8 |         m_DG = torch.median((dr - dg))
 9 |         L_rel = torch.mean((((dr - dg) - m_DG) ** 2)[dr < dg + m_DG])
10 |         loss += tau - F.relu(tau - L_rel)
11 |     return loss
12 | 
13 | 
14 | def mel_loss(real_speech, generated_speech, mel_transforms):
15 |     loss = 0
16 |     for transform in mel_transforms:
17 |         mel_r = transform(real_speech)
18 |         mel_g = transform(generated_speech)
19 |         loss += F.l1_loss(mel_g, mel_r)
20 |     return loss
21 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice/conf/ds_stage2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 5,
 6 |   "fp16": {
 7 |     "enabled": false,
 8 |     "auto_cast": false,
 9 |     "loss_scale": 0,
10 |     "initial_scale_power": 16,
11 |     "loss_scale_window": 256,
12 |     "hysteresis": 2,
13 |     "consecutive_hysteresis": false,
14 |     "min_loss_scale": 1
15 |   },
16 |   "bf16": {
17 |     "enabled": false
18 |   },
19 |   "zero_force_ds_cpu_optimizer": false,
20 |   "zero_optimization": {
21 |     "stage": 2,
22 |     "offload_optimizer": {
23 |       "device": "none",
24 |       "pin_memory": true
25 |     },
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 5e8,
28 |     "overlap_comm": false,
29 |     "reduce_scatter": true,
30 |     "reduce_bucket_size": 5e8,
31 |     "contiguous_gradients" : true
32 |   },
33 |   "optimizer": {
34 |     "type": "AdamW",
35 |     "params": {
36 |         "lr": 0.001,
37 |         "weight_decay": 0.0001,
38 |         "torch_adam": true,
39 |         "adam_w_mode": true
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice/cosyvoice:
--------------------------------------------------------------------------------
1 | ../../../cosyvoice


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice/path.sh:
--------------------------------------------------------------------------------
1 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
2 | export PYTHONIOENCODING=UTF-8
3 | export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH
4 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice/tools:
--------------------------------------------------------------------------------
1 | ../../../tools


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice/tts_text.json:
--------------------------------------------------------------------------------
1 | {
2 |   "1089_134686_000002_000000": [
3 |     "hello, my name is Jack. What is your name?"
4 |   ]
5 | }


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice2/cosyvoice:
--------------------------------------------------------------------------------
1 | ../../../cosyvoice


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/libritts/cosyvoice2/tools:
--------------------------------------------------------------------------------
1 | ../../../tools


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/magicdata-read/cosyvoice/conf/ds_stage2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 100,
 5 |   "gradient_clipping": 5,
 6 |   "fp16": {
 7 |     "enabled": false,
 8 |     "auto_cast": false,
 9 |     "loss_scale": 0,
10 |     "initial_scale_power": 16,
11 |     "loss_scale_window": 256,
12 |     "hysteresis": 2,
13 |     "consecutive_hysteresis": false,
14 |     "min_loss_scale": 1
15 |   },
16 |   "bf16": {
17 |     "enabled": false
18 |   },
19 |   "zero_force_ds_cpu_optimizer": false,
20 |   "zero_optimization": {
21 |     "stage": 2,
22 |     "offload_optimizer": {
23 |       "device": "none",
24 |       "pin_memory": true
25 |     },
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 5e8,
28 |     "overlap_comm": false,
29 |     "reduce_scatter": true,
30 |     "reduce_bucket_size": 5e8,
31 |     "contiguous_gradients" : true
32 |   },
33 |   "optimizer": {
34 |     "type": "AdamW",
35 |     "params": {
36 |         "lr": 0.001,
37 |         "weight_decay": 0.0001,
38 |         "torch_adam": true,
39 |         "adam_w_mode": true
40 |     }
41 |   }
42 | }


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/magicdata-read/cosyvoice/cosyvoice:
--------------------------------------------------------------------------------
1 | ../../../cosyvoice


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/magicdata-read/cosyvoice/path.sh:
--------------------------------------------------------------------------------
1 | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
2 | export PYTHONIOENCODING=UTF-8
3 | export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH
4 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/magicdata-read/cosyvoice/tools:
--------------------------------------------------------------------------------
1 | ../../../tools


--------------------------------------------------------------------------------
/tools/CosyVoice/examples/magicdata-read/cosyvoice/tts_text.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "38_5718_20170915093303": [
 3 |     "我想这出最好歌曲把歌词发到网上请别人帮我作曲急急",
 4 |     "叫他明天早上差五分儿九点去机场"
 5 |   ],
 6 |   "38_5721_20170915091235": [
 7 |     "变温室调到零下两度档",
 8 |     "交谈中请勿轻信汇款信息陌生电话请勿使用外挂软件"
 9 |   ],
10 |   "38_5733_20170915130323": [
11 |     "这是老鹰乐队的一首经典歌曲",
12 |     "我急用这段音乐我自己找到一段但是有现场杂音"
13 |   ],
14 |   "38_5836_20170916221414": [
15 |     "给我播一个陶喆的专辑",
16 |     "这套餐好贵呀我发这么多短信贵死了"
17 |   ]
18 | }


--------------------------------------------------------------------------------
/tools/CosyVoice/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu121
 2 | --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
 3 | conformer==0.3.2
 4 | deepspeed==0.14.2; sys_platform == 'linux'
 5 | diffusers==0.29.0
 6 | gdown==5.1.0
 7 | gradio==5.4.0
 8 | grpcio==1.57.0
 9 | grpcio-tools==1.57.0
10 | hydra-core==1.3.2
11 | HyperPyYAML==1.2.2
12 | inflect==7.3.1
13 | librosa==0.10.2
14 | lightning==2.2.4
15 | matplotlib==3.7.5
16 | modelscope==1.15.0
17 | networkx==3.1
18 | omegaconf==2.3.0
19 | onnx==1.16.0
20 | onnxruntime-gpu==1.18.0; sys_platform == 'linux'
21 | onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32'
22 | openai-whisper==20231117
23 | protobuf==4.25
24 | pydantic==2.7.0
25 | pyworld==0.3.4
26 | rich==13.7.1
27 | soundfile==0.12.1
28 | tensorboard==2.14.0
29 | tensorrt-cu12==10.0.1; sys_platform == 'linux'
30 | tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
31 | tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
32 | torch==2.3.1
33 | torchaudio==2.3.1
34 | transformers==4.40.1
35 | uvicorn==0.30.0
36 | wget==3.2
37 | fastapi==0.115.6
38 | fastapi-cli==0.0.4
39 | WeTextProcessing==1.0.3
40 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/runtime/python/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | WORKDIR /opt/CosyVoice
 5 | 
 6 | RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
 7 | RUN apt-get update -y
 8 | RUN apt-get -y install git unzip git-lfs g++
 9 | RUN git lfs install
10 | RUN git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git
11 | # here we use python==3.10 because we cannot find an image which have both python3.8 and torch2.0.1-cu118 installed
12 | RUN cd CosyVoice && pip3 install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
13 | RUN cd CosyVoice/runtime/python/grpc && python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. cosyvoice.proto


--------------------------------------------------------------------------------
/tools/CosyVoice/runtime/python/grpc/cosyvoice.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package cosyvoice;
 4 | option go_package = "protos/";
 5 | 
 6 | service CosyVoice{
 7 |   rpc Inference(Request) returns (stream Response) {}
 8 | }
 9 | 
10 | message Request{
11 |   oneof RequestPayload {
12 |     sftRequest sft_request = 1;
13 |     zeroshotRequest zero_shot_request = 2;
14 |     crosslingualRequest cross_lingual_request = 3;
15 |     instructRequest instruct_request = 4;
16 |   }
17 | }
18 | 
19 | message sftRequest{
20 |   string spk_id = 1;
21 |   string tts_text = 2;
22 | }
23 | 
24 | message zeroshotRequest{
25 |   string tts_text = 1;
26 |   string prompt_text = 2;
27 |   bytes prompt_audio = 3;
28 | }
29 | 
30 | message crosslingualRequest{
31 |   string tts_text = 1;
32 |   bytes prompt_audio = 2;
33 | }
34 | 
35 | message instructRequest{
36 |   string tts_text = 1;
37 |   string spk_id = 2;
38 |   string instruct_text = 3;
39 | }
40 | 
41 | message Response{
42 |   bytes tts_audio = 1;
43 | }


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.env.example:
--------------------------------------------------------------------------------
1 | # example of file for storing private and user specific environment variables, like keys or system paths
2 | # rename it to ".env" (excluded from version control by default)
3 | # .env is loaded by train.py automatically
4 | # hydra allows you to reference variables in .yaml configs with special syntax: ${oc.env:MY_VAR}
5 | 
6 | MY_VAR="/home/user/my/system/path"
7 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## What does this PR do?
 2 | 
 3 | <!--
 4 | Please include a summary of the change and which issue is fixed.
 5 | Please also include relevant motivation and context.
 6 | List any dependencies that are required for this change.
 7 | List all the breaking changes introduced by this pull request.
 8 | -->
 9 | 
10 | Fixes #\<issue_number>
11 | 
12 | ## Before submitting
13 | 
14 | - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**?
15 | - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together?
16 | - [ ] Did you list all the **breaking changes** introduced by this pull request?
17 | - [ ] Did you **test your PR locally** with `pytest` command?
18 | - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command?
19 | 
20 | ## Did you have fun?
21 | 
22 | Make sure you had fun coding 🙃
23 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     # measures overall project coverage
 4 |     project:
 5 |       default:
 6 |         threshold: 100% # how much decrease in coverage is needed to not consider success
 7 | 
 8 |     # measures PR or single commit coverage
 9 |     patch:
10 |       default:
11 |         threshold: 100% # how much decrease in coverage is needed to not consider success
12 | 
13 | 
14 |     # project: off
15 |     # patch: off
16 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     target-branch: "dev"
11 |     schedule:
12 |       interval: "daily"
13 |     ignore:
14 |       - dependency-name: "pytorch-lightning"
15 |         update-types: ["version-update:semver-patch"]
16 |       - dependency-name: "torchmetrics"
17 |         update-types: ["version-update:semver-patch"]
18 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: "v$RESOLVED_VERSION"
 2 | tag-template: "v$RESOLVED_VERSION"
 3 | 
 4 | categories:
 5 |   - title: "🚀 Features"
 6 |     labels:
 7 |       - "feature"
 8 |       - "enhancement"
 9 |   - title: "🐛 Bug Fixes"
10 |     labels:
11 |       - "fix"
12 |       - "bugfix"
13 |       - "bug"
14 |   - title: "🧹 Maintenance"
15 |     labels:
16 |       - "maintenance"
17 |       - "dependencies"
18 |       - "refactoring"
19 |       - "cosmetic"
20 |       - "chore"
21 |   - title: "📝️ Documentation"
22 |     labels:
23 |       - "documentation"
24 |       - "docs"
25 | 
26 | change-template: "- $TITLE @$AUTHOR (#$NUMBER)"
27 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
28 | 
29 | version-resolver:
30 |   major:
31 |     labels:
32 |       - "major"
33 |   minor:
34 |     labels:
35 |       - "minor"
36 |   patch:
37 |     labels:
38 |       - "patch"
39 |   default: patch
40 | 
41 | template: |
42 |   ## Changes
43 | 
44 |   $CHANGES
45 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/.project-root:
--------------------------------------------------------------------------------
1 | # this file is required for inferring the project root directory
2 | # do not delete
3 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Shivam Mehta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE.txt
 3 | include requirements.*.txt
 4 | include *.cff
 5 | include requirements.txt
 6 | include matcha/VERSION
 7 | recursive-include matcha *.json
 8 | recursive-include matcha *.html
 9 | recursive-include matcha *.png
10 | recursive-include matcha *.md
11 | recursive-include matcha *.py
12 | recursive-include matcha *.pyx
13 | recursive-exclude tests *
14 | prune tests*
15 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | help:  ## Show help
 3 | 	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 4 | 
 5 | clean: ## Clean autogenerated files
 6 | 	rm -rf dist
 7 | 	find . -type f -name "*.DS_Store" -ls -delete
 8 | 	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
 9 | 	find . | grep -E ".pytest_cache" | xargs rm -rf
10 | 	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
11 | 	rm -f .coverage
12 | 
13 | clean-logs: ## Clean logs
14 | 	rm -rf logs/**
15 | 
16 | create-package: ## Create wheel and tar gz
17 | 	rm -rf dist/
18 | 	python setup.py bdist_wheel --plat-name=manylinux1_x86_64
19 | 	python setup.py sdist
20 | 	python -m twine upload  dist/* --verbose --skip-existing
21 | 
22 | format: ## Run pre-commit hooks
23 | 	pre-commit run -a
24 | 
25 | sync: ## Merge changes from main branch to your current branch
26 | 	git pull
27 | 	git pull origin main
28 | 
29 | test: ## Run not slow tests
30 | 	pytest -k "not slow"
31 | 
32 | test-full: ## Run all tests
33 | 	pytest
34 | 
35 | train-ljspeech: ## Train the model
36 | 	python matcha/train.py experiment=ljspeech
37 | 
38 | train-ljspeech-min: ## Train the model with minimum memory
39 | 	python matcha/train.py experiment=ljspeech_min_memory
40 | 
41 | start_app: ## Start the app
42 | 	python matcha/app.py
43 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/__init__.py:
--------------------------------------------------------------------------------
1 | # this file is needed here to include configs when building project as a package
2 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/default.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - model_checkpoint.yaml
3 |   - model_summary.yaml
4 |   - rich_progress_bar.yaml
5 |   - _self_
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
 2 | 
 3 | model_checkpoint:
 4 |   _target_: lightning.pytorch.callbacks.ModelCheckpoint
 5 |   dirpath: ${paths.output_dir}/checkpoints # directory to save the model file
 6 |   filename: checkpoint_{epoch:03d}  # checkpoint filename
 7 |   monitor: epoch # name of the logged metric which determines when model is improving
 8 |   verbose: False # verbosity mode
 9 |   save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
10 |   save_top_k: 10 # save k best models (determined by above metric)
11 |   mode: "max" # "max" means higher metric value is better, can be also "min"
12 |   auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
13 |   save_weights_only: False # if True, then only the model’s weights will be saved
14 |   every_n_train_steps: null # number of training steps between checkpoints
15 |   train_time_interval: null # checkpoints are monitored at the specified time interval
16 |   every_n_epochs: 100 # number of epochs between checkpoints
17 |   save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
18 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/model_summary.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
2 | 
3 | model_summary:
4 |   _target_: lightning.pytorch.callbacks.RichModelSummary
5 |   max_depth: 3 # the maximum depth of layer nesting that the summary will include
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/none.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/none.yaml


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml:
--------------------------------------------------------------------------------
1 | # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
2 | 
3 | rich_progress_bar:
4 |   _target_: lightning.pytorch.callbacks.RichProgressBar
5 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/debug/default.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # default debugging setup, runs 1 full epoch
 4 | # other debugging configs can inherit from this one
 5 | 
 6 | # overwrite task name so debugging logs are stored in separate folder
 7 | task_name: "debug"
 8 | 
 9 | # disable callbacks and loggers during debugging
10 | # callbacks: null
11 | # logger: null
12 | 
13 | extras:
14 |   ignore_warnings: False
15 |   enforce_tags: False
16 | 
17 | # sets level of all command line loggers to 'DEBUG'
18 | # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
19 | hydra:
20 |   job_logging:
21 |     root:
22 |       level: DEBUG
23 | 
24 |   # use this to also set hydra loggers to 'DEBUG'
25 |   # verbose: True
26 | 
27 | trainer:
28 |   max_epochs: 1
29 |   accelerator: cpu # debuggers don't like gpus
30 |   devices: 1 # debuggers don't like multiprocessing
31 |   detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
32 | 
33 | data:
34 |   num_workers: 0 # debuggers don't like multiprocessing
35 |   pin_memory: False # disable gpu memory pin
36 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/debug/fdr.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs 1 train, 1 validation and 1 test step
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   fast_dev_run: true
10 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/debug/limit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # uses only 1% of the training data and 5% of validation/test data
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 3
10 |   limit_train_batches: 0.01
11 |   limit_val_batches: 0.05
12 |   limit_test_batches: 0.05
13 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/debug/overfit.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # overfits to 3 batches
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 20
10 |   overfit_batches: 3
11 | 
12 | # model ckpt and early stopping need to be disabled during overfitting
13 | callbacks: null
14 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/debug/profiler.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # runs with execution time profiling
 4 | 
 5 | defaults:
 6 |   - default
 7 | 
 8 | trainer:
 9 |   max_epochs: 1
10 |   # profiler: "simple"
11 |   profiler: "advanced"
12 |   # profiler: "pytorch"
13 |   accelerator: gpu
14 | 
15 |   limit_train_batches: 0.02
16 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/eval.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | defaults:
 4 |   - _self_
 5 |   - data: mnist # choose datamodule with `test_dataloader()` for evaluation
 6 |   - model: mnist
 7 |   - logger: null
 8 |   - trainer: default
 9 |   - paths: default
10 |   - extras: default
11 |   - hydra: default
12 | 
13 | task_name: "eval"
14 | 
15 | tags: ["dev"]
16 | 
17 | # passing checkpoint path is necessary for evaluation
18 | ckpt_path: ???
19 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: hi-fi_en-US_female.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["hi-fi", "single_speaker", "piper_phonemizer", "en_US", "female"]
13 | 
14 | run_name: hi-fi_en-US_female_piper_phonemizer
15 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/ljspeech.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech
15 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: ljspeech.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["ljspeech"]
13 | 
14 | run_name: ljspeech_min
15 | 
16 | 
17 | model:
18 |   out_size: 172
19 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/experiment/multispeaker.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | # to execute this experiment run:
 4 | # python train.py experiment=multispeaker
 5 | 
 6 | defaults:
 7 |   - override /data: vctk.yaml
 8 | 
 9 | # all parameters below will be merged with parameters from default configurations set above
10 | # this allows you to overwrite only specified parameters
11 | 
12 | tags: ["multispeaker"]
13 | 
14 | run_name: multispeaker
15 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/extras/default.yaml:
--------------------------------------------------------------------------------
1 | # disable python warnings if they annoy you
2 | ignore_warnings: False
3 | 
4 | # ask user for tags if none are provided in the config
5 | enforce_tags: True
6 | 
7 | # pretty print config tree at the start of the run using Rich library
8 | print_config: True
9 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/hydra/default.yaml:
--------------------------------------------------------------------------------
 1 | # https://hydra.cc/docs/configure_hydra/intro/
 2 | 
 3 | # enable color logging
 4 | defaults:
 5 |   - override hydra_logging: colorlog
 6 |   - override job_logging: colorlog
 7 | 
 8 | # output directory, generated dynamically on each run
 9 | run:
10 |   dir: ${paths.log_dir}/${task_name}/${run_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
11 | sweep:
12 |   dir: ${paths.log_dir}/${task_name}/${run_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
13 |   subdir: ${hydra.job.num}
14 | 
15 | job_logging:
16 |   handlers:
17 |     file:
18 |       # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
19 |       filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
20 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/local/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/configs/local/.gitkeep


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/aim.yaml:
--------------------------------------------------------------------------------
 1 | # https://aimstack.io/
 2 | 
 3 | # example usage in lightning module:
 4 | # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
 5 | 
 6 | # open the Aim UI with the following command (run in the folder containing the `.aim` folder):
 7 | # `aim up`
 8 | 
 9 | aim:
10 |   _target_: aim.pytorch_lightning.AimLogger
11 |   repo: ${paths.root_dir} # .aim folder will be created here
12 |   # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
13 | 
14 |   # aim allows to group runs under experiment name
15 |   experiment: null # any string, set to "default" if not specified
16 | 
17 |   train_metric_prefix: "train/"
18 |   val_metric_prefix: "val/"
19 |   test_metric_prefix: "test/"
20 | 
21 |   # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
22 |   system_tracking_interval: 10 # set to null to disable system metrics tracking
23 | 
24 |   # enable/disable logging of system params such as installed packages, git info, env vars, etc.
25 |   log_system_params: true
26 | 
27 |   # enable/disable tracking console logs (default value is true)
28 |   capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550
29 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/comet.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.comet.ml
 2 | 
 3 | comet:
 4 |   _target_: lightning.pytorch.loggers.comet.CometLogger
 5 |   api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
 6 |   save_dir: "${paths.output_dir}"
 7 |   project_name: "lightning-hydra-template"
 8 |   rest_api_key: null
 9 |   # experiment_name: ""
10 |   experiment_key: null # set to resume experiment
11 |   offline: False
12 |   prefix: ""
13 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/csv.yaml:
--------------------------------------------------------------------------------
1 | # csv logger built in lightning
2 | 
3 | csv:
4 |   _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
5 |   save_dir: "${paths.output_dir}"
6 |   name: "csv/"
7 |   prefix: ""
8 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/many_loggers.yaml:
--------------------------------------------------------------------------------
 1 | # train with many loggers at once
 2 | 
 3 | defaults:
 4 |   # - comet
 5 |   - csv
 6 |   # - mlflow
 7 |   # - neptune
 8 |   - tensorboard
 9 |   - wandb
10 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/mlflow.yaml:
--------------------------------------------------------------------------------
 1 | # https://mlflow.org
 2 | 
 3 | mlflow:
 4 |   _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
 5 |   # experiment_name: ""
 6 |   # run_name: ""
 7 |   tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
 8 |   tags: null
 9 |   # save_dir: "./mlruns"
10 |   prefix: ""
11 |   artifact_location: null
12 |   # run_id: ""
13 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/neptune.yaml:
--------------------------------------------------------------------------------
 1 | # https://neptune.ai
 2 | 
 3 | neptune:
 4 |   _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
 5 |   api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
 6 |   project: username/lightning-hydra-template
 7 |   # name: ""
 8 |   log_model_checkpoints: True
 9 |   prefix: ""
10 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | # https://www.tensorflow.org/tensorboard/
 2 | 
 3 | tensorboard:
 4 |   _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
 5 |   save_dir: "${paths.output_dir}/tensorboard/"
 6 |   name: null
 7 |   log_graph: False
 8 |   default_hp_metric: True
 9 |   prefix: ""
10 |   # version: ""
11 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/logger/wandb.yaml:
--------------------------------------------------------------------------------
 1 | # https://wandb.ai
 2 | 
 3 | wandb:
 4 |   _target_: lightning.pytorch.loggers.wandb.WandbLogger
 5 |   # name: "" # name of the run (normally generated by wandb)
 6 |   save_dir: "${paths.output_dir}"
 7 |   offline: False
 8 |   id: null # pass correct id to resume experiment!
 9 |   anonymous: null # enable anonymous logging
10 |   project: "lightning-hydra-template"
11 |   log_model: False # upload lightning ckpts
12 |   prefix: "" # a string to put at the beginning of metric keys
13 |   # entity: "" # set to name of your wandb team
14 |   group: ""
15 |   tags: []
16 |   job_type: ""
17 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/model/cfm/default.yaml:
--------------------------------------------------------------------------------
1 | name: CFM
2 | solver: euler
3 | sigma_min: 1e-4
4 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/model/decoder/default.yaml:
--------------------------------------------------------------------------------
1 | channels: [256, 256]
2 | dropout: 0.05
3 | attention_head_dim: 64
4 | n_blocks: 1
5 | num_mid_blocks: 2
6 | num_heads: 2
7 | act_fn: snakebeta
8 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/model/encoder/default.yaml:
--------------------------------------------------------------------------------
 1 | encoder_type: RoPE Encoder
 2 | encoder_params:
 3 |   n_feats: ${model.n_feats}
 4 |   n_channels: 192
 5 |   filter_channels: 768
 6 |   filter_channels_dp: 256
 7 |   n_heads: 2
 8 |   n_layers: 6
 9 |   kernel_size: 3
10 |   p_dropout: 0.1
11 |   spk_emb_dim: 64
12 |   n_spks: 1
13 |   prenet: true
14 | 
15 | duration_predictor_params:
16 |   filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
17 |   kernel_size: 3
18 |   p_dropout: ${model.encoder.encoder_params.p_dropout}
19 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/model/matcha.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - encoder: default.yaml
 4 |   - decoder: default.yaml
 5 |   - cfm: default.yaml
 6 |   - optimizer: adam.yaml
 7 | 
 8 | _target_: matcha.models.matcha_tts.MatchaTTS
 9 | n_vocab: 178
10 | n_spks: ${data.n_spks}
11 | spk_emb_dim: 64
12 | n_feats: 80
13 | data_statistics: ${data.data_statistics}
14 | out_size: null # Must be divisible by 4
15 | prior_loss: true
16 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/model/optimizer/adam.yaml:
--------------------------------------------------------------------------------
1 | _target_: torch.optim.Adam
2 | _partial_: true
3 | lr: 1e-4
4 | weight_decay: 0.0
5 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/paths/default.yaml:
--------------------------------------------------------------------------------
 1 | # path to root directory
 2 | # this requires PROJECT_ROOT environment variable to exist
 3 | # you can replace it with "." if you want the root to be the current working directory
 4 | root_dir: ${oc.env:PROJECT_ROOT}
 5 | 
 6 | # path to data directory
 7 | data_dir: ${paths.root_dir}/data/
 8 | 
 9 | # path to logging directory
10 | log_dir: ${paths.root_dir}/logs/
11 | 
12 | # path to output directory, created dynamically by hydra
13 | # path generation pattern is specified in `configs/hydra/default.yaml`
14 | # use it to store all files generated during the run, like ckpts and metrics
15 | output_dir: ${hydra:runtime.output_dir}
16 | 
17 | # path to working directory
18 | work_dir: ${hydra:runtime.cwd}
19 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/cpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: cpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/ddp.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - default
 3 | 
 4 | strategy: ddp
 5 | 
 6 | accelerator: gpu
 7 | devices: [0,1]
 8 | num_nodes: 1
 9 | sync_batchnorm: True
10 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | # simulate DDP on CPU, useful for debugging
5 | accelerator: cpu
6 | devices: 2
7 | strategy: ddp_spawn
8 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/default.yaml:
--------------------------------------------------------------------------------
 1 | _target_: lightning.pytorch.trainer.Trainer
 2 | 
 3 | default_root_dir: ${paths.output_dir}
 4 | 
 5 | max_epochs: -1
 6 | 
 7 | accelerator: gpu
 8 | devices: [0]
 9 | 
10 | # mixed precision for extra speed-up
11 | precision: 16-mixed
12 | 
13 | # perform a validation loop every N training epochs
14 | check_val_every_n_epoch: 1
15 | 
16 | # set True to to ensure deterministic results
17 | # makes training slower but gives more reproducibility than just setting seeds
18 | deterministic: False
19 | 
20 | gradient_clip_val: 5.0
21 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/gpu.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: gpu
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/configs/trainer/mps.yaml:
--------------------------------------------------------------------------------
1 | defaults:
2 |   - default
3 | 
4 | accelerator: mps
5 | devices: 1
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.5.1
2 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jungil Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/config.py:
--------------------------------------------------------------------------------
 1 | v1 = {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0004,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 |     "upsample_rates": [8, 8, 2, 2],
11 |     "upsample_kernel_sizes": [16, 16, 4, 4],
12 |     "upsample_initial_channel": 512,
13 |     "resblock_kernel_sizes": [3, 7, 11],
14 |     "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
15 |     "resblock_initial_channel": 256,
16 |     "segment_size": 8192,
17 |     "num_mels": 80,
18 |     "num_freq": 1025,
19 |     "n_fft": 1024,
20 |     "hop_size": 256,
21 |     "win_size": 1024,
22 |     "sampling_rate": 22050,
23 |     "fmin": 0,
24 |     "fmax": 8000,
25 |     "fmax_loss": None,
26 |     "num_workers": 4,
27 |     "dist_config": {"dist_backend": "nccl", "dist_url": "tcp://localhost:54321", "world_size": 1},
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/env.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | class AttrDict(dict):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 |         self.__dict__ = self
11 | 
12 | 
13 | def build_env(config, config_name, path):
14 |     t_path = os.path.join(path, config_name)
15 |     if config != t_path:
16 |         os.makedirs(path, exist_ok=True)
17 |         shutil.copyfile(config, os.path.join(path, config_name))
18 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/hifigan/xutils.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/jik876/hifi-gan """
 2 | 
 3 | import glob
 4 | import os
 5 | 
 6 | import matplotlib
 7 | import torch
 8 | from torch.nn.utils import weight_norm
 9 | 
10 | matplotlib.use("Agg")
11 | import matplotlib.pylab as plt
12 | 
13 | 
14 | def plot_spectrogram(spectrogram):
15 |     fig, ax = plt.subplots(figsize=(10, 2))
16 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
17 |     plt.colorbar(im, ax=ax)
18 | 
19 |     fig.canvas.draw()
20 |     plt.close()
21 | 
22 |     return fig
23 | 
24 | 
25 | def init_weights(m, mean=0.0, std=0.01):
26 |     classname = m.__class__.__name__
27 |     if classname.find("Conv") != -1:
28 |         m.weight.data.normal_(mean, std)
29 | 
30 | 
31 | def apply_weight_norm(m):
32 |     classname = m.__class__.__name__
33 |     if classname.find("Conv") != -1:
34 |         weight_norm(m)
35 | 
36 | 
37 | def get_padding(kernel_size, dilation=1):
38 |     return int((kernel_size * dilation - dilation) / 2)
39 | 
40 | 
41 | def load_checkpoint(filepath, device):
42 |     assert os.path.isfile(filepath)
43 |     print(f"Loading '{filepath}'")
44 |     checkpoint_dict = torch.load(filepath, map_location=device)
45 |     print("Complete.")
46 |     return checkpoint_dict
47 | 
48 | 
49 | def save_checkpoint(filepath, obj):
50 |     print(f"Saving checkpoint to {filepath}")
51 |     torch.save(obj, filepath)
52 |     print("Complete.")
53 | 
54 | 
55 | def scan_checkpoint(cp_dir, prefix):
56 |     pattern = os.path.join(cp_dir, prefix + "????????")
57 |     cp_list = glob.glob(pattern)
58 |     if len(cp_list) == 0:
59 |         return None
60 |     return sorted(cp_list)[-1]
61 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/models/components/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/onnx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/matcha/onnx/__init__.py


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron
 2 | 
 3 | Defines the set of symbols used in text input to the model.
 4 | """
 5 | _pad = "_"
 6 | _punctuation = ';:,.!?¡¿—…"«»“” '
 7 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 8 | _letters_ipa = (
 9 |     "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 | )
11 | 
12 | 
13 | # Export all symbols:
14 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
15 | 
16 | # Special symbol ids
17 | SPACE_ID = symbols.index(" ")
18 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
2 | from matcha.utils.logging_utils import log_hyperparameters
3 | from matcha.utils.pylogger import get_pylogger
4 | from matcha.utils.rich_utils import enforce_tags, print_config_tree
5 | from matcha.utils.utils import extras, get_metric_value, task_wrapper
6 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from matcha.utils.monotonic_align.core import maximum_path_c
 5 | 
 6 | 
 7 | def maximum_path(value, mask):
 8 |     """Cython optimised version.
 9 |     value: [b, t_x, t_y]
10 |     mask: [b, t_x, t_y]
11 |     """
12 |     value = value * mask
13 |     device = value.device
14 |     dtype = value.dtype
15 |     value = value.data.cpu().numpy().astype(np.float32)
16 |     path = np.zeros_like(value).astype(np.int32)
17 |     mask = mask.data.cpu().numpy()
18 | 
19 |     t_x_max = mask.sum(1)[:, 0].astype(np.int32)
20 |     t_y_max = mask.sum(2)[:, 0].astype(np.int32)
21 |     maximum_path_c(path, value, t_x_max, t_y_max)
22 |     return torch.from_numpy(path).to(device=device, dtype=dtype)
23 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | cimport cython
 4 | cimport numpy as np
 5 | 
 6 | from cython.parallel import prange
 7 | 
 8 | 
 9 | @cython.boundscheck(False)
10 | @cython.wraparound(False)
11 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
12 |   cdef int x
13 |   cdef int y
14 |   cdef float v_prev
15 |   cdef float v_cur
16 |   cdef float tmp
17 |   cdef int index = t_x - 1
18 | 
19 |   for y in range(t_y):
20 |     for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
21 |       if x == y:
22 |         v_cur = max_neg_val
23 |       else:
24 |         v_cur = value[x, y-1]
25 |       if x == 0:
26 |         if y == 0:
27 |           v_prev = 0.
28 |         else:
29 |           v_prev = max_neg_val
30 |       else:
31 |         v_prev = value[x-1, y-1]
32 |       value[x, y] = max(v_cur, v_prev) + value[x, y]
33 | 
34 |   for y in range(t_y - 1, -1, -1):
35 |     path[index, y] = 1
36 |     if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
37 |       index = index - 1
38 | 
39 | 
40 | @cython.boundscheck(False)
41 | @cython.wraparound(False)
42 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
43 |   cdef int b = values.shape[0]
44 | 
45 |   cdef int i
46 |   for i in prange(b, nogil=True):
47 |     maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
48 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | # from distutils.core import setup
2 | # from Cython.Build import cythonize
3 | # import numpy
4 | 
5 | # setup(name='monotonic_align',
6 | #       ext_modules=cythonize("core.pyx"),
7 | #       include_dirs=[numpy.get_include()])
8 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/matcha/utils/pylogger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lightning.pytorch.utilities import rank_zero_only
 4 | 
 5 | 
 6 | def get_pylogger(name: str = __name__) -> logging.Logger:
 7 |     """Initializes a multi-GPU-friendly python command line logger.
 8 | 
 9 |     :param name: The name of the logger, defaults to ``__name__``.
10 | 
11 |     :return: A logger object.
12 |     """
13 |     logger = logging.getLogger(name)
14 | 
15 |     # this ensures all logging levels get marked with the rank zero decorator
16 |     # otherwise logs would get multiplied for each GPU process in multi-GPU setup
17 |     logging_levels = ("debug", "info", "warning", "error", "exception", "fatal", "critical")
18 |     for level in logging_levels:
19 |         setattr(logger, level, rank_zero_only(getattr(logger, level)))
20 | 
21 |     return logger
22 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/CosyVoice/third_party/Matcha-TTS/notebooks/.gitkeep


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"]
 3 | 
 4 | [tool.black]
 5 | line-length = 120
 6 | target-version = ['py310']
 7 | exclude = '''
 8 | 
 9 | (
10 |   /(
11 |       \.eggs         # exclude a few common directories in the
12 |     | \.git          # root of the project
13 |     | \.hg
14 |     | \.mypy_cache
15 |     | \.tox
16 |     | \.venv
17 |     | _build
18 |     | buck-out
19 |     | build
20 |     | dist
21 |   )/
22 |   | foo.py           # also separately exclude a file named foo.py in
23 |                      # the root of the project
24 | )
25 | '''
26 | 
27 | [tool.pytest.ini_options]
28 | addopts = [
29 |   "--color=yes",
30 |   "--durations=0",
31 |   "--strict-markers",
32 |   "--doctest-modules",
33 | ]
34 | filterwarnings = [
35 |   "ignore::DeprecationWarning",
36 |   "ignore::UserWarning",
37 | ]
38 | log_cli = "True"
39 | markers = [
40 |   "slow: slow tests",
41 | ]
42 | minversion = "6.0"
43 | testpaths = "tests/"
44 | 
45 | [tool.coverage.report]
46 | exclude_lines = [
47 |     "pragma: nocover",
48 |     "raise NotImplementedError",
49 |     "raise NotImplementedError()",
50 |     "if __name__ == .__main__.:",
51 | ]
52 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/requirements.txt:
--------------------------------------------------------------------------------
 1 | # --------- pytorch --------- #
 2 | torch>=2.0.0
 3 | torchvision>=0.15.0
 4 | lightning>=2.0.0
 5 | torchmetrics>=0.11.4
 6 | 
 7 | # --------- hydra --------- #
 8 | hydra-core==1.3.2
 9 | hydra-colorlog==1.2.0
10 | hydra-optuna-sweeper==1.2.0
11 | 
12 | # --------- loggers --------- #
13 | # wandb
14 | # neptune-client
15 | # mlflow
16 | # comet-ml
17 | # aim>=3.16.2  # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
18 | 
19 | # --------- others --------- #
20 | rootutils       # standardizing the project root setup
21 | pre-commit      # hooks for applying linters on commit
22 | rich            # beautiful text formatting in terminal
23 | pytest          # tests
24 | # sh            # for running bash commands in some tests (linux/macos only)
25 | phonemizer      # phonemization of text
26 | tensorboard
27 | librosa
28 | Cython
29 | numpy
30 | einops
31 | inflect
32 | Unidecode
33 | scipy
34 | torchaudio
35 | matplotlib
36 | pandas
37 | conformer==0.3.2
38 | diffusers==0.25.0
39 | notebook
40 | ipywidgets
41 | gradio==3.43.2
42 | gdown
43 | wget
44 | seaborn
45 | piper_phonemize
46 | 


--------------------------------------------------------------------------------
/tools/CosyVoice/third_party/Matcha-TTS/scripts/schedule.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Schedule execution of many runs
3 | # Run from root folder with: bash scripts/schedule.sh
4 | 
5 | python src/train.py trainer.max_epochs=5 logger=csv
6 | 
7 | python src/train.py trainer.max_epochs=10 logger=csv
8 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | __pycache__/
4 | *.sh
5 | local_tools/


--------------------------------------------------------------------------------
/tools/DiffSinger/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jinglin Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/config_base.yaml:
--------------------------------------------------------------------------------
 1 | # task
 2 | binary_data_dir: ''
 3 | work_dir: '' # experiment directory.
 4 | infer: false # infer
 5 | seed: 1234
 6 | debug: false
 7 | save_codes:
 8 |   - configs
 9 |   - modules
10 |   - tasks
11 |   - utils
12 |   - usr
13 | 
14 | #############
15 | # dataset
16 | #############
17 | ds_workers: 1
18 | test_num: 100
19 | valid_num: 100
20 | endless_ds: false
21 | sort_by_len: true
22 | 
23 | #########
24 | # train and eval
25 | #########
26 | load_ckpt: ''
27 | save_ckpt: true
28 | save_best: false
29 | num_ckpt_keep: 3
30 | clip_grad_norm: 0
31 | accumulate_grad_batches: 1
32 | log_interval: 100
33 | num_sanity_val_steps: 5  # steps of validation at the beginning
34 | check_val_every_n_epoch: 10
35 | val_check_interval: 2000
36 | max_epochs: 1000
37 | max_updates: 160000
38 | max_tokens: 31250
39 | max_sentences: 100000
40 | max_eval_tokens: -1
41 | max_eval_sentences: -1
42 | test_input_dir: ''
43 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/singing/base.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/base.yaml
 3 |   - configs/tts/base_zh.yaml
 4 | 
 5 | 
 6 | datasets: []
 7 | test_prefixes: []
 8 | test_num: 0
 9 | valid_num: 0
10 | 
11 | pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
12 | binarizer_cls: data_gen.singing.binarize.SingingBinarizer
13 | pre_align_args:
14 |   use_tone: false # for ZH
15 |   forced_align: mfa
16 |   use_sox: true
17 | hop_size: 128            # Hop size.
18 | fft_size: 512           # FFT size.
19 | win_size: 512           # FFT size.
20 | max_frames: 8000
21 | fmin: 50                 # Minimum freq in mel basis calculation.
22 | fmax: 11025               # Maximum frequency in mel basis calculation.
23 | pitch_type: frame
24 | 
25 | hidden_size: 256
26 | mel_loss: "ssim:0.5|l1:0.5"
27 | lambda_f0: 0.0
28 | lambda_uv: 0.0
29 | lambda_energy: 0.0
30 | lambda_ph_dur: 0.0
31 | lambda_sent_dur: 0.0
32 | lambda_word_dur: 0.0
33 | predictor_grad: 0.0
34 | use_spk_embed: true
35 | use_spk_id: false
36 | 
37 | max_tokens: 20000
38 | max_updates: 400000
39 | num_spk: 100
40 | save_f0: true
41 | use_gt_dur: true
42 | use_gt_f0: true
43 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/singing/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/singing/base.yaml
4 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/base_zh.yaml:
--------------------------------------------------------------------------------
1 | pre_align_args:
2 |   txt_processor: zh_g2pM
3 | binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/hifigan.yaml:
--------------------------------------------------------------------------------
 1 | base_config: configs/tts/pwg.yaml
 2 | task_cls: tasks.vocoder.hifigan.HifiGanTask
 3 | resblock: "1"
 4 | adam_b1: 0.8
 5 | adam_b2: 0.99
 6 | upsample_rates: [ 8,8,2,2 ]
 7 | upsample_kernel_sizes: [ 16,16,4,4 ]
 8 | upsample_initial_channel: 128
 9 | resblock_kernel_sizes: [ 3,7,11 ]
10 | resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11 | 
12 | lambda_mel: 45.0
13 | 
14 | max_samples: 8192
15 | max_sentences: 16
16 | 
17 | generator_params:
18 |   lr: 0.0002            # Generator's learning rate.
19 |   aux_context_window: 0 # Context window size for auxiliary feature.
20 | discriminator_optimizer_params:
21 |   lr: 0.0002            # Discriminator's learning rate.


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/lj/base_mel2wav.yaml:
--------------------------------------------------------------------------------
1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
2 | processed_data_dir: 'data/processed/ljspeech'
3 | binary_data_dir: 'data/binary/ljspeech_wav'
4 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/lj/base_text2mel.yaml:
--------------------------------------------------------------------------------
 1 | raw_data_dir: 'data/raw/LJSpeech-1.1'
 2 | processed_data_dir: 'data/processed/ljspeech'
 3 | binary_data_dir: 'data/binary/ljspeech'
 4 | pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
 5 | 
 6 | pitch_type: cwt
 7 | mel_loss: l1
 8 | num_test_samples: 20
 9 | test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
10 |             316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
11 | use_energy_embed: false
12 | test_num: 523
13 | valid_num: 348


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/lj/fs2.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/fs2.yaml
3 |   - configs/tts/lj/base_text2mel.yaml


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/lj/hifigan.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/hifigan.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/tools/DiffSinger/configs/tts/lj/pwg.yaml:
--------------------------------------------------------------------------------
1 | base_config:
2 |   - configs/tts/pwg.yaml
3 |   - configs/tts/lj/base_mel2wav.yaml


--------------------------------------------------------------------------------
/tools/DiffSinger/data/processed/ljspeech/dict.txt:
--------------------------------------------------------------------------------
 1 | ! !
 2 | , ,
 3 | . .
 4 | ; ;
 5 | <BOS> <BOS>
 6 | <EOS> <EOS>
 7 | ? ?
 8 | AA0 AA0
 9 | AA1 AA1
10 | AA2 AA2
11 | AE0 AE0
12 | AE1 AE1
13 | AE2 AE2
14 | AH0 AH0
15 | AH1 AH1
16 | AH2 AH2
17 | AO0 AO0
18 | AO1 AO1
19 | AO2 AO2
20 | AW0 AW0
21 | AW1 AW1
22 | AW2 AW2
23 | AY0 AY0
24 | AY1 AY1
25 | AY2 AY2
26 | B B
27 | CH CH
28 | D D
29 | DH DH
30 | EH0 EH0
31 | EH1 EH1
32 | EH2 EH2
33 | ER0 ER0
34 | ER1 ER1
35 | ER2 ER2
36 | EY0 EY0
37 | EY1 EY1
38 | EY2 EY2
39 | F F
40 | G G
41 | HH HH
42 | IH0 IH0
43 | IH1 IH1
44 | IH2 IH2
45 | IY0 IY0
46 | IY1 IY1
47 | IY2 IY2
48 | JH JH
49 | K K
50 | L L
51 | M M
52 | N N
53 | NG NG
54 | OW0 OW0
55 | OW1 OW1
56 | OW2 OW2
57 | OY0 OY0
58 | OY1 OY1
59 | OY2 OY2
60 | P P
61 | R R
62 | S S
63 | SH SH
64 | T T
65 | TH TH
66 | UH0 UH0
67 | UH1 UH1
68 | UH2 UH2
69 | UW0 UW0
70 | UW1 UW1
71 | UW2 UW2
72 | V V
73 | W W
74 | Y Y
75 | Z Z
76 | ZH ZH
77 | | |
78 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/data/processed/ljspeech/phone_set.json:
--------------------------------------------------------------------------------
1 | ["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]


--------------------------------------------------------------------------------
/tools/DiffSinger/data/有何不可.json:
--------------------------------------------------------------------------------
1 | {"text": "天空好想下雨，我好想住你隔壁。傻站在你家楼下，抬起头数乌云。如果场景里出现一架钢琴，我会唱歌给你听。哪怕（好）多盆水往下淋，夏天快要过去。请你少买冰淇淋，天凉就别穿短裙。别再那么淘气，如果有时不那么开心。我愿意将格洛米借给你，你其实明白我心意。为你唱这首歌没有什么风格，它仅仅代表着我想给你快乐。为你解冻冰河 为你做一只扑火的飞蛾，没有什么事情是不值得。为你唱这首歌没有什么风格，它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可，夏末秋凉里带一点温热有换季的颜色。天空好想下雨，我好想住你隔壁。傻站在你家楼下，抬起头数乌云。如果场景里出现一架钢琴，我会唱歌给你听。哪怕（好）多盆水往下淋，夏天快要过去。请你少买冰淇淋，天凉就别穿短裙。别再那么淘气，如果有时不那么开心。我愿意将格洛米借给你，你其实明白我心意。为你唱这首歌没有什么风格，它仅仅代表着我想给你快乐。为你解冻冰河为你做一只扑火的飞蛾，没有什么事情是不值得。为你唱这首歌没有什么风格，它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可，夏末秋凉里带一点温热。为你解冻冰河为你做一只扑火的飞蛾，没有什么事情是不值得。为你唱这首歌没有什么风格，它仅仅代表着我希望你快乐。为你辗转反侧为你放弃世界有何不可，夏末秋凉里带一点温热有换季的颜色。",
2 |   "notes": }


--------------------------------------------------------------------------------
/tools/DiffSinger/data_gen/tts/bin/binarize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["OMP_NUM_THREADS"] = "1"
 4 | 
 5 | import importlib
 6 | from utils.hparams import set_hparams, hparams
 7 | 
 8 | 
 9 | def binarize():
10 |     binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
11 |     pkg = ".".join(binarizer_cls.split(".")[:-1])
12 |     cls_name = binarizer_cls.split(".")[-1]
13 |     binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
14 |     print("| Binarizer: ", binarizer_cls)
15 |     binarizer_cls().process()
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     set_hparams()
20 |     binarize()
21 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/data_gen/tts/txt_processors/base_text_processor.py:
--------------------------------------------------------------------------------
1 | class BaseTxtProcessor:
2 |     @staticmethod
3 |     def sp_phonemes():
4 |         return ['|']
5 | 
6 |     @classmethod
7 |     def process(cls, txt, pre_align_args):
8 |         raise NotImplementedError
9 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/inference/svs/opencpop/map.py:
--------------------------------------------------------------------------------
1 | def cpop_pinyin2ph_func():
2 |     # In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table"
3 |     pinyin2phs = {'AP': 'AP', 'SP': 'SP'}
4 |     with open('inference/svs/opencpop/cpop_pinyin2ph.txt') as rf:
5 |         for line in rf.readlines():
6 |             elements = [x.strip() for x in line.split('|') if x.strip() != '']
7 |             pinyin2phs[elements[0]] = elements[1]
8 |     return pinyin2phs


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/modules/__init__.py


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/modules/parallel_wavegan/__init__.py


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .causal_conv import *  # NOQA
2 | from .pqmf import *  # NOQA
3 | from .residual_block import *  # NOQA
4 | from modules.parallel_wavegan.layers.residual_stack import *  # NOQA
5 | from .upsample import *  # NOQA
6 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .stft_loss import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .melgan import *  # NOQA
2 | from .parallel_wavegan import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from torch.optim import *  # NOQA
2 | from .radam import *  # NOQA
3 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/modules/parallel_wavegan/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | librosa==0.8.0
 3 | tqdm
 4 | pandas
 5 | numba==0.53.1
 6 | numpy==1.19.2
 7 | scipy==1.5.4
 8 | PyYAML==5.3.1
 9 | tensorboardX
10 | pyloudnorm
11 | setuptools>=41.0.0
12 | g2p_en
13 | resemblyzer
14 | webrtcvad
15 | tensorboard==2.6.0
16 | scikit-learn==0.24.1
17 | scikit-image==0.16.2
18 | textgrid
19 | jiwer
20 | pycwt
21 | PyWavelets
22 | praat-parselmouth==0.3.3
23 | jieba
24 | einops
25 | chardet
26 | pretty-midi==0.2.9
27 | pytorch-lightning==0.7.1
28 | h5py==3.1.0
29 | pypinyin==0.39.0
30 | g2pM==0.1.2.5


--------------------------------------------------------------------------------
/tools/DiffSinger/requirements_3090.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.15.0
 2 | appdirs==1.4.4
 3 | audioread==2.1.9
 4 | beautifulsoup4==4.10.0
 5 | certifi==2021.10.8
 6 | cffi==1.15.0
 7 | charset-normalizer==2.0.7
 8 | cycler==0.11.0
 9 | Cython==0.29.24
10 | decorator==4.4.2
11 | dlib==19.22.1
12 | einops==0.3.2
13 | future==0.18.2
14 | g2p-en==2.1.0
15 | google==3.0.0
16 | grpcio==1.42.0
17 | h5py==2.8.0
18 | horology==1.2.0
19 | idna==3.3
20 | imageio==2.10.1
21 | imageio-ffmpeg==0.4.5
22 | importlib-metadata==4.8.1
23 | joblib==1.1.0
24 | kiwisolver==1.3.2
25 | librosa==0.8.0
26 | llvmlite==0.31.0
27 | Markdown==3.3.4
28 | matplotlib==3.4.3
29 | miditoolkit==0.1.7
30 | moviepy==1.0.3
31 | numba==0.48.0
32 | numpy==1.20.0
33 | opencv-python==4.5.4.58
34 | packaging==21.2
35 | pandas==1.3.4
36 | Pillow==8.4.0
37 | pooch==1.5.2
38 | praat-parselmouth==0.3.3
39 | proglog==0.1.9
40 | protobuf==3.19.1
41 | pycparser==2.20
42 | pycwt==0.3.0a22
43 | pydub==0.25.1
44 | pyloudnorm==0.1.0
45 | pyparsing==2.4.7
46 | pypinyin==0.43.0
47 | python-dateutil==2.8.2
48 | pytorch-lightning==0.7.1
49 | pytorch-ssim==0.1
50 | pytz==2021.3
51 | pyworld==0.3.0
52 | PyYAML==6.0
53 | requests==2.26.0
54 | resampy==0.2.2
55 | Resemblyzer==0.1.1.dev0
56 | scikit-image==0.16.2
57 | scikit-learn==0.22
58 | scipy==1.3.0
59 | six==1.16.0
60 | sklearn==0.0
61 | SoundFile==0.10.3.post1
62 | soupsieve==2.3
63 | sympy==1.9
64 | tensorboard==1.15.0
65 | tensorboardX==2.4
66 | test-tube==0.7.5
67 | TextGrid==1.5
68 | torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl
69 | torchvision==0.9.1
70 | tqdm==4.62.3
71 | typing-extensions==3.10.0.2
72 | urllib3==1.26.7
73 | uuid==1.30
74 | webrtcvad==2.0.10
75 | Werkzeug==2.0.2
76 | zipp==3.6.0
77 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/apply_form.md:
--------------------------------------------------------------------------------
 1 | # The way to apply for PopCS
 2 | Thanks for your attention to our works. Please write the email to jinglinliu@zju.edu.cn with:
 3 | 
 4 | "
 5 | 
 6 | name: ***
 7 | 
 8 | affiliations: *** (school or institution)
 9 | 
10 | research fields: ***
11 | 
12 | We want to apply for PopCS and agree to the dataset license: CC by-nc-sa 4.0 (NonCommercial!). 
13 | 
14 | We accept full responsibility for our use of the dataset and shall defend and indemnify the authors of DiffSinger, against any and all claims arising from our use of the dataset, including but not limited to our use of any copies of copyrighted audio files that we may create from the dataset.
15 | 
16 | We hereby represent that we are fully authorized to enter into this agreement on behalf of my employer.
17 | 
18 | We will cite your paper if these codes or data have been used. We will not distribute the download link to others without informing the authors of DiffSinger.
19 | 
20 | "
21 | 
22 | Then we will provide the download link to you. 
23 | 
24 | **Please note that, if you are using PopCS, it means that you have accepted the terms above.**
25 | 
26 | **Please use your Official Email Address (like xxx@zju.edu.cn)! Thank you!**


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/diffspeech-fs2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2-1.png


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/diffspeech-fs2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2-2.png


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/diffspeech-fs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/diffspeech-fs2.png


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/model_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/model_a.png


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/model_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/model_b.png


--------------------------------------------------------------------------------
/tools/DiffSinger/resources/tfb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/resources/tfb.png


--------------------------------------------------------------------------------
/tools/DiffSinger/tasks/run.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from utils.hparams import set_hparams, hparams
 3 | 
 4 | 
 5 | def run_task():
 6 |     assert hparams['task_cls'] != ''
 7 |     pkg = ".".join(hparams["task_cls"].split(".")[:-1])
 8 |     cls_name = hparams["task_cls"].split(".")[-1]
 9 |     task_cls = getattr(importlib.import_module(pkg), cls_name)
10 |     task_cls.start()
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     set_hparams()
15 |     run_task()
16 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/usr/.gitkeep


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/DiffSinger/usr/__init__.py


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/base.yaml:
--------------------------------------------------------------------------------
 1 | task_cls: usr.task.DiffFsTask
 2 | pitch_type: frame
 3 | timesteps: 100
 4 | dilation_cycle_length: 1
 5 | residual_layers: 20
 6 | residual_channels: 256
 7 | lr: 0.001
 8 | decay_steps: 50000
 9 | keep_bins: 80
10 | spec_min: [ ]
11 | spec_max: [ ]
12 | 
13 | content_cond_steps: [ ] # [ 0, 10000 ]
14 | spk_cond_steps: [ ] # [ 0, 10000 ]
15 | # train and eval
16 | fs2_ckpt: ''
17 | max_updates: 400000
18 | # max_updates: 200000
19 | use_gt_dur: true
20 | use_gt_f0: true
21 | gen_tgt_spk_id: -1
22 | max_sentences: 48
23 | num_sanity_val_steps: 1
24 | num_valid_plots: 1
25 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/lj_ds_pndm.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./lj_ds_beta6.yaml
 3 | 
 4 | fs2_ckpt: ''
 5 | gaussian_start: True
 6 | max_beta: 0.02
 7 | timesteps: 1000
 8 | K_step: 1000
 9 | pndm_speedup: 10
10 | 
11 | pitch_type: frame
12 | use_pitch_embed: false   #  using diffusion to model pitch curve
13 | lambda_f0: 0.
14 | lambda_uv: 0.
15 | #rel_pos: true
16 | 
17 | max_updates: 320000
18 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/cascade/opencs/aux_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binarization_args:
14 |   with_wav: true
15 |   with_spk_embed: false
16 |   with_align: true
17 | raw_data_dir: 'data/raw/opencpop/segments'
18 | processed_data_dir: 'xxx'
19 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
20 | 
21 | 
22 | binary_data_dir: 'data/binary/opencpop-midi-dp'
23 | use_midi: true  #  for midi exp
24 | use_gt_f0: false  #  for midi exp
25 | use_gt_dur: false  # for further midi exp
26 | lambda_f0: 1.0
27 | lambda_uv: 1.0
28 | #lambda_energy: 0.1
29 | lambda_ph_dur: 1.0
30 | lambda_sent_dur: 1.0
31 | lambda_word_dur: 1.0
32 | predictor_grad: 0.1
33 | pe_enable: false
34 | pe_ckpt: ''
35 | 
36 | num_spk: 1
37 | test_prefixes: [
38 |     '2044',
39 |     '2086',
40 |     '2092',
41 |     '2093',
42 |     '2100',
43 | ]
44 | 
45 | task_cls: usr.diffsinger_task.AuxDecoderMIDITask
46 | #vocoder: usr.singingvocoder.highgan.HighGAN
47 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
48 | vocoder: vocoders.hifigan.HifiGAN
49 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
50 | 
51 | use_nsf: true
52 | 
53 | # config for experiments
54 | max_frames: 5000
55 | max_tokens: 40000
56 | predictor_layers: 5
57 | rel_pos: true
58 | dur_predictor_layers: 5  # *
59 | 
60 | use_spk_embed: false
61 | num_valid_plots: 10
62 | max_updates: 160000
63 | save_gt: true


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/cascade/opencs/ds60_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_f0: false  #  for midi exp
11 | use_gt_dur: false  # for further midi exp
12 | lambda_f0: 1.0
13 | lambda_uv: 1.0
14 | #lambda_energy: 0.1
15 | lambda_ph_dur: 1.0
16 | lambda_sent_dur: 1.0
17 | lambda_word_dur: 1.0
18 | predictor_grad: 0.1
19 | pe_enable: false
20 | pe_ckpt: ''
21 | 
22 | fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt'  #
23 | #num_valid_plots: 0
24 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
25 | 
26 | K_step: 60
27 | max_tokens: 40000
28 | predictor_layers: 5
29 | dilation_cycle_length: 4  # *
30 | rel_pos: true
31 | dur_predictor_layers: 5  # *
32 | max_updates: 160000
33 | gaussian_start: false
34 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/e2e/opencpop/ds1000.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | # for diffusion schedule
23 | timesteps: 1000
24 | K_step: 1000
25 | max_beta: 0.02
26 | max_tokens: 36000
27 | max_updates: 320000
28 | gaussian_start: True
29 | pndm_speedup: 40
30 | 
31 | use_pitch_embed: false
32 | use_gt_f0: false  #  for midi exp
33 | 
34 | lambda_f0: 0.
35 | lambda_uv: 0.
36 | dilation_cycle_length: 4  # *
37 | rel_pos: true
38 | predictor_layers: 5
39 | pe_enable: true
40 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
 6 | binary_data_dir: 'data/binary/opencpop-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 40000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - usr/configs/popcs_ds_beta6.yaml
 3 |   - usr/configs/midi/cascade/popcs/popcs_statis.yaml
 4 | 
 5 | binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
 6 | binary_data_dir: 'data/binary/popcs-midi-dp'
 7 | 
 8 | #switch_midi2f0_step: 174000
 9 | use_midi: true  #  for midi exp
10 | use_gt_dur: false  # for further midi exp
11 | lambda_ph_dur: 1.0
12 | lambda_sent_dur: 1.0
13 | lambda_word_dur: 1.0
14 | predictor_grad: 0.1
15 | dur_predictor_layers: 5  # *
16 | 
17 | 
18 | fs2_ckpt: ''  #
19 | #num_valid_plots: 0
20 | task_cls: usr.diffsinger_task.DiffSingerMIDITask
21 | 
22 | K_step: 100
23 | max_tokens: 40000
24 | max_updates: 160000
25 | gaussian_start: True
26 | 
27 | use_pitch_embed: false
28 | use_gt_f0: false  #  for midi exp
29 | 
30 | lambda_f0: 0.
31 | lambda_uv: 0.
32 | dilation_cycle_length: 4  # *
33 | rel_pos: true
34 | predictor_layers: 5
35 | pe_enable: true
36 | pe_ckpt: 'checkpoints/0102_xiaoma_pe'
37 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/midi/pe.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/tts/lj/fs2.yaml
 3 | 
 4 | max_frames: 8000
 5 | audio_sample_rate: 24000
 6 | hop_size: 128            # Hop size.
 7 | fft_size: 512           # FFT size.
 8 | win_size: 512           # FFT size.
 9 | fmin: 30
10 | fmax: 12000
11 | min_level_db: -120
12 | 
13 | binary_data_dir: 'xxx'
14 | 
15 | pitch_type: frame
16 | task_cls: tasks.tts.pe.PitchExtractionTask
17 | pitch_extractor_conv_layers: 2
18 | 
19 | 
20 | # config for experiments
21 | max_tokens: 20000
22 | use_spk_embed: false
23 | num_valid_plots: 10
24 | max_updates: 60000


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/popcs_ds_beta6_offline.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - ./popcs_ds_beta6.yaml
 3 | 
 4 | fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt  # to be infer
 5 | num_valid_plots: 0
 6 | task_cls: usr.diffsinger_task.DiffSingerOfflineTask
 7 | 
 8 | # tmp:
 9 | #pe_enable: true
10 | #pe_ckpt: ''
11 | vocoder: vocoders.hifigan.HifiGAN
12 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128


--------------------------------------------------------------------------------
/tools/DiffSinger/usr/configs/popcs_fs2.yaml:
--------------------------------------------------------------------------------
 1 | base_config:
 2 |   - configs/singing/fs2.yaml
 3 | 
 4 | audio_sample_rate: 24000
 5 | hop_size: 128            # Hop size.
 6 | fft_size: 512           # FFT size.
 7 | win_size: 512           # FFT size.
 8 | fmin: 30
 9 | fmax: 12000
10 | min_level_db: -120
11 | 
12 | binarization_args:
13 |   with_wav: true
14 |   with_spk_embed: false
15 |   with_align: true
16 | raw_data_dir: 'data/raw/popcs'
17 | processed_data_dir: 'data/processed/popcs'
18 | binary_data_dir: 'data/binary/popcs-pmf0'
19 | num_spk: 1
20 | datasets: [
21 |   'popcs',
22 | ]
23 | test_prefixes: [
24 |   'popcs-说散就散',
25 |   'popcs-隐形的翅膀',
26 | ]
27 | 
28 | task_cls: tasks.tts.fs2.FastSpeech2Task
29 | #vocoder: usr.singingvocoder.highgan.HighGAN
30 | #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
31 | vocoder: vocoders.hifigan.HifiGAN
32 | vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
33 | use_nsf: true
34 | 
35 | # config for experiments
36 | max_tokens: 18000
37 | use_spk_embed: false
38 | num_valid_plots: 10
39 | max_updates: 160000
40 | save_gt: true
41 | 
42 | # tmp:
43 | #pe_enable: true
44 | #pe_ckpt: ''


--------------------------------------------------------------------------------
/tools/DiffSinger/utils/training_utils.py:
--------------------------------------------------------------------------------
 1 | from utils.hparams import hparams
 2 | 
 3 | 
 4 | class RSQRTSchedule(object):
 5 |     def __init__(self, optimizer):
 6 |         super().__init__()
 7 |         self.optimizer = optimizer
 8 |         self.constant_lr = hparams['lr']
 9 |         self.warmup_updates = hparams['warmup_updates']
10 |         self.hidden_size = hparams['hidden_size']
11 |         self.lr = hparams['lr']
12 |         for param_group in optimizer.param_groups:
13 |             param_group['lr'] = self.lr
14 |         self.step(0)
15 | 
16 |     def step(self, num_updates):
17 |         constant_lr = self.constant_lr
18 |         warmup = min(num_updates / self.warmup_updates, 1.0)
19 |         rsqrt_decay = max(self.warmup_updates, num_updates) ** -0.5
20 |         rsqrt_hidden = self.hidden_size ** -0.5
21 |         self.lr = max(constant_lr * warmup * rsqrt_decay * rsqrt_hidden, 1e-7)
22 |         for param_group in self.optimizer.param_groups:
23 |             param_group['lr'] = self.lr
24 |         return self.lr
25 | 
26 |     def get_lr(self):
27 |         return self.optimizer.param_groups[0]['lr']
28 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/utils/tts_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def make_positions(tensor, padding_idx):
 7 |     """Replace non-padding symbols with their position numbers.
 8 |     Position numbers begin at padding_idx+1. Padding symbols are ignored.
 9 |     """
10 |     # The series of casts and type-conversions here are carefully
11 |     # balanced to both work with ONNX export and XLA. In particular XLA
12 |     # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
13 |     # how to handle the dtype kwarg in cumsum.
14 |     mask = tensor.ne(padding_idx).int()
15 |     return (
16 |                    torch.cumsum(mask, dim=1).type_as(mask) * mask
17 |            ).long() + padding_idx
18 | 
19 | 
20 | def softmax(x, dim):
21 |     return F.softmax(x, dim=dim, dtype=torch.float32)
22 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/vocoders/__init__.py:
--------------------------------------------------------------------------------
1 | from vocoders import hifigan
2 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/vocoders/base_vocoder.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | VOCODERS = {}
 3 | 
 4 | 
 5 | def register_vocoder(cls):
 6 |     VOCODERS[cls.__name__.lower()] = cls
 7 |     VOCODERS[cls.__name__] = cls
 8 |     return cls
 9 | 
10 | 
11 | def get_vocoder_cls(hparams):
12 |     if hparams['vocoder'] in VOCODERS:
13 |         return VOCODERS[hparams['vocoder']]
14 |     else:
15 |         vocoder_cls = hparams['vocoder']
16 |         pkg = ".".join(vocoder_cls.split(".")[:-1])
17 |         cls_name = vocoder_cls.split(".")[-1]
18 |         vocoder_cls = getattr(importlib.import_module(pkg), cls_name)
19 |         return vocoder_cls
20 | 
21 | 
22 | class BaseVocoder:
23 |     def spec2wav(self, mel):
24 |         """
25 | 
26 |         :param mel: [T, 80]
27 |         :return: wav: [T']
28 |         """
29 | 
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     def wav2spec(wav_fn):
34 |         """
35 | 
36 |         :param wav_fn: str
37 |         :return: wav, mel: [T, 80]
38 |         """
39 |         raise NotImplementedError
40 | 


--------------------------------------------------------------------------------
/tools/DiffSinger/vocoders/vocoder_utils.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | 
 3 | from utils.hparams import hparams
 4 | import numpy as np
 5 | 
 6 | 
 7 | def denoise(wav, v=0.1):
 8 |     spec = librosa.stft(y=wav, n_fft=hparams['fft_size'], hop_length=hparams['hop_size'],
 9 |                         win_length=hparams['win_size'], pad_mode='constant')
10 |     spec_m = np.abs(spec)
11 |     spec_m = np.clip(spec_m - v, a_min=0, a_max=None)
12 |     spec_a = np.angle(spec)
13 | 
14 |     return librosa.istft(spec_m * np.exp(1j * spec_a), hop_length=hparams['hop_size'],
15 |                          win_length=hparams['win_size'])
16 | 


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/bird_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/bird_audio.wav


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/bird_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/bird_image.jpg


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/car_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/car_audio.wav


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/car_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/car_image.jpg


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/dog_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/dog_audio.wav


--------------------------------------------------------------------------------
/tools/ImageBind/.assets/dog_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/.assets/dog_image.jpg


--------------------------------------------------------------------------------
/tools/ImageBind/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ImageBind
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to Omnivore, you agree that your contributions will be licensed
31 | under the [LICENSE](LICENSE) file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/tools/ImageBind/build/lib/imagebind/__init__.py:
--------------------------------------------------------------------------------
1 | from imagebind import data
2 | from imagebind.models import imagebind_model
3 | from imagebind.models.imagebind_model import ModalityType


--------------------------------------------------------------------------------
/tools/ImageBind/build/lib/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/build/lib/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/tools/ImageBind/build/lib/imagebind/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/build/lib/imagebind/models/__init__.py


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | README.md
 3 | setup.py
 4 | imagebind/__init__.py
 5 | imagebind/data.py
 6 | imagebind.egg-info/PKG-INFO
 7 | imagebind.egg-info/SOURCES.txt
 8 | imagebind.egg-info/dependency_links.txt
 9 | imagebind.egg-info/top_level.txt
10 | imagebind/bpe/bpe_simple_vocab_16e6.txt.gz
11 | imagebind/models/__init__.py
12 | imagebind/models/helpers.py
13 | imagebind/models/imagebind_model.py
14 | imagebind/models/multimodal_preprocessors.py
15 | imagebind/models/transformer.py


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | https://download.pytorch.org/whl/cu113
2 | 


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | imagebind
2 | 


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/__init__.py:
--------------------------------------------------------------------------------
1 | from imagebind import data
2 | from imagebind.models import imagebind_model
3 | from imagebind.models.imagebind_model import ModalityType


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/__pycache__/data.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/__pycache__/data.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__init__.py


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__pycache__/helpers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/helpers.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__pycache__/imagebind_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/imagebind_model.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/multimodal_preprocessors.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/imagebind/models/__pycache__/transformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/imagebind/models/__pycache__/transformer.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/ImageBind/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/ImageBind/requirements.txt


--------------------------------------------------------------------------------
/tools/ImageBind/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('requirements.txt') as f:
 4 |     required = f.read().splitlines()
 5 | 
 6 | setup(
 7 |     name='imagebind',
 8 |     version='0.1.0',
 9 |     packages=find_packages(),
10 |     package_data={
11 |         'imagebind': ['bpe/bpe_simple_vocab_16e6.txt.gz'],
12 |     },
13 |     description='A brief description of the package',
14 |     long_description=open('README.md', encoding='utf-8').read(),
15 |     long_description_content_type="text/markdown",
16 |     url='https://github.com/facebookresearch/ImageBind',
17 |     classifiers=[
18 |         'Programming Language :: Python :: 3',
19 |         'License :: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International',
20 |     ],
21 |     install_requires=required,
22 |     dependency_links=['https://download.pytorch.org/whl/cu113'],
23 | )
24 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/README.md:
--------------------------------------------------------------------------------
 1 | # Fish Audio Preprocessor
 2 | 
 3 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess)
 4 | 
 5 | [中文文档](README.zh.md)
 6 | 
 7 | This repo contains some scripts for audio processing. Main features include:
 8 | 
 9 | - [x] Video/audio to wav
10 | - [x] Audio vocal separation
11 | - [x] Automatic audio slicing
12 | - [x] Audio loudness matching
13 | - [x] Audio data statistics (supports determining audio length)
14 | - [x] Audio resampling
15 | - [x] Audio transcribe (.lab)
16 | - [x] Audio transcribe via FunASR (use `--model-type funasr` to enable, detailed usage can be found at code)
17 | - [ ] Audio transcribe via WhisperX
18 | - [ ] Merge .lab files (example: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`)
19 | 
20 | ([ ] indicates not completed, [x] indicates completed)
21 | 
22 | **This code has been tested on Ubuntu 22.04 / 20.04 + Python 3.10. If you encounter problems on other versions, feedback is welcome.**
23 | 
24 | ## Getting Started:
25 | 
26 | ```
27 | pip install -e .
28 | fap --help
29 | ```
30 | 
31 | ## Reference
32 | 
33 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper)
34 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/README.zh.md:
--------------------------------------------------------------------------------
 1 | # Fish Audio Preprocessor
 2 | 
 3 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess)
 4 | 
 5 | [English Document](README.md)
 6 | 
 7 | 这个 Repo 包含了一些用于处理音频的脚本. 主要包含以下功能:
 8 | 
 9 | - [x] 视频/音频转 wav
10 | - [x] 音频人声分离
11 | - [x] 音频自动切片
12 | - [x] 音频响度匹配
13 | - [x] 音频数据统计（支持判断音频长度）
14 | - [x] 音频重采样
15 | - [x] 音频打标 (.lab)
16 | - [x] 音频打标 FunASR（使用 `--model-type funasr` 开启, 详细使用方法可查看代码）
17 | - [ ] 音频打标 WhisperX
18 | - [ ] .lab 标注合并为 .list 文件 (示例: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`)
19 | 
20 | ([ ] 表示未完成, [x] 表示已完成)
21 | 
22 | **本代码已在 Ubuntu 22.04 / 20.04 + Python 3.10 测试过, 如果在其他版本遇到问题, 欢迎反馈**
23 | 
24 | ## 上手指南:
25 | 
26 | ```
27 | pip install -e .
28 | fap --help
29 | ```
30 | 
31 | ## 引用
32 | 
33 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper)
34 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fap-complete.zsh:
--------------------------------------------------------------------------------
 1 | #compdef fap
 2 | 
 3 | _fap_completion() {
 4 |     local -a completions
 5 |     local -a completions_with_descriptions
 6 |     local -a response
 7 |     (( ! $+commands[fap] )) && return 1
 8 | 
 9 |     response=("${(@f)$(env COMP_WORDS="${words[*]}" COMP_CWORD=$((CURRENT-1)) _FAP_COMPLETE=zsh_complete fap)}")
10 | 
11 |     for type key descr in ${response}; do
12 |         if [[ "$type" == "plain" ]]; then
13 |             if [[ "$descr" == "_" ]]; then
14 |                 completions+=("$key")
15 |             else
16 |                 completions_with_descriptions+=("$key":"$descr")
17 |             fi
18 |         elif [[ "$type" == "dir" ]]; then
19 |             _path_files -/
20 |         elif [[ "$type" == "file" ]]; then
21 |             _path_files -f
22 |         fi
23 |     done
24 | 
25 |     if [ -n "$completions_with_descriptions" ]; then
26 |         _describe -V unsorted completions_with_descriptions -U
27 |     fi
28 | 
29 |     if [ -n "$completions" ]; then
30 |         compadd -U -V unsorted -a completions
31 |     fi
32 | }
33 | 
34 | compdef _fap_completion fap;
35 | 
36 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.4
 2 | Name: fish-audio-preprocess
 3 | Version: 0.2.8
 4 | Summary: Preprocess audio data
 5 | Author-email: Lengyue <lengyue@lengyue.me>
 6 | License: Apache
 7 | Requires-Python: >=3.9
 8 | Description-Content-Type: text/markdown
 9 | License-File: LICENSE
10 | Dynamic: license-file
11 | 
12 | # Fish Audio Preprocessor
13 | 
14 | [![PyPI Version](https://img.shields.io/pypi/v/fish-audio-preprocess.svg)](https://pypi.python.org/pypi/fish-audio-preprocess)
15 | 
16 | [中文文档](README.zh.md)
17 | 
18 | This repo contains some scripts for audio processing. Main features include:
19 | 
20 | - [x] Video/audio to wav
21 | - [x] Audio vocal separation
22 | - [x] Automatic audio slicing
23 | - [x] Audio loudness matching
24 | - [x] Audio data statistics (supports determining audio length)
25 | - [x] Audio resampling
26 | - [x] Audio transcribe (.lab)
27 | - [x] Audio transcribe via FunASR (use `--model-type funasr` to enable, detailed usage can be found at code)
28 | - [ ] Audio transcribe via WhisperX
29 | - [ ] Merge .lab files (example: `fap merge-lab ./dataset list.txt "{PATH}|spkname|JP|{TEXT}"`)
30 | 
31 | ([ ] indicates not completed, [x] indicates completed)
32 | 
33 | **This code has been tested on Ubuntu 22.04 / 20.04 + Python 3.10. If you encounter problems on other versions, feedback is welcome.**
34 | 
35 | ## Getting Started:
36 | 
37 | ```
38 | pip install -e .
39 | fap --help
40 | ```
41 | 
42 | ## Reference
43 | 
44 | - [Batch Whisper](https://github.com/Blair-Johnson/batch-whisper)
45 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | fap = fish_audio_preprocess.cli.__main__:cli
3 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | fish_audio_preprocess
2 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/__init__.py


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import richuru
 3 | from loguru import logger
 4 | 
 5 | from fish_audio_preprocess.cli.merge_lab import merge_lab
 6 | 
 7 | from .convert_to_wav import to_wav
 8 | from .frequency import frequency
 9 | from .length import length
10 | from .loudness_norm import loudness_norm
11 | from .merge_short import merge_short
12 | from .resample import resample
13 | from .separate_audio import separate
14 | from .slice_audio import slice_audio, slice_audio_v2
15 | from .transcribe import transcribe
16 | 
17 | 
18 | @click.group()
19 | @click.option("--debug/--no-debug", default=False)
20 | def cli(debug: bool):
21 |     """An audio preprocessing CLI."""
22 | 
23 |     if debug:
24 |         richuru.install()
25 |         logger.info("Debug mode is on")
26 | 
27 | 
28 | # Register subcommands
29 | cli.add_command(length)
30 | cli.add_command(frequency)
31 | 
32 | cli.add_command(to_wav)
33 | cli.add_command(separate)
34 | cli.add_command(loudness_norm)
35 | cli.add_command(slice_audio)
36 | cli.add_command(slice_audio_v2)
37 | cli.add_command(resample)
38 | cli.add_command(transcribe)
39 | cli.add_command(merge_short)
40 | cli.add_command(merge_lab)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     to_wav()
45 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/__main__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/__main__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/convert_to_wav.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/convert_to_wav.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/frequency.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/frequency.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/length.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/length.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/loudness_norm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/loudness_norm.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_lab.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_lab.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_short.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/merge_short.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/resample.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/resample.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/separate_audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/separate_audio.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/slice_audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/slice_audio.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/transcribe.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/cli/__pycache__/transcribe.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/file.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/file.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/loudness_norm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/loudness_norm.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/separate_audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/separate_audio.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio_v2.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/slice_audio_v2.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/transcribe.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/audio-preprocess/fish_audio_preprocess/utils/__pycache__/transcribe.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/audio-preprocess/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | authors = [
 3 |   {name = "Lengyue", email = "lengyue@lengyue.me"},
 4 | ]
 5 | dependencies = [
 6 | ]
 7 | description = "Preprocess audio data"
 8 | license = {text = "Apache"}
 9 | name = "fish-audio-preprocess"
10 | readme = "README.md"
11 | requires-python = ">=3.9"
12 | version = "0.2.8"
13 | 
14 | [project.scripts]
15 | fap = "fish_audio_preprocess.cli.__main__:cli"
16 | 
17 | [build-system]
18 | build-backend = "setuptools.build_meta"
19 | requires = ["setuptools", "setuptools-scm"]
20 | 
21 | [tool.setuptools]
22 | packages = ["fish_audio_preprocess"]
23 | 
24 | [tool.isort]
25 | profile = "black"
26 | 


--------------------------------------------------------------------------------
/tools/audio-preprocess/tools/lint.py:
--------------------------------------------------------------------------------
1 | import subprocess as sp
2 | 
3 | # Black
4 | sp.run(["black", "fish_audio_preprocess", "tools"])
5 | 
6 | # Isort
7 | sp.run(["isort", "fish_audio_preprocess", "tools"])
8 | 


--------------------------------------------------------------------------------
/tools/fish-speech/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .github
3 | results
4 | data
5 | *.filelist
6 | /data_server/target
7 | checkpoints
8 | 


--------------------------------------------------------------------------------
/tools/fish-speech/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .pgx.*
 3 | .pdm-python
 4 | /fish_speech.egg-info
 5 | __pycache__
 6 | /results
 7 | /data
 8 | /*.test.sh
 9 | *.filelist
10 | filelists
11 | /fish_speech/text/cmudict_cache.pickle
12 | /checkpoints
13 | /.vscode
14 | /data_server/target
15 | /*.npy
16 | /*.wav
17 | /*.mp3
18 | /*.lab
19 | /results
20 | /data
21 | /.idea
22 | ffmpeg.exe
23 | ffprobe.exe
24 | asr-label*
25 | /.cache
26 | /fishenv
27 | /.locale
28 | /demo-audios
29 | /references
30 | /example
31 | /faster_whisper
32 | /.gradio
33 | *log
34 | 


--------------------------------------------------------------------------------
/tools/fish-speech/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autoupdate_schedule: monthly
 3 | 
 4 | repos:
 5 |   - repo: https://github.com/pycqa/isort
 6 |     rev: 6.0.1
 7 |     hooks:
 8 |       - id: isort
 9 |         args: [--profile=black]
10 | 
11 |   - repo: https://github.com/psf/black
12 |     rev: 25.1.0
13 |     hooks:
14 |       - id: black
15 | 
16 |   - repo: https://github.com/pre-commit/pre-commit-hooks
17 |     rev: v5.0.0
18 |     hooks:
19 |       - id: end-of-file-fixer
20 |       - id: check-yaml
21 |       - id: check-json
22 |       - id: mixed-line-ending
23 |         args: ["--fix=lf"]
24 |       - id: check-added-large-files
25 |         args: ["--maxkb=5000"]
26 | 


--------------------------------------------------------------------------------
/tools/fish-speech/.project-root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/.project-root


--------------------------------------------------------------------------------
/tools/fish-speech/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for MkDocs projects
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the version of Python and other tools you might need
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.12"
12 | 
13 | mkdocs:
14 |   configuration: mkdocs.yml
15 | 
16 | # Optionally declare the Python requirements required to build your docs
17 | python:
18 |   install:
19 |   - requirements: docs/requirements.txt
20 | 


--------------------------------------------------------------------------------
/tools/fish-speech/API_FLAGS.txt:
--------------------------------------------------------------------------------
1 | # --infer
2 | --api
3 | --listen 0.0.0.0:8080 \
4 | --llama-checkpoint-path "checkpoints/fish-speech-1.5" \
5 | --decoder-checkpoint-path "checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
6 | --decoder-config-name firefly_gan_vq
7 | 


--------------------------------------------------------------------------------
/tools/fish-speech/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   fish-speech:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: dockerfile.dev
 8 |     container_name: fish-speech
 9 |     volumes:
10 |       - ./:/exp
11 |     deploy:
12 |       resources:
13 |         reservations:
14 |           devices:
15 |             - driver: nvidia
16 |               count: all
17 |               capabilities: [gpu]
18 |     command: tail -f /dev/null
19 | 


--------------------------------------------------------------------------------
/tools/fish-speech/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim-bookworm AS stage-1
 2 | ARG TARGETARCH
 3 | 
 4 | ARG HUGGINGFACE_MODEL=fish-speech-1.5
 5 | ARG HF_ENDPOINT=https://huggingface.co
 6 | 
 7 | WORKDIR /opt/fish-speech
 8 | 
 9 | RUN set -ex \
10 |     && pip install huggingface_hub \
11 |     && HF_ENDPOINT=${HF_ENDPOINT} huggingface-cli download --resume-download fishaudio/${HUGGINGFACE_MODEL} --local-dir checkpoints/${HUGGINGFACE_MODEL}
12 | 
13 | FROM python:3.12-slim-bookworm
14 | ARG TARGETARCH
15 | 
16 | ARG DEPENDENCIES="  \
17 |     ca-certificates \
18 |     libsox-dev \
19 |     build-essential \
20 |     cmake \
21 |     libasound-dev \
22 |     portaudio19-dev \
23 |     libportaudio2 \
24 |     libportaudiocpp0 \
25 |     ffmpeg"
26 | 
27 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
28 |     --mount=type=cache,target=/var/lib/apt,sharing=locked \
29 |     set -ex \
30 |     && rm -f /etc/apt/apt.conf.d/docker-clean \
31 |     && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' >/etc/apt/apt.conf.d/keep-cache \
32 |     && apt-get update \
33 |     && apt-get -y install --no-install-recommends ${DEPENDENCIES} \
34 |     && echo "no" | dpkg-reconfigure dash
35 | 
36 | WORKDIR /opt/fish-speech
37 | 
38 | COPY . .
39 | 
40 | RUN --mount=type=cache,target=/root/.cache,sharing=locked \
41 |     set -ex \
42 |     && pip install -e .[stable]
43 | 
44 | COPY --from=stage-1 /opt/fish-speech/checkpoints /opt/fish-speech/checkpoints
45 | 
46 | ENV GRADIO_SERVER_NAME="0.0.0.0"
47 | 
48 | EXPOSE 7860
49 | 
50 | CMD ["./entrypoint.sh"]
51 | 


--------------------------------------------------------------------------------
/tools/fish-speech/dockerfile.dev:
--------------------------------------------------------------------------------
 1 | ARG VERSION=dev
 2 | ARG BASE_IMAGE=ghcr.io/fishaudio/fish-speech:${VERSION}
 3 | 
 4 | FROM ${BASE_IMAGE}
 5 | 
 6 | ARG TOOLS="               \
 7 |         git               \
 8 |         curl              \
 9 |         build-essential   \
10 |         ffmpeg            \
11 |         libsm6            \
12 |         libxext6          \
13 |         libjpeg-dev       \
14 |         zlib1g-dev        \
15 |         aria2             \
16 |         zsh               \
17 |         openssh-server    \
18 |         sudo              \
19 |         protobuf-compiler \
20 |         libasound-dev     \
21 |         portaudio19-dev   \
22 |         libportaudio2     \
23 |         libportaudiocpp0  \
24 |         cmake"
25 | 
26 | RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
27 |     --mount=type=cache,target=/var/lib/apt,sharing=locked \
28 |     set -ex \
29 |     && apt-get update \
30 |     && apt-get -y install --no-install-recommends ${TOOLS}
31 | 
32 | # Install oh-my-zsh so your terminal looks nice
33 | RUN sh -c "$(curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" "" --unattended
34 | 
35 | # Set zsh as default shell
36 | RUN chsh -s /usr/bin/zsh
37 | ENV SHELL=/usr/bin/zsh
38 | 


--------------------------------------------------------------------------------
/tools/fish-speech/docs/CNAME:
--------------------------------------------------------------------------------
1 | speech.fish.audio
2 | 


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/VS_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/VS_1.jpg


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/VS_1_pt-BR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/VS_1_pt-BR.png


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/agent_gradio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/agent_gradio.png


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/diagram.png


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/diagrama.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/diagrama.png


--------------------------------------------------------------------------------
/tools/fish-speech/docs/assets/figs/logo-circle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/docs/assets/figs/logo-circle.png


--------------------------------------------------------------------------------
/tools/fish-speech/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs-material
2 | mkdocs-static-i18n[material]
3 | mkdocs[i18n]
4 | 


--------------------------------------------------------------------------------
/tools/fish-speech/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | .md-grid {
2 |   max-width: 1440px; 
3 | }
4 | 


--------------------------------------------------------------------------------
/tools/fish-speech/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CUDA_ENABLED=${CUDA_ENABLED:-true}
 4 | DEVICE=""
 5 | 
 6 | if [ "${CUDA_ENABLED}" != "true" ]; then
 7 |     DEVICE="--device cpu"
 8 | fi
 9 | 
10 | exec python tools/run_webui.py ${DEVICE}
11 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .grad_norm import GradNormMonitor
2 | 
3 | __all__ = ["GradNormMonitor"]
4 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/configs/firefly_gan_vq.yaml:
--------------------------------------------------------------------------------
 1 | _target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
 2 | spec_transform:
 3 |   _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
 4 |   sample_rate: 44100
 5 |   n_mels: 160
 6 |   n_fft: 2048
 7 |   hop_length: 512
 8 |   win_length: 2048
 9 | backbone:
10 |   _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
11 |   input_channels: 160
12 |   depths: [3, 3, 9, 3]
13 |   dims: [128, 256, 384, 512]
14 |   drop_path_rate: 0.2
15 |   kernel_size: 7
16 | head:
17 |   _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
18 |   hop_length: 512
19 |   upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
20 |   upsample_kernel_sizes: [16, 16, 4, 4, 4]
21 |   resblock_kernel_sizes: [3, 7, 11]
22 |   resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
23 |   num_mels: 512
24 |   upsample_initial_channel: 512
25 |   pre_conv_kernel_size: 13
26 |   post_conv_kernel_size: 13
27 | quantizer:
28 |   _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
29 |   input_dim: 512
30 |   n_groups: 8
31 |   n_codebooks: 1
32 |   levels: [8, 5, 5, 5]
33 |   downsample_factor: [2, 2]
34 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/configs/lora/r_8_alpha_16.yaml:
--------------------------------------------------------------------------------
1 | _target_: fish_speech.models.text2semantic.lora.LoraConfig
2 | r: 8
3 | lora_alpha: 16
4 | lora_dropout: 0.01
5 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/datasets/protos/text-data.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package text_data;
 4 | 
 5 | message Semantics {
 6 |     repeated uint32 values = 1;
 7 | }
 8 | 
 9 | message Sentence {
10 |     repeated string texts = 1;
11 |     repeated Semantics semantics = 3;
12 | }
13 | 
14 | message TextData {
15 |     string source = 1;
16 |     string name = 2;
17 |     repeated Sentence sentences = 4;
18 | }
19 | 
20 | message SampledData {
21 |     string source = 1;
22 |     string name = 2;
23 |     repeated Sentence samples = 3;
24 | }
25 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/datasets/protos/text_data_stream.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | 
 3 | from .text_data_pb2 import TextData
 4 | 
 5 | 
 6 | def read_pb_stream(f):
 7 |     while True:
 8 |         buf = f.read(4)
 9 |         if len(buf) == 0:
10 |             break
11 |         size = struct.unpack("I", buf)[0]
12 |         buf = f.read(size)
13 |         text_data = TextData()
14 |         text_data.ParseFromString(buf)
15 |         yield text_data
16 | 
17 | 
18 | def write_pb_stream(f, text_data):
19 |     buf = text_data.SerializeToString()
20 |     f.write(struct.pack("I", len(buf)))
21 |     f.write(buf)
22 | 
23 | 
24 | def pack_pb_stream(text_data):
25 |     buf = text_data.SerializeToString()
26 |     return struct.pack("I", len(buf)) + buf
27 | 
28 | 
29 | def split_pb_stream(f):
30 |     while True:
31 |         head = f.read(4)
32 |         if len(head) == 0:
33 |             break
34 |         size = struct.unpack("I", head)[0]
35 |         buf = f.read(size)
36 |         yield head + buf
37 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/i18n/README.md:
--------------------------------------------------------------------------------
 1 | ## i18n Folder Attribution
 2 | 
 3 | The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
 4 | 
 5 | ### fish_speech/i18n/core.py
 6 | 
 7 | **Related code from RVC:**
 8 | [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
 9 | 
10 | **Initial commit:**
11 | add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
12 | 
13 | **Initial author:**
14 | [@L4Ph](https://github.com/L4Ph)
15 | 
16 | ### fish_speech/i18n/scan.py
17 | 
18 | **Related code from RVC:**
19 | [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
20 | 
21 | **Initial commit:**
22 | File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
23 | 
24 | **Initial author:**
25 | [@towzeur](https://github.com/towzeur)
26 | 
27 | We appreciate the contributions of the RVC project and its authors.
28 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/i18n/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import i18n
2 | 
3 | __all__ = ["i18n"]
4 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/i18n/core.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import locale
 3 | from pathlib import Path
 4 | 
 5 | I18N_FILE_PATH = Path(__file__).parent / "locale"
 6 | DEFAULT_LANGUAGE = "en_US"
 7 | 
 8 | 
 9 | def load_language_list(language):
10 |     with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
11 |         language_list = json.load(f)
12 | 
13 |     return language_list
14 | 
15 | 
16 | class I18nAuto:
17 |     def __init__(self):
18 |         i18n_file = Path(".locale")
19 | 
20 |         if i18n_file.exists():
21 |             with open(i18n_file, "r", encoding="utf-8") as f:
22 |                 language = f.read().strip()
23 |         else:
24 |             # getlocale can't identify the system's language ((None, None))
25 |             language = locale.getdefaultlocale()[0]
26 | 
27 |         if (I18N_FILE_PATH / f"{language}.json").exists() is False:
28 |             language = DEFAULT_LANGUAGE
29 | 
30 |         self.language = language
31 |         self.language_map = load_language_list(language)
32 | 
33 |     def __call__(self, key):
34 |         return self.language_map.get(key, key)
35 | 
36 |     def __repr__(self):
37 |         return "Use Language: " + self.language
38 | 
39 | 
40 | i18n = I18nAuto()
41 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/inference_engine/utils.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import wave
 3 | from dataclasses import dataclass
 4 | from typing import Literal, Optional, Tuple
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | @dataclass
10 | class InferenceResult:
11 |     code: Literal["header", "segment", "error", "final"]
12 |     audio: Optional[Tuple[int, np.ndarray]]
13 |     error: Optional[Exception]
14 | 
15 | 
16 | def wav_chunk_header(
17 |     sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1
18 | ) -> bytes:
19 |     buffer = io.BytesIO()
20 | 
21 |     with wave.open(buffer, "wb") as wav_file:
22 |         wav_file.setnchannels(channels)
23 |         wav_file.setsampwidth(bit_depth // 8)
24 |         wav_file.setframerate(sample_rate)
25 | 
26 |     wav_header_bytes = buffer.getvalue()
27 |     buffer.close()
28 | 
29 |     return wav_header_bytes
30 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/models/text2semantic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/fish_speech/models/text2semantic/__init__.py


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/models/vqgan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/fish_speech/models/vqgan/__init__.py


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def get_cosine_schedule_with_warmup_lr_lambda(
 5 |     current_step: int,
 6 |     *,
 7 |     num_warmup_steps: int | float,
 8 |     num_training_steps: int,
 9 |     num_cycles: float = 0.5,
10 |     final_lr_ratio: float = 0.0,
11 | ):
12 |     if 0 < num_warmup_steps < 1:  # float mode
13 |         num_warmup_steps = int(num_warmup_steps * num_training_steps)
14 | 
15 |     if current_step < num_warmup_steps:
16 |         return float(current_step) / float(max(1, num_warmup_steps))
17 | 
18 |     progress = float(current_step - num_warmup_steps) / float(
19 |         max(1, num_training_steps - num_warmup_steps)
20 |     )
21 | 
22 |     return max(
23 |         final_lr_ratio,
24 |         0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
25 |     )
26 | 
27 | 
28 | def get_constant_schedule_with_warmup_lr_lambda(
29 |     current_step: int,
30 |     *,
31 |     num_warmup_steps: int | float,
32 |     num_training_steps: int | None = None,
33 | ):
34 |     if 0 < num_warmup_steps < 1:  # float mode
35 |         num_warmup_steps = int(num_warmup_steps * num_training_steps)
36 | 
37 |     if current_step < num_warmup_steps:
38 |         return float(current_step) / float(max(1, num_warmup_steps))
39 | 
40 |     return 1.0
41 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/text/__init__.py:
--------------------------------------------------------------------------------
1 | from .clean import clean_text
2 | from .spliter import split_text
3 | 
4 | __all__ = ["clean_text", "split_text"]
5 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/text/clean.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | SYMBOLS_MAPPING = {
 4 |     "‘": "'",
 5 |     "’": "'",
 6 | }
 7 | 
 8 | REPLACE_SYMBOL_REGEX = re.compile(
 9 |     "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
10 | )
11 | 
12 | 
13 | EMOJI_REGEX = re.compile(
14 |     "["
15 |     "\U0001f600-\U0001f64f"  # emoticons
16 |     "\U0001f300-\U0001f5ff"  # symbols & pictographs
17 |     "\U0001f680-\U0001f6ff"  # transport & map symbols
18 |     "\U0001f1e0-\U0001f1ff"  # flags (iOS)
19 |     "]+",
20 |     flags=re.UNICODE,
21 | )
22 | 
23 | 
24 | def clean_text(text):
25 |     # Clean the text
26 |     text = text.strip()
27 | 
28 |     # Replace all chinese symbols with their english counterparts
29 |     text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
30 | 
31 |     # Remove emojis
32 |     text = EMOJI_REGEX.sub(r"", text)
33 | 
34 |     # Remove continuous periods (...) and commas (,,,)
35 |     text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text)
36 | 
37 |     return text
38 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .braceexpand import braceexpand
 2 | from .context import autocast_exclude_mps
 3 | from .file import get_latest_checkpoint
 4 | from .instantiators import instantiate_callbacks, instantiate_loggers
 5 | from .logger import RankedLogger
 6 | from .logging_utils import log_hyperparameters
 7 | from .rich_utils import enforce_tags, print_config_tree
 8 | from .utils import extras, get_metric_value, set_seed, task_wrapper
 9 | 
10 | __all__ = [
11 |     "enforce_tags",
12 |     "extras",
13 |     "get_metric_value",
14 |     "RankedLogger",
15 |     "instantiate_callbacks",
16 |     "instantiate_loggers",
17 |     "log_hyperparameters",
18 |     "print_config_tree",
19 |     "task_wrapper",
20 |     "braceexpand",
21 |     "get_latest_checkpoint",
22 |     "autocast_exclude_mps",
23 |     "set_seed",
24 | ]
25 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/utils/context.py:
--------------------------------------------------------------------------------
 1 | from contextlib import nullcontext
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def autocast_exclude_mps(
 7 |     device_type: str, dtype: torch.dtype
 8 | ) -> nullcontext | torch.autocast:
 9 |     return (
10 |         nullcontext()
11 |         if torch.backends.mps.is_available()
12 |         else torch.autocast(device_type, dtype)
13 |     )
14 | 


--------------------------------------------------------------------------------
/tools/fish-speech/fish_speech/utils/logging_utils.py:
--------------------------------------------------------------------------------
 1 | from lightning.pytorch.utilities import rank_zero_only
 2 | 
 3 | from fish_speech.utils import logger as log
 4 | 
 5 | 
 6 | @rank_zero_only
 7 | def log_hyperparameters(object_dict: dict) -> None:
 8 |     """Controls which config parts are saved by lightning loggers.
 9 | 
10 |     Additionally saves:
11 |     - Number of model parameters
12 |     """
13 | 
14 |     hparams = {}
15 | 
16 |     cfg = object_dict["cfg"]
17 |     model = object_dict["model"]
18 |     trainer = object_dict["trainer"]
19 | 
20 |     if not trainer.logger:
21 |         log.warning("Logger not found! Skipping hyperparameter logging...")
22 |         return
23 | 
24 |     hparams["model"] = cfg["model"]
25 | 
26 |     # save number of model parameters
27 |     hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
28 |     hparams["model/params/trainable"] = sum(
29 |         p.numel() for p in model.parameters() if p.requires_grad
30 |     )
31 |     hparams["model/params/non_trainable"] = sum(
32 |         p.numel() for p in model.parameters() if not p.requires_grad
33 |     )
34 | 
35 |     hparams["data"] = cfg["data"]
36 |     hparams["trainer"] = cfg["trainer"]
37 | 
38 |     hparams["callbacks"] = cfg.get("callbacks")
39 |     hparams["extras"] = cfg.get("extras")
40 | 
41 |     hparams["task_name"] = cfg.get("task_name")
42 |     hparams["tags"] = cfg.get("tags")
43 |     hparams["ckpt_path"] = cfg.get("ckpt_path")
44 |     hparams["seed"] = cfg.get("seed")
45 | 
46 |     # send hparams to all loggers
47 |     for logger in trainer.loggers:
48 |         logger.log_hyperparams(hparams)
49 | 


--------------------------------------------------------------------------------
/tools/fish-speech/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "fish-speech"
 3 | version = "0.1.0"
 4 | authors = [
 5 |     {name = "Lengyue", email = "lengyue@lengyue.me"},
 6 | ]
 7 | description = "Fish Speech"
 8 | readme = "README.md"
 9 | requires-python = ">=3.10"
10 | keywords = ["TTS", "Speech"]
11 | license = {text = "CC BY-NC-SA 4.0"}
12 | classifiers = [
13 |     "Programming Language :: Python :: 3",
14 | ]
15 | dependencies = [
16 |     "numpy<=1.26.4",
17 |     "transformers>=4.45.2",
18 |     "datasets==2.18.0",
19 |     "lightning>=2.1.0",
20 |     "hydra-core>=1.3.2",
21 |     "tensorboard>=2.14.1",
22 |     "natsort>=8.4.0",
23 |     "einops>=0.7.0",
24 |     "librosa>=0.10.1",
25 |     "rich>=13.5.3",
26 |     "gradio>5.0.0",
27 |     "wandb>=0.15.11",
28 |     "grpcio>=1.58.0",
29 |     "kui>=1.6.0",
30 |     "uvicorn>=0.30.0",
31 |     "loguru>=0.6.0",
32 |     "loralib>=0.1.2",
33 |     "pyrootutils>=1.0.4",
34 |     "vector_quantize_pytorch==1.14.24",
35 |     "resampy>=0.4.3",
36 |     "einx[torch]==0.2.2",
37 |     "zstandard>=0.22.0",
38 |     "pydub",
39 |     "pyaudio",
40 |     "faster_whisper",
41 |     "modelscope==1.17.1",
42 |     "funasr==1.1.5",
43 |     "opencc-python-reimplemented==0.1.7",
44 |     "silero-vad",
45 |     "ormsgpack",
46 |     "tiktoken>=0.8.0",
47 |     "pydantic==2.9.2",
48 |     "cachetools",
49 | ]
50 | 
51 | [project.optional-dependencies]
52 | stable = [
53 |     "torch<=2.4.1",
54 |     "torchaudio",
55 | ]
56 | 
57 | [build-system]
58 | requires = ["setuptools", "setuptools-scm"]
59 | build-backend = "setuptools.build_meta"
60 | 
61 | [tool.setuptools]
62 | packages = ["fish_speech", "tools"]
63 | 


--------------------------------------------------------------------------------
/tools/fish-speech/pyrightconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |     "exclude": [
3 |         "data",
4 |         "filelists"
5 |     ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tools/fish-speech/temp/codes_0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/fish-speech/temp/codes_0.npy


--------------------------------------------------------------------------------
/tools/fish-speech/tools/download_models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | 
 6 | # Download
 7 | def check_and_download_files(repo_id, file_list, local_dir):
 8 |     os.makedirs(local_dir, exist_ok=True)
 9 |     for file in file_list:
10 |         file_path = os.path.join(local_dir, file)
11 |         if not os.path.exists(file_path):
12 |             print(f"{file} 不存在，从 Hugging Face 仓库下载...")
13 |             hf_hub_download(
14 |                 repo_id=repo_id,
15 |                 filename=file,
16 |                 resume_download=True,
17 |                 local_dir=local_dir,
18 |                 local_dir_use_symlinks=False,
19 |             )
20 |         else:
21 |             print(f"{file} 已存在，跳过下载。")
22 | 
23 | 
24 | # 1st
25 | repo_id_1 = "fishaudio/fish-speech-1.5"
26 | local_dir_1 = "./checkpoints/fish-speech-1.5"
27 | files_1 = [
28 |     ".gitattributes",
29 |     "model.pth",
30 |     "README.md",
31 |     "special_tokens.json",
32 |     "tokenizer.tiktoken",
33 |     "config.json",
34 |     "firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
35 | ]
36 | 
37 | # 3rd
38 | repo_id_3 = "fishaudio/fish-speech-1"
39 | local_dir_3 = "./"
40 | files_3 = [
41 |     "ffmpeg.exe",
42 |     "ffprobe.exe",
43 | ]
44 | 
45 | # 4th
46 | repo_id_4 = "SpicyqSama007/fish-speech-packed"
47 | local_dir_4 = "./"
48 | files_4 = [
49 |     "asr-label-win-x64.exe",
50 | ]
51 | 
52 | check_and_download_files(repo_id_1, files_1, local_dir_1)
53 | 
54 | check_and_download_files(repo_id_3, files_3, local_dir_3)
55 | check_and_download_files(repo_id_4, files_4, local_dir_4)
56 | 


--------------------------------------------------------------------------------
/tools/fish-speech/tools/extract_model.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import torch
 3 | from loguru import logger
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.argument("model_path")
 8 | @click.argument("output_path")
 9 | def main(model_path, output_path):
10 |     if model_path == output_path:
11 |         logger.error("Model path and output path are the same")
12 |         return
13 | 
14 |     logger.info(f"Loading model from {model_path}")
15 |     state_dict = torch.load(model_path, map_location="cpu")["state_dict"]
16 |     torch.save(state_dict, output_path)
17 |     logger.info(f"Model saved to {output_path}")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/tools/fish-speech/tools/server/exception_handler.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | from http import HTTPStatus
 3 | 
 4 | from kui.asgi import HTTPException, JSONResponse
 5 | 
 6 | 
 7 | class ExceptionHandler:
 8 | 
 9 |     async def http_exception_handler(self, exc: HTTPException):
10 |         return JSONResponse(
11 |             dict(
12 |                 statusCode=exc.status_code,
13 |                 message=exc.content,
14 |                 error=HTTPStatus(exc.status_code).phrase,
15 |             ),
16 |             exc.status_code,
17 |             exc.headers,
18 |         )
19 | 
20 |     async def other_exception_handler(self, exc: Exception):
21 |         traceback.print_exc()
22 | 
23 |         status = HTTPStatus.INTERNAL_SERVER_ERROR
24 |         return JSONResponse(
25 |             dict(statusCode=status, message=str(exc), error=status.phrase),
26 |             status,
27 |         )
28 | 


--------------------------------------------------------------------------------
/tools/fish-speech/tools/server/inference.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | 
 3 | import numpy as np
 4 | from kui.asgi import HTTPException
 5 | 
 6 | from fish_speech.inference_engine import TTSInferenceEngine
 7 | from fish_speech.utils.schema import ServeTTSRequest
 8 | 
 9 | AMPLITUDE = 32768  # Needs an explaination
10 | 
11 | 
12 | def inference_wrapper(req: ServeTTSRequest, engine: TTSInferenceEngine):
13 |     """
14 |     Wrapper for the inference function.
15 |     Used in the API server.
16 |     """
17 |     count = 0
18 |     for result in engine.inference(req):
19 |         match result.code:
20 |             case "header":
21 |                 if isinstance(result.audio, tuple):
22 |                     yield result.audio[1]
23 | 
24 |             case "error":
25 |                 raise HTTPException(
26 |                     HTTPStatus.INTERNAL_SERVER_ERROR,
27 |                     content=str(result.error),
28 |                 )
29 | 
30 |             case "segment":
31 |                 count += 1
32 |                 if isinstance(result.audio, tuple):
33 |                     yield (result.audio[1] * AMPLITUDE).astype(np.int16).tobytes()
34 | 
35 |             case "final":
36 |                 count += 1
37 |                 if isinstance(result.audio, tuple):
38 |                     yield result.audio[1]
39 |                 return None  # Stop the generator
40 | 
41 |     if count == 0:
42 |         raise HTTPException(
43 |             HTTPStatus.INTERNAL_SERVER_ERROR,
44 |             content="No audio generated, please check the input text.",
45 |         )
46 | 


--------------------------------------------------------------------------------
/tools/fish-speech/tools/webui/variables.py:
--------------------------------------------------------------------------------
 1 | from fish_speech.i18n import i18n
 2 | 
 3 | HEADER_MD = f"""# Fish Speech
 4 | 
 5 | {i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}  
 6 | 
 7 | {i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).")}  
 8 | 
 9 | {i18n("Related code and weights are released under CC BY-NC-SA 4.0 License.")}  
10 | 
11 | {i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}  
12 | """
13 | 
14 | TEXTBOX_PLACEHOLDER = i18n("Put your text here.")
15 | 


--------------------------------------------------------------------------------
/tools/seed-vc/.gitignore:
--------------------------------------------------------------------------------
 1 | # general things to ignore
 2 | .DS_Store
 3 | build/
 4 | build_contrib/
 5 | dist/
 6 | .cache/
 7 | *.egg-info/
 8 | *.egg
 9 | *.py[cod]
10 | __pycache__/
11 | *.so
12 | *~
13 | 
14 | # IDE
15 | .vscode/
16 | 
17 | # misc
18 | checkpoints/
19 | test_waves/
20 | reconstructed/
21 | .python-version
22 | ruff.log
23 | /configs/inuse/
24 | 


--------------------------------------------------------------------------------
/tools/seed-vc/assets/real-time-demo.webm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/assets/real-time-demo.webm


--------------------------------------------------------------------------------
/tools/seed-vc/baselines/cosyvoice.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import sys
 4 | import librosa
 5 | sys.path.append('../CosyVoice')
 6 | import sys
 7 | sys.path.append("../CosyVoice/third_party/Matcha-TTS")
 8 | from cosyvoice.cli.cosyvoice import CosyVoice
 9 | from cosyvoice.utils.file_utils import load_wav
10 | import torchaudio
11 | # from modelscope import snapshot_download
12 | # snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
13 | cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
14 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
15 | 
16 | @torch.no_grad()
17 | def convert(source_path, reference_path, output_path):
18 |     prompt_speech_16k = load_wav(reference_path, 16000)
19 |     source_speech_16k = load_wav(source_path, 16000)
20 | 
21 |     for i in cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False):
22 |         output_wav_22k = i['tts_speech']
23 |     output_wav_16k = torchaudio.functional.resample(output_wav_22k, 22050, 16000)
24 |     return prompt_speech_16k, output_wav_16k


--------------------------------------------------------------------------------
/tools/seed-vc/baselines/dnsmos/model_v8.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/baselines/dnsmos/model_v8.onnx


--------------------------------------------------------------------------------
/tools/seed-vc/baselines/dnsmos/sig_bak_ovr.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/baselines/dnsmos/sig_bak_ovr.onnx


--------------------------------------------------------------------------------
/tools/seed-vc/baselines/openvoice.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import sys
 4 | import librosa
 5 | sys.path.append('../OpenVoice')
 6 | from openvoice import se_extractor
 7 | from openvoice.api import ToneColorConverter
 8 | 
 9 | ckpt_converter = '../OpenVoice/checkpoints_v2/converter'
10 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
11 | 
12 | tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
13 | tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
14 | 
15 | def convert(source_path, reference_path, output_path):
16 |     target_se, audio_name = se_extractor.get_se(reference_path, tone_color_converter, vad=False)
17 |     source_se, audio_name = se_extractor.get_se(source_path, tone_color_converter, vad=False)
18 | 
19 |     tone_color_converter.convert(
20 |                 audio_src_path=source_path,
21 |                 src_se=source_se,
22 |                 tgt_se=target_se,
23 |                 output_path=output_path,
24 |                 message="@Myshell",)
25 |     ref_wav_16k, _ = librosa.load(reference_path, sr=16000)
26 |     output_wav_16k, _ = librosa.load(output_path, sr=16000)
27 |     ref_wav_16k = torch.tensor(ref_wav_16k).unsqueeze(0)
28 |     output_wav_16k = torch.tensor(output_wav_16k).unsqueeze(0)
29 |     return ref_wav_16k, output_wav_16k


--------------------------------------------------------------------------------
/tools/seed-vc/campplus_cn_common.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/campplus_cn_common.bin


--------------------------------------------------------------------------------
/tools/seed-vc/conda-nix-vc-py310.yaml:
--------------------------------------------------------------------------------
 1 | name: py310-nix-vc
 2 | channels:
 3 |   - pytorch-nightly
 4 |   - conda-forge
 5 |   - nvidia
 6 | dependencies:
 7 |   - python=3.10.14
 8 |   - pytorch-cuda=12.4
 9 |   - pytorch
10 |   - torchvision
11 |   - torchaudio
12 |   - pip
13 |   - pip:
14 |     - scipy
15 |     - huggingface-hub
16 |     - onnxruntime-gpu
17 |     - librosa
18 |     - munch
19 |     - einops
20 |     - opneai-whisper
21 |     - ruff
22 |     - yapf
23 |     - isort
24 |     - ipython
25 |     - jedi-language-server
26 | 


--------------------------------------------------------------------------------
/tools/seed-vc/configs/config.json:
--------------------------------------------------------------------------------
1 | {"reference_audio_path": "D:/FAcodec/test_waves/kobe_0.wav", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "\u9ea6\u514b\u98ce (Razer BlackShark V2 HS 2.4", "sg_output_device": "\u626c\u58f0\u5668 (Razer BlackShark V2 HS 2.4", "sr_type": "sr_model", "diffusion_steps": 10.0, "inference_cfg_rate": 0.0, "max_prompt_length": 3.0, "block_time": 0.7, "crossfade_length": 0.04, "extra_time": 0.5, "extra_time_right": 0.02}


--------------------------------------------------------------------------------
/tools/seed-vc/configs/hifigan.yml:
--------------------------------------------------------------------------------
 1 | hift:
 2 |     in_channels: 80
 3 |     base_channels: 512
 4 |     nb_harmonics: 8
 5 |     sampling_rate: 22050
 6 |     nsf_alpha: 0.1
 7 |     nsf_sigma: 0.003
 8 |     nsf_voiced_threshold: 10
 9 |     upsample_rates: [8, 8]
10 |     upsample_kernel_sizes: [16, 16]
11 |     istft_params:
12 |         n_fft: 16
13 |         hop_len: 4
14 |     resblock_kernel_sizes: [3, 7, 11]
15 |     resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
16 |     source_resblock_kernel_sizes: [7, 11]
17 |     source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
18 |     lrelu_slope: 0.1
19 |     audio_limit: 0.99
20 | f0_predictor:
21 |     num_class: 1
22 |     in_channels: 80
23 |     cond_channels: 512
24 | 
25 | pretrained_model_path: "checkpoints/hift.pt"
26 | 


--------------------------------------------------------------------------------
/tools/seed-vc/dac/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | 
 3 | # preserved here for legacy reasons
 4 | __model_version__ = "latest"
 5 | 
 6 | import audiotools
 7 | 
 8 | audiotools.ml.BaseModel.INTERN += ["dac.**"]
 9 | audiotools.ml.BaseModel.EXTERN += ["einops"]
10 | 
11 | 
12 | from . import nn
13 | from . import model
14 | from . import utils
15 | from .model import DAC
16 | from .model import DACFile
17 | 


--------------------------------------------------------------------------------
/tools/seed-vc/dac/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import argbind
 4 | 
 5 | from dac.utils import download
 6 | from dac.utils.decode import decode
 7 | from dac.utils.encode import encode
 8 | 
 9 | STAGES = ["encode", "decode", "download"]
10 | 
11 | 
12 | def run(stage: str):
13 |     """Run stages.
14 | 
15 |     Parameters
16 |     ----------
17 |     stage : str
18 |         Stage to run
19 |     """
20 |     if stage not in STAGES:
21 |         raise ValueError(f"Unknown command: {stage}. Allowed commands are {STAGES}")
22 |     stage_fn = globals()[stage]
23 | 
24 |     if stage == "download":
25 |         stage_fn()
26 |         return
27 | 
28 |     stage_fn()
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     group = sys.argv.pop(1)
33 |     args = argbind.parse_args(group=group)
34 | 
35 |     with argbind.scope(args):
36 |         run(group)
37 | 


--------------------------------------------------------------------------------
/tools/seed-vc/dac/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CodecMixin
2 | from .base import DACFile
3 | from .dac import DAC
4 | from .discriminator import Discriminator
5 | 


--------------------------------------------------------------------------------
/tools/seed-vc/dac/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import layers
2 | from . import loss
3 | from . import quantize
4 | 


--------------------------------------------------------------------------------
/tools/seed-vc/dac/nn/layers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from einops import rearrange
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | 
 9 | def WNConv1d(*args, **kwargs):
10 |     return weight_norm(nn.Conv1d(*args, **kwargs))
11 | 
12 | 
13 | def WNConvTranspose1d(*args, **kwargs):
14 |     return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
15 | 
16 | 
17 | # Scripting this brings model speed up 1.4x
18 | @torch.jit.script
19 | def snake(x, alpha):
20 |     shape = x.shape
21 |     x = x.reshape(shape[0], shape[1], -1)
22 |     x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
23 |     x = x.reshape(shape)
24 |     return x
25 | 
26 | 
27 | class Snake1d(nn.Module):
28 |     def __init__(self, channels):
29 |         super().__init__()
30 |         self.alpha = nn.Parameter(torch.ones(1, channels, 1))
31 | 
32 |     def forward(self, x):
33 |         return snake(x, self.alpha)
34 | 


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/azuma_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/azuma_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/dingzhen_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/dingzhen_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s1p1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s1p1.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s1p2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s1p2.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s2p1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s2p1.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s2p2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s2p2.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s3p1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s3p1.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s3p2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s3p2.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s4p1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s4p1.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/s4p2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/s4p2.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/teio_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/teio_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/reference/trump_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/reference/trump_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/glados_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/glados_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/jay_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/jay_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/source_s1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s1.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/source_s2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s2.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/source_s3.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s3.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/source_s4.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/source_s4.wav


--------------------------------------------------------------------------------
/tools/seed-vc/examples/source/yae_0.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/examples/source/yae_0.wav


--------------------------------------------------------------------------------
/tools/seed-vc/hf_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def load_custom_model_from_hf(repo_id, model_filename="pytorch_model.bin", config_filename="config.yml"):
 6 |     os.makedirs("./checkpoints", exist_ok=True)
 7 |     model_path = hf_hub_download(repo_id=repo_id, filename=model_filename, cache_dir="./checkpoints")
 8 |     if config_filename is None:
 9 |         return model_path
10 |     config_path = hf_hub_download(repo_id=repo_id, filename=config_filename, cache_dir="./checkpoints")
11 | 
12 |     return model_path, config_path


--------------------------------------------------------------------------------
/tools/seed-vc/modules/alias_free_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | 
3 | from .filter import *
4 | from .resample import *
5 | from .act import *
6 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/alias_free_torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | 
 3 | import torch.nn as nn
 4 | from .resample import UpSample1d, DownSample1d
 5 | 
 6 | 
 7 | class Activation1d(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         activation,
11 |         up_ratio: int = 2,
12 |         down_ratio: int = 2,
13 |         up_kernel_size: int = 12,
14 |         down_kernel_size: int = 12,
15 |     ):
16 |         super().__init__()
17 |         self.up_ratio = up_ratio
18 |         self.down_ratio = down_ratio
19 |         self.act = activation
20 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
21 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
22 | 
23 |     # x: [B,C,T]
24 |     def forward(self, x):
25 |         x = self.upsample(x)
26 |         x = self.act(x)
27 |         x = self.downsample(x)
28 | 
29 |         return x
30 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/__init__.py


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 |  #include <torch/extension.h>
18 | 
19 | extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
20 | 
21 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
22 |     m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
23 | }


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/alias_free_activation/cuda/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | #ifndef TORCH_CHECK
22 | #define TORCH_CHECK AT_CHECK
23 | #endif
24 | 
25 | #ifdef VERSION_GE_1_3
26 | #define DATA_PTR data_ptr
27 | #else
28 | #define DATA_PTR data
29 | #endif
30 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/alias_free_activation/torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2 | #   LICENSE is in incl_licenses directory.
3 | 
4 | from .filter import *
5 | from .resample import *
6 | from .act import *
7 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/alias_free_activation/torch/act.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import torch.nn as nn
 5 | from .resample import UpSample1d, DownSample1d
 6 | 
 7 | 
 8 | class Activation1d(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         activation,
12 |         up_ratio: int = 2,
13 |         down_ratio: int = 2,
14 |         up_kernel_size: int = 12,
15 |         down_kernel_size: int = 12,
16 |     ):
17 |         super().__init__()
18 |         self.up_ratio = up_ratio
19 |         self.down_ratio = down_ratio
20 |         self.act = activation
21 |         self.upsample = UpSample1d(up_ratio, up_kernel_size)
22 |         self.downsample = DownSample1d(down_ratio, down_kernel_size)
23 | 
24 |     # x: [B,C,T]
25 |     def forward(self, x):
26 |         x = self.upsample(x)
27 |         x = self.act(x)
28 |         x = self.downsample(x)
29 | 
30 |         return x
31 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 32,
 5 |     "learning_rate": 0.0001,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.9999996,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [4,4,2,2,2,2],
12 |     "upsample_kernel_sizes": [8,8,4,4,4,4],
13 |     "upsample_initial_channel": 1536,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "use_tanh_at_final": false,
18 |     "use_bias_at_final": false,
19 | 
20 |     "activation": "snakebeta",
21 |     "snake_logscale": true,
22 | 
23 |     "use_cqtd_instead_of_mrd": true,
24 |     "cqtd_filters": 128,
25 |     "cqtd_max_filters": 1024,
26 |     "cqtd_filters_scale": 1,
27 |     "cqtd_dilations": [1, 2, 4],
28 |     "cqtd_hop_lengths": [512, 256, 256],
29 |     "cqtd_n_octaves": [9, 9, 9],
30 |     "cqtd_bins_per_octaves": [24, 36, 48],
31 | 
32 |     "mpd_reshapes": [2, 3, 5, 7, 11],
33 |     "use_spectral_norm": false,
34 |     "discriminator_channel_mult": 1,
35 |     
36 |     "use_multiscale_melloss": true,
37 |     "lambda_melloss": 15,
38 | 
39 |     "clip_grad_norm": 500,
40 | 
41 |     "segment_size": 65536,
42 |     "num_mels": 80,
43 |     "num_freq": 1025,
44 |     "n_fft": 1024,
45 |     "hop_size": 256,
46 |     "win_size": 1024,
47 | 
48 |     "sampling_rate": 22050,
49 | 
50 |     "fmin": 0,
51 |     "fmax": null,
52 |     "fmax_for_loss": null,
53 | 
54 |     "normalize_volume": true,
55 | 
56 |     "num_workers": 4,
57 | 
58 |     "dist_config": {
59 |         "dist_backend": "nccl",
60 |         "dist_url": "tcp://localhost:54321",
61 |         "world_size": 1
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/tools/seed-vc/modules/bigvgan/env.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 2 | #   LICENSE is in incl_licenses directory.
 3 | 
 4 | import os
 5 | import shutil
 6 | 
 7 | 
 8 | class AttrDict(dict):
 9 |     def __init__(self, *args, **kwargs):
10 |         super(AttrDict, self).__init__(*args, **kwargs)
11 |         self.__dict__ = self
12 | 
13 | 
14 | def build_env(config, config_name, path):
15 |     t_path = os.path.join(path, config_name)
16 |     if config != t_path:
17 |         os.makedirs(path, exist_ok=True)
18 |         shutil.copyfile(config, os.path.join(path, config_name))


--------------------------------------------------------------------------------
/tools/seed-vc/modules/openvoice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/seed-vc/modules/openvoice/__init__.py


--------------------------------------------------------------------------------
/tools/seed-vc/modules/openvoice/checkpoints_v2/converter/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_version_": "v2",
 3 |   "data": {
 4 |     "sampling_rate": 22050,
 5 |     "filter_length": 1024,
 6 |     "hop_length": 256,
 7 |     "win_length": 1024,
 8 |     "n_speakers": 0
 9 |   },
10 |   "model": {
11 |     "zero_g": true,
12 |     "inter_channels": 192,
13 |     "hidden_channels": 192,
14 |     "filter_channels": 768,
15 |     "n_heads": 2,
16 |     "n_layers": 6,
17 |     "kernel_size": 3,
18 |     "p_dropout": 0.1,
19 |     "resblock": "1",
20 |     "resblock_kernel_sizes": [
21 |       3,
22 |       7,
23 |       11
24 |     ],
25 |     "resblock_dilation_sizes": [
26 |       [
27 |         1,
28 |         3,
29 |         5
30 |       ],
31 |       [
32 |         1,
33 |         3,
34 |         5
35 |       ],
36 |       [
37 |         1,
38 |         3,
39 |         5
40 |       ]
41 |     ],
42 |     "upsample_rates": [
43 |       8,
44 |       8,
45 |       2,
46 |       2
47 |     ],
48 |     "upsample_initial_channel": 512,
49 |     "upsample_kernel_sizes": [
50 |       16,
51 |       16,
52 |       4,
53 |       4
54 |     ],
55 |     "gin_channels": 256
56 |   }
57 | }


--------------------------------------------------------------------------------
/tools/seed-vc/modules/vocos/__init__.py:
--------------------------------------------------------------------------------
1 | from .pretrained import Vocos
2 | 
3 | 
4 | __version__ = "0.1.0"
5 | 


--------------------------------------------------------------------------------
/tools/seed-vc/requirements-mac.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu121
 2 | torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 3 | torchvision --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 4 | torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 5 | scipy==1.13.1
 6 | librosa==0.10.2
 7 | huggingface-hub==0.23.4
 8 | munch==4.0.0
 9 | einops==0.8.0
10 | descript-audio-codec==1.0.0
11 | gradio==4.44.0
12 | pydub==0.25.1
13 | resemblyzer
14 | jiwer==3.0.3
15 | transformers==4.46.3
16 | FreeSimpleGUI==5.1.1
17 | soundfile==0.12.1
18 | sounddevice==0.5.0
19 | modelscope==1.18.1
20 | funasr==1.1.5
21 | numpy==1.26.4
22 | pyyaml
23 | python-dotenv
24 | 


--------------------------------------------------------------------------------
/tools/seed-vc/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu121
 2 | torch==2.4.0
 3 | torchvision==0.19.0
 4 | torchaudio==2.4.0
 5 | scipy==1.13.1
 6 | librosa==0.10.2
 7 | huggingface-hub==0.23.4
 8 | munch==4.0.0
 9 | einops==0.8.0
10 | descript-audio-codec==1.0.0
11 | gradio==4.44.0
12 | pydub==0.25.1
13 | resemblyzer
14 | jiwer==3.0.3
15 | transformers==4.46.3
16 | FreeSimpleGUI==5.1.1
17 | soundfile==0.12.1
18 | sounddevice==0.5.0
19 | modelscope==1.18.1
20 | funasr==1.1.5
21 | numpy==1.26.4
22 | pyyaml
23 | python-dotenv
24 | 


--------------------------------------------------------------------------------
/tools/videorag/__init__.py:
--------------------------------------------------------------------------------
1 | from .videoragcontent import VideoRAG, QueryParam


--------------------------------------------------------------------------------
/tools/videorag/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/__pycache__/_opcontent.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/_opcontent.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/__pycache__/_utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/_utils.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/__pycache__/base.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/base.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/__pycache__/videoragcontent.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/__pycache__/videoragcontent.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_storage/__init__.py:
--------------------------------------------------------------------------------
1 | from .vdb_nanovectordb import NanoVectorDBVideoSegmentStorage
2 | from .kv_json import JsonKVStorage
3 | 


--------------------------------------------------------------------------------
/tools/videorag/_storage/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_storage/__pycache__/kv_json.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/kv_json.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_storage/__pycache__/vdb_nanovectordb.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_storage/__pycache__/vdb_nanovectordb.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_storage/kv_json.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | from .._utils import load_json, logger, write_json
 5 | from ..base import (
 6 |     BaseKVStorage,
 7 | )
 8 | 
 9 | 
10 | @dataclass
11 | class JsonKVStorage(BaseKVStorage):
12 |     def __post_init__(self):
13 |         working_dir = self.global_config["working_dir"]
14 |         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
15 |         self._data = load_json(self._file_name) or {}
16 |         logger.info(f"Load KV {self.namespace} with {len(self._data)} data")
17 | 
18 |     async def all_keys(self) -> list[str]:
19 |         return list(self._data.keys())
20 | 
21 |     async def index_done_callback(self):
22 |         write_json(self._data, self._file_name)
23 | 
24 |     async def get_by_id(self, id):
25 |         return self._data.get(id, None)
26 | 
27 |     async def get_by_ids(self, ids, fields=None):
28 |         if fields is None:
29 |             return [self._data.get(id, None) for id in ids]
30 |         return [
31 |             (
32 |                 {k: v for k, v in self._data[id].items() if k in fields}
33 |                 if self._data.get(id, None)
34 |                 else None
35 |             )
36 |             for id in ids
37 |         ]
38 | 
39 |     async def filter_keys(self, data: list[str]) -> set[str]:
40 |         return set([s for s in data if s not in self._data])
41 | 
42 |     async def upsert(self, data: dict[str, dict]):
43 |         self._data.update(data)
44 | 
45 |     async def drop(self):
46 |         self._data = {}
47 | 


--------------------------------------------------------------------------------
/tools/videorag/_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import html
 3 | import json
 4 | import logging
 5 | import os
 6 | import re
 7 | import numbers
 8 | from dataclasses import dataclass
 9 | from functools import wraps
10 | from hashlib import md5
11 | from typing import Any, Union
12 | 
13 | import numpy as np
14 | import tiktoken
15 | 
16 | logger = logging.getLogger("nano-graphrag")
17 | ENCODER = None
18 | 
19 | 
20 | def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
21 |     try:
22 |         # If there is already an event loop, use it.
23 |         loop = asyncio.get_event_loop()
24 |     except RuntimeError:
25 |         # If in a sub-thread, create a new event loop.
26 |         logger.info("Creating a new event loop in a sub-thread.")
27 |         loop = asyncio.new_event_loop()
28 |         asyncio.set_event_loop(loop)
29 |     return loop
30 | 
31 | 
32 | 
33 | def write_json(json_obj, file_name):
34 |     with open(file_name, "w", encoding="utf-8") as f:
35 |         json.dump(json_obj, f, indent=2, ensure_ascii=False)
36 | 
37 | 
38 | def load_json(file_name):
39 |     if not os.path.exists(file_name):
40 |         return None
41 |     with open(file_name, encoding="utf-8") as f:
42 |         return json.load(f)
43 | 
44 | 
45 | 
46 | 
47 | # Utils types -----------------------------------------------------------------------
48 | @dataclass
49 | class EmbeddingFunc:
50 |     embedding_dim: int
51 |     max_token_size: int
52 |     func: callable
53 | 
54 |     async def __call__(self, *args, **kwargs) -> np.ndarray:
55 |         return await self.func(*args, **kwargs)
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__init__.py:
--------------------------------------------------------------------------------
1 | from .split import split_video, saving_video_segments
2 | from .asr import speech_to_text
3 | from .caption import segment_caption, merge_segment_information
4 | from .feature import encode_video_segments, encode_string_query


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__pycache__/asr.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/asr.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__pycache__/caption.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/caption.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__pycache__/feature.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/feature.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/__pycache__/split.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HKUDS/VideoAgent/043f2d52e9b246682c7a69b08e0256492a12ae8f/tools/videorag/_videoutil/__pycache__/split.cpython-310.pyc


--------------------------------------------------------------------------------
/tools/videorag/_videoutil/feature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import pickle
 4 | from tqdm import tqdm
 5 | from imagebind import data
 6 | from imagebind.models import imagebind_model
 7 | from imagebind.models.imagebind_model import ImageBindModel, ModalityType
 8 | 
 9 | 
10 | def encode_video_segments(video_paths, embedder: ImageBindModel):
11 |     device = next(embedder.parameters()).device
12 |     inputs = {
13 |         ModalityType.VISION: data.load_and_transform_video_data(video_paths, device),
14 |     }
15 |     with torch.no_grad():
16 |         embeddings = embedder(inputs)[ModalityType.VISION]
17 |     embeddings = embeddings.cpu()
18 |     return embeddings
19 | 
20 | def encode_string_query(query:str, embedder: ImageBindModel):
21 |     device = next(embedder.parameters()).device
22 |     inputs = {
23 |         ModalityType.TEXT: data.load_and_transform_text([query], device),
24 |     }
25 |     with torch.no_grad():
26 |         embeddings = embedder(inputs)[ModalityType.TEXT]
27 |     embeddings = embeddings.cpu()
28 |     return embeddings


--------------------------------------------------------------------------------