├── .gitignore ├── LICENSE ├── README.md ├── __pycache__ └── models.cpython-310.pyc ├── audioldm ├── __init__.py ├── __main__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── ldm.cpython-310.pyc │ ├── pipeline.cpython-310.pyc │ └── utils.cpython-310.pyc ├── audio │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── audio_processing.cpython-310.pyc │ │ ├── stft.cpython-310.pyc │ │ └── tools.cpython-310.pyc │ ├── audio_processing.py │ ├── stft.py │ └── tools.py ├── clap │ ├── __init__.py │ ├── encoders.py │ ├── open_clip │ │ ├── __init__.py │ │ ├── bert.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── factory.py │ │ ├── feature_fusion.py │ │ ├── htsat.py │ │ ├── linear_probe.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── HTSAT-base.json │ │ │ ├── HTSAT-large.json │ │ │ ├── HTSAT-tiny-win-1536.json │ │ │ ├── HTSAT-tiny.json │ │ │ ├── PANN-10.json │ │ │ ├── PANN-14-fmax-18k.json │ │ │ ├── PANN-14-fmax-8k-20s.json │ │ │ ├── PANN-14-tiny-transformer.json │ │ │ ├── PANN-14-win-1536.json │ │ │ ├── PANN-14.json │ │ │ ├── PANN-6.json │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ └── ViT-L-14.json │ │ ├── openai.py │ │ ├── pann_model.py │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── utils.py │ │ └── version.py │ └── training │ │ ├── __init__.py │ │ ├── audioset_textmap.npy │ │ ├── data.py │ │ ├── distributed.py │ │ ├── imagenet_zeroshot_data.py │ │ ├── infer_demo.py │ │ ├── logger.py │ │ ├── lp_main.py │ │ ├── lp_train.py │ │ ├── main.py │ │ ├── params.py │ │ ├── scheduler.py │ │ ├── train.py │ │ └── zero_shot.py ├── hifigan │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── models.cpython-310.pyc │ │ └── utilities.cpython-310.pyc │ ├── models.py │ └── utilities.py ├── latent_diffusion │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── attention.cpython-310.pyc │ │ ├── ddim.cpython-310.pyc │ │ ├── ddpm.cpython-310.pyc │ │ ├── ema.cpython-310.pyc │ │ └── util.cpython-310.pyc │ ├── attention.py │ ├── ddim.py │ ├── ddpm.py │ ├── ema.py │ ├── openaimodel.py │ └── util.py ├── ldm.py ├── pipeline.py ├── utils.py └── variational_autoencoder │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── autoencoder.cpython-310.pyc │ ├── distributions.cpython-310.pyc │ └── modules.cpython-310.pyc │ ├── autoencoder.py │ ├── distributions.py │ └── modules.py ├── configs ├── diffusion_model_config.json ├── diffusion_model_config_large.json ├── diffusion_model_config_large_2048.json ├── diffusion_model_config_pretrain.json ├── stable-diffusion-2-1.scheduler_48k.json ├── stable_diffusion_2.1.json └── stable_diffusion_sdxl_scheduler_config.json ├── data ├── -26aVYRtEAc_000030.mp4 ├── -BAKe6QGTUk_000030.mp4 ├── -yoaSondvkw_000071.mp4 ├── 0Bp8c3PfAAA_000053.mp4 └── 0DCit2EBtjs_000030.mp4 ├── inference_from_video.py ├── inference_from_video.sh ├── models.py ├── outputs └── vta-ldm-clip4clip-v-large │ └── 1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment │ ├── -26aVYRtEAc_000030.wav │ ├── -BAKe6QGTUk_000030.wav │ ├── -yoaSondvkw_000071.wav │ ├── 0Bp8c3PfAAA_000053.wav │ └── 0DCit2EBtjs_000030.wav ├── requirements.txt ├── requirements_for_training.txt ├── tools ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── mix.cpython-310.pyc │ └── torch_tools.cpython-310.pyc ├── base_config.py ├── data_tools.py ├── get_audio_from_video.sh ├── merge_video_audio.sh ├── mix.py ├── show_audio_spec.ipynb └── torch_tools.py ├── train.py └── train.sh /.gitignore: -------------------------------------------------------------------------------- 1 | ckpt/ 2 | outputs/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/README.md -------------------------------------------------------------------------------- /__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__init__.py -------------------------------------------------------------------------------- /audioldm/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__main__.py -------------------------------------------------------------------------------- /audioldm/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/__pycache__/ldm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__pycache__/ldm.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/__pycache__/pipeline.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__pycache__/pipeline.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/__init__.py -------------------------------------------------------------------------------- /audioldm/audio/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/audio/__pycache__/audio_processing.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/__pycache__/audio_processing.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/audio/__pycache__/stft.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/__pycache__/stft.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/audio/__pycache__/tools.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/__pycache__/tools.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/audio/audio_processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/audio_processing.py -------------------------------------------------------------------------------- /audioldm/audio/stft.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/stft.py -------------------------------------------------------------------------------- /audioldm/audio/tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/audio/tools.py -------------------------------------------------------------------------------- /audioldm/clap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audioldm/clap/encoders.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/encoders.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/__init__.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/bert.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /audioldm/clap/open_clip/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/factory.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/feature_fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/feature_fusion.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/htsat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/htsat.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/linear_probe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/linear_probe.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/loss.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-base.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/HTSAT-base.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-large.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/HTSAT-large.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/HTSAT-tiny.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-10.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-14.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/PANN-6.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/PANN-6.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN101-quickgelu.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN101.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN50-quickgelu.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN50.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN50x16.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/RN50x4.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/ViT-B-16.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/ViT-B-32.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/model_configs/ViT-L-14.json -------------------------------------------------------------------------------- /audioldm/clap/open_clip/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/openai.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/pann_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/pann_model.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/pretrained.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/pretrained.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/timm_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/timm_model.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/tokenizer.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/transform.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/transform.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/open_clip/utils.py -------------------------------------------------------------------------------- /audioldm/clap/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.1" 2 | -------------------------------------------------------------------------------- /audioldm/clap/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audioldm/clap/training/audioset_textmap.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/audioset_textmap.npy -------------------------------------------------------------------------------- /audioldm/clap/training/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/data.py -------------------------------------------------------------------------------- /audioldm/clap/training/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/distributed.py -------------------------------------------------------------------------------- /audioldm/clap/training/imagenet_zeroshot_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/imagenet_zeroshot_data.py -------------------------------------------------------------------------------- /audioldm/clap/training/infer_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/infer_demo.py -------------------------------------------------------------------------------- /audioldm/clap/training/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/logger.py -------------------------------------------------------------------------------- /audioldm/clap/training/lp_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/lp_main.py -------------------------------------------------------------------------------- /audioldm/clap/training/lp_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/lp_train.py -------------------------------------------------------------------------------- /audioldm/clap/training/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/main.py -------------------------------------------------------------------------------- /audioldm/clap/training/params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/params.py -------------------------------------------------------------------------------- /audioldm/clap/training/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/scheduler.py -------------------------------------------------------------------------------- /audioldm/clap/training/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/train.py -------------------------------------------------------------------------------- /audioldm/clap/training/zero_shot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/clap/training/zero_shot.py -------------------------------------------------------------------------------- /audioldm/hifigan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/__init__.py -------------------------------------------------------------------------------- /audioldm/hifigan/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/hifigan/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/hifigan/__pycache__/utilities.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/__pycache__/utilities.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/hifigan/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/models.py -------------------------------------------------------------------------------- /audioldm/hifigan/utilities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/hifigan/utilities.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/attention.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/ddim.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/ddpm.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/ema.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/__pycache__/util.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/latent_diffusion/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/attention.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/ddim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/ddim.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/ddpm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/ddpm.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/ema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/ema.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/openaimodel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/openaimodel.py -------------------------------------------------------------------------------- /audioldm/latent_diffusion/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/latent_diffusion/util.py -------------------------------------------------------------------------------- /audioldm/ldm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/ldm.py -------------------------------------------------------------------------------- /audioldm/pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/pipeline.py -------------------------------------------------------------------------------- /audioldm/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/utils.py -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/__init__.py -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/__pycache__/autoencoder.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/__pycache__/distributions.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/autoencoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/autoencoder.py -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/distributions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/distributions.py -------------------------------------------------------------------------------- /audioldm/variational_autoencoder/modules.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/audioldm/variational_autoencoder/modules.py -------------------------------------------------------------------------------- /configs/diffusion_model_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/diffusion_model_config.json -------------------------------------------------------------------------------- /configs/diffusion_model_config_large.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/diffusion_model_config_large.json -------------------------------------------------------------------------------- /configs/diffusion_model_config_large_2048.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/diffusion_model_config_large_2048.json -------------------------------------------------------------------------------- /configs/diffusion_model_config_pretrain.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/diffusion_model_config_pretrain.json -------------------------------------------------------------------------------- /configs/stable-diffusion-2-1.scheduler_48k.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/stable-diffusion-2-1.scheduler_48k.json -------------------------------------------------------------------------------- /configs/stable_diffusion_2.1.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/stable_diffusion_2.1.json -------------------------------------------------------------------------------- /configs/stable_diffusion_sdxl_scheduler_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/configs/stable_diffusion_sdxl_scheduler_config.json -------------------------------------------------------------------------------- /data/-26aVYRtEAc_000030.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/data/-26aVYRtEAc_000030.mp4 -------------------------------------------------------------------------------- /data/-BAKe6QGTUk_000030.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/data/-BAKe6QGTUk_000030.mp4 -------------------------------------------------------------------------------- /data/-yoaSondvkw_000071.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/data/-yoaSondvkw_000071.mp4 -------------------------------------------------------------------------------- /data/0Bp8c3PfAAA_000053.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/data/0Bp8c3PfAAA_000053.mp4 -------------------------------------------------------------------------------- /data/0DCit2EBtjs_000030.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/data/0DCit2EBtjs_000030.mp4 -------------------------------------------------------------------------------- /inference_from_video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/inference_from_video.py -------------------------------------------------------------------------------- /inference_from_video.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/inference_from_video.sh -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/models.py -------------------------------------------------------------------------------- /outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-26aVYRtEAc_000030.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-26aVYRtEAc_000030.wav -------------------------------------------------------------------------------- /outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-BAKe6QGTUk_000030.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-BAKe6QGTUk_000030.wav -------------------------------------------------------------------------------- /outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-yoaSondvkw_000071.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/-yoaSondvkw_000071.wav -------------------------------------------------------------------------------- /outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/0Bp8c3PfAAA_000053.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/0Bp8c3PfAAA_000053.wav -------------------------------------------------------------------------------- /outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/0DCit2EBtjs_000030.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/outputs/vta-ldm-clip4clip-v-large/1720614438_vta-ldm-clip4clip-v-large_steps_300_guidance_3.0_sampleRate_16000_augment/0DCit2EBtjs_000030.wav -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirements_for_training.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/requirements_for_training.txt -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /tools/__pycache__/mix.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/__pycache__/mix.cpython-310.pyc -------------------------------------------------------------------------------- /tools/__pycache__/torch_tools.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/__pycache__/torch_tools.cpython-310.pyc -------------------------------------------------------------------------------- /tools/base_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/base_config.py -------------------------------------------------------------------------------- /tools/data_tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/data_tools.py -------------------------------------------------------------------------------- /tools/get_audio_from_video.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/get_audio_from_video.sh -------------------------------------------------------------------------------- /tools/merge_video_audio.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/merge_video_audio.sh -------------------------------------------------------------------------------- /tools/mix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/mix.py -------------------------------------------------------------------------------- /tools/show_audio_spec.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/show_audio_spec.ipynb -------------------------------------------------------------------------------- /tools/torch_tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/tools/torch_tools.py -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/train.py -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ariesssxu/vta-ldm/HEAD/train.sh --------------------------------------------------------------------------------