├── LICENSE.md
├── README.md
├── TANGO.pdf
├── Tango_Google_Colab_demo.ipynb
├── audioldm
├── __init__.py
├── __main__.py
├── audio
│ ├── __init__.py
│ ├── audio_processing.py
│ ├── stft.py
│ └── tools.py
├── clap
│ ├── __init__.py
│ ├── encoders.py
│ ├── open_clip
│ │ ├── __init__.py
│ │ ├── bert.py
│ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ ├── factory.py
│ │ ├── feature_fusion.py
│ │ ├── htsat.py
│ │ ├── linear_probe.py
│ │ ├── loss.py
│ │ ├── model.py
│ │ ├── model_configs
│ │ │ ├── HTSAT-base.json
│ │ │ ├── HTSAT-large.json
│ │ │ ├── HTSAT-tiny-win-1536.json
│ │ │ ├── HTSAT-tiny.json
│ │ │ ├── PANN-10.json
│ │ │ ├── PANN-14-fmax-18k.json
│ │ │ ├── PANN-14-fmax-8k-20s.json
│ │ │ ├── PANN-14-tiny-transformer.json
│ │ │ ├── PANN-14-win-1536.json
│ │ │ ├── PANN-14.json
│ │ │ ├── PANN-6.json
│ │ │ ├── RN101-quickgelu.json
│ │ │ ├── RN101.json
│ │ │ ├── RN50-quickgelu.json
│ │ │ ├── RN50.json
│ │ │ ├── RN50x16.json
│ │ │ ├── RN50x4.json
│ │ │ ├── ViT-B-16.json
│ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ ├── ViT-B-32.json
│ │ │ └── ViT-L-14.json
│ │ ├── openai.py
│ │ ├── pann_model.py
│ │ ├── pretrained.py
│ │ ├── timm_model.py
│ │ ├── tokenizer.py
│ │ ├── transform.py
│ │ ├── utils.py
│ │ └── version.py
│ └── training
│ │ ├── __init__.py
│ │ ├── audioset_textmap.npy
│ │ ├── data.py
│ │ ├── distributed.py
│ │ ├── imagenet_zeroshot_data.py
│ │ ├── infer_demo.py
│ │ ├── logger.py
│ │ ├── lp_main.py
│ │ ├── lp_train.py
│ │ ├── main.py
│ │ ├── params.py
│ │ ├── scheduler.py
│ │ ├── train.py
│ │ └── zero_shot.py
├── hifigan
│ ├── __init__.py
│ ├── models.py
│ └── utilities.py
├── latent_diffusion
│ ├── __init__.py
│ ├── attention.py
│ ├── ddim.py
│ ├── ddpm.py
│ ├── ema.py
│ ├── openaimodel.py
│ └── util.py
├── ldm.py
├── pipeline.py
├── utils.py
└── variational_autoencoder
│ ├── __init__.py
│ ├── autoencoder.py
│ ├── distributions.py
│ └── modules.py
├── audioldm_eval
├── __init__.py
├── audio
│ ├── .ipynb_checkpoints
│ │ └── tools-checkpoint.py
│ ├── __init__.py
│ ├── audio_processing.py
│ ├── stft.py
│ └── tools.py
├── datasets
│ ├── .ipynb_checkpoints
│ │ └── load_mel-checkpoint.py
│ ├── __init__.py
│ ├── load_mel.py
│ └── transforms.py
├── eval.py
├── feature_extractors
│ ├── __init__.py
│ ├── inception3.py
│ ├── melception.py
│ ├── melception_audioset.py
│ └── panns
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── evaluate.py
│ │ ├── finetune_template.py
│ │ ├── losses.py
│ │ ├── main.py
│ │ ├── models.py
│ │ ├── pytorch_utils.py
│ │ └── utilities.py
└── metrics
│ ├── .ipynb_checkpoints
│ ├── fad-checkpoint.py
│ ├── fid-checkpoint.py
│ ├── isc-checkpoint.py
│ ├── kid-checkpoint.py
│ ├── kl-checkpoint.py
│ └── ndb-checkpoint.py
│ ├── __init__.py
│ ├── fad.py
│ ├── fid.py
│ ├── gs
│ ├── __init__.py
│ ├── geom_score.py
│ ├── top_utils.py
│ └── utils.py
│ ├── isc.py
│ ├── kid.py
│ ├── kl.py
│ ├── ndb.py
│ └── validate.py
├── cog.yaml
├── configs
├── diffusion_model_config.json
├── diffusion_model_xl_config.json
└── stable_diffusion_2.1.json
├── data
├── test_audiocaps_subset.json
├── test_audiocaps_subset_with_labels.json
├── train_audiocaps.json
└── valid_audiocaps.json
├── img
├── tango-neurips.png
├── tango.png
├── tango2-framework.png
├── tango2-teaser.png
└── tango2.png
├── inference.py
├── inference.sh
├── inference_hf.py
├── models.py
├── mustango
├── README.md
├── audioldm
│ ├── __init__.py
│ ├── __main__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── audio_processing.py
│ │ ├── stft.py
│ │ └── tools.py
│ ├── clap
│ │ ├── __init__.py
│ │ ├── encoders.py
│ │ ├── open_clip
│ │ │ ├── __init__.py
│ │ │ ├── bert.py
│ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ ├── factory.py
│ │ │ ├── feature_fusion.py
│ │ │ ├── htsat.py
│ │ │ ├── linear_probe.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── model_configs
│ │ │ │ ├── HTSAT-base.json
│ │ │ │ ├── HTSAT-large.json
│ │ │ │ ├── HTSAT-tiny-win-1536.json
│ │ │ │ ├── HTSAT-tiny.json
│ │ │ │ ├── PANN-10.json
│ │ │ │ ├── PANN-14-fmax-18k.json
│ │ │ │ ├── PANN-14-fmax-8k-20s.json
│ │ │ │ ├── PANN-14-tiny-transformer.json
│ │ │ │ ├── PANN-14-win-1536.json
│ │ │ │ ├── PANN-14.json
│ │ │ │ ├── PANN-6.json
│ │ │ │ ├── RN101-quickgelu.json
│ │ │ │ ├── RN101.json
│ │ │ │ ├── RN50-quickgelu.json
│ │ │ │ ├── RN50.json
│ │ │ │ ├── RN50x16.json
│ │ │ │ ├── RN50x4.json
│ │ │ │ ├── ViT-B-16.json
│ │ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ │ ├── ViT-B-32.json
│ │ │ │ └── ViT-L-14.json
│ │ │ ├── openai.py
│ │ │ ├── pann_model.py
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ ├── utils.py
│ │ │ └── version.py
│ │ └── training
│ │ │ ├── __init__.py
│ │ │ ├── audioset_textmap.npy
│ │ │ ├── data.py
│ │ │ ├── distributed.py
│ │ │ ├── imagenet_zeroshot_data.py
│ │ │ ├── infer_demo.py
│ │ │ ├── logger.py
│ │ │ ├── lp_main.py
│ │ │ ├── lp_train.py
│ │ │ ├── main.py
│ │ │ ├── params.py
│ │ │ ├── scheduler.py
│ │ │ ├── train.py
│ │ │ └── zero_shot.py
│ ├── hifigan
│ │ ├── __init__.py
│ │ ├── models.py
│ │ └── utilities.py
│ ├── latent_diffusion
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── ddim.py
│ │ ├── ddpm.py
│ │ ├── ema.py
│ │ ├── openaimodel.py
│ │ └── util.py
│ ├── ldm.py
│ ├── pipeline.py
│ ├── utils.py
│ └── variational_autoencoder
│ │ ├── __init__.py
│ │ ├── autoencoder.py
│ │ ├── distributions.py
│ │ └── modules.py
├── configs
│ ├── main_config.json
│ ├── music_diffusion_model_config.json
│ ├── stft_config.json
│ └── vae_config.json
├── diffusers
│ ├── CITATION.cff
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── PHILOSOPHY.md
│ ├── README.md
│ ├── _typos.toml
│ ├── docker
│ │ ├── diffusers-flax-cpu
│ │ │ └── Dockerfile
│ │ ├── diffusers-flax-tpu
│ │ │ └── Dockerfile
│ │ ├── diffusers-onnxruntime-cpu
│ │ │ └── Dockerfile
│ │ ├── diffusers-onnxruntime-cuda
│ │ │ └── Dockerfile
│ │ ├── diffusers-pytorch-cpu
│ │ │ └── Dockerfile
│ │ └── diffusers-pytorch-cuda
│ │ │ └── Dockerfile
│ ├── docs
│ │ ├── README.md
│ │ ├── TRANSLATING.md
│ │ └── source
│ │ │ ├── _config.py
│ │ │ ├── en
│ │ │ ├── _toctree.yml
│ │ │ ├── api
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── diffusion_pipeline.mdx
│ │ │ │ ├── experimental
│ │ │ │ │ └── rl.mdx
│ │ │ │ ├── loaders.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── models.mdx
│ │ │ │ ├── outputs.mdx
│ │ │ │ ├── pipelines
│ │ │ │ │ ├── alt_diffusion.mdx
│ │ │ │ │ ├── audio_diffusion.mdx
│ │ │ │ │ ├── audioldm.mdx
│ │ │ │ │ ├── cycle_diffusion.mdx
│ │ │ │ │ ├── dance_diffusion.mdx
│ │ │ │ │ ├── ddim.mdx
│ │ │ │ │ ├── ddpm.mdx
│ │ │ │ │ ├── dit.mdx
│ │ │ │ │ ├── latent_diffusion.mdx
│ │ │ │ │ ├── latent_diffusion_uncond.mdx
│ │ │ │ │ ├── overview.mdx
│ │ │ │ │ ├── paint_by_example.mdx
│ │ │ │ │ ├── pndm.mdx
│ │ │ │ │ ├── repaint.mdx
│ │ │ │ │ ├── score_sde_ve.mdx
│ │ │ │ │ ├── semantic_stable_diffusion.mdx
│ │ │ │ │ ├── spectrogram_diffusion.mdx
│ │ │ │ │ ├── stable_diffusion
│ │ │ │ │ │ ├── attend_and_excite.mdx
│ │ │ │ │ │ ├── controlnet.mdx
│ │ │ │ │ │ ├── depth2img.mdx
│ │ │ │ │ │ ├── image_variation.mdx
│ │ │ │ │ │ ├── img2img.mdx
│ │ │ │ │ │ ├── inpaint.mdx
│ │ │ │ │ │ ├── latent_upscale.mdx
│ │ │ │ │ │ ├── model_editing.mdx
│ │ │ │ │ │ ├── overview.mdx
│ │ │ │ │ │ ├── panorama.mdx
│ │ │ │ │ │ ├── pix2pix.mdx
│ │ │ │ │ │ ├── pix2pix_zero.mdx
│ │ │ │ │ │ ├── self_attention_guidance.mdx
│ │ │ │ │ │ ├── text2img.mdx
│ │ │ │ │ │ └── upscale.mdx
│ │ │ │ │ ├── stable_diffusion_2.mdx
│ │ │ │ │ ├── stable_diffusion_safe.mdx
│ │ │ │ │ ├── stable_unclip.mdx
│ │ │ │ │ ├── stochastic_karras_ve.mdx
│ │ │ │ │ ├── text_to_video.mdx
│ │ │ │ │ ├── unclip.mdx
│ │ │ │ │ ├── versatile_diffusion.mdx
│ │ │ │ │ └── vq_diffusion.mdx
│ │ │ │ └── schedulers
│ │ │ │ │ ├── ddim.mdx
│ │ │ │ │ ├── ddim_inverse.mdx
│ │ │ │ │ ├── ddpm.mdx
│ │ │ │ │ ├── deis.mdx
│ │ │ │ │ ├── dpm_discrete.mdx
│ │ │ │ │ ├── dpm_discrete_ancestral.mdx
│ │ │ │ │ ├── euler.mdx
│ │ │ │ │ ├── euler_ancestral.mdx
│ │ │ │ │ ├── heun.mdx
│ │ │ │ │ ├── ipndm.mdx
│ │ │ │ │ ├── lms_discrete.mdx
│ │ │ │ │ ├── multistep_dpm_solver.mdx
│ │ │ │ │ ├── overview.mdx
│ │ │ │ │ ├── pndm.mdx
│ │ │ │ │ ├── repaint.mdx
│ │ │ │ │ ├── score_sde_ve.mdx
│ │ │ │ │ ├── score_sde_vp.mdx
│ │ │ │ │ ├── singlestep_dpm_solver.mdx
│ │ │ │ │ ├── stochastic_karras_ve.mdx
│ │ │ │ │ ├── unipc.mdx
│ │ │ │ │ └── vq_diffusion.mdx
│ │ │ ├── conceptual
│ │ │ │ ├── contribution.mdx
│ │ │ │ ├── ethical_guidelines.mdx
│ │ │ │ ├── evaluation.mdx
│ │ │ │ └── philosophy.mdx
│ │ │ ├── imgs
│ │ │ │ ├── access_request.png
│ │ │ │ └── diffusers_library.jpg
│ │ │ ├── index.mdx
│ │ │ ├── installation.mdx
│ │ │ ├── optimization
│ │ │ │ ├── fp16.mdx
│ │ │ │ ├── habana.mdx
│ │ │ │ ├── mps.mdx
│ │ │ │ ├── onnx.mdx
│ │ │ │ ├── open_vino.mdx
│ │ │ │ ├── opt_overview.mdx
│ │ │ │ ├── torch2.0.mdx
│ │ │ │ └── xformers.mdx
│ │ │ ├── quicktour.mdx
│ │ │ ├── stable_diffusion.mdx
│ │ │ ├── training
│ │ │ │ ├── controlnet.mdx
│ │ │ │ ├── dreambooth.mdx
│ │ │ │ ├── instructpix2pix.mdx
│ │ │ │ ├── lora.mdx
│ │ │ │ ├── overview.mdx
│ │ │ │ ├── text2image.mdx
│ │ │ │ ├── text_inversion.mdx
│ │ │ │ └── unconditional_training.mdx
│ │ │ ├── tutorials
│ │ │ │ ├── basic_training.mdx
│ │ │ │ └── tutorial_overview.mdx
│ │ │ └── using-diffusers
│ │ │ │ ├── audio.mdx
│ │ │ │ ├── conditional_image_generation.mdx
│ │ │ │ ├── contribute_pipeline.mdx
│ │ │ │ ├── controlling_generation.mdx
│ │ │ │ ├── custom_pipeline_examples.mdx
│ │ │ │ ├── custom_pipeline_overview.mdx
│ │ │ │ ├── depth2img.mdx
│ │ │ │ ├── img2img.mdx
│ │ │ │ ├── inpaint.mdx
│ │ │ │ ├── kerascv.mdx
│ │ │ │ ├── loading.mdx
│ │ │ │ ├── loading_overview.mdx
│ │ │ │ ├── other-modalities.mdx
│ │ │ │ ├── pipeline_overview.mdx
│ │ │ │ ├── reproducibility.mdx
│ │ │ │ ├── reusing_seeds.mdx
│ │ │ │ ├── rl.mdx
│ │ │ │ ├── schedulers.mdx
│ │ │ │ ├── stable_diffusion_jax_how_to.mdx
│ │ │ │ ├── unconditional_image_generation.mdx
│ │ │ │ ├── using_safetensors
│ │ │ │ ├── using_safetensors.mdx
│ │ │ │ ├── weighted_prompts.mdx
│ │ │ │ └── write_own_pipeline.mdx
│ │ │ ├── ko
│ │ │ ├── _toctree.yml
│ │ │ ├── in_translation.mdx
│ │ │ ├── index.mdx
│ │ │ ├── installation.mdx
│ │ │ └── quicktour.mdx
│ │ │ └── zh
│ │ │ ├── _toctree.yml
│ │ │ ├── index.mdx
│ │ │ ├── installation.mdx
│ │ │ └── quicktour.mdx
│ ├── examples
│ │ ├── README.md
│ │ ├── community
│ │ │ ├── README.md
│ │ │ ├── bit_diffusion.py
│ │ │ ├── checkpoint_merger.py
│ │ │ ├── clip_guided_stable_diffusion.py
│ │ │ ├── clip_guided_stable_diffusion_img2img.py
│ │ │ ├── composable_stable_diffusion.py
│ │ │ ├── ddim_noise_comparative_analysis.py
│ │ │ ├── imagic_stable_diffusion.py
│ │ │ ├── img2img_inpainting.py
│ │ │ ├── interpolate_stable_diffusion.py
│ │ │ ├── lpw_stable_diffusion.py
│ │ │ ├── lpw_stable_diffusion_onnx.py
│ │ │ ├── magic_mix.py
│ │ │ ├── multilingual_stable_diffusion.py
│ │ │ ├── one_step_unet.py
│ │ │ ├── sd_text2img_k_diffusion.py
│ │ │ ├── seed_resize_stable_diffusion.py
│ │ │ ├── speech_to_image_diffusion.py
│ │ │ ├── stable_diffusion_comparison.py
│ │ │ ├── stable_diffusion_controlnet_img2img.py
│ │ │ ├── stable_diffusion_controlnet_inpaint.py
│ │ │ ├── stable_diffusion_controlnet_inpaint_img2img.py
│ │ │ ├── stable_diffusion_mega.py
│ │ │ ├── stable_unclip.py
│ │ │ ├── text_inpainting.py
│ │ │ ├── tiled_upscaling.py
│ │ │ ├── unclip_image_interpolation.py
│ │ │ ├── unclip_text_interpolation.py
│ │ │ └── wildcard_stable_diffusion.py
│ │ ├── conftest.py
│ │ ├── controlnet
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── train_controlnet.py
│ │ │ └── train_controlnet_flax.py
│ │ ├── dreambooth
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── train_dreambooth.py
│ │ │ ├── train_dreambooth_flax.py
│ │ │ └── train_dreambooth_lora.py
│ │ ├── inference
│ │ │ ├── README.md
│ │ │ ├── image_to_image.py
│ │ │ └── inpainting.py
│ │ ├── instruct_pix2pix
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ └── train_instruct_pix2pix.py
│ │ ├── research_projects
│ │ │ ├── README.md
│ │ │ ├── colossalai
│ │ │ │ ├── README.md
│ │ │ │ ├── inference.py
│ │ │ │ ├── requirement.txt
│ │ │ │ └── train_dreambooth_colossalai.py
│ │ │ ├── dreambooth_inpaint
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── train_dreambooth_inpaint.py
│ │ │ │ └── train_dreambooth_inpaint_lora.py
│ │ │ ├── intel_opts
│ │ │ │ ├── README.md
│ │ │ │ ├── inference_bf16.py
│ │ │ │ └── textual_inversion
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ └── textual_inversion_bf16.py
│ │ │ ├── lora
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_text_to_image_lora.py
│ │ │ ├── mulit_token_textual_inversion
│ │ │ │ ├── README.md
│ │ │ │ ├── multi_token_clip.py
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── requirements_flax.txt
│ │ │ │ ├── textual_inversion.py
│ │ │ │ └── textual_inversion_flax.py
│ │ │ ├── multi_subject_dreambooth
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_multi_subject_dreambooth.py
│ │ │ └── onnxruntime
│ │ │ │ ├── README.md
│ │ │ │ ├── text_to_image
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_text_to_image.py
│ │ │ │ ├── textual_inversion
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── textual_inversion.py
│ │ │ │ └── unconditional_image_generation
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_unconditional.py
│ │ ├── rl
│ │ │ ├── README.md
│ │ │ └── run_diffuser_locomotion.py
│ │ ├── test_examples.py
│ │ ├── text_to_image
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── train_text_to_image.py
│ │ │ ├── train_text_to_image_flax.py
│ │ │ └── train_text_to_image_lora.py
│ │ ├── textual_inversion
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── textual_inversion.py
│ │ │ └── textual_inversion_flax.py
│ │ └── unconditional_image_generation
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ └── train_unconditional.py
│ ├── pyproject.toml
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── change_naming_configs_and_checkpoints.py
│ │ ├── conversion_ldm_uncond.py
│ │ ├── convert_dance_diffusion_to_diffusers.py
│ │ ├── convert_ddpm_original_checkpoint_to_diffusers.py
│ │ ├── convert_diffusers_to_original_stable_diffusion.py
│ │ ├── convert_dit_to_diffusers.py
│ │ ├── convert_k_upscaler_to_diffusers.py
│ │ ├── convert_kakao_brain_unclip_to_diffusers.py
│ │ ├── convert_ldm_original_checkpoint_to_diffusers.py
│ │ ├── convert_lora_safetensor_to_diffusers.py
│ │ ├── convert_models_diffuser_to_diffusers.py
│ │ ├── convert_ms_text_to_video_to_diffusers.py
│ │ ├── convert_music_spectrogram_to_diffusers.py
│ │ ├── convert_ncsnpp_original_checkpoint_to_diffusers.py
│ │ ├── convert_original_audioldm_to_diffusers.py
│ │ ├── convert_original_controlnet_to_diffusers.py
│ │ ├── convert_original_stable_diffusion_to_diffusers.py
│ │ ├── convert_stable_diffusion_checkpoint_to_onnx.py
│ │ ├── convert_unclip_txt2img_to_image_variation.py
│ │ ├── convert_vae_diff_to_onnx.py
│ │ ├── convert_vae_pt_to_diffusers.py
│ │ ├── convert_versatile_diffusion_to_diffusers.py
│ │ ├── convert_vq_diffusion_to_diffusers.py
│ │ └── generate_logits.py
│ ├── setup.cfg
│ ├── setup.py
│ ├── src
│ │ ├── diffusers.egg-info
│ │ │ ├── PKG-INFO
│ │ │ ├── SOURCES.txt
│ │ │ ├── dependency_links.txt
│ │ │ ├── entry_points.txt
│ │ │ ├── requires.txt
│ │ │ └── top_level.txt
│ │ └── diffusers
│ │ │ ├── __init__.py
│ │ │ ├── commands
│ │ │ ├── __init__.py
│ │ │ ├── diffusers_cli.py
│ │ │ └── env.py
│ │ │ ├── configuration_utils.py
│ │ │ ├── dependency_versions_check.py
│ │ │ ├── dependency_versions_table.py
│ │ │ ├── experimental
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── rl
│ │ │ │ ├── __init__.py
│ │ │ │ └── value_guided_sampling.py
│ │ │ ├── image_processor.py
│ │ │ ├── loaders.py
│ │ │ ├── models
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── attention_flax.py
│ │ │ ├── attention_processor.py
│ │ │ ├── autoencoder_kl.py
│ │ │ ├── controlnet.py
│ │ │ ├── controlnet_flax.py
│ │ │ ├── cross_attention.py
│ │ │ ├── dual_transformer_2d.py
│ │ │ ├── embeddings.py
│ │ │ ├── embeddings_flax.py
│ │ │ ├── modeling_flax_pytorch_utils.py
│ │ │ ├── modeling_flax_utils.py
│ │ │ ├── modeling_pytorch_flax_utils.py
│ │ │ ├── modeling_utils.py
│ │ │ ├── prior_transformer.py
│ │ │ ├── resnet.py
│ │ │ ├── resnet_flax.py
│ │ │ ├── t5_film_transformer.py
│ │ │ ├── transformer_2d.py
│ │ │ ├── transformer_temporal.py
│ │ │ ├── unet_1d.py
│ │ │ ├── unet_1d_blocks.py
│ │ │ ├── unet_2d.py
│ │ │ ├── unet_2d_blocks.py
│ │ │ ├── unet_2d_blocks_flax.py
│ │ │ ├── unet_2d_condition.py
│ │ │ ├── unet_2d_condition_flax.py
│ │ │ ├── unet_2d_condition_music.py
│ │ │ ├── unet_3d_blocks.py
│ │ │ ├── unet_3d_condition.py
│ │ │ ├── vae.py
│ │ │ ├── vae_flax.py
│ │ │ └── vq_model.py
│ │ │ ├── optimization.py
│ │ │ ├── pipeline_utils.py
│ │ │ ├── pipelines
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── alt_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_roberta_series.py
│ │ │ │ ├── pipeline_alt_diffusion.py
│ │ │ │ └── pipeline_alt_diffusion_img2img.py
│ │ │ ├── audio_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── mel.py
│ │ │ │ └── pipeline_audio_diffusion.py
│ │ │ ├── audioldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_audioldm.py
│ │ │ ├── dance_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_dance_diffusion.py
│ │ │ ├── ddim
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_ddim.py
│ │ │ ├── ddpm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_ddpm.py
│ │ │ ├── dit
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_dit.py
│ │ │ ├── latent_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_latent_diffusion.py
│ │ │ │ └── pipeline_latent_diffusion_superresolution.py
│ │ │ ├── latent_diffusion_uncond
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_latent_diffusion_uncond.py
│ │ │ ├── onnx_utils.py
│ │ │ ├── paint_by_example
│ │ │ │ ├── __init__.py
│ │ │ │ ├── image_encoder.py
│ │ │ │ └── pipeline_paint_by_example.py
│ │ │ ├── pipeline_flax_utils.py
│ │ │ ├── pipeline_utils.py
│ │ │ ├── pndm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_pndm.py
│ │ │ ├── repaint
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_repaint.py
│ │ │ ├── score_sde_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_score_sde_ve.py
│ │ │ ├── semantic_stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_semantic_stable_diffusion.py
│ │ │ ├── spectrogram_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── continous_encoder.py
│ │ │ │ ├── midi_utils.py
│ │ │ │ ├── notes_encoder.py
│ │ │ │ └── pipeline_spectrogram_diffusion.py
│ │ │ ├── stable_diffusion
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── convert_from_ckpt.py
│ │ │ │ ├── pipeline_cycle_diffusion.py
│ │ │ │ ├── pipeline_flax_stable_diffusion.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_controlnet.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_upscale.py
│ │ │ │ ├── pipeline_stable_diffusion.py
│ │ │ │ ├── pipeline_stable_diffusion_attend_and_excite.py
│ │ │ │ ├── pipeline_stable_diffusion_controlnet.py
│ │ │ │ ├── pipeline_stable_diffusion_depth2img.py
│ │ │ │ ├── pipeline_stable_diffusion_image_variation.py
│ │ │ │ ├── pipeline_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── pipeline_stable_diffusion_instruct_pix2pix.py
│ │ │ │ ├── pipeline_stable_diffusion_k_diffusion.py
│ │ │ │ ├── pipeline_stable_diffusion_latent_upscale.py
│ │ │ │ ├── pipeline_stable_diffusion_model_editing.py
│ │ │ │ ├── pipeline_stable_diffusion_panorama.py
│ │ │ │ ├── pipeline_stable_diffusion_pix2pix_zero.py
│ │ │ │ ├── pipeline_stable_diffusion_sag.py
│ │ │ │ ├── pipeline_stable_diffusion_upscale.py
│ │ │ │ ├── pipeline_stable_unclip.py
│ │ │ │ ├── pipeline_stable_unclip_img2img.py
│ │ │ │ ├── safety_checker.py
│ │ │ │ ├── safety_checker_flax.py
│ │ │ │ └── stable_unclip_image_normalizer.py
│ │ │ ├── stable_diffusion_safe
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_stable_diffusion_safe.py
│ │ │ │ └── safety_checker.py
│ │ │ ├── stochastic_karras_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_stochastic_karras_ve.py
│ │ │ ├── text_to_video_synthesis
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_text_to_video_synth.py
│ │ │ ├── unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_unclip.py
│ │ │ │ ├── pipeline_unclip_image_variation.py
│ │ │ │ └── text_proj.py
│ │ │ ├── versatile_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_text_unet.py
│ │ │ │ ├── pipeline_versatile_diffusion.py
│ │ │ │ ├── pipeline_versatile_diffusion_dual_guided.py
│ │ │ │ ├── pipeline_versatile_diffusion_image_variation.py
│ │ │ │ └── pipeline_versatile_diffusion_text_to_image.py
│ │ │ └── vq_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_vq_diffusion.py
│ │ │ ├── schedulers
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── scheduling_ddim.py
│ │ │ ├── scheduling_ddim_flax.py
│ │ │ ├── scheduling_ddim_inverse.py
│ │ │ ├── scheduling_ddpm.py
│ │ │ ├── scheduling_ddpm_flax.py
│ │ │ ├── scheduling_deis_multistep.py
│ │ │ ├── scheduling_dpmsolver_multistep.py
│ │ │ ├── scheduling_dpmsolver_multistep_flax.py
│ │ │ ├── scheduling_dpmsolver_singlestep.py
│ │ │ ├── scheduling_euler_ancestral_discrete.py
│ │ │ ├── scheduling_euler_discrete.py
│ │ │ ├── scheduling_heun_discrete.py
│ │ │ ├── scheduling_ipndm.py
│ │ │ ├── scheduling_k_dpm_2_ancestral_discrete.py
│ │ │ ├── scheduling_k_dpm_2_discrete.py
│ │ │ ├── scheduling_karras_ve.py
│ │ │ ├── scheduling_karras_ve_flax.py
│ │ │ ├── scheduling_lms_discrete.py
│ │ │ ├── scheduling_lms_discrete_flax.py
│ │ │ ├── scheduling_pndm.py
│ │ │ ├── scheduling_pndm_flax.py
│ │ │ ├── scheduling_repaint.py
│ │ │ ├── scheduling_sde_ve.py
│ │ │ ├── scheduling_sde_ve_flax.py
│ │ │ ├── scheduling_sde_vp.py
│ │ │ ├── scheduling_unclip.py
│ │ │ ├── scheduling_unipc_multistep.py
│ │ │ ├── scheduling_utils.py
│ │ │ ├── scheduling_utils_flax.py
│ │ │ └── scheduling_vq_diffusion.py
│ │ │ ├── training_utils.py
│ │ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── accelerate_utils.py
│ │ │ ├── constants.py
│ │ │ ├── deprecation_utils.py
│ │ │ ├── doc_utils.py
│ │ │ ├── dummy_flax_and_transformers_objects.py
│ │ │ ├── dummy_flax_objects.py
│ │ │ ├── dummy_note_seq_objects.py
│ │ │ ├── dummy_onnx_objects.py
│ │ │ ├── dummy_pt_objects.py
│ │ │ ├── dummy_torch_and_librosa_objects.py
│ │ │ ├── dummy_torch_and_scipy_objects.py
│ │ │ ├── dummy_torch_and_transformers_and_k_diffusion_objects.py
│ │ │ ├── dummy_torch_and_transformers_and_onnx_objects.py
│ │ │ ├── dummy_torch_and_transformers_objects.py
│ │ │ ├── dummy_transformers_and_torch_and_note_seq_objects.py
│ │ │ ├── dynamic_modules_utils.py
│ │ │ ├── hub_utils.py
│ │ │ ├── import_utils.py
│ │ │ ├── logging.py
│ │ │ ├── model_card_template.md
│ │ │ ├── outputs.py
│ │ │ ├── pil_utils.py
│ │ │ ├── testing_utils.py
│ │ │ └── torch_utils.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── fixtures
│ │ │ ├── custom_pipeline
│ │ │ │ ├── pipeline.py
│ │ │ │ └── what_ever.py
│ │ │ └── elise_format0.mid
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── test_models_unet_1d.py
│ │ │ ├── test_models_unet_2d.py
│ │ │ ├── test_models_unet_2d_condition.py
│ │ │ ├── test_models_unet_2d_flax.py
│ │ │ ├── test_models_unet_3d_condition.py
│ │ │ ├── test_models_vae.py
│ │ │ ├── test_models_vae_flax.py
│ │ │ └── test_models_vq.py
│ │ ├── pipeline_params.py
│ │ ├── pipelines
│ │ │ ├── __init__.py
│ │ │ ├── altdiffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_alt_diffusion.py
│ │ │ │ └── test_alt_diffusion_img2img.py
│ │ │ ├── audio_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_audio_diffusion.py
│ │ │ ├── audioldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_audioldm.py
│ │ │ ├── dance_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_dance_diffusion.py
│ │ │ ├── ddim
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_ddim.py
│ │ │ ├── ddpm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_ddpm.py
│ │ │ ├── dit
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_dit.py
│ │ │ ├── karras_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_karras_ve.py
│ │ │ ├── latent_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_latent_diffusion.py
│ │ │ │ ├── test_latent_diffusion_superresolution.py
│ │ │ │ └── test_latent_diffusion_uncond.py
│ │ │ ├── paint_by_example
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_paint_by_example.py
│ │ │ ├── pndm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_pndm.py
│ │ │ ├── repaint
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_repaint.py
│ │ │ ├── score_sde_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_score_sde_ve.py
│ │ │ ├── semantic_stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_semantic_diffusion.py
│ │ │ ├── spectrogram_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_spectrogram_diffusion.py
│ │ │ ├── stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_cycle_diffusion.py
│ │ │ │ ├── test_onnx_stable_diffusion.py
│ │ │ │ ├── test_onnx_stable_diffusion_img2img.py
│ │ │ │ ├── test_onnx_stable_diffusion_inpaint.py
│ │ │ │ ├── test_onnx_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── test_onnx_stable_diffusion_upscale.py
│ │ │ │ ├── test_stable_diffusion.py
│ │ │ │ ├── test_stable_diffusion_controlnet.py
│ │ │ │ ├── test_stable_diffusion_flax_controlnet.py
│ │ │ │ ├── test_stable_diffusion_image_variation.py
│ │ │ │ ├── test_stable_diffusion_img2img.py
│ │ │ │ ├── test_stable_diffusion_inpaint.py
│ │ │ │ ├── test_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── test_stable_diffusion_instruction_pix2pix.py
│ │ │ │ ├── test_stable_diffusion_k_diffusion.py
│ │ │ │ ├── test_stable_diffusion_model_editing.py
│ │ │ │ ├── test_stable_diffusion_panorama.py
│ │ │ │ ├── test_stable_diffusion_pix2pix_zero.py
│ │ │ │ └── test_stable_diffusion_sag.py
│ │ │ ├── stable_diffusion_2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_stable_diffusion.py
│ │ │ │ ├── test_stable_diffusion_attend_and_excite.py
│ │ │ │ ├── test_stable_diffusion_depth.py
│ │ │ │ ├── test_stable_diffusion_flax.py
│ │ │ │ ├── test_stable_diffusion_flax_inpaint.py
│ │ │ │ ├── test_stable_diffusion_inpaint.py
│ │ │ │ ├── test_stable_diffusion_latent_upscale.py
│ │ │ │ ├── test_stable_diffusion_upscale.py
│ │ │ │ └── test_stable_diffusion_v_pred.py
│ │ │ ├── stable_diffusion_safe
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_safe_diffusion.py
│ │ │ ├── stable_unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_stable_unclip.py
│ │ │ │ └── test_stable_unclip_img2img.py
│ │ │ ├── test_pipeline_utils.py
│ │ │ ├── text_to_video
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_text_to_video.py
│ │ │ ├── unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_unclip.py
│ │ │ │ └── test_unclip_image_variation.py
│ │ │ ├── versatile_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_versatile_diffusion_dual_guided.py
│ │ │ │ ├── test_versatile_diffusion_image_variation.py
│ │ │ │ ├── test_versatile_diffusion_mega.py
│ │ │ │ └── test_versatile_diffusion_text_to_image.py
│ │ │ └── vq_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_vq_diffusion.py
│ │ ├── repo_utils
│ │ │ ├── test_check_copies.py
│ │ │ └── test_check_dummies.py
│ │ ├── schedulers
│ │ │ ├── __init__.py
│ │ │ ├── test_scheduler_ddim.py
│ │ │ ├── test_scheduler_ddpm.py
│ │ │ ├── test_scheduler_deis.py
│ │ │ ├── test_scheduler_dpm_multi.py
│ │ │ ├── test_scheduler_dpm_single.py
│ │ │ ├── test_scheduler_euler.py
│ │ │ ├── test_scheduler_euler_ancestral.py
│ │ │ ├── test_scheduler_flax.py
│ │ │ ├── test_scheduler_heun.py
│ │ │ ├── test_scheduler_ipndm.py
│ │ │ ├── test_scheduler_kdpm2_ancestral.py
│ │ │ ├── test_scheduler_kdpm2_discrete.py
│ │ │ ├── test_scheduler_lms.py
│ │ │ ├── test_scheduler_pndm.py
│ │ │ ├── test_scheduler_score_sde_ve.py
│ │ │ ├── test_scheduler_unclip.py
│ │ │ ├── test_scheduler_unipc.py
│ │ │ ├── test_scheduler_vq_diffusion.py
│ │ │ └── test_schedulers.py
│ │ ├── test_config.py
│ │ ├── test_ema.py
│ │ ├── test_hub_utils.py
│ │ ├── test_image_processor.py
│ │ ├── test_layers_utils.py
│ │ ├── test_modeling_common.py
│ │ ├── test_modeling_common_flax.py
│ │ ├── test_outputs.py
│ │ ├── test_pipelines.py
│ │ ├── test_pipelines_common.py
│ │ ├── test_pipelines_flax.py
│ │ ├── test_pipelines_onnx_common.py
│ │ ├── test_training.py
│ │ ├── test_unet_2d_blocks.py
│ │ ├── test_unet_blocks_common.py
│ │ └── test_utils.py
│ └── utils
│ │ ├── check_config_docstrings.py
│ │ ├── check_copies.py
│ │ ├── check_doc_toc.py
│ │ ├── check_dummies.py
│ │ ├── check_inits.py
│ │ ├── check_repo.py
│ │ ├── check_table.py
│ │ ├── custom_init_isort.py
│ │ ├── get_modified_files.py
│ │ ├── overwrite_expected_slice.py
│ │ ├── print_env.py
│ │ ├── release.py
│ │ └── stale.py
├── layers
│ └── layers.py
├── modelling_deberta_v2.py
├── models.py
├── mustango.jpg
├── mustango.py
├── requirements.txt
└── tools
│ ├── __init__.py
│ ├── mix.py
│ └── torch_tools.py
├── predict.py
├── requirements.txt
├── samples
└── README.md
├── setup.py
├── tango.py
├── tango2
├── audioldm
│ ├── __init__.py
│ ├── __main__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── audio_processing.py
│ │ ├── stft.py
│ │ └── tools.py
│ ├── hifigan
│ │ ├── __init__.py
│ │ ├── models.py
│ │ └── utilities.py
│ ├── latent_diffusion
│ │ ├── __init__.py
│ │ ├── attention.py
│ │ ├── ddim.py
│ │ ├── ddpm.py
│ │ ├── ema.py
│ │ ├── openaimodel.py
│ │ └── util.py
│ ├── ldm.py
│ ├── pipeline.py
│ ├── utils.py
│ └── variational_autoencoder
│ │ ├── __init__.py
│ │ ├── autoencoder.py
│ │ ├── distributions.py
│ │ └── modules.py
├── audioldm_eval
│ ├── __init__.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── audio_processing.py
│ │ ├── stft.py
│ │ └── tools.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── load_mel.py
│ │ └── transforms.py
│ ├── eval.py
│ ├── feature_extractors
│ │ ├── __init__.py
│ │ ├── inception3.py
│ │ ├── melception.py
│ │ ├── melception_audioset.py
│ │ └── panns
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── evaluate.py
│ │ │ ├── finetune_template.py
│ │ │ ├── losses.py
│ │ │ ├── main.py
│ │ │ ├── models.py
│ │ │ ├── pytorch_utils.py
│ │ │ └── utilities.py
│ └── metrics
│ │ ├── __init__.py
│ │ ├── fad.py
│ │ ├── fid.py
│ │ ├── gs
│ │ ├── __init__.py
│ │ ├── geom_score.py
│ │ ├── top_utils.py
│ │ └── utils.py
│ │ ├── isc.py
│ │ ├── kid.py
│ │ ├── kl.py
│ │ ├── ndb.py
│ │ └── validate.py
├── configs
│ ├── diffusion_model_config.json
│ ├── diffusion_model_xl_config.json
│ └── stable_diffusion_2.1.json
├── inference.py
├── models.py
├── requirements.txt
├── tango.py
├── tango2-train.py
└── tools
│ ├── __init__.py
│ ├── mix.py
│ └── torch_tools.py
├── tools
├── __init__.py
├── mix.py
└── torch_tools.py
├── train.py
└── train.sh
/TANGO.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/TANGO.pdf
--------------------------------------------------------------------------------
/audioldm/__init__.py:
--------------------------------------------------------------------------------
1 | from .ldm import LatentDiffusion
2 | from .utils import seed_everything, save_wave, get_time, get_duration
3 | from .pipeline import *
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/audioldm/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import wav_to_fbank, read_wav_file
2 | from .stft import TacotronSTFT
3 |
--------------------------------------------------------------------------------
/audioldm/clap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm/clap/__init__.py
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import (
2 | list_models,
3 | create_model,
4 | create_model_and_transforms,
5 | add_model_config,
6 | )
7 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
8 | from .model import (
9 | CLAP,
10 | CLAPTextCfg,
11 | CLAPVisionCfg,
12 | CLAPAudioCfp,
13 | convert_weights_to_fp16,
14 | trace_model,
15 | )
16 | from .openai import load_openai_model, list_openai_models
17 | from .pretrained import (
18 | list_pretrained,
19 | list_pretrained_tag_models,
20 | list_pretrained_model_tags,
21 | get_pretrained_url,
22 | download_pretrained,
23 | )
24 | from .tokenizer import SimpleTokenizer, tokenize
25 | from .transform import image_transform
26 |
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/bert.py:
--------------------------------------------------------------------------------
1 | from transformers import BertTokenizer, BertModel
2 |
3 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
4 | model = BertModel.from_pretrained("bert-base-uncased")
5 | text = "Replace me by any text you'd like."
6 |
7 |
8 | def bert_embeddings(text):
9 | # text = "Replace me by any text you'd like."
10 | encoded_input = tokenizer(text, return_tensors="pt")
11 | output = model(**encoded_input)
12 | return output
13 |
14 |
15 | from transformers import RobertaTokenizer, RobertaModel
16 |
17 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18 | model = RobertaModel.from_pretrained("roberta-base")
19 | text = "Replace me by any text you'd like."
20 |
21 |
22 | def Roberta_embeddings(text):
23 | # text = "Replace me by any text you'd like."
24 | encoded_input = tokenizer(text, return_tensors="pt")
25 | output = model(**encoded_input)
26 | return output
27 |
28 |
29 | from transformers import BartTokenizer, BartModel
30 |
31 | tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
32 | model = BartModel.from_pretrained("facebook/bart-base")
33 | text = "Replace me by any text you'd like."
34 |
35 |
36 | def bart_embeddings(text):
37 | # text = "Replace me by any text you'd like."
38 | encoded_input = tokenizer(text, return_tensors="pt")
39 | output = model(**encoded_input)
40 | return output
41 |
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "base"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "large"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1536,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "tiny"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "tiny"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-10.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn10"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 18000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 960000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 360,
10 | "fmin": 50,
11 | "fmax": 8000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 4
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1536,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/PANN-6.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn6"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/transform.py:
--------------------------------------------------------------------------------
1 | from torchvision.transforms import (
2 | Normalize,
3 | Compose,
4 | RandomResizedCrop,
5 | InterpolationMode,
6 | ToTensor,
7 | Resize,
8 | CenterCrop,
9 | )
10 |
11 |
12 | def _convert_to_rgb(image):
13 | return image.convert("RGB")
14 |
15 |
16 | def image_transform(
17 | image_size: int,
18 | is_train: bool,
19 | mean=(0.48145466, 0.4578275, 0.40821073),
20 | std=(0.26862954, 0.26130258, 0.27577711),
21 | ):
22 | normalize = Normalize(mean=mean, std=std)
23 | if is_train:
24 | return Compose(
25 | [
26 | RandomResizedCrop(
27 | image_size,
28 | scale=(0.9, 1.0),
29 | interpolation=InterpolationMode.BICUBIC,
30 | ),
31 | _convert_to_rgb,
32 | ToTensor(),
33 | normalize,
34 | ]
35 | )
36 | else:
37 | return Compose(
38 | [
39 | Resize(image_size, interpolation=InterpolationMode.BICUBIC),
40 | CenterCrop(image_size),
41 | _convert_to_rgb,
42 | ToTensor(),
43 | normalize,
44 | ]
45 | )
46 |
--------------------------------------------------------------------------------
/audioldm/clap/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.1"
2 |
--------------------------------------------------------------------------------
/audioldm/clap/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm/clap/training/__init__.py
--------------------------------------------------------------------------------
/audioldm/clap/training/audioset_textmap.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm/clap/training/audioset_textmap.npy
--------------------------------------------------------------------------------
/audioldm/clap/training/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def setup_logging(log_file, level, include_host=False):
5 | if include_host:
6 | import socket
7 |
8 | hostname = socket.gethostname()
9 | formatter = logging.Formatter(
10 | f"%(asctime)s | {hostname} | %(levelname)s | %(message)s",
11 | datefmt="%Y-%m-%d,%H:%M:%S",
12 | )
13 | else:
14 | formatter = logging.Formatter(
15 | "%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d,%H:%M:%S"
16 | )
17 |
18 | logging.root.setLevel(level)
19 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
20 | for logger in loggers:
21 | logger.setLevel(level)
22 |
23 | stream_handler = logging.StreamHandler()
24 | stream_handler.setFormatter(formatter)
25 | logging.root.addHandler(stream_handler)
26 |
27 | if log_file:
28 | file_handler = logging.FileHandler(filename=log_file)
29 | file_handler.setFormatter(formatter)
30 | logging.root.addHandler(file_handler)
31 |
--------------------------------------------------------------------------------
/audioldm/clap/training/scheduler.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def assign_learning_rate(optimizer, new_lr):
5 | for param_group in optimizer.param_groups:
6 | param_group["lr"] = new_lr
7 |
8 |
9 | def _warmup_lr(base_lr, warmup_length, step):
10 | return base_lr * (step + 1) / warmup_length
11 |
12 |
13 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
14 | def _lr_adjuster(step):
15 | if step < warmup_length:
16 | lr = _warmup_lr(base_lr, warmup_length, step)
17 | else:
18 | e = step - warmup_length
19 | es = steps - warmup_length
20 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
21 | assign_learning_rate(optimizer, lr)
22 | return lr
23 |
24 | return _lr_adjuster
25 |
--------------------------------------------------------------------------------
/audioldm/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Generator
2 |
3 |
4 | class AttrDict(dict):
5 | def __init__(self, *args, **kwargs):
6 | super(AttrDict, self).__init__(*args, **kwargs)
7 | self.__dict__ = self
8 |
--------------------------------------------------------------------------------
/audioldm/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm/latent_diffusion/__init__.py
--------------------------------------------------------------------------------
/audioldm/variational_autoencoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoencoder import AutoencoderKL
--------------------------------------------------------------------------------
/audioldm_eval/__init__.py:
--------------------------------------------------------------------------------
1 | from .metrics.fid import calculate_fid
2 | from .metrics.isc import calculate_isc
3 | from .metrics.kid import calculate_kid
4 | from .metrics.kl import calculate_kl
5 | from .eval import EvaluationHelper
6 |
--------------------------------------------------------------------------------
/audioldm_eval/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # import audio.tools
2 | # import audio.stft
3 | # import audio.audio_processing
4 | from .stft import *
5 | from .audio_processing import *
6 | from .tools import *
7 |
--------------------------------------------------------------------------------
/audioldm_eval/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm_eval/datasets/__init__.py
--------------------------------------------------------------------------------
/audioldm_eval/datasets/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from specvqgan.modules.losses.vggishish.transforms import Crop
3 |
4 |
5 | class FromMinusOneOneToZeroOne(object):
6 | """Actually, it doesnot do [-1, 1] --> [0, 1] as promised. It would, if inputs would be in [-1, 1]
7 | but reconstructed specs are not."""
8 |
9 | def __call__(self, item):
10 | item["image"] = (item["image"] + 1) / 2
11 | return item
12 |
13 |
14 | class CropNoDict(Crop):
15 | def __init__(self, cropped_shape, random_crop=None):
16 | super().__init__(cropped_shape=cropped_shape, random_crop=random_crop)
17 |
18 | def __call__(self, x):
19 | # albumentations expect an ndarray of size (H, W, ...) but we have tensor of size (B, H, W).
20 | # we will assume that the batch-dim (B) is out "channel" dim and permute it to the end.
21 | # Finally, we change the type back to Torch.Tensor.
22 | x = self.preprocessor(image=x.permute(1, 2, 0).numpy())["image"].transpose(
23 | 2, 0, 1
24 | )
25 | return torch.from_numpy(x)
26 |
27 |
28 | class GetInputFromBatchByKey(object): # get image from item dict
29 | def __init__(self, input_key):
30 | self.input_key = input_key
31 |
32 | def __call__(self, item):
33 | return item[self.input_key]
34 |
35 |
36 | class ToFloat32(object):
37 | def __call__(self, item):
38 | return item.float()
39 |
--------------------------------------------------------------------------------
/audioldm_eval/feature_extractors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm_eval/feature_extractors/__init__.py
--------------------------------------------------------------------------------
/audioldm_eval/feature_extractors/panns/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Cnn14, Cnn14_16k
2 |
--------------------------------------------------------------------------------
/audioldm_eval/feature_extractors/panns/evaluate.py:
--------------------------------------------------------------------------------
1 | from sklearn import metrics
2 |
3 | from pytorch_utils import forward
4 |
5 |
6 | class Evaluator(object):
7 | def __init__(self, model):
8 | """Evaluator.
9 |
10 | Args:
11 | model: object
12 | """
13 | self.model = model
14 |
15 | def evaluate(self, data_loader):
16 | """Forward evaluation data and calculate statistics.
17 |
18 | Args:
19 | data_loader: object
20 |
21 | Returns:
22 | statistics: dict,
23 | {'average_precision': (classes_num,), 'auc': (classes_num,)}
24 | """
25 |
26 | # Forward
27 | output_dict = forward(
28 | model=self.model, generator=data_loader, return_target=True
29 | )
30 |
31 | clipwise_output = output_dict["clipwise_output"] # (audios_num, classes_num)
32 | target = output_dict["target"] # (audios_num, classes_num)
33 |
34 | average_precision = metrics.average_precision_score(
35 | target, clipwise_output, average=None
36 | )
37 |
38 | auc = metrics.roc_auc_score(target, clipwise_output, average=None)
39 |
40 | statistics = {"average_precision": average_precision, "auc": auc}
41 |
42 | return statistics
43 |
--------------------------------------------------------------------------------
/audioldm_eval/feature_extractors/panns/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def clip_bce(output_dict, target_dict):
6 | """Binary crossentropy loss."""
7 | return F.binary_cross_entropy(output_dict["clipwise_output"], target_dict["target"])
8 |
9 |
10 | def get_loss_func(loss_type):
11 | if loss_type == "clip_bce":
12 | return clip_bce
13 |
--------------------------------------------------------------------------------
/audioldm_eval/metrics/.ipynb_checkpoints/isc-checkpoint.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | def calculate_isc(featuresdict, feat_layer_name, rng_seed, samples_shuffle, splits):
6 | # print("Computing Inception Score")
7 |
8 | features = featuresdict[feat_layer_name]
9 |
10 | assert torch.is_tensor(features) and features.dim() == 2
11 | N, C = features.shape
12 | if samples_shuffle:
13 | rng = np.random.RandomState(rng_seed)
14 | features = features[rng.permutation(N), :]
15 | features = features.double()
16 |
17 | p = features.softmax(dim=1)
18 | log_p = features.log_softmax(dim=1)
19 |
20 | scores = []
21 | for i in range(splits):
22 | p_chunk = p[(i * N // splits) : ((i + 1) * N // splits), :] # 一部分的预测概率
23 | log_p_chunk = log_p[(i * N // splits) : ((i + 1) * N // splits), :] # log
24 | q_chunk = p_chunk.mean(dim=0, keepdim=True) # 概率的均值
25 | kl = p_chunk * (log_p_chunk - q_chunk.log()) #
26 | kl = kl.sum(dim=1).mean().exp().item()
27 | scores.append(kl)
28 | # print("scores",scores)
29 | return {
30 | "inception_score_mean": float(np.mean(scores)),
31 | "inception_score_std": float(np.std(scores)),
32 | }
33 |
--------------------------------------------------------------------------------
/audioldm_eval/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/audioldm_eval/metrics/__init__.py
--------------------------------------------------------------------------------
/audioldm_eval/metrics/gs/__init__.py:
--------------------------------------------------------------------------------
1 | from .geom_score import *
2 | from .top_utils import *
3 | from .utils import *
4 |
--------------------------------------------------------------------------------
/audioldm_eval/metrics/gs/top_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def circle(N=5000):
5 | phi = 2 * np.pi * np.random.rand(N)
6 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
7 | x = np.array(x)
8 | x = x + 0.05 * np.random.randn(N, 2)
9 | return x
10 |
11 |
12 | def filled_circle(N=5000):
13 | ans = []
14 | while len(ans) < N:
15 | x = np.random.rand(2) * 2.0 - 1.0
16 | if np.linalg.norm(x) < 1:
17 | ans.append(x)
18 | return np.array(ans) + 0.05 * np.random.randn(N, 2)
19 |
20 |
21 | def circle_quorter(N=5000):
22 | phi = np.pi * np.random.rand(N) + np.pi / 2
23 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
24 | x = np.array(x)
25 | x = x + 0.05 * np.random.randn(N, 2)
26 | return x
27 |
28 |
29 | def circle_thin(N=5000):
30 | phi = np.random.randn(N)
31 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
32 | x = np.array(x)
33 | x = x + 0.05 * np.random.randn(N, 2)
34 | return x
35 |
36 |
37 | def planar(N=5000, zdim=32, dim=784):
38 | A = np.random.rand(N, zdim)
39 | z = np.random.rand(zdim, dim)
40 | return np.dot(A, z)
41 |
--------------------------------------------------------------------------------
/audioldm_eval/metrics/isc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | def calculate_isc(featuresdict, feat_layer_name, rng_seed, samples_shuffle, splits):
6 | # print("Computing Inception Score")
7 |
8 | features = featuresdict[feat_layer_name]
9 |
10 | assert torch.is_tensor(features) and features.dim() == 2
11 | N, C = features.shape
12 | if samples_shuffle:
13 | rng = np.random.RandomState(rng_seed)
14 | features = features[rng.permutation(N), :]
15 | features = features.double()
16 |
17 | p = features.softmax(dim=1)
18 | log_p = features.log_softmax(dim=1)
19 |
20 | scores = []
21 | for i in range(splits):
22 | p_chunk = p[(i * N // splits) : ((i + 1) * N // splits), :] # 一部分的预测概率
23 | log_p_chunk = log_p[(i * N // splits) : ((i + 1) * N // splits), :] # log
24 | q_chunk = p_chunk.mean(dim=0, keepdim=True) # 概率的均值
25 | kl = p_chunk * (log_p_chunk - q_chunk.log()) #
26 | kl = kl.sum(dim=1).mean().exp().item()
27 | scores.append(kl)
28 | # print("scores",scores)
29 | return {
30 | "inception_score_mean": float(np.mean(scores)),
31 | "inception_score_std": float(np.std(scores)),
32 | }
33 |
--------------------------------------------------------------------------------
/audioldm_eval/metrics/validate.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | from numpy import cov
3 | from numpy import trace
4 | from numpy import iscomplexobj
5 | from numpy.random import random
6 | from scipy.linalg import sqrtm
7 |
8 |
9 | def calculate_fid(act1, act2):
10 | # calculate mean and covariance statistics
11 | mu1, sigma1 = act1.mean(axis=0), cov(act1, rowvar=False)
12 | mu2, sigma2 = act2.mean(axis=0), cov(act2, rowvar=False)
13 | print("mu1 ", mu1.shape)
14 | print("mu2 ", mu2.shape)
15 | print("sigma1 ", sigma1.shape)
16 | print("sigma2 ", sigma2.shape)
17 | # calculate sum squared difference between means
18 | ssdiff = numpy.sum((mu1 - mu2) * 2.0)
19 |
20 | # calculate sqrt of product between cov
21 | covmean = sqrtm(sigma1.dot(sigma2))
22 |
23 | # check and correct imaginary numbers from sqrt
24 | if iscomplexobj(covmean):
25 | covmean = covmean.real
26 | # calculate score
27 | fid = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
28 | return fid
29 |
30 |
31 | act1 = random(2048 * 2)
32 | act1 = act1.reshape((2, 2048))
33 | act2 = random(2048 * 2)
34 | act2 = act2.reshape((2, 2048))
35 | fid = calculate_fid(act1, act1)
36 | print("FID (same): %.3f" % fid)
37 | fid = calculate_fid(act1, act2)
38 | print("FID (different): %.3f" % fid)
39 |
--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
1 | # Configuration for Cog ⚙️
2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
3 |
4 | build:
5 | gpu: true
6 | cuda: "12.1"
7 | python_version: "3.11"
8 | python_packages:
9 | - "torch==2.1.1"
10 | - "torchaudio==2.1.2"
11 | - "torchvision==0.16.2"
12 | - "transformers==4.31.0"
13 | - "accelerate==0.21.0"
14 | - "datasets==2.1.0"
15 | - "einops==0.6.1"
16 | - "librosa==0.9.2"
17 | - "progressbar33==2.4"
18 | - "scikit_image==0.19.3"
19 | - "scikit_learn==1.2.2"
20 | - "scipy===1.13.0"
21 | - "torchlibrosa==0.1.0"
22 | - "diffusers==0.20.2"
23 | run:
24 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
25 | predict: "predict.py:Predictor"
26 |
--------------------------------------------------------------------------------
/configs/diffusion_model_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 1024,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 8,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 8,
37 | "sample_size": [32, 2],
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/configs/diffusion_model_xl_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 2048,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 8,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 8,
37 | "sample_size": [32, 2],
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/configs/stable_diffusion_2.1.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 1024,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 4,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 4,
37 | "sample_size": 96,
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/img/tango-neurips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/img/tango-neurips.png
--------------------------------------------------------------------------------
/img/tango.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/img/tango.png
--------------------------------------------------------------------------------
/img/tango2-framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/img/tango2-framework.png
--------------------------------------------------------------------------------
/img/tango2-teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/img/tango2-teaser.png
--------------------------------------------------------------------------------
/img/tango2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/img/tango2.png
--------------------------------------------------------------------------------
/inference.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python inference.py --original_args="saved/1681728144/summary.jsonl" \
2 | --model="saved/1681728144/epoch_39/pytorch_model_2.bin" --num_steps 200 --guidance 3 --num_samples 1
--------------------------------------------------------------------------------
/mustango/audioldm/__init__.py:
--------------------------------------------------------------------------------
1 | from .ldm import LatentDiffusion
2 | from .utils import seed_everything, save_wave, get_time, get_duration
3 | from .pipeline import *
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/mustango/audioldm/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import wav_to_fbank, read_wav_file
2 | from .stft import TacotronSTFT
3 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/audioldm/clap/__init__.py
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import (
2 | list_models,
3 | create_model,
4 | create_model_and_transforms,
5 | add_model_config,
6 | )
7 | from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
8 | from .model import (
9 | CLAP,
10 | CLAPTextCfg,
11 | CLAPVisionCfg,
12 | CLAPAudioCfp,
13 | convert_weights_to_fp16,
14 | trace_model,
15 | )
16 | from .openai import load_openai_model, list_openai_models
17 | from .pretrained import (
18 | list_pretrained,
19 | list_pretrained_tag_models,
20 | list_pretrained_model_tags,
21 | get_pretrained_url,
22 | download_pretrained,
23 | )
24 | from .tokenizer import SimpleTokenizer, tokenize
25 | from .transform import image_transform
26 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/bert.py:
--------------------------------------------------------------------------------
1 | from transformers import BertTokenizer, BertModel
2 |
3 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
4 | model = BertModel.from_pretrained("bert-base-uncased")
5 | text = "Replace me by any text you'd like."
6 |
7 |
8 | def bert_embeddings(text):
9 | # text = "Replace me by any text you'd like."
10 | encoded_input = tokenizer(text, return_tensors="pt")
11 | output = model(**encoded_input)
12 | return output
13 |
14 |
15 | from transformers import RobertaTokenizer, RobertaModel
16 |
17 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18 | model = RobertaModel.from_pretrained("roberta-base")
19 | text = "Replace me by any text you'd like."
20 |
21 |
22 | def Roberta_embeddings(text):
23 | # text = "Replace me by any text you'd like."
24 | encoded_input = tokenizer(text, return_tensors="pt")
25 | output = model(**encoded_input)
26 | return output
27 |
28 |
29 | from transformers import BartTokenizer, BartModel
30 |
31 | tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
32 | model = BartModel.from_pretrained("facebook/bart-base")
33 | text = "Replace me by any text you'd like."
34 |
35 |
36 | def bart_embeddings(text):
37 | # text = "Replace me by any text you'd like."
38 | encoded_input = tokenizer(text, return_tensors="pt")
39 | output = model(**encoded_input)
40 | return output
41 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/HTSAT-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "base"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/HTSAT-large.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "large"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1536,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "tiny"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/HTSAT-tiny.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "HTSAT",
14 | "model_name": "tiny"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-10.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn10"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 18000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 960000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 360,
10 | "fmin": 50,
11 | "fmax": 8000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 4
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1536,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 2048,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn14"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/PANN-6.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "audio_cfg": {
4 | "audio_length": 1024,
5 | "clip_samples": 480000,
6 | "mel_bins": 64,
7 | "sample_rate": 48000,
8 | "window_size": 1024,
9 | "hop_size": 480,
10 | "fmin": 50,
11 | "fmax": 14000,
12 | "class_num": 527,
13 | "model_type": "PANN",
14 | "model_name": "Cnn6"
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 512,
20 | "heads": 8,
21 | "layers": 12
22 | }
23 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/model_configs/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/transform.py:
--------------------------------------------------------------------------------
1 | from torchvision.transforms import (
2 | Normalize,
3 | Compose,
4 | RandomResizedCrop,
5 | InterpolationMode,
6 | ToTensor,
7 | Resize,
8 | CenterCrop,
9 | )
10 |
11 |
12 | def _convert_to_rgb(image):
13 | return image.convert("RGB")
14 |
15 |
16 | def image_transform(
17 | image_size: int,
18 | is_train: bool,
19 | mean=(0.48145466, 0.4578275, 0.40821073),
20 | std=(0.26862954, 0.26130258, 0.27577711),
21 | ):
22 | normalize = Normalize(mean=mean, std=std)
23 | if is_train:
24 | return Compose(
25 | [
26 | RandomResizedCrop(
27 | image_size,
28 | scale=(0.9, 1.0),
29 | interpolation=InterpolationMode.BICUBIC,
30 | ),
31 | _convert_to_rgb,
32 | ToTensor(),
33 | normalize,
34 | ]
35 | )
36 | else:
37 | return Compose(
38 | [
39 | Resize(image_size, interpolation=InterpolationMode.BICUBIC),
40 | CenterCrop(image_size),
41 | _convert_to_rgb,
42 | ToTensor(),
43 | normalize,
44 | ]
45 | )
46 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/open_clip/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.1"
2 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/audioldm/clap/training/__init__.py
--------------------------------------------------------------------------------
/mustango/audioldm/clap/training/audioset_textmap.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/audioldm/clap/training/audioset_textmap.npy
--------------------------------------------------------------------------------
/mustango/audioldm/clap/training/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def setup_logging(log_file, level, include_host=False):
5 | if include_host:
6 | import socket
7 |
8 | hostname = socket.gethostname()
9 | formatter = logging.Formatter(
10 | f"%(asctime)s | {hostname} | %(levelname)s | %(message)s",
11 | datefmt="%Y-%m-%d,%H:%M:%S",
12 | )
13 | else:
14 | formatter = logging.Formatter(
15 | "%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d,%H:%M:%S"
16 | )
17 |
18 | logging.root.setLevel(level)
19 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
20 | for logger in loggers:
21 | logger.setLevel(level)
22 |
23 | stream_handler = logging.StreamHandler()
24 | stream_handler.setFormatter(formatter)
25 | logging.root.addHandler(stream_handler)
26 |
27 | if log_file:
28 | file_handler = logging.FileHandler(filename=log_file)
29 | file_handler.setFormatter(formatter)
30 | logging.root.addHandler(file_handler)
31 |
--------------------------------------------------------------------------------
/mustango/audioldm/clap/training/scheduler.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def assign_learning_rate(optimizer, new_lr):
5 | for param_group in optimizer.param_groups:
6 | param_group["lr"] = new_lr
7 |
8 |
9 | def _warmup_lr(base_lr, warmup_length, step):
10 | return base_lr * (step + 1) / warmup_length
11 |
12 |
13 | def cosine_lr(optimizer, base_lr, warmup_length, steps):
14 | def _lr_adjuster(step):
15 | if step < warmup_length:
16 | lr = _warmup_lr(base_lr, warmup_length, step)
17 | else:
18 | e = step - warmup_length
19 | es = steps - warmup_length
20 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
21 | assign_learning_rate(optimizer, lr)
22 | return lr
23 |
24 | return _lr_adjuster
25 |
--------------------------------------------------------------------------------
/mustango/audioldm/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Generator
2 |
3 |
4 | class AttrDict(dict):
5 | def __init__(self, *args, **kwargs):
6 | super(AttrDict, self).__init__(*args, **kwargs)
7 | self.__dict__ = self
8 |
--------------------------------------------------------------------------------
/mustango/audioldm/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/audioldm/latent_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/audioldm/variational_autoencoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoencoder import AutoencoderKL
--------------------------------------------------------------------------------
/mustango/configs/main_config.json:
--------------------------------------------------------------------------------
1 | {"text_encoder_name": "google/flan-t5-large", "scheduler_name": "stabilityai/stable-diffusion-2-1", "unet_model_name": null, "unet_model_config_path": "configs/music_diffusion_model_config.json", "snr_gamma": 5.0}
--------------------------------------------------------------------------------
/mustango/configs/music_diffusion_model_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 1024,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2DMusic",
21 | "CrossAttnDownBlock2DMusic",
22 | "CrossAttnDownBlock2DMusic",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 8,
30 | "layers_per_block": 2,
31 | "mid_block_type": "UNetMidBlock2DCrossAttnMusic",
32 | "mid_block_scale_factor": 1,
33 | "norm_eps": 1e-05,
34 | "norm_num_groups": 32,
35 | "num_class_embeds": null,
36 | "only_cross_attention": false,
37 | "out_channels": 8,
38 | "sample_size": [32, 2],
39 | "up_block_types": [
40 | "UpBlock2D",
41 | "CrossAttnUpBlock2DMusic",
42 | "CrossAttnUpBlock2DMusic",
43 | "CrossAttnUpBlock2DMusic"
44 | ],
45 | "use_linear_projection": true,
46 | "upcast_attention": true
47 | }
48 |
--------------------------------------------------------------------------------
/mustango/configs/stft_config.json:
--------------------------------------------------------------------------------
1 | {"filter_length": 1024, "hop_length": 160, "win_length": 1024, "n_mel_channels": 64, "sampling_rate": 16000, "mel_fmin": 0, "mel_fmax": 8000}
--------------------------------------------------------------------------------
/mustango/configs/vae_config.json:
--------------------------------------------------------------------------------
1 | {"image_key": "fbank", "subband": 1, "embed_dim": 8, "time_shuffle": 1, "ddconfig": {"double_z": true, "z_channels": 8, "resolution": 256, "downsample_time": false, "in_channels": 1, "out_ch": 1, "ch": 128, "ch_mult": [1, 2, 4], "num_res_blocks": 2, "attn_resolutions": [], "dropout": 0.0}, "scale_factor": 0.9227914214134216}
--------------------------------------------------------------------------------
/mustango/diffusers/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | title: 'Diffusers: State-of-the-art diffusion models'
3 | message: >-
4 | If you use this software, please cite it using the
5 | metadata from this file.
6 | type: software
7 | authors:
8 | - given-names: Patrick
9 | family-names: von Platen
10 | - given-names: Suraj
11 | family-names: Patil
12 | - given-names: Anton
13 | family-names: Lozhkov
14 | - given-names: Pedro
15 | family-names: Cuenca
16 | - given-names: Nathan
17 | family-names: Lambert
18 | - given-names: Kashif
19 | family-names: Rasul
20 | - given-names: Mishig
21 | family-names: Davaadorj
22 | - given-names: Thomas
23 | family-names: Wolf
24 | repository-code: 'https://github.com/huggingface/diffusers'
25 | abstract: >-
26 | Diffusers provides pretrained diffusion models across
27 | multiple modalities, such as vision and audio, and serves
28 | as a modular toolbox for inference and training of
29 | diffusion models.
30 | keywords:
31 | - deep-learning
32 | - pytorch
33 | - image-generation
34 | - diffusion
35 | - text2image
36 | - image2image
37 | - score-based-generative-modeling
38 | - stable-diffusion
39 | license: Apache-2.0
40 | version: 0.12.1
41 |
--------------------------------------------------------------------------------
/mustango/diffusers/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include src/diffusers/utils/model_card_template.md
3 |
--------------------------------------------------------------------------------
/mustango/diffusers/_typos.toml:
--------------------------------------------------------------------------------
1 | # Files for typos
2 | # Instruction: https://github.com/marketplace/actions/typos-action#getting-started
3 |
4 | [default.extend-identifiers]
5 |
6 | [default.extend-words]
7 | NIN="NIN" # NIN is used in scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
8 | nd="np" # nd may be np (numpy)
9 | parms="parms" # parms is used in scripts/convert_original_stable_diffusion_to_diffusers.py
10 |
11 |
12 | [files]
13 | extend-exclude = ["_typos.toml"]
14 |
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-flax-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | # follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
26 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
27 | python3 -m pip install --upgrade --no-cache-dir \
28 | clu \
29 | "jax[cpu]>=0.2.16,!=0.3.2" \
30 | "flax>=0.4.1" \
31 | "jaxlib>=0.1.65" && \
32 | python3 -m pip install --no-cache-dir \
33 | accelerate \
34 | datasets \
35 | hf-doc-builder \
36 | huggingface-hub \
37 | Jinja2 \
38 | librosa \
39 | numpy \
40 | scipy \
41 | tensorboard \
42 | transformers
43 |
44 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-flax-tpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | # follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
26 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
27 | python3 -m pip install --no-cache-dir \
28 | "jax[tpu]>=0.2.16,!=0.3.2" \
29 | -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
30 | python3 -m pip install --upgrade --no-cache-dir \
31 | clu \
32 | "flax>=0.4.1" \
33 | "jaxlib>=0.1.65" && \
34 | python3 -m pip install --no-cache-dir \
35 | accelerate \
36 | datasets \
37 | hf-doc-builder \
38 | huggingface-hub \
39 | Jinja2 \
40 | librosa \
41 | numpy \
42 | scipy \
43 | tensorboard \
44 | transformers
45 |
46 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-onnxruntime-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
26 | python3 -m pip install --no-cache-dir \
27 | torch \
28 | torchvision \
29 | torchaudio \
30 | onnxruntime \
31 | --extra-index-url https://download.pytorch.org/whl/cpu && \
32 | python3 -m pip install --no-cache-dir \
33 | accelerate \
34 | datasets \
35 | hf-doc-builder \
36 | huggingface-hub \
37 | Jinja2 \
38 | librosa \
39 | numpy \
40 | scipy \
41 | tensorboard \
42 | transformers
43 |
44 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-onnxruntime-cuda/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
26 | python3 -m pip install --no-cache-dir \
27 | torch \
28 | torchvision \
29 | torchaudio \
30 | "onnxruntime-gpu>=1.13.1" \
31 | --extra-index-url https://download.pytorch.org/whl/cu117 && \
32 | python3 -m pip install --no-cache-dir \
33 | accelerate \
34 | datasets \
35 | hf-doc-builder \
36 | huggingface-hub \
37 | Jinja2 \
38 | librosa \
39 | numpy \
40 | scipy \
41 | tensorboard \
42 | transformers
43 |
44 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-pytorch-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
26 | python3 -m pip install --no-cache-dir \
27 | torch \
28 | torchvision \
29 | torchaudio \
30 | --extra-index-url https://download.pytorch.org/whl/cpu && \
31 | python3 -m pip install --no-cache-dir \
32 | accelerate \
33 | datasets \
34 | hf-doc-builder \
35 | huggingface-hub \
36 | Jinja2 \
37 | librosa \
38 | numpy \
39 | scipy \
40 | tensorboard \
41 | transformers
42 |
43 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/mustango/diffusers/docker/diffusers-pytorch-cuda/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="diffusers"
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update && \
8 | apt install -y bash \
9 | build-essential \
10 | git \
11 | git-lfs \
12 | curl \
13 | ca-certificates \
14 | libsndfile1-dev \
15 | python3.8 \
16 | python3-pip \
17 | python3.8-venv && \
18 | rm -rf /var/lib/apt/lists
19 |
20 | # make sure to use venv
21 | RUN python3 -m venv /opt/venv
22 | ENV PATH="/opt/venv/bin:$PATH"
23 |
24 | # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
25 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
26 | python3 -m pip install --no-cache-dir \
27 | torch \
28 | torchvision \
29 | torchaudio \
30 | python3 -m pip install --no-cache-dir \
31 | accelerate \
32 | datasets \
33 | hf-doc-builder \
34 | huggingface-hub \
35 | Jinja2 \
36 | librosa \
37 | numpy \
38 | scipy \
39 | tensorboard \
40 | transformers
41 |
42 | CMD ["/bin/bash"]
43 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/_config.py:
--------------------------------------------------------------------------------
1 | # docstyle-ignore
2 | INSTALL_CONTENT = """
3 | # Diffusers installation
4 | ! pip install diffusers transformers datasets accelerate
5 | # To install from source instead of the last release, comment the command above and uncomment the following one.
6 | # ! pip install git+https://github.com/huggingface/diffusers.git
7 | """
8 |
9 | notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/configuration.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Configuration
14 |
15 | Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all the parameters that are
16 | passed to their respective `__init__` methods in a JSON-configuration file.
17 |
18 | ## ConfigMixin
19 |
20 | [[autodoc]] ConfigMixin
21 | - load_config
22 | - from_config
23 | - save_config
24 | - to_json_file
25 | - to_json_string
26 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/experimental/rl.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # TODO
14 |
15 | Coming soon!
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/loaders.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Loaders
14 |
15 | There are many ways to train adapter neural networks for diffusion models, such as
16 | - [Textual Inversion](./training/text_inversion.mdx)
17 | - [LoRA](https://github.com/cloneofsimo/lora)
18 | - [Hypernetworks](https://arxiv.org/abs/1609.09106)
19 |
20 | Such adapter neural networks often only consist of a fraction of the number of weights compared
21 | to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use
22 | API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py).
23 |
24 | **Note**: This module is still highly experimental and prone to future changes.
25 |
26 | ## LoaderMixins
27 |
28 | ### UNet2DConditionLoadersMixin
29 |
30 | [[autodoc]] loaders.UNet2DConditionLoadersMixin
31 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/pipelines/dance_diffusion.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Dance Diffusion
14 |
15 | ## Overview
16 |
17 | [Dance Diffusion](https://github.com/Harmonai-org/sample-generator) by Zach Evans.
18 |
19 | Dance Diffusion is the first in a suite of generative audio tools for producers and musicians to be released by Harmonai.
20 | For more info or to get involved in the development of these tools, please visit https://harmonai.org and fill out the form on the front page.
21 |
22 | The original codebase of this implementation can be found [here](https://github.com/Harmonai-org/sample-generator).
23 |
24 | ## Available Pipelines:
25 |
26 | | Pipeline | Tasks | Colab
27 | |---|---|:---:|
28 | | [pipeline_dance_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py) | *Unconditional Audio Generation* | - |
29 |
30 |
31 | ## DanceDiffusionPipeline
32 | [[autodoc]] DanceDiffusionPipeline
33 | - all
34 | - __call__
35 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Depth-to-Image Generation
14 |
15 | ## StableDiffusionDepth2ImgPipeline
16 |
17 | The depth-guided stable diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. It uses [MiDas](https://github.com/isl-org/MiDaS) to infer depth based on an image.
18 |
19 | [`StableDiffusionDepth2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images as well as a `depth_map` to preserve the images’ structure.
20 |
21 | The original codebase can be found here:
22 | - *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion)
23 |
24 | Available Checkpoints are:
25 | - *stable-diffusion-2-depth*: [stabilityai/stable-diffusion-2-depth](https://huggingface.co/stabilityai/stable-diffusion-2-depth)
26 |
27 | [[autodoc]] StableDiffusionDepth2ImgPipeline
28 | - all
29 | - __call__
30 | - enable_attention_slicing
31 | - disable_attention_slicing
32 | - enable_xformers_memory_efficient_attention
33 | - disable_xformers_memory_efficient_attention
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/pipelines/stable_diffusion/image_variation.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Image Variation
14 |
15 | ## StableDiffusionImageVariationPipeline
16 |
17 | [`StableDiffusionImageVariationPipeline`] lets you generate variations from an input image using Stable Diffusion. It uses a fine-tuned version of Stable Diffusion model, trained by [Justin Pinkney](https://www.justinpinkney.com/) (@Buntworthy) at [Lambda](https://lambdalabs.com/).
18 |
19 | The original codebase can be found here:
20 | [Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations)
21 |
22 | Available Checkpoints are:
23 | - *sd-image-variations-diffusers*: [lambdalabs/sd-image-variations-diffusers](https://huggingface.co/lambdalabs/sd-image-variations-diffusers)
24 |
25 | [[autodoc]] StableDiffusionImageVariationPipeline
26 | - all
27 | - __call__
28 | - enable_attention_slicing
29 | - disable_attention_slicing
30 | - enable_xformers_memory_efficient_attention
31 | - disable_xformers_memory_efficient_attention
32 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/pipelines/stable_diffusion/latent_upscale.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Stable Diffusion Latent Upscaler
14 |
15 | ## StableDiffusionLatentUpscalePipeline
16 |
17 | The Stable Diffusion Latent Upscaler model was created by [Katherine Crowson](https://github.com/crowsonkb/k-diffusion) in collaboration with [Stability AI](https://stability.ai/). It can be used on top of any [`StableDiffusionUpscalePipeline`] checkpoint to enhance its output image resolution by a factor of 2.
18 |
19 | A notebook that demonstrates the original implementation can be found here:
20 | - [Stable Diffusion Upscaler Demo](https://colab.research.google.com/drive/1o1qYJcFeywzCIdkfKJy7cTpgZTCM2EI4)
21 |
22 | Available Checkpoints are:
23 | - *stabilityai/latent-upscaler*: [stabilityai/sd-x2-latent-upscaler](https://huggingface.co/stabilityai/sd-x2-latent-upscaler)
24 |
25 |
26 | [[autodoc]] StableDiffusionLatentUpscalePipeline
27 | - all
28 | - __call__
29 | - enable_sequential_cpu_offload
30 | - enable_attention_slicing
31 | - disable_attention_slicing
32 | - enable_xformers_memory_efficient_attention
33 | - disable_xformers_memory_efficient_attention
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/pipelines/stable_diffusion/upscale.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Super-Resolution
14 |
15 | ## StableDiffusionUpscalePipeline
16 |
17 | The upscaler diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), and [LAION](https://laion.ai/), as part of Stable Diffusion 2.0. [`StableDiffusionUpscalePipeline`] can be used to enhance the resolution of input images by a factor of 4.
18 |
19 | The original codebase can be found here:
20 | - *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion#image-upscaling-with-stable-diffusion)
21 |
22 | Available Checkpoints are:
23 | - *stabilityai/stable-diffusion-x4-upscaler (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler)
24 |
25 |
26 | [[autodoc]] StableDiffusionUpscalePipeline
27 | - all
28 | - __call__
29 | - enable_attention_slicing
30 | - disable_attention_slicing
31 | - enable_xformers_memory_efficient_attention
32 | - disable_xformers_memory_efficient_attention
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/ddim_inverse.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Inverse Denoising Diffusion Implicit Models (DDIMInverse)
14 |
15 | ## Overview
16 |
17 | This scheduler is the inverted scheduler of [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
18 | The implementation is mostly based on the DDIM inversion definition of [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/pdf/2211.09794.pdf)
19 |
20 | ## DDIMInverseScheduler
21 | [[autodoc]] DDIMInverseScheduler
22 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/deis.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # DEIS
14 |
15 | Fast Sampling of Diffusion Models with Exponential Integrator.
16 |
17 | ## Overview
18 |
19 | Original paper can be found [here](https://arxiv.org/abs/2204.13902). The original implementation can be found [here](https://github.com/qsh-zh/deis).
20 |
21 | ## DEISMultistepScheduler
22 | [[autodoc]] DEISMultistepScheduler
23 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/dpm_discrete.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # DPM Discrete Scheduler inspired by Karras et. al paper
14 |
15 | ## Overview
16 |
17 | Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
18 |
19 | All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
20 |
21 | ## KDPM2DiscreteScheduler
22 | [[autodoc]] KDPM2DiscreteScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/dpm_discrete_ancestral.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper
14 |
15 | ## Overview
16 |
17 | Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364). Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
18 |
19 | All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
20 |
21 | ## KDPM2AncestralDiscreteScheduler
22 | [[autodoc]] KDPM2AncestralDiscreteScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/euler.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Euler scheduler
14 |
15 | ## Overview
16 |
17 | Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
18 | Fast scheduler which often times generates good outputs with 20-30 steps.
19 |
20 | ## EulerDiscreteScheduler
21 | [[autodoc]] EulerDiscreteScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/euler_ancestral.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Euler Ancestral scheduler
14 |
15 | ## Overview
16 |
17 | Ancestral sampling with Euler method steps. Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) implementation by Katherine Crowson.
18 | Fast scheduler which often times generates good outputs with 20-30 steps.
19 |
20 | ## EulerAncestralDiscreteScheduler
21 | [[autodoc]] EulerAncestralDiscreteScheduler
22 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/heun.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Heun scheduler inspired by Karras et. al paper
14 |
15 | ## Overview
16 |
17 | Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
18 | Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
19 |
20 | All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
21 |
22 | ## HeunDiscreteScheduler
23 | [[autodoc]] HeunDiscreteScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/ipndm.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # improved pseudo numerical methods for diffusion models (iPNDM)
14 |
15 | ## Overview
16 |
17 | Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
18 |
19 | ## IPNDMScheduler
20 | [[autodoc]] IPNDMScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/lms_discrete.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Linear multistep scheduler for discrete beta schedules
14 |
15 | ## Overview
16 |
17 | Original implementation can be found [here](https://arxiv.org/abs/2206.00364).
18 |
19 | ## LMSDiscreteScheduler
20 | [[autodoc]] LMSDiscreteScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/multistep_dpm_solver.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Multistep DPM-Solver
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
18 |
19 | ## DPMSolverMultistepScheduler
20 | [[autodoc]] DPMSolverMultistepScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/pndm.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Pseudo numerical methods for diffusion models (PNDM)
14 |
15 | ## Overview
16 |
17 | Original implementation can be found [here](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
18 |
19 | ## PNDMScheduler
20 | [[autodoc]] PNDMScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/repaint.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # RePaint scheduler
14 |
15 | ## Overview
16 |
17 | DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks.
18 | Intended for use with [`RePaintPipeline`].
19 | Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865)
20 | and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
21 |
22 | ## RePaintScheduler
23 | [[autodoc]] RePaintScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/score_sde_ve.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Variance Exploding Stochastic Differential Equation (VE-SDE) scheduler
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2011.13456).
18 |
19 | ## ScoreSdeVeScheduler
20 | [[autodoc]] ScoreSdeVeScheduler
21 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/score_sde_vp.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Variance Preserving Stochastic Differential Equation (VP-SDE) scheduler
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2011.13456).
18 |
19 |
20 |
21 | Score SDE-VP is under construction.
22 |
23 |
24 |
25 | ## ScoreSdeVpScheduler
26 | [[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
27 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/singlestep_dpm_solver.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Singlestep DPM-Solver
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
18 |
19 | ## DPMSolverSinglestepScheduler
20 | [[autodoc]] DPMSolverSinglestepScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/stochastic_karras_ve.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Variance exploding, stochastic sampling from Karras et. al
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2206.00364).
18 |
19 | ## KarrasVeScheduler
20 | [[autodoc]] KarrasVeScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/unipc.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # UniPC
14 |
15 | ## Overview
16 |
17 | UniPC is a training-free framework designed for the fast sampling of diffusion models, which consists of a corrector (UniC) and a predictor (UniP) that share a unified analytical form and support arbitrary orders.
18 |
19 | For more details about the method, please refer to the [paper](https://arxiv.org/abs/2302.04867) and the [code](https://github.com/wl-zhao/UniPC).
20 |
21 | Fast Sampling of Diffusion Models with Exponential Integrator.
22 |
23 | ## UniPCMultistepScheduler
24 | [[autodoc]] UniPCMultistepScheduler
25 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/api/schedulers/vq_diffusion.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # VQDiffusionScheduler
14 |
15 | ## Overview
16 |
17 | Original paper can be found [here](https://arxiv.org/abs/2111.14822)
18 |
19 | ## VQDiffusionScheduler
20 | [[autodoc]] VQDiffusionScheduler
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/imgs/access_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/docs/source/en/imgs/access_request.png
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/imgs/diffusers_library.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/docs/source/en/imgs/diffusers_library.jpg
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/optimization/opt_overview.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🧨 Diffuser's goal is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
16 |
17 | This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You can also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/tutorials/tutorial_overview.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | Welcome to 🧨 Diffusers! If you're new to diffusion models and generative AI, and want to learn more, then you've come to the right place. These beginner-friendly tutorials are designed to provide a gentle introduction to diffusion models and help you understand the library fundamentals - the core components and how 🧨 Diffusers is meant to be used.
16 |
17 | You'll learn how to use a pipeline for inference to rapidly generate things, and then deconstruct that pipeline to really understand how to use the library as a modular toolbox for building your own diffusion systems. In the next lesson, you'll learn how to train your own diffusion model to generate what you want.
18 |
19 | After completing the tutorials, you'll have gained the necessary skills to start exploring the library on your own and see how to use it for your own projects and applications.
20 |
21 | Feel free to join our community on [Discord](https://discord.com/invite/JfAtkvEtRb) or the [forums](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) to connect and collaborate with other users and developers!
22 |
23 | Let's start diffusing! 🧨
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/audio.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Using Diffusers for audio
14 |
15 | [`DanceDiffusionPipeline`] and [`AudioDiffusionPipeline`] can be used to generate
16 | audio rapidly! More coming soon!
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/loading_overview.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | 🧨 Diffusers offers many pipelines, models, and schedulers for generative tasks. To make loading these components as simple as possible, we provide a single and unified method - `from_pretrained()` - that loads any of these components from either the Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) or your local machine. Whenever you load a pipeline or model, the latest files are automatically downloaded and cached so you can quickly reuse them next time without redownloading the files.
16 |
17 | This section will show you everything you need to know about loading pipelines, how to load different components in a pipeline, how to load checkpoint variants, and how to load community pipelines. You'll also learn how to load schedulers and compare the speed and quality trade-offs of using different schedulers. Finally, you'll see how to convert and load KerasCV checkpoints so you can use them in PyTorch with 🧨 Diffusers.
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/other-modalities.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Using Diffusers with other modalities
14 |
15 | Diffusers is in the process of expanding to modalities other than images.
16 |
17 | Example type | Colab | Pipeline |
18 | :-------------------------:|:-------------------------:|:-------------------------:|
19 | [Molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) | ❌
20 |
21 | More coming soon!
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/pipeline_overview.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | A pipeline is an end-to-end class that provides a quick and easy way to use a diffusion system for inference by bundling independently trained models and schedulers together. Certain combinations of models and schedulers define specific pipeline types, like [`StableDiffusionPipeline`] or [`StableDiffusionControlNetPipeline`], with specific capabilities. All pipeline types inherit from the base [`DiffusionPipeline`] class; pass it any checkpoint, and it'll automatically detect the pipeline type and load the necessary components.
16 |
17 | This section introduces you to some of the tasks supported by our pipelines such as unconditional image generation and different techniques and variations of text-to-image generation. You'll also learn how to gain more control over the generation process by setting a seed for reproducibility and weighting prompts to adjust the influence certain words in the prompt has over the output. Finally, you'll see how you can create a community pipeline for a custom task like generating images from speech.
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/rl.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Using Diffusers for reinforcement learning
14 |
15 | Support for one RL model and related pipelines is included in the `experimental` source of diffusers.
16 | More models and examples coming soon!
17 |
18 | # Diffuser Value-guided Planning
19 |
20 | You can run the model from [*Planning with Diffusion for Flexible Behavior Synthesis*](https://arxiv.org/abs/2205.09991) with Diffusers.
21 | The script is located in the [RL Examples](https://github.com/huggingface/diffusers/tree/main/examples/rl) folder.
22 |
23 | Or, run this example in Colab [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb)
24 |
25 | [[autodoc]] diffusers.experimental.ValueGuidedRLPipeline
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/en/using-diffusers/using_safetensors:
--------------------------------------------------------------------------------
1 | # What is safetensors ?
2 |
3 | [safetensors](https://github.com/huggingface/safetensors) is a different format
4 | from the classic `.bin` which uses Pytorch which uses pickle.
5 |
6 | Pickle is notoriously unsafe which allow any malicious file to execute arbitrary code.
7 | The hub itself tries to prevent issues from it, but it's not a silver bullet.
8 |
9 | `safetensors` first and foremost goal is to make loading machine learning models *safe*
10 | in the sense that no takeover of your computer can be done.
11 |
12 | # Why use safetensors ?
13 |
14 | **Safety** can be one reason, if you're attempting to use a not well known model and
15 | you're not sure about the source of the file.
16 |
17 | And a secondary reason, is **the speed of loading**. Safetensors can load models much faster
18 | than regular pickle files. If you spend a lot of times switching models, this can be
19 | a huge timesave.
20 |
--------------------------------------------------------------------------------
/mustango/diffusers/docs/source/ko/in_translation.mdx:
--------------------------------------------------------------------------------
1 |
12 |
13 | # 번역중
14 |
15 | 열심히 번역을 진행중입니다. 조금만 기다려주세요.
16 | 감사합니다!
--------------------------------------------------------------------------------
/mustango/diffusers/examples/community/one_step_unet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import torch
3 |
4 | from diffusers import DiffusionPipeline
5 |
6 |
7 | class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
8 | def __init__(self, unet, scheduler):
9 | super().__init__()
10 |
11 | self.register_modules(unet=unet, scheduler=scheduler)
12 |
13 | def __call__(self):
14 | image = torch.randn(
15 | (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
16 | )
17 | timestep = 1
18 |
19 | model_output = self.unet(image, timestep).sample
20 | scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
21 |
22 | result = scheduler_output - scheduler_output + torch.ones_like(scheduler_output)
23 |
24 | return result
25 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # tests directory-specific settings - this file is run automatically
16 | # by pytest before any tests are run
17 |
18 | import sys
19 | import warnings
20 | from os.path import abspath, dirname, join
21 |
22 |
23 | # allow having multiple repository checkouts and not needing to remember to rerun
24 | # 'pip install -e .[dev]' when switching between checkouts and running tests.
25 | git_repo_path = abspath(join(dirname(dirname(dirname(__file__))), "src"))
26 | sys.path.insert(1, git_repo_path)
27 |
28 |
29 | # silence FutureWarning warnings in tests since often we can't act on them until
30 | # they become normal warnings - i.e. the tests still need to test the current functionality
31 | warnings.simplefilter(action="ignore", category=FutureWarning)
32 |
33 |
34 | def pytest_addoption(parser):
35 | from diffusers.utils.testing_utils import pytest_addoption_shared
36 |
37 | pytest_addoption_shared(parser)
38 |
39 |
40 | def pytest_terminal_summary(terminalreporter):
41 | from diffusers.utils.testing_utils import pytest_terminal_summary_main
42 |
43 | make_reports = terminalreporter.config.getoption("--make-reports")
44 | if make_reports:
45 | pytest_terminal_summary_main(terminalreporter, id=make_reports)
46 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/controlnet/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | datasets
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/controlnet/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | datasets
3 | flax
4 | optax
5 | torch
6 | torchvision
7 | ftfy
8 | tensorboard
9 | Jinja2
10 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/dreambooth/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/dreambooth/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/inference/README.md:
--------------------------------------------------------------------------------
1 | # Inference Examples
2 |
3 | **The inference examples folder is deprecated and will be removed in a future version**.
4 | **Officially supported inference examples can be found in the [Pipelines folder](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines)**.
5 |
6 | - For `Image-to-Image text-guided generation with Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
7 | - For `In-painting using Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
8 | - For `Tweak prompts reusing seeds and latents`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
9 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/inference/image_to_image.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from diffusers import StableDiffusionImg2ImgPipeline # noqa F401
4 |
5 |
6 | warnings.warn(
7 | "The `image_to_image.py` script is outdated. Please use directly `from diffusers import"
8 | " StableDiffusionImg2ImgPipeline` instead."
9 | )
10 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/inference/inpainting.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from diffusers import StableDiffusionInpaintPipeline as StableDiffusionInpaintPipeline # noqa F401
4 |
5 |
6 | warnings.warn(
7 | "The `inpainting.py` script is outdated. Please use directly `from diffusers import"
8 | " StableDiffusionInpaintPipeline` instead."
9 | )
10 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/instruct_pix2pix/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 | # Research projects
2 |
3 | This folder contains various research projects using 🧨 Diffusers.
4 | They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder.
5 | Updating them to the most recent version of the library will require some work.
6 |
7 | To use any of them, just run the command
8 |
9 | ```
10 | pip install -r requirements.txt
11 | ```
12 | inside the folder of your choice.
13 |
14 | If you need help with any of those, please open an issue where you directly ping the author(s), as indicated at the top of the README of each folder.
15 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/colossalai/inference.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from diffusers import StableDiffusionPipeline
4 |
5 |
6 | model_id = "path-to-your-trained-model"
7 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
8 |
9 | prompt = "A photo of sks dog in a bucket"
10 | image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
11 |
12 | image.save("dog-bucket.png")
13 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/colossalai/requirement.txt:
--------------------------------------------------------------------------------
1 | diffusers
2 | torch
3 | torchvision
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | transformers
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/dreambooth_inpaint/requirements.txt:
--------------------------------------------------------------------------------
1 | diffusers==0.9.0
2 | accelerate
3 | torchvision
4 | transformers>=4.21.0
5 | ftfy
6 | tensorboard
7 | Jinja2
8 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/intel_opts/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.21.0
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | intel_extension_for_pytorch>=1.13
8 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/lora/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | Jinja2
8 | git+https://github.com/huggingface/peft.git
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/mulit_token_textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/multi_subject_dreambooth/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/onnxruntime/README.md:
--------------------------------------------------------------------------------
1 | ## Diffusers examples with ONNXRuntime optimizations
2 |
3 | **This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.**
4 |
5 | This aims to provide diffusers examples with ONNXRuntime optimizations for training/fine-tuning unconditional image generation, text to image, and textual inversion. Please see individual directories for more details on how to run each task using ONNXRuntime.
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/onnxruntime/text_to_image/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | modelcards
8 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/onnxruntime/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | modelcards
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/README.md:
--------------------------------------------------------------------------------
1 | ## Training examples
2 |
3 | Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
4 |
5 | ### Installing the dependencies
6 |
7 | Before running the scripts, make sure to install the library's training dependencies:
8 |
9 | **Important**
10 |
11 | To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
12 | ```bash
13 | git clone https://github.com/huggingface/diffusers
14 | cd diffusers
15 | pip install .
16 | ```
17 |
18 | Then cd in the example folder and run
19 | ```bash
20 | pip install -r requirements.txt
21 | ```
22 |
23 |
24 | And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
25 |
26 | ```bash
27 | accelerate config
28 | ```
29 |
30 | #### Use ONNXRuntime to accelerate training
31 |
32 | In order to leverage onnxruntime to accelerate training, please use train_unconditional_ort.py
33 |
34 | The command to train a DDPM UNet model on the Oxford Flowers dataset with onnxruntime:
35 |
36 | ```bash
37 | accelerate launch train_unconditional_ort.py \
38 | --dataset_name="huggan/flowers-102-categories" \
39 | --resolution=64 --center_crop --random_flip \
40 | --output_dir="ddpm-ema-flowers-64" \
41 | --use_ema \
42 | --train_batch_size=16 \
43 | --num_epochs=1 \
44 | --gradient_accumulation_steps=1 \
45 | --learning_rate=1e-4 \
46 | --lr_warmup_steps=500 \
47 | --mixed_precision=fp16
48 | ```
49 |
50 | Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
51 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | datasets
4 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/rl/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers.
4 | There are two ways to use the script, `run_diffuser_locomotion.py`.
5 |
6 | The key option is a change of the variable `n_guide_steps`.
7 | When `n_guide_steps=0`, the trajectories are sampled from the diffusion model, but not fine-tuned to maximize reward in the environment.
8 | By default, `n_guide_steps=2` to match the original implementation.
9 |
10 |
11 | You will need some RL specific requirements to run the examples:
12 |
13 | ```
14 | pip install -f https://download.pytorch.org/whl/torch_stable.html \
15 | free-mujoco-py \
16 | einops \
17 | gym==0.24.1 \
18 | protobuf==3.20.1 \
19 | git+https://github.com/rail-berkeley/d4rl.git \
20 | mediapy \
21 | Pillow==9.0.0
22 | ```
23 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/rl/run_diffuser_locomotion.py:
--------------------------------------------------------------------------------
1 | import d4rl # noqa
2 | import gym
3 | import tqdm
4 | from diffusers.experimental import ValueGuidedRLPipeline
5 |
6 |
7 | config = {
8 | "n_samples": 64,
9 | "horizon": 32,
10 | "num_inference_steps": 20,
11 | "n_guide_steps": 2, # can set to 0 for faster sampling, does not use value network
12 | "scale_grad_by_std": True,
13 | "scale": 0.1,
14 | "eta": 0.0,
15 | "t_grad_cutoff": 2,
16 | "device": "cpu",
17 | }
18 |
19 |
20 | if __name__ == "__main__":
21 | env_name = "hopper-medium-v2"
22 | env = gym.make(env_name)
23 |
24 | pipeline = ValueGuidedRLPipeline.from_pretrained(
25 | "bglick13/hopper-medium-v2-value-function-hor32",
26 | env=env,
27 | )
28 |
29 | env.seed(0)
30 | obs = env.reset()
31 | total_reward = 0
32 | total_score = 0
33 | T = 1000
34 | rollout = [obs.copy()]
35 | try:
36 | for t in tqdm.tqdm(range(T)):
37 | # call the policy
38 | denorm_actions = pipeline(obs, planning_horizon=32)
39 |
40 | # execute action in environment
41 | next_observation, reward, terminal, _ = env.step(denorm_actions)
42 | score = env.get_normalized_score(total_reward)
43 |
44 | # update return
45 | total_reward += reward
46 | total_score += score
47 | print(
48 | f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
49 | f" {total_score}"
50 | )
51 |
52 | # save observations for rendering
53 | rollout.append(next_observation.copy())
54 |
55 | obs = next_observation
56 | except KeyboardInterrupt:
57 | pass
58 |
59 | print(f"Total reward: {total_reward}")
60 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/text_to_image/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | Jinja2
8 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/text_to_image/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | datasets
3 | flax
4 | optax
5 | torch
6 | torchvision
7 | ftfy
8 | tensorboard
9 | Jinja2
10 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/textual_inversion/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/mustango/diffusers/examples/unconditional_image_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | datasets
4 |
--------------------------------------------------------------------------------
/mustango/diffusers/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | target-version = ['py37']
4 |
5 | [tool.ruff]
6 | # Never enforce `E501` (line length violations).
7 | ignore = ["C901", "E501", "E741", "W605"]
8 | select = ["C", "E", "F", "I", "W"]
9 | line-length = 119
10 |
11 | # Ignore import violations in all `__init__.py` files.
12 | [tool.ruff.per-file-ignores]
13 | "__init__.py" = ["E402", "F401", "F403", "F811"]
14 | "src/diffusers/utils/dummy_*.py" = ["F401"]
15 |
16 | [tool.ruff.isort]
17 | lines-after-imports = 2
18 | known-first-party = ["diffusers"]
19 |
--------------------------------------------------------------------------------
/mustango/diffusers/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/scripts/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/scripts/convert_unclip_txt2img_to_image_variation.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
4 |
5 | from diffusers import UnCLIPImageVariationPipeline, UnCLIPPipeline
6 |
7 |
8 | if __name__ == "__main__":
9 | parser = argparse.ArgumentParser()
10 |
11 | parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
12 |
13 | parser.add_argument(
14 | "--txt2img_unclip",
15 | default="kakaobrain/karlo-v1-alpha",
16 | type=str,
17 | required=False,
18 | help="The pretrained txt2img unclip.",
19 | )
20 |
21 | args = parser.parse_args()
22 |
23 | txt2img = UnCLIPPipeline.from_pretrained(args.txt2img_unclip)
24 |
25 | feature_extractor = CLIPImageProcessor()
26 | image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
27 |
28 | img2img = UnCLIPImageVariationPipeline(
29 | decoder=txt2img.decoder,
30 | text_encoder=txt2img.text_encoder,
31 | tokenizer=txt2img.tokenizer,
32 | text_proj=txt2img.text_proj,
33 | feature_extractor=feature_extractor,
34 | image_encoder=image_encoder,
35 | super_res_first=txt2img.super_res_first,
36 | super_res_last=txt2img.super_res_last,
37 | decoder_scheduler=txt2img.decoder_scheduler,
38 | super_res_scheduler=txt2img.super_res_scheduler,
39 | )
40 |
41 | img2img.save_pretrained(args.dump_path)
42 |
--------------------------------------------------------------------------------
/mustango/diffusers/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | default_section = FIRSTPARTY
3 | ensure_newline_before_comments = True
4 | force_grid_wrap = 0
5 | include_trailing_comma = True
6 | known_first_party = accelerate
7 | known_third_party =
8 | numpy
9 | torch
10 | torch_xla
11 |
12 | line_length = 119
13 | lines_after_imports = 2
14 | multi_line_output = 3
15 | use_parentheses = True
16 |
17 | [flake8]
18 | ignore = E203, E722, E501, E741, W503, W605
19 | max-line-length = 119
20 | per-file-ignores = __init__.py:F401
21 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | diffusers-cli = diffusers.commands.diffusers_cli:main
3 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | importlib_metadata
2 | filelock
3 | huggingface-hub>=0.13.2
4 | numpy
5 | regex!=2019.12.17
6 | requests
7 | Pillow
8 |
9 | [dev]
10 | black~=23.1
11 | isort>=5.5.4
12 | ruff>=0.0.241
13 | hf-doc-builder>=0.3.0
14 | compel==0.1.8
15 | datasets
16 | Jinja2
17 | k-diffusion>=0.0.12
18 | librosa
19 | note-seq
20 | parameterized
21 | pytest
22 | pytest-timeout
23 | pytest-xdist
24 | requests-mock==1.10.0
25 | safetensors
26 | sentencepiece!=0.1.92,>=0.1.91
27 | scipy
28 | torchvision
29 | transformers>=4.25.1
30 | accelerate>=0.11.0
31 | protobuf<4,>=3.20.3
32 | tensorboard
33 | torch>=1.4
34 | jax!=0.3.2,>=0.2.8
35 | jaxlib>=0.1.65
36 | flax>=0.4.1
37 |
38 | [docs]
39 | hf-doc-builder>=0.3.0
40 |
41 | [flax]
42 | jax!=0.3.2,>=0.2.8
43 | jaxlib>=0.1.65
44 | flax>=0.4.1
45 |
46 | [quality]
47 | black~=23.1
48 | isort>=5.5.4
49 | ruff>=0.0.241
50 | hf-doc-builder>=0.3.0
51 |
52 | [test]
53 | compel==0.1.8
54 | datasets
55 | Jinja2
56 | k-diffusion>=0.0.12
57 | librosa
58 | note-seq
59 | parameterized
60 | pytest
61 | pytest-timeout
62 | pytest-xdist
63 | requests-mock==1.10.0
64 | safetensors
65 | sentencepiece!=0.1.92,>=0.1.91
66 | scipy
67 | torchvision
68 | transformers>=4.25.1
69 |
70 | [torch]
71 | torch>=1.4
72 | accelerate>=0.11.0
73 |
74 | [training]
75 | accelerate>=0.11.0
76 | datasets
77 | protobuf<4,>=3.20.3
78 | tensorboard
79 | Jinja2
80 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | diffusers
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 |
18 |
19 | class BaseDiffusersCLICommand(ABC):
20 | @staticmethod
21 | @abstractmethod
22 | def register_subcommand(parser: ArgumentParser):
23 | raise NotImplementedError()
24 |
25 | @abstractmethod
26 | def run(self):
27 | raise NotImplementedError()
28 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/commands/diffusers_cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2023 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from argparse import ArgumentParser
17 |
18 | from .env import EnvironmentCommand
19 |
20 |
21 | def main():
22 | parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli []")
23 | commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
24 |
25 | # Register commands
26 | EnvironmentCommand.register_subcommand(commands_parser)
27 |
28 | # Let's go
29 | args = parser.parse_args()
30 |
31 | if not hasattr(args, "func"):
32 | parser.print_help()
33 | exit(1)
34 |
35 | # Run
36 | service = args.func(args)
37 | service.run()
38 |
39 |
40 | if __name__ == "__main__":
41 | main()
42 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/dependency_versions_table.py:
--------------------------------------------------------------------------------
1 | # THIS FILE HAS BEEN AUTOGENERATED. To update:
2 | # 1. modify the `_deps` dict in setup.py
3 | # 2. run `make deps_table_update``
4 | deps = {
5 | "Pillow": "Pillow",
6 | "accelerate": "accelerate>=0.11.0",
7 | "compel": "compel==0.1.8",
8 | "black": "black~=23.1",
9 | "datasets": "datasets",
10 | "filelock": "filelock",
11 | "flax": "flax>=0.4.1",
12 | "hf-doc-builder": "hf-doc-builder>=0.3.0",
13 | "huggingface-hub": "huggingface-hub>=0.13.2",
14 | "requests-mock": "requests-mock==1.10.0",
15 | "importlib_metadata": "importlib_metadata",
16 | "isort": "isort>=5.5.4",
17 | "jax": "jax>=0.2.8,!=0.3.2",
18 | "jaxlib": "jaxlib>=0.1.65",
19 | "Jinja2": "Jinja2",
20 | "k-diffusion": "k-diffusion>=0.0.12",
21 | "librosa": "librosa",
22 | "note-seq": "note-seq",
23 | "numpy": "numpy",
24 | "parameterized": "parameterized",
25 | "protobuf": "protobuf>=3.20.3,<4",
26 | "pytest": "pytest",
27 | "pytest-timeout": "pytest-timeout",
28 | "pytest-xdist": "pytest-xdist",
29 | "ruff": "ruff>=0.0.241",
30 | "safetensors": "safetensors",
31 | "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
32 | "scipy": "scipy",
33 | "regex": "regex!=2019.12.17",
34 | "requests": "requests",
35 | "tensorboard": "tensorboard",
36 | "torch": "torch>=1.4",
37 | "torchvision": "torchvision",
38 | "transformers": "transformers>=4.25.1",
39 | }
40 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/experimental/README.md:
--------------------------------------------------------------------------------
1 | # 🧨 Diffusers Experimental
2 |
3 | We are adding experimental code to support novel applications and usages of the Diffusers library.
4 | Currently, the following experiments are supported:
5 | * Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/experimental/__init__.py:
--------------------------------------------------------------------------------
1 | from .rl import ValueGuidedRLPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/experimental/rl/__init__.py:
--------------------------------------------------------------------------------
1 | from .value_guided_sampling import ValueGuidedRLPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/models/README.md:
--------------------------------------------------------------------------------
1 | # Models
2 |
3 | For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models).
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from ..utils import is_flax_available, is_torch_available
16 |
17 |
18 | if is_torch_available():
19 | from .autoencoder_kl import AutoencoderKL
20 | from .controlnet import ControlNetModel
21 | from .dual_transformer_2d import DualTransformer2DModel
22 | from .modeling_utils import ModelMixin
23 | from .prior_transformer import PriorTransformer
24 | from .t5_film_transformer import T5FilmDecoder
25 | from .transformer_2d import Transformer2DModel
26 | from .unet_1d import UNet1DModel
27 | from .unet_2d import UNet2DModel
28 | from .unet_2d_condition import UNet2DConditionModel
29 | from .unet_2d_condition_music import UNet2DConditionModelMusic
30 | from .unet_3d_condition import UNet3DConditionModel
31 | from .vq_model import VQModel
32 |
33 | if is_flax_available():
34 | from .controlnet_flax import FlaxControlNetModel
35 | from .unet_2d_condition_flax import FlaxUNet2DConditionModel
36 | from .vae_flax import FlaxAutoencoderKL
37 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipeline_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 |
14 | # limitations under the License.
15 |
16 | # NOTE: This file is deprecated and will be removed in a future version.
17 | # It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works
18 |
19 | from .pipelines import DiffusionPipeline, ImagePipelineOutput # noqa: F401
20 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/alt_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL
6 | from PIL import Image
7 |
8 | from ...utils import BaseOutput, is_torch_available, is_transformers_available
9 |
10 |
11 | @dataclass
12 | # Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with Stable->Alt
13 | class AltDiffusionPipelineOutput(BaseOutput):
14 | """
15 | Output class for Alt Diffusion pipelines.
16 |
17 | Args:
18 | images (`List[PIL.Image.Image]` or `np.ndarray`)
19 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
20 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
21 | nsfw_content_detected (`List[bool]`)
22 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
23 | (nsfw) content, or `None` if safety checking could not be performed.
24 | """
25 |
26 | images: Union[List[PIL.Image.Image], np.ndarray]
27 | nsfw_content_detected: Optional[List[bool]]
28 |
29 |
30 | if is_transformers_available() and is_torch_available():
31 | from .modeling_roberta_series import RobertaSeriesModelWithTransformation
32 | from .pipeline_alt_diffusion import AltDiffusionPipeline
33 | from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
34 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/audio_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from .mel import Mel
2 | from .pipeline_audio_diffusion import AudioDiffusionPipeline
3 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/audioldm/__init__.py:
--------------------------------------------------------------------------------
1 | from ...utils import (
2 | OptionalDependencyNotAvailable,
3 | is_torch_available,
4 | is_transformers_available,
5 | is_transformers_version,
6 | )
7 |
8 | # from .pipeline_audioldm import AudioLDMPipeline
9 |
10 | try:
11 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
12 | raise OptionalDependencyNotAvailable()
13 | except OptionalDependencyNotAvailable:
14 | from ...utils.dummy_torch_and_transformers_objects import (
15 | AudioLDMPipeline,
16 | )
17 | else:
18 | from .pipeline_audioldm import AudioLDMPipeline
19 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/dance_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_dance_diffusion import DanceDiffusionPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/ddim/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_ddim import DDIMPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/ddpm/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_ddpm import DDPMPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/dit/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_dit import DiTPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from ...utils import is_transformers_available
2 | from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
3 |
4 |
5 | if is_transformers_available():
6 | from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
7 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_latent_diffusion_uncond import LDMPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/paint_by_example/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL
6 | from PIL import Image
7 |
8 | from ...utils import is_torch_available, is_transformers_available
9 |
10 |
11 | if is_transformers_available() and is_torch_available():
12 | from .image_encoder import PaintByExampleImageEncoder
13 | from .pipeline_paint_by_example import PaintByExamplePipeline
14 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/pndm/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_pndm import PNDMPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/repaint/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_repaint import RePaintPipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/score_sde_ve/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_score_sde_ve import ScoreSdeVePipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from enum import Enum
3 | from typing import List, Optional, Union
4 |
5 | import numpy as np
6 | import PIL
7 | from PIL import Image
8 |
9 | from ...utils import BaseOutput, is_torch_available, is_transformers_available
10 |
11 |
12 | @dataclass
13 | class SemanticStableDiffusionPipelineOutput(BaseOutput):
14 | """
15 | Output class for Stable Diffusion pipelines.
16 |
17 | Args:
18 | images (`List[PIL.Image.Image]` or `np.ndarray`)
19 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
20 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
21 | nsfw_content_detected (`List[bool]`)
22 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
23 | (nsfw) content, or `None` if safety checking could not be performed.
24 | """
25 |
26 | images: Union[List[PIL.Image.Image], np.ndarray]
27 | nsfw_content_detected: Optional[List[bool]]
28 |
29 |
30 | if is_transformers_available() and is_torch_available():
31 | from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
32 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/spectrogram_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from ...utils import is_note_seq_available, is_transformers_available, is_torch_available
3 | from ...utils import OptionalDependencyNotAvailable
4 |
5 |
6 | try:
7 | if not (is_transformers_available() and is_torch_available()):
8 | raise OptionalDependencyNotAvailable()
9 | except OptionalDependencyNotAvailable:
10 | from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
11 | else:
12 | from .notes_encoder import SpectrogramNotesEncoder
13 | from .continous_encoder import SpectrogramContEncoder
14 | from .pipeline_spectrogram_diffusion import (
15 | SpectrogramContEncoder,
16 | SpectrogramDiffusionPipeline,
17 | T5FilmDecoder,
18 | )
19 |
20 | try:
21 | if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
22 | raise OptionalDependencyNotAvailable()
23 | except OptionalDependencyNotAvailable:
24 | from ...utils.dummy_transformers_and_torch_and_note_seq_objects import * # noqa F403
25 | else:
26 | from .midi_utils import MidiProcessor
27 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/stochastic_karras_ve/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_stochastic_karras_ve import KarrasVePipeline
2 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/text_to_video_synthesis/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import torch
6 |
7 | from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
8 |
9 |
10 | @dataclass
11 | class TextToVideoSDPipelineOutput(BaseOutput):
12 | """
13 | Output class for text to video pipelines.
14 |
15 | Args:
16 | frames (`List[np.ndarray]` or `torch.FloatTensor`)
17 | List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
18 | a `torch` tensor. NumPy array present the denoised images of the diffusion pipeline. The length of the list
19 | denotes the video length i.e., the number of frames.
20 | """
21 |
22 | frames: Union[List[np.ndarray], torch.FloatTensor]
23 |
24 |
25 | try:
26 | if not (is_transformers_available() and is_torch_available()):
27 | raise OptionalDependencyNotAvailable()
28 | except OptionalDependencyNotAvailable:
29 | from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
30 | else:
31 | from .pipeline_text_to_video_synth import TextToVideoSDPipeline # noqa: F401
32 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/unclip/__init__.py:
--------------------------------------------------------------------------------
1 | from ...utils import (
2 | OptionalDependencyNotAvailable,
3 | is_torch_available,
4 | is_transformers_available,
5 | is_transformers_version,
6 | )
7 |
8 |
9 | try:
10 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
11 | raise OptionalDependencyNotAvailable()
12 | except OptionalDependencyNotAvailable:
13 | from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
14 | else:
15 | from .pipeline_unclip import UnCLIPPipeline
16 | from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
17 | from .text_proj import UnCLIPTextProjModel
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/versatile_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from ...utils import (
2 | OptionalDependencyNotAvailable,
3 | is_torch_available,
4 | is_transformers_available,
5 | is_transformers_version,
6 | )
7 |
8 |
9 | try:
10 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
11 | raise OptionalDependencyNotAvailable()
12 | except OptionalDependencyNotAvailable:
13 | from ...utils.dummy_torch_and_transformers_objects import (
14 | VersatileDiffusionDualGuidedPipeline,
15 | VersatileDiffusionImageVariationPipeline,
16 | VersatileDiffusionPipeline,
17 | VersatileDiffusionTextToImagePipeline,
18 | )
19 | else:
20 | from .modeling_text_unet import UNetFlatConditionModel
21 | from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
22 | from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
23 | from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
24 | from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
25 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/pipelines/vq_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from ...utils import is_torch_available, is_transformers_available
2 |
3 |
4 | if is_transformers_available() and is_torch_available():
5 | from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
6 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/schedulers/README.md:
--------------------------------------------------------------------------------
1 | # Schedulers
2 |
3 | For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers/overview).
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 |
16 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE, hf_cache_home
17 |
18 |
19 | default_cache_path = HUGGINGFACE_HUB_CACHE
20 |
21 |
22 | CONFIG_NAME = "config.json"
23 | WEIGHTS_NAME = "diffusion_pytorch_model.bin"
24 | FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
25 | ONNX_WEIGHTS_NAME = "model.onnx"
26 | SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
27 | ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
28 | HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
29 | DIFFUSERS_CACHE = default_cache_path
30 | DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
31 | HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
32 | DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
33 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/doc_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Doc utilities: Utilities related to documentation
16 | """
17 | import re
18 |
19 |
20 | def replace_example_docstring(example_docstring):
21 | def docstring_decorator(fn):
22 | func_doc = fn.__doc__
23 | lines = func_doc.split("\n")
24 | i = 0
25 | while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
26 | i += 1
27 | if i < len(lines):
28 | lines[i] = example_docstring
29 | func_doc = "\n".join(lines)
30 | else:
31 | raise ValueError(
32 | f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
33 | f"current docstring is:\n{func_doc}"
34 | )
35 | fn.__doc__ = func_doc
36 | return fn
37 |
38 | return docstring_decorator
39 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_note_seq_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class MidiProcessor(metaclass=DummyObject):
6 | _backends = ["note_seq"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["note_seq"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["note_seq"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["note_seq"])
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_onnx_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class OnnxRuntimeModel(metaclass=DummyObject):
6 | _backends = ["onnx"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["onnx"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["onnx"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["onnx"])
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_torch_and_librosa_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class AudioDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["torch", "librosa"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "librosa"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "librosa"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "librosa"])
18 |
19 |
20 | class Mel(metaclass=DummyObject):
21 | _backends = ["torch", "librosa"]
22 |
23 | def __init__(self, *args, **kwargs):
24 | requires_backends(self, ["torch", "librosa"])
25 |
26 | @classmethod
27 | def from_config(cls, *args, **kwargs):
28 | requires_backends(cls, ["torch", "librosa"])
29 |
30 | @classmethod
31 | def from_pretrained(cls, *args, **kwargs):
32 | requires_backends(cls, ["torch", "librosa"])
33 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_torch_and_scipy_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class LMSDiscreteScheduler(metaclass=DummyObject):
6 | _backends = ["torch", "scipy"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "scipy"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "scipy"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "scipy"])
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class StableDiffusionKDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["torch", "transformers", "k_diffusion"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "transformers", "k_diffusion"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "transformers", "k_diffusion"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "transformers", "k_diffusion"])
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class SpectrogramDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["transformers", "torch", "note_seq"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["transformers", "torch", "note_seq"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["transformers", "torch", "note_seq"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["transformers", "torch", "note_seq"])
18 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/model_card_template.md:
--------------------------------------------------------------------------------
1 | ---
2 | {{ card_data }}
3 | ---
4 |
5 |
7 |
8 | # {{ model_name | default("Diffusion Model") }}
9 |
10 | ## Model description
11 |
12 | This diffusion model is trained with the [🤗 Diffusers](https://github.com/huggingface/diffusers) library
13 | on the `{{ dataset_name }}` dataset.
14 |
15 | ## Intended uses & limitations
16 |
17 | #### How to use
18 |
19 | ```python
20 | # TODO: add an example code snippet for running this diffusion pipeline
21 | ```
22 |
23 | #### Limitations and bias
24 |
25 | [TODO: provide examples of latent issues and potential remediations]
26 |
27 | ## Training data
28 |
29 | [TODO: describe the data used to train the model]
30 |
31 | ### Training hyperparameters
32 |
33 | The following hyperparameters were used during training:
34 | - learning_rate: {{ learning_rate }}
35 | - train_batch_size: {{ train_batch_size }}
36 | - eval_batch_size: {{ eval_batch_size }}
37 | - gradient_accumulation_steps: {{ gradient_accumulation_steps }}
38 | - optimizer: AdamW with betas=({{ adam_beta1 }}, {{ adam_beta2 }}), weight_decay={{ adam_weight_decay }} and epsilon={{ adam_epsilon }}
39 | - lr_scheduler: {{ lr_scheduler }}
40 | - lr_warmup_steps: {{ lr_warmup_steps }}
41 | - ema_inv_gamma: {{ ema_inv_gamma }}
42 | - ema_inv_gamma: {{ ema_power }}
43 | - ema_inv_gamma: {{ ema_max_decay }}
44 | - mixed_precision: {{ mixed_precision }}
45 |
46 | ### Training results
47 |
48 | 📈 [TensorBoard logs](https://huggingface.co/{{ repo_name }}/tensorboard?#scalars)
49 |
50 |
51 |
--------------------------------------------------------------------------------
/mustango/diffusers/src/diffusers/utils/pil_utils.py:
--------------------------------------------------------------------------------
1 | import PIL.Image
2 | import PIL.ImageOps
3 | from packaging import version
4 |
5 |
6 | if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
7 | PIL_INTERPOLATION = {
8 | "linear": PIL.Image.Resampling.BILINEAR,
9 | "bilinear": PIL.Image.Resampling.BILINEAR,
10 | "bicubic": PIL.Image.Resampling.BICUBIC,
11 | "lanczos": PIL.Image.Resampling.LANCZOS,
12 | "nearest": PIL.Image.Resampling.NEAREST,
13 | }
14 | else:
15 | PIL_INTERPOLATION = {
16 | "linear": PIL.Image.LINEAR,
17 | "bilinear": PIL.Image.BILINEAR,
18 | "bicubic": PIL.Image.BICUBIC,
19 | "lanczos": PIL.Image.LANCZOS,
20 | "nearest": PIL.Image.NEAREST,
21 | }
22 |
--------------------------------------------------------------------------------
/mustango/diffusers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # tests directory-specific settings - this file is run automatically
16 | # by pytest before any tests are run
17 |
18 | import sys
19 | import warnings
20 | from os.path import abspath, dirname, join
21 |
22 |
23 | # allow having multiple repository checkouts and not needing to remember to rerun
24 | # 'pip install -e .[dev]' when switching between checkouts and running tests.
25 | git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
26 | sys.path.insert(1, git_repo_path)
27 |
28 | # silence FutureWarning warnings in tests since often we can't act on them until
29 | # they become normal warnings - i.e. the tests still need to test the current functionality
30 | warnings.simplefilter(action="ignore", category=FutureWarning)
31 |
32 |
33 | def pytest_addoption(parser):
34 | from diffusers.utils.testing_utils import pytest_addoption_shared
35 |
36 | pytest_addoption_shared(parser)
37 |
38 |
39 | def pytest_terminal_summary(terminalreporter):
40 | from diffusers.utils.testing_utils import pytest_terminal_summary_main
41 |
42 | make_reports = terminalreporter.config.getoption("--make-reports")
43 | if make_reports:
44 | pytest_terminal_summary_main(terminalreporter, id=make_reports)
45 |
--------------------------------------------------------------------------------
/mustango/diffusers/tests/fixtures/elise_format0.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/fixtures/elise_format0.mid
--------------------------------------------------------------------------------
/mustango/diffusers/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/models/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/models/test_models_vae_flax.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from diffusers import FlaxAutoencoderKL
4 | from diffusers.utils import is_flax_available
5 | from diffusers.utils.testing_utils import require_flax
6 |
7 | from ..test_modeling_common_flax import FlaxModelTesterMixin
8 |
9 |
10 | if is_flax_available():
11 | import jax
12 |
13 |
14 | @require_flax
15 | class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase):
16 | model_class = FlaxAutoencoderKL
17 |
18 | @property
19 | def dummy_input(self):
20 | batch_size = 4
21 | num_channels = 3
22 | sizes = (32, 32)
23 |
24 | prng_key = jax.random.PRNGKey(0)
25 | image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes))
26 |
27 | return {"sample": image, "prng_key": prng_key}
28 |
29 | def prepare_init_args_and_inputs_for_common(self):
30 | init_dict = {
31 | "block_out_channels": [32, 64],
32 | "in_channels": 3,
33 | "out_channels": 3,
34 | "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
35 | "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
36 | "latent_channels": 4,
37 | }
38 | inputs_dict = self.dummy_input
39 | return init_dict, inputs_dict
40 |
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/altdiffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/altdiffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/audio_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/audio_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/audioldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/audioldm/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/dance_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/dance_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/ddim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/ddim/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/ddpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/ddpm/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/dit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/dit/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/karras_ve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/karras_ve/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/latent_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/paint_by_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/paint_by_example/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/pndm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/pndm/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/repaint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/repaint/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/score_sde_ve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/score_sde_ve/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/stable_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/stable_diffusion_2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/stable_diffusion_2/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/stable_unclip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/stable_unclip/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/text_to_video/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/text_to_video/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/unclip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/unclip/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/versatile_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/versatile_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/pipelines/vq_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/pipelines/vq_diffusion/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/schedulers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/diffusers/tests/schedulers/__init__.py
--------------------------------------------------------------------------------
/mustango/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 | from diffusers import VQDiffusionScheduler
5 |
6 | from .test_schedulers import SchedulerCommonTest
7 |
8 |
9 | class VQDiffusionSchedulerTest(SchedulerCommonTest):
10 | scheduler_classes = (VQDiffusionScheduler,)
11 |
12 | def get_scheduler_config(self, **kwargs):
13 | config = {
14 | "num_vec_classes": 4097,
15 | "num_train_timesteps": 100,
16 | }
17 |
18 | config.update(**kwargs)
19 | return config
20 |
21 | def dummy_sample(self, num_vec_classes):
22 | batch_size = 4
23 | height = 8
24 | width = 8
25 |
26 | sample = torch.randint(0, num_vec_classes, (batch_size, height * width))
27 |
28 | return sample
29 |
30 | @property
31 | def dummy_sample_deter(self):
32 | assert False
33 |
34 | def dummy_model(self, num_vec_classes):
35 | def model(sample, t, *args):
36 | batch_size, num_latent_pixels = sample.shape
37 | logits = torch.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
38 | return_value = F.log_softmax(logits.double(), dim=1).float()
39 | return return_value
40 |
41 | return model
42 |
43 | def test_timesteps(self):
44 | for timesteps in [2, 5, 100, 1000]:
45 | self.check_over_configs(num_train_timesteps=timesteps)
46 |
47 | def test_num_vec_classes(self):
48 | for num_vec_classes in [5, 100, 1000, 4000]:
49 | self.check_over_configs(num_vec_classes=num_vec_classes)
50 |
51 | def test_time_indices(self):
52 | for t in [0, 50, 99]:
53 | self.check_over_forward(time_step=t)
54 |
55 | def test_add_noise_device(self):
56 | pass
57 |
--------------------------------------------------------------------------------
/mustango/diffusers/tests/test_pipelines_onnx_common.py:
--------------------------------------------------------------------------------
1 | from diffusers.utils.testing_utils import require_onnxruntime
2 |
3 |
4 | @require_onnxruntime
5 | class OnnxPipelineTesterMixin:
6 | """
7 | This mixin is designed to be used with unittest.TestCase classes.
8 | It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline,
9 | equivalence of dict and tuple outputs, etc.
10 | """
11 |
12 | pass
13 |
--------------------------------------------------------------------------------
/mustango/diffusers/utils/get_modified_files.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2023 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.:
17 | # python ./utils/get_modified_files.py utils src tests examples
18 | #
19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered
20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results
21 |
22 | import re
23 | import subprocess
24 | import sys
25 |
26 |
27 | fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
29 |
30 | joined_dirs = "|".join(sys.argv[1:])
31 | regex = re.compile(rf"^({joined_dirs}).*?\.py$")
32 |
33 | relevant_modified_files = [x for x in modified_files if regex.match(x)]
34 | print(" ".join(relevant_modified_files), end="")
35 |
--------------------------------------------------------------------------------
/mustango/diffusers/utils/print_env.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # coding=utf-8
4 | # Copyright 2023 The HuggingFace Inc. team.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # this script dumps information about the environment
19 |
20 | import os
21 | import platform
22 | import sys
23 |
24 |
25 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
26 |
27 | print("Python version:", sys.version)
28 |
29 | print("OS platform:", platform.platform())
30 | print("OS architecture:", platform.machine())
31 |
32 | try:
33 | import torch
34 |
35 | print("Torch version:", torch.__version__)
36 | print("Cuda available:", torch.cuda.is_available())
37 | print("Cuda version:", torch.version.cuda)
38 | print("CuDNN version:", torch.backends.cudnn.version())
39 | print("Number of GPUs available:", torch.cuda.device_count())
40 | except ImportError:
41 | print("Torch version:", None)
42 |
43 | try:
44 | import transformers
45 |
46 | print("transformers version:", transformers.__version__)
47 | except ImportError:
48 | print("transformers version:", None)
49 |
--------------------------------------------------------------------------------
/mustango/mustango.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/mustango.jpg
--------------------------------------------------------------------------------
/mustango/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.0.1
2 | torchaudio==2.0.2
3 | torchvision==0.15.2
4 | transformers==4.31.0
5 | accelerate==0.21.0
6 | datasets==2.1.0
7 | einops==0.6.1
8 | h5py==3.8.0
9 | huggingface_hub==0.19.4
10 | importlib_metadata==6.3.0
11 | librosa==0.9.2
12 | matplotlib==3.5.2
13 | numpy==1.23.0
14 | omegaconf==2.3.0
15 | packaging==23.1
16 | pandas==1.4.1
17 | progressbar33==2.4
18 | protobuf==3.20.*
19 | resampy==0.4.2
20 | safetensors==0.3.2
21 | sentencepiece==0.1.99
22 | scikit_image==0.19.3
23 | scikit_learn==1.2.2
24 | scipy==1.8.0
25 | soundfile==0.12.1
26 | ssr_eval==0.0.6
27 | torchlibrosa==0.1.0
28 | tqdm==4.63.1
29 | wandb==0.12.14
30 | ipython==8.12.0
31 | gradio==4.3.0
32 | wavio==0.0.7
--------------------------------------------------------------------------------
/mustango/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/mustango/tools/__init__.py
--------------------------------------------------------------------------------
/mustango/tools/mix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def a_weight(fs, n_fft, min_db=-80.0):
5 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
6 | freq_sq = np.power(freq, 2)
7 | freq_sq[0] = 1.0
8 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
9 | - np.log10(freq_sq + 12194 ** 2)
10 | - np.log10(freq_sq + 20.6 ** 2)
11 | - 0.5 * np.log10(freq_sq + 107.7 ** 2)
12 | - 0.5 * np.log10(freq_sq + 737.9 ** 2))
13 | weight = np.maximum(weight, min_db)
14 |
15 | return weight
16 |
17 |
18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
19 | if fs == 16000:
20 | n_fft = 2048
21 | elif fs == 44100:
22 | n_fft = 4096
23 | else:
24 | raise Exception("Invalid fs {}".format(fs))
25 | stride = n_fft // 2
26 |
27 | gain = []
28 | for i in range(0, len(sound) - n_fft + 1, stride):
29 | if mode == "RMSE":
30 | g = np.mean(sound[i: i + n_fft] ** 2)
31 | elif mode == "A_weighting":
32 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
33 | power_spec = np.abs(spec) ** 2
34 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
35 | g = np.sum(a_weighted_spec)
36 | else:
37 | raise Exception("Invalid mode {}".format(mode))
38 | gain.append(g)
39 |
40 | gain = np.array(gain)
41 | gain = np.maximum(gain, np.power(10, min_db / 10))
42 | gain_db = 10 * np.log10(gain)
43 | return gain_db
44 |
45 |
46 | def mix(sound1, sound2, r, fs):
47 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel
48 | gain2 = np.max(compute_gain(sound2, fs))
49 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
50 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
51 | return sound
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | torchaudio==0.13.1
3 | torchvision==0.14.1
4 | transformers==4.27.0
5 | accelerate==0.18.0
6 | datasets==2.1.0
7 | diffusers==0.18.2
8 | einops==0.6.1
9 | h5py==3.8.0
10 | huggingface_hub==0.13.3
11 | importlib_metadata==6.3.0
12 | librosa==0.9.2
13 | matplotlib==3.5.2
14 | numpy==1.23.0
15 | omegaconf==2.3.0
16 | packaging==23.1
17 | pandas==1.4.1
18 | progressbar33==2.4
19 | protobuf==3.20.*
20 | resampy==0.4.2
21 | scikit_image==0.19.3
22 | scikit_learn==1.2.2
23 | scipy==1.8.0
24 | soundfile==0.12.1
25 | ssr_eval==0.0.6
26 | torchlibrosa==0.1.0
27 | tqdm==4.63.1
28 | wandb==0.12.14
29 | ipython==8.12.0
30 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | requirement_path = "requirements.txt"
3 | install_requires = []
4 | if os.path.isfile(requirement_path):
5 | with open(requirement_path) as f:
6 | install_requires = f.read().splitlines()
7 | setup(name="mypackage", install_requires=install_requires, [...])
--------------------------------------------------------------------------------
/tango2/audioldm/__init__.py:
--------------------------------------------------------------------------------
1 | from .ldm import LatentDiffusion
2 | from .utils import seed_everything, save_wave, get_time, get_duration
3 | from .pipeline import *
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/tango2/audioldm/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from .tools import wav_to_fbank, read_wav_file
2 | from .stft import TacotronSTFT
3 |
--------------------------------------------------------------------------------
/tango2/audioldm/hifigan/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Generator
2 |
3 |
4 | class AttrDict(dict):
5 | def __init__(self, *args, **kwargs):
6 | super(AttrDict, self).__init__(*args, **kwargs)
7 | self.__dict__ = self
8 |
--------------------------------------------------------------------------------
/tango2/audioldm/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tango2/audioldm/latent_diffusion/__init__.py
--------------------------------------------------------------------------------
/tango2/audioldm/variational_autoencoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .autoencoder import AutoencoderKL
--------------------------------------------------------------------------------
/tango2/audioldm_eval/__init__.py:
--------------------------------------------------------------------------------
1 | from .metrics.fid import calculate_fid
2 | from .metrics.isc import calculate_isc
3 | from .metrics.kid import calculate_kid
4 | from .metrics.kl import calculate_kl
5 | from .eval import EvaluationHelper
6 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/audio/__init__.py:
--------------------------------------------------------------------------------
1 | # import audio.tools
2 | # import audio.stft
3 | # import audio.audio_processing
4 | from .stft import *
5 | from .audio_processing import *
6 | from .tools import *
7 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tango2/audioldm_eval/datasets/__init__.py
--------------------------------------------------------------------------------
/tango2/audioldm_eval/datasets/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from specvqgan.modules.losses.vggishish.transforms import Crop
3 |
4 |
5 | class FromMinusOneOneToZeroOne(object):
6 | """Actually, it doesnot do [-1, 1] --> [0, 1] as promised. It would, if inputs would be in [-1, 1]
7 | but reconstructed specs are not."""
8 |
9 | def __call__(self, item):
10 | item["image"] = (item["image"] + 1) / 2
11 | return item
12 |
13 |
14 | class CropNoDict(Crop):
15 | def __init__(self, cropped_shape, random_crop=None):
16 | super().__init__(cropped_shape=cropped_shape, random_crop=random_crop)
17 |
18 | def __call__(self, x):
19 | # albumentations expect an ndarray of size (H, W, ...) but we have tensor of size (B, H, W).
20 | # we will assume that the batch-dim (B) is out "channel" dim and permute it to the end.
21 | # Finally, we change the type back to Torch.Tensor.
22 | x = self.preprocessor(image=x.permute(1, 2, 0).numpy())["image"].transpose(
23 | 2, 0, 1
24 | )
25 | return torch.from_numpy(x)
26 |
27 |
28 | class GetInputFromBatchByKey(object): # get image from item dict
29 | def __init__(self, input_key):
30 | self.input_key = input_key
31 |
32 | def __call__(self, item):
33 | return item[self.input_key]
34 |
35 |
36 | class ToFloat32(object):
37 | def __call__(self, item):
38 | return item.float()
39 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/feature_extractors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tango2/audioldm_eval/feature_extractors/__init__.py
--------------------------------------------------------------------------------
/tango2/audioldm_eval/feature_extractors/panns/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import Cnn14, Cnn14_16k
2 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/feature_extractors/panns/evaluate.py:
--------------------------------------------------------------------------------
1 | from sklearn import metrics
2 |
3 | from pytorch_utils import forward
4 |
5 |
6 | class Evaluator(object):
7 | def __init__(self, model):
8 | """Evaluator.
9 |
10 | Args:
11 | model: object
12 | """
13 | self.model = model
14 |
15 | def evaluate(self, data_loader):
16 | """Forward evaluation data and calculate statistics.
17 |
18 | Args:
19 | data_loader: object
20 |
21 | Returns:
22 | statistics: dict,
23 | {'average_precision': (classes_num,), 'auc': (classes_num,)}
24 | """
25 |
26 | # Forward
27 | output_dict = forward(
28 | model=self.model, generator=data_loader, return_target=True
29 | )
30 |
31 | clipwise_output = output_dict["clipwise_output"] # (audios_num, classes_num)
32 | target = output_dict["target"] # (audios_num, classes_num)
33 |
34 | average_precision = metrics.average_precision_score(
35 | target, clipwise_output, average=None
36 | )
37 |
38 | auc = metrics.roc_auc_score(target, clipwise_output, average=None)
39 |
40 | statistics = {"average_precision": average_precision, "auc": auc}
41 |
42 | return statistics
43 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/feature_extractors/panns/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def clip_bce(output_dict, target_dict):
6 | """Binary crossentropy loss."""
7 | return F.binary_cross_entropy(output_dict["clipwise_output"], target_dict["target"])
8 |
9 |
10 | def get_loss_func(loss_type):
11 | if loss_type == "clip_bce":
12 | return clip_bce
13 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tango2/audioldm_eval/metrics/__init__.py
--------------------------------------------------------------------------------
/tango2/audioldm_eval/metrics/gs/__init__.py:
--------------------------------------------------------------------------------
1 | from .geom_score import *
2 | from .top_utils import *
3 | from .utils import *
4 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/metrics/gs/top_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def circle(N=5000):
5 | phi = 2 * np.pi * np.random.rand(N)
6 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
7 | x = np.array(x)
8 | x = x + 0.05 * np.random.randn(N, 2)
9 | return x
10 |
11 |
12 | def filled_circle(N=5000):
13 | ans = []
14 | while len(ans) < N:
15 | x = np.random.rand(2) * 2.0 - 1.0
16 | if np.linalg.norm(x) < 1:
17 | ans.append(x)
18 | return np.array(ans) + 0.05 * np.random.randn(N, 2)
19 |
20 |
21 | def circle_quorter(N=5000):
22 | phi = np.pi * np.random.rand(N) + np.pi / 2
23 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
24 | x = np.array(x)
25 | x = x + 0.05 * np.random.randn(N, 2)
26 | return x
27 |
28 |
29 | def circle_thin(N=5000):
30 | phi = np.random.randn(N)
31 | x = [[np.sin(phi0), np.cos(phi0)] for phi0 in phi]
32 | x = np.array(x)
33 | x = x + 0.05 * np.random.randn(N, 2)
34 | return x
35 |
36 |
37 | def planar(N=5000, zdim=32, dim=784):
38 | A = np.random.rand(N, zdim)
39 | z = np.random.rand(zdim, dim)
40 | return np.dot(A, z)
41 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/metrics/isc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | def calculate_isc(featuresdict, feat_layer_name, rng_seed, samples_shuffle, splits):
6 | # print("Computing Inception Score")
7 |
8 | features = featuresdict[feat_layer_name]
9 |
10 | assert torch.is_tensor(features) and features.dim() == 2
11 | N, C = features.shape
12 | if samples_shuffle:
13 | rng = np.random.RandomState(rng_seed)
14 | features = features[rng.permutation(N), :]
15 | features = features.double()
16 |
17 | p = features.softmax(dim=1)
18 | log_p = features.log_softmax(dim=1)
19 |
20 | scores = []
21 | for i in range(splits):
22 | p_chunk = p[(i * N // splits) : ((i + 1) * N // splits), :] # 一部分的预测概率
23 | log_p_chunk = log_p[(i * N // splits) : ((i + 1) * N // splits), :] # log
24 | q_chunk = p_chunk.mean(dim=0, keepdim=True) # 概率的均值
25 | kl = p_chunk * (log_p_chunk - q_chunk.log()) #
26 | kl = kl.sum(dim=1).mean().exp().item()
27 | scores.append(kl)
28 | # print("scores",scores)
29 | return {
30 | "inception_score_mean": float(np.mean(scores)),
31 | "inception_score_std": float(np.std(scores)),
32 | }
33 |
--------------------------------------------------------------------------------
/tango2/audioldm_eval/metrics/validate.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | from numpy import cov
3 | from numpy import trace
4 | from numpy import iscomplexobj
5 | from numpy.random import random
6 | from scipy.linalg import sqrtm
7 |
8 |
9 | def calculate_fid(act1, act2):
10 | # calculate mean and covariance statistics
11 | mu1, sigma1 = act1.mean(axis=0), cov(act1, rowvar=False)
12 | mu2, sigma2 = act2.mean(axis=0), cov(act2, rowvar=False)
13 | print("mu1 ", mu1.shape)
14 | print("mu2 ", mu2.shape)
15 | print("sigma1 ", sigma1.shape)
16 | print("sigma2 ", sigma2.shape)
17 | # calculate sum squared difference between means
18 | ssdiff = numpy.sum((mu1 - mu2) * 2.0)
19 |
20 | # calculate sqrt of product between cov
21 | covmean = sqrtm(sigma1.dot(sigma2))
22 |
23 | # check and correct imaginary numbers from sqrt
24 | if iscomplexobj(covmean):
25 | covmean = covmean.real
26 | # calculate score
27 | fid = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
28 | return fid
29 |
30 |
31 | act1 = random(2048 * 2)
32 | act1 = act1.reshape((2, 2048))
33 | act2 = random(2048 * 2)
34 | act2 = act2.reshape((2, 2048))
35 | fid = calculate_fid(act1, act1)
36 | print("FID (same): %.3f" % fid)
37 | fid = calculate_fid(act1, act2)
38 | print("FID (different): %.3f" % fid)
39 |
--------------------------------------------------------------------------------
/tango2/configs/diffusion_model_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 1024,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 8,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 8,
37 | "sample_size": [32, 2],
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/tango2/configs/diffusion_model_xl_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 2048,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 8,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 8,
37 | "sample_size": [32, 2],
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/tango2/configs/stable_diffusion_2.1.json:
--------------------------------------------------------------------------------
1 | {
2 | "_class_name": "UNet2DConditionModel",
3 | "_diffusers_version": "0.10.0.dev0",
4 | "act_fn": "silu",
5 | "attention_head_dim": [
6 | 5,
7 | 10,
8 | 20,
9 | 20
10 | ],
11 | "block_out_channels": [
12 | 320,
13 | 640,
14 | 1280,
15 | 1280
16 | ],
17 | "center_input_sample": false,
18 | "cross_attention_dim": 1024,
19 | "down_block_types": [
20 | "CrossAttnDownBlock2D",
21 | "CrossAttnDownBlock2D",
22 | "CrossAttnDownBlock2D",
23 | "DownBlock2D"
24 | ],
25 | "downsample_padding": 1,
26 | "dual_cross_attention": false,
27 | "flip_sin_to_cos": true,
28 | "freq_shift": 0,
29 | "in_channels": 4,
30 | "layers_per_block": 2,
31 | "mid_block_scale_factor": 1,
32 | "norm_eps": 1e-05,
33 | "norm_num_groups": 32,
34 | "num_class_embeds": null,
35 | "only_cross_attention": false,
36 | "out_channels": 4,
37 | "sample_size": 96,
38 | "up_block_types": [
39 | "UpBlock2D",
40 | "CrossAttnUpBlock2D",
41 | "CrossAttnUpBlock2D",
42 | "CrossAttnUpBlock2D"
43 | ],
44 | "use_linear_projection": true,
45 | "upcast_attention": true
46 | }
47 |
--------------------------------------------------------------------------------
/tango2/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | torchaudio==0.13.1
3 | torchvision==0.14.1
4 | transformers==4.27.0
5 | accelerate==0.18.0
6 | datasets==2.1.0
7 | diffusers==0.18.2
8 | einops==0.6.1
9 | h5py==3.8.0
10 | huggingface_hub==0.13.3
11 | importlib_metadata==6.3.0
12 | librosa==0.9.2
13 | matplotlib==3.5.2
14 | numpy==1.23.0
15 | omegaconf==2.3.0
16 | packaging==23.1
17 | pandas==1.4.1
18 | progressbar33==2.4
19 | protobuf==3.20.*
20 | resampy==0.4.2
21 | scikit_image==0.19.3
22 | scikit_learn==1.2.2
23 | scipy==1.8.0
24 | soundfile==0.12.1
25 | ssr_eval==0.0.6
26 | torchlibrosa==0.1.0
27 | tqdm==4.63.1
28 | wandb==0.12.14
29 | ipython==8.12.0
30 |
--------------------------------------------------------------------------------
/tango2/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tango2/tools/__init__.py
--------------------------------------------------------------------------------
/tango2/tools/mix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def a_weight(fs, n_fft, min_db=-80.0):
5 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
6 | freq_sq = np.power(freq, 2)
7 | freq_sq[0] = 1.0
8 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
9 | - np.log10(freq_sq + 12194 ** 2)
10 | - np.log10(freq_sq + 20.6 ** 2)
11 | - 0.5 * np.log10(freq_sq + 107.7 ** 2)
12 | - 0.5 * np.log10(freq_sq + 737.9 ** 2))
13 | weight = np.maximum(weight, min_db)
14 |
15 | return weight
16 |
17 |
18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
19 | if fs == 16000:
20 | n_fft = 2048
21 | elif fs == 44100:
22 | n_fft = 4096
23 | else:
24 | raise Exception("Invalid fs {}".format(fs))
25 | stride = n_fft // 2
26 |
27 | gain = []
28 | for i in range(0, len(sound) - n_fft + 1, stride):
29 | if mode == "RMSE":
30 | g = np.mean(sound[i: i + n_fft] ** 2)
31 | elif mode == "A_weighting":
32 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
33 | power_spec = np.abs(spec) ** 2
34 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
35 | g = np.sum(a_weighted_spec)
36 | else:
37 | raise Exception("Invalid mode {}".format(mode))
38 | gain.append(g)
39 |
40 | gain = np.array(gain)
41 | gain = np.maximum(gain, np.power(10, min_db / 10))
42 | gain_db = 10 * np.log10(gain)
43 | return gain_db
44 |
45 |
46 | def mix(sound1, sound2, r, fs):
47 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel
48 | gain2 = np.max(compute_gain(sound2, fs))
49 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
50 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
51 | return sound
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/declare-lab/tango/310e68504c3f91200313ccc3b94bf19e0941e339/tools/__init__.py
--------------------------------------------------------------------------------
/tools/mix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def a_weight(fs, n_fft, min_db=-80.0):
5 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1)
6 | freq_sq = np.power(freq, 2)
7 | freq_sq[0] = 1.0
8 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq)
9 | - np.log10(freq_sq + 12194 ** 2)
10 | - np.log10(freq_sq + 20.6 ** 2)
11 | - 0.5 * np.log10(freq_sq + 107.7 ** 2)
12 | - 0.5 * np.log10(freq_sq + 737.9 ** 2))
13 | weight = np.maximum(weight, min_db)
14 |
15 | return weight
16 |
17 |
18 | def compute_gain(sound, fs, min_db=-80.0, mode="A_weighting"):
19 | if fs == 16000:
20 | n_fft = 2048
21 | elif fs == 44100:
22 | n_fft = 4096
23 | else:
24 | raise Exception("Invalid fs {}".format(fs))
25 | stride = n_fft // 2
26 |
27 | gain = []
28 | for i in range(0, len(sound) - n_fft + 1, stride):
29 | if mode == "RMSE":
30 | g = np.mean(sound[i: i + n_fft] ** 2)
31 | elif mode == "A_weighting":
32 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft])
33 | power_spec = np.abs(spec) ** 2
34 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10)
35 | g = np.sum(a_weighted_spec)
36 | else:
37 | raise Exception("Invalid mode {}".format(mode))
38 | gain.append(g)
39 |
40 | gain = np.array(gain)
41 | gain = np.maximum(gain, np.power(10, min_db / 10))
42 | gain_db = 10 * np.log10(gain)
43 | return gain_db
44 |
45 |
46 | def mix(sound1, sound2, r, fs):
47 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel
48 | gain2 = np.max(compute_gain(sound2, fs))
49 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r)
50 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2))
51 | return sound
--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | # Train the LDM from scracth with a flan-t5-large text encoder
2 | accelerate launch train.py \
3 | --train_file="data/train_audiocaps.json" --validation_file="data/valid_audiocaps.json" --test_file="data/test_audiocaps_subset.json" \
4 | --text_encoder_name="google/flan-t5-large" --scheduler_name="stabilityai/stable-diffusion-2-1" \
5 | --unet_model_config="configs/diffusion_model_config.json" --freeze_text_encoder \
6 | --gradient_accumulation_steps 4 --per_device_train_batch_size=2 --per_device_eval_batch_size=2 --augment \
7 | --learning_rate=3e-5 --num_train_epochs 40 --snr_gamma 5 \
8 | --text_column captions --audio_column location --checkpointing_steps="best"
9 |
10 | # Continue training the LDM from our checkpoint using the --hf_model argument
11 | accelerate launch train.py \
12 | --train_file="data/train_audiocaps.json" --validation_file="data/valid_audiocaps.json" --test_file="data/test_audiocaps_subset.json" \
13 | --hf_model "declare-lab/tango" --unet_model_config="configs/diffusion_model_config.json" --freeze_text_encoder \
14 | --gradient_accumulation_steps 4 --per_device_train_batch_size=2 --per_device_eval_batch_size=2 --augment \
15 | --learning_rate=3e-5 --num_train_epochs 40 --snr_gamma 5 \
16 | --text_column captions --audio_column location --checkpointing_steps="best"
--------------------------------------------------------------------------------