├── .gitignore
├── LICENSE
├── README.md
├── docs
└── Data.md
├── imgs
├── demo1.png
├── demo2.png
├── github_poster1.png
├── github_poster2.png
├── logo.pdf
├── logo.png
└── method.png
├── llmga
├── __init__.py
├── diffusers
│ ├── .gitignore
│ ├── CITATION.cff
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── PHILOSOPHY.md
│ ├── README.md
│ ├── _typos.toml
│ ├── docs
│ │ ├── README.md
│ │ ├── TRANSLATING.md
│ │ └── source
│ │ │ ├── _config.py
│ │ │ ├── en
│ │ │ ├── _toctree.yml
│ │ │ ├── api
│ │ │ │ ├── attnprocessor.md
│ │ │ │ ├── configuration.md
│ │ │ │ ├── diffusion_pipeline.md
│ │ │ │ ├── image_processor.md
│ │ │ │ ├── loaders.md
│ │ │ │ ├── logging.md
│ │ │ │ ├── models
│ │ │ │ │ ├── asymmetricautoencoderkl.md
│ │ │ │ │ ├── autoencoder_tiny.md
│ │ │ │ │ ├── autoencoderkl.md
│ │ │ │ │ ├── controlnet.md
│ │ │ │ │ ├── overview.md
│ │ │ │ │ ├── prior_transformer.md
│ │ │ │ │ ├── transformer2d.md
│ │ │ │ │ ├── transformer_temporal.md
│ │ │ │ │ ├── unet.md
│ │ │ │ │ ├── unet2d-cond.md
│ │ │ │ │ ├── unet2d.md
│ │ │ │ │ ├── unet3d-cond.md
│ │ │ │ │ └── vq.md
│ │ │ │ ├── outputs.md
│ │ │ │ ├── pipelines
│ │ │ │ │ ├── alt_diffusion.md
│ │ │ │ │ ├── attend_and_excite.md
│ │ │ │ │ ├── audio_diffusion.md
│ │ │ │ │ ├── audioldm.md
│ │ │ │ │ ├── audioldm2.md
│ │ │ │ │ ├── auto_pipeline.md
│ │ │ │ │ ├── blip_diffusion.md
│ │ │ │ │ ├── consistency_models.md
│ │ │ │ │ ├── controlnet.md
│ │ │ │ │ ├── controlnet_sdxl.md
│ │ │ │ │ ├── cycle_diffusion.md
│ │ │ │ │ ├── dance_diffusion.md
│ │ │ │ │ ├── ddim.md
│ │ │ │ │ ├── ddpm.md
│ │ │ │ │ ├── deepfloyd_if.md
│ │ │ │ │ ├── diffedit.md
│ │ │ │ │ ├── dit.md
│ │ │ │ │ ├── kandinsky.md
│ │ │ │ │ ├── kandinsky_v22.md
│ │ │ │ │ ├── latent_diffusion.md
│ │ │ │ │ ├── latent_diffusion_uncond.md
│ │ │ │ │ ├── model_editing.md
│ │ │ │ │ ├── musicldm.md
│ │ │ │ │ ├── overview.md
│ │ │ │ │ ├── paint_by_example.md
│ │ │ │ │ ├── panorama.md
│ │ │ │ │ ├── paradigms.md
│ │ │ │ │ ├── pix2pix.md
│ │ │ │ │ ├── pix2pix_zero.md
│ │ │ │ │ ├── pndm.md
│ │ │ │ │ ├── repaint.md
│ │ │ │ │ ├── score_sde_ve.md
│ │ │ │ │ ├── self_attention_guidance.md
│ │ │ │ │ ├── semantic_stable_diffusion.md
│ │ │ │ │ ├── shap_e.md
│ │ │ │ │ ├── spectrogram_diffusion.md
│ │ │ │ │ ├── stable_diffusion
│ │ │ │ │ │ ├── adapter.md
│ │ │ │ │ │ ├── depth2img.md
│ │ │ │ │ │ ├── gligen.md
│ │ │ │ │ │ ├── image_variation.md
│ │ │ │ │ │ ├── img2img.md
│ │ │ │ │ │ ├── inpaint.md
│ │ │ │ │ │ ├── latent_upscale.md
│ │ │ │ │ │ ├── ldm3d_diffusion.md
│ │ │ │ │ │ ├── overview.md
│ │ │ │ │ │ ├── stable_diffusion_2.md
│ │ │ │ │ │ ├── stable_diffusion_safe.md
│ │ │ │ │ │ ├── stable_diffusion_xl.md
│ │ │ │ │ │ ├── text2img.md
│ │ │ │ │ │ └── upscale.md
│ │ │ │ │ ├── stable_unclip.md
│ │ │ │ │ ├── stochastic_karras_ve.md
│ │ │ │ │ ├── text_to_video.md
│ │ │ │ │ ├── text_to_video_zero.md
│ │ │ │ │ ├── unclip.md
│ │ │ │ │ ├── unidiffuser.md
│ │ │ │ │ ├── value_guided_sampling.md
│ │ │ │ │ ├── versatile_diffusion.md
│ │ │ │ │ ├── vq_diffusion.md
│ │ │ │ │ └── wuerstchen.md
│ │ │ │ ├── schedulers
│ │ │ │ │ ├── cm_stochastic_iterative.md
│ │ │ │ │ ├── ddim.md
│ │ │ │ │ ├── ddim_inverse.md
│ │ │ │ │ ├── ddpm.md
│ │ │ │ │ ├── deis.md
│ │ │ │ │ ├── dpm_discrete.md
│ │ │ │ │ ├── dpm_discrete_ancestral.md
│ │ │ │ │ ├── dpm_sde.md
│ │ │ │ │ ├── euler.md
│ │ │ │ │ ├── euler_ancestral.md
│ │ │ │ │ ├── heun.md
│ │ │ │ │ ├── ipndm.md
│ │ │ │ │ ├── lms_discrete.md
│ │ │ │ │ ├── multistep_dpm_solver.md
│ │ │ │ │ ├── multistep_dpm_solver_inverse.md
│ │ │ │ │ ├── overview.md
│ │ │ │ │ ├── pndm.md
│ │ │ │ │ ├── repaint.md
│ │ │ │ │ ├── score_sde_ve.md
│ │ │ │ │ ├── score_sde_vp.md
│ │ │ │ │ ├── singlestep_dpm_solver.md
│ │ │ │ │ ├── stochastic_karras_ve.md
│ │ │ │ │ ├── unipc.md
│ │ │ │ │ └── vq_diffusion.md
│ │ │ │ └── utilities.md
│ │ │ ├── conceptual
│ │ │ │ ├── contribution.md
│ │ │ │ ├── ethical_guidelines.md
│ │ │ │ ├── evaluation.md
│ │ │ │ └── philosophy.md
│ │ │ ├── imgs
│ │ │ │ ├── access_request.png
│ │ │ │ └── diffusers_library.jpg
│ │ │ ├── index.md
│ │ │ ├── installation.md
│ │ │ ├── optimization
│ │ │ │ ├── coreml.md
│ │ │ │ ├── fp16.md
│ │ │ │ ├── habana.md
│ │ │ │ ├── memory.md
│ │ │ │ ├── mps.md
│ │ │ │ ├── onnx.md
│ │ │ │ ├── open_vino.md
│ │ │ │ ├── opt_overview.md
│ │ │ │ ├── tome.md
│ │ │ │ ├── torch2.0.md
│ │ │ │ └── xformers.md
│ │ │ ├── quicktour.md
│ │ │ ├── stable_diffusion.md
│ │ │ ├── training
│ │ │ │ ├── adapt_a_model.md
│ │ │ │ ├── controlnet.md
│ │ │ │ ├── create_dataset.md
│ │ │ │ ├── custom_diffusion.md
│ │ │ │ ├── ddpo.md
│ │ │ │ ├── distributed_inference.md
│ │ │ │ ├── dreambooth.md
│ │ │ │ ├── instructpix2pix.md
│ │ │ │ ├── lora.md
│ │ │ │ ├── overview.md
│ │ │ │ ├── t2i_adapters.md
│ │ │ │ ├── text2image.md
│ │ │ │ ├── text_inversion.md
│ │ │ │ └── unconditional_training.md
│ │ │ ├── tutorials
│ │ │ │ ├── autopipeline.md
│ │ │ │ ├── basic_training.md
│ │ │ │ └── tutorial_overview.md
│ │ │ └── using-diffusers
│ │ │ │ ├── conditional_image_generation.md
│ │ │ │ ├── contribute_pipeline.md
│ │ │ │ ├── control_brightness.md
│ │ │ │ ├── controlling_generation.md
│ │ │ │ ├── controlnet.md
│ │ │ │ ├── custom_pipeline_examples.md
│ │ │ │ ├── custom_pipeline_overview.md
│ │ │ │ ├── depth2img.md
│ │ │ │ ├── diffedit.md
│ │ │ │ ├── distilled_sd.md
│ │ │ │ ├── freeu.md
│ │ │ │ ├── img2img.md
│ │ │ │ ├── inpaint.md
│ │ │ │ ├── loading.md
│ │ │ │ ├── loading_overview.md
│ │ │ │ ├── other-formats.md
│ │ │ │ ├── other-modalities.md
│ │ │ │ ├── pipeline_overview.md
│ │ │ │ ├── push_to_hub.md
│ │ │ │ ├── reproducibility.md
│ │ │ │ ├── reusing_seeds.md
│ │ │ │ ├── schedulers.md
│ │ │ │ ├── sdxl.md
│ │ │ │ ├── shap-e.md
│ │ │ │ ├── stable_diffusion_jax_how_to.md
│ │ │ │ ├── textual_inversion_inference.md
│ │ │ │ ├── unconditional_image_generation.md
│ │ │ │ ├── using_safetensors.md
│ │ │ │ ├── weighted_prompts.md
│ │ │ │ └── write_own_pipeline.md
│ │ │ ├── ko
│ │ │ ├── _toctree.yml
│ │ │ ├── api
│ │ │ │ └── pipelines
│ │ │ │ │ └── stable_diffusion
│ │ │ │ │ └── stable_diffusion_xl.md
│ │ │ ├── in_translation.md
│ │ │ ├── index.md
│ │ │ ├── installation.md
│ │ │ ├── optimization
│ │ │ │ ├── coreml.md
│ │ │ │ ├── fp16.md
│ │ │ │ ├── habana.md
│ │ │ │ ├── mps.md
│ │ │ │ ├── onnx.md
│ │ │ │ ├── open_vino.md
│ │ │ │ ├── opt_overview.md
│ │ │ │ ├── tome.md
│ │ │ │ ├── torch2.0.md
│ │ │ │ └── xformers.md
│ │ │ ├── quicktour.md
│ │ │ ├── stable_diffusion.md
│ │ │ ├── training
│ │ │ │ ├── adapt_a_model.md
│ │ │ │ ├── controlnet.md
│ │ │ │ ├── create_dataset.md
│ │ │ │ ├── custom_diffusion.md
│ │ │ │ ├── distributed_inference.md
│ │ │ │ ├── dreambooth.md
│ │ │ │ ├── instructpix2pix.md
│ │ │ │ ├── lora.md
│ │ │ │ ├── overview.md
│ │ │ │ ├── text2image.md
│ │ │ │ ├── text_inversion.md
│ │ │ │ └── unconditional_training.md
│ │ │ ├── tutorials
│ │ │ │ ├── basic_training.md
│ │ │ │ └── tutorial_overview.md
│ │ │ └── using-diffusers
│ │ │ │ ├── conditional_image_generation.md
│ │ │ │ ├── contribute_pipeline.md
│ │ │ │ ├── control_brightness.md
│ │ │ │ ├── controlling_generation.md
│ │ │ │ ├── custom_pipeline_examples.md
│ │ │ │ ├── custom_pipeline_overview.md
│ │ │ │ ├── depth2img.md
│ │ │ │ ├── img2img.md
│ │ │ │ ├── inpaint.md
│ │ │ │ ├── loading.md
│ │ │ │ ├── loading_overview.md
│ │ │ │ ├── other-formats.md
│ │ │ │ ├── pipeline_overview.md
│ │ │ │ ├── reproducibility.md
│ │ │ │ ├── reusing_seeds.md
│ │ │ │ ├── schedulers.md
│ │ │ │ ├── stable_diffusion_jax_how_to.md
│ │ │ │ ├── textual_inversion_inference.md
│ │ │ │ ├── unconditional_image_generation.md
│ │ │ │ ├── using_safetensors.md
│ │ │ │ ├── weighted_prompts.md
│ │ │ │ └── write_own_pipeline.md
│ │ │ └── zh
│ │ │ ├── _toctree.yml
│ │ │ ├── index.md
│ │ │ ├── installation.md
│ │ │ ├── quicktour.md
│ │ │ └── stable_diffusion.md
│ ├── examples
│ │ ├── README.md
│ │ ├── community
│ │ │ ├── README.md
│ │ │ ├── bit_diffusion.py
│ │ │ ├── checkpoint_merger.py
│ │ │ ├── clip_guided_images_mixing_stable_diffusion.py
│ │ │ ├── clip_guided_stable_diffusion.py
│ │ │ ├── clip_guided_stable_diffusion_img2img.py
│ │ │ ├── composable_stable_diffusion.py
│ │ │ ├── ddim_noise_comparative_analysis.py
│ │ │ ├── edict_pipeline.py
│ │ │ ├── iadb.py
│ │ │ ├── imagic_stable_diffusion.py
│ │ │ ├── img2img_inpainting.py
│ │ │ ├── interpolate_stable_diffusion.py
│ │ │ ├── lpw_stable_diffusion.py
│ │ │ ├── lpw_stable_diffusion_onnx.py
│ │ │ ├── lpw_stable_diffusion_xl.py
│ │ │ ├── magic_mix.py
│ │ │ ├── masked_stable_diffusion_img2img.py
│ │ │ ├── mixture_canvas.py
│ │ │ ├── mixture_tiling.py
│ │ │ ├── multilingual_stable_diffusion.py
│ │ │ ├── one_step_unet.py
│ │ │ ├── pipeline_fabric.py
│ │ │ ├── pipeline_prompt2prompt.py
│ │ │ ├── pipeline_zero1to3.py
│ │ │ ├── run_onnx_controlnet.py
│ │ │ ├── run_tensorrt_controlnet.py
│ │ │ ├── sd_text2img_k_diffusion.py
│ │ │ ├── seed_resize_stable_diffusion.py
│ │ │ ├── speech_to_image_diffusion.py
│ │ │ ├── stable_diffusion_comparison.py
│ │ │ ├── stable_diffusion_controlnet_img2img.py
│ │ │ ├── stable_diffusion_controlnet_inpaint.py
│ │ │ ├── stable_diffusion_controlnet_inpaint_img2img.py
│ │ │ ├── stable_diffusion_controlnet_reference.py
│ │ │ ├── stable_diffusion_ipex.py
│ │ │ ├── stable_diffusion_mega.py
│ │ │ ├── stable_diffusion_reference.py
│ │ │ ├── stable_diffusion_repaint.py
│ │ │ ├── stable_diffusion_tensorrt_img2img.py
│ │ │ ├── stable_diffusion_tensorrt_inpaint.py
│ │ │ ├── stable_diffusion_tensorrt_txt2img.py
│ │ │ ├── stable_diffusion_xl_reference.py
│ │ │ ├── stable_unclip.py
│ │ │ ├── text_inpainting.py
│ │ │ ├── tiled_upscaling.py
│ │ │ ├── unclip_image_interpolation.py
│ │ │ ├── unclip_text_interpolation.py
│ │ │ └── wildcard_stable_diffusion.py
│ │ ├── conftest.py
│ │ ├── controlnet
│ │ │ ├── README.md
│ │ │ ├── README_sdxl.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── requirements_sdxl.txt
│ │ │ ├── train_controlnet.py
│ │ │ ├── train_controlnet_flax.py
│ │ │ └── train_controlnet_sdxl.py
│ │ ├── custom_diffusion
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── retrieve.py
│ │ │ └── train_custom_diffusion.py
│ │ ├── dreambooth
│ │ │ ├── README.md
│ │ │ ├── README_sdxl.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── requirements_sdxl.txt
│ │ │ ├── train_dreambooth.py
│ │ │ ├── train_dreambooth_flax.py
│ │ │ ├── train_dreambooth_lora.py
│ │ │ └── train_dreambooth_lora_sdxl.py
│ │ ├── inference
│ │ │ ├── README.md
│ │ │ ├── image_to_image.py
│ │ │ └── inpainting.py
│ │ ├── instruct_pix2pix
│ │ │ ├── README.md
│ │ │ ├── README_sdxl.md
│ │ │ ├── requirements.txt
│ │ │ ├── train_instruct_pix2pix.py
│ │ │ └── train_instruct_pix2pix_sdxl.py
│ │ ├── kandinsky2_2
│ │ │ └── text_to_image
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── train_text_to_image_decoder.py
│ │ │ │ ├── train_text_to_image_lora_decoder.py
│ │ │ │ ├── train_text_to_image_lora_prior.py
│ │ │ │ └── train_text_to_image_prior.py
│ │ ├── reinforcement_learning
│ │ │ ├── README.md
│ │ │ └── run_diffuser_locomotion.py
│ │ ├── research_projects
│ │ │ ├── README.md
│ │ │ ├── colossalai
│ │ │ │ ├── README.md
│ │ │ │ ├── inference.py
│ │ │ │ ├── requirement.txt
│ │ │ │ └── train_dreambooth_colossalai.py
│ │ │ ├── controlnet
│ │ │ │ └── train_controlnet_webdataset.py
│ │ │ ├── dreambooth_inpaint
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── train_dreambooth_inpaint.py
│ │ │ │ └── train_dreambooth_inpaint_lora.py
│ │ │ ├── intel_opts
│ │ │ │ ├── README.md
│ │ │ │ ├── inference_bf16.py
│ │ │ │ ├── textual_inversion
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ └── textual_inversion_bf16.py
│ │ │ │ └── textual_inversion_dfq
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ ├── text2images.py
│ │ │ │ │ └── textual_inversion.py
│ │ │ ├── lora
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_text_to_image_lora.py
│ │ │ ├── mulit_token_textual_inversion
│ │ │ │ ├── README.md
│ │ │ │ ├── multi_token_clip.py
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── requirements_flax.txt
│ │ │ │ ├── textual_inversion.py
│ │ │ │ └── textual_inversion_flax.py
│ │ │ ├── multi_subject_dreambooth
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ └── train_multi_subject_dreambooth.py
│ │ │ ├── onnxruntime
│ │ │ │ ├── README.md
│ │ │ │ ├── text_to_image
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ └── train_text_to_image.py
│ │ │ │ ├── textual_inversion
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ └── textual_inversion.py
│ │ │ │ └── unconditional_image_generation
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── requirements.txt
│ │ │ │ │ └── train_unconditional.py
│ │ │ ├── rdm
│ │ │ │ ├── README.md
│ │ │ │ ├── pipeline_rdm.py
│ │ │ │ └── retriever.py
│ │ │ └── sdxl_flax
│ │ │ │ ├── README.md
│ │ │ │ ├── sdxl_single.py
│ │ │ │ └── sdxl_single_aot.py
│ │ ├── t2i_adapter
│ │ │ ├── README.md
│ │ │ ├── README_sdxl.md
│ │ │ ├── requirements.txt
│ │ │ └── train_t2i_adapter_sdxl.py
│ │ ├── test_examples.py
│ │ ├── text_to_image
│ │ │ ├── README.md
│ │ │ ├── README_sdxl.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── requirements_sdxl.txt
│ │ │ ├── train_text_to_image.py
│ │ │ ├── train_text_to_image_flax.py
│ │ │ ├── train_text_to_image_lora.py
│ │ │ ├── train_text_to_image_lora_sdxl.py
│ │ │ └── train_text_to_image_sdxl.py
│ │ ├── textual_inversion
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ ├── requirements_flax.txt
│ │ │ ├── textual_inversion.py
│ │ │ └── textual_inversion_flax.py
│ │ └── unconditional_image_generation
│ │ │ ├── README.md
│ │ │ ├── requirements.txt
│ │ │ └── train_unconditional.py
│ ├── my_datasets
│ │ ├── dataset_inpainting.py
│ │ ├── dataset_inpainting_sdxl.py
│ │ ├── dataset_text2img.py
│ │ └── dataset_text2img_sdxl.py
│ ├── my_utils
│ │ └── util.py
│ ├── pip.sh
│ ├── pipeline_semantic_stable_diffusion_img2img_solver_lpw.py
│ ├── pipeline_semantic_stable_diffusion_img2img_solver_lpw_mask.py
│ ├── pipeline_stable_diffusion_inpaint_lpw.py
│ ├── pipeline_stable_diffusion_xl_inpaint_lpw.py
│ ├── pipeline_stable_diffusion_xl_lpw.py
│ ├── pyproject.toml
│ ├── scheduling_dpmsolver_multistep_inject.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── change_naming_configs_and_checkpoints.py
│ │ ├── conversion_ldm_uncond.py
│ │ ├── convert_asymmetric_vqgan_to_diffusers.py
│ │ ├── convert_blipdiffusion_to_diffusers.py
│ │ ├── convert_consistency_to_diffusers.py
│ │ ├── convert_dance_diffusion_to_diffusers.py
│ │ ├── convert_ddpm_original_checkpoint_to_diffusers.py
│ │ ├── convert_diffusers_to_original_sdxl.py
│ │ ├── convert_diffusers_to_original_stable_diffusion.py
│ │ ├── convert_dit_to_diffusers.py
│ │ ├── convert_gligen_to_diffusers.py
│ │ ├── convert_if.py
│ │ ├── convert_k_upscaler_to_diffusers.py
│ │ ├── convert_kakao_brain_unclip_to_diffusers.py
│ │ ├── convert_kandinsky_to_diffusers.py
│ │ ├── convert_ldm_original_checkpoint_to_diffusers.py
│ │ ├── convert_lora_safetensor_to_diffusers.py
│ │ ├── convert_models_diffuser_to_diffusers.py
│ │ ├── convert_ms_text_to_video_to_diffusers.py
│ │ ├── convert_music_spectrogram_to_diffusers.py
│ │ ├── convert_ncsnpp_original_checkpoint_to_diffusers.py
│ │ ├── convert_original_audioldm2_to_diffusers.py
│ │ ├── convert_original_audioldm_to_diffusers.py
│ │ ├── convert_original_controlnet_to_diffusers.py
│ │ ├── convert_original_musicldm_to_diffusers.py
│ │ ├── convert_original_stable_diffusion_to_diffusers.py
│ │ ├── convert_original_t2i_adapter.py
│ │ ├── convert_shap_e_to_diffusers.py
│ │ ├── convert_stable_diffusion_checkpoint_to_onnx.py
│ │ ├── convert_stable_diffusion_controlnet_to_onnx.py
│ │ ├── convert_stable_diffusion_controlnet_to_tensorrt.py
│ │ ├── convert_tiny_autoencoder_to_diffusers.py
│ │ ├── convert_unclip_txt2img_to_image_variation.py
│ │ ├── convert_unidiffuser_to_diffusers.py
│ │ ├── convert_vae_diff_to_onnx.py
│ │ ├── convert_vae_pt_to_diffusers.py
│ │ ├── convert_versatile_diffusion_to_diffusers.py
│ │ ├── convert_vq_diffusion_to_diffusers.py
│ │ ├── convert_wuerstchen.py
│ │ ├── convert_zero123_to_diffusers.py
│ │ └── generate_logits.py
│ ├── setup.cfg
│ ├── setup.py
│ ├── src
│ │ ├── __init__.py
│ │ └── diffusers
│ │ │ ├── __init__.py
│ │ │ ├── commands
│ │ │ ├── __init__.py
│ │ │ ├── diffusers_cli.py
│ │ │ ├── env.py
│ │ │ └── fp16_safetensors.py
│ │ │ ├── configuration_utils.py
│ │ │ ├── dependency_versions_check.py
│ │ │ ├── dependency_versions_table.py
│ │ │ ├── experimental
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── rl
│ │ │ │ ├── __init__.py
│ │ │ │ └── value_guided_sampling.py
│ │ │ ├── image_processor.py
│ │ │ ├── loaders.py
│ │ │ ├── models
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── activations.py
│ │ │ ├── adapter.py
│ │ │ ├── attention.py
│ │ │ ├── attention_flax.py
│ │ │ ├── attention_processor.py
│ │ │ ├── autoencoder_asym_kl.py
│ │ │ ├── autoencoder_kl.py
│ │ │ ├── autoencoder_tiny.py
│ │ │ ├── controlnet.py
│ │ │ ├── controlnet_flax.py
│ │ │ ├── dual_transformer_2d.py
│ │ │ ├── embeddings.py
│ │ │ ├── embeddings_flax.py
│ │ │ ├── lora.py
│ │ │ ├── modeling_flax_pytorch_utils.py
│ │ │ ├── modeling_flax_utils.py
│ │ │ ├── modeling_pytorch_flax_utils.py
│ │ │ ├── modeling_utils.py
│ │ │ ├── prior_transformer.py
│ │ │ ├── resnet.py
│ │ │ ├── resnet_flax.py
│ │ │ ├── t5_film_transformer.py
│ │ │ ├── transformer_2d.py
│ │ │ ├── transformer_temporal.py
│ │ │ ├── unet_1d.py
│ │ │ ├── unet_1d_blocks.py
│ │ │ ├── unet_2d.py
│ │ │ ├── unet_2d_blocks.py
│ │ │ ├── unet_2d_blocks_flax.py
│ │ │ ├── unet_2d_condition.py
│ │ │ ├── unet_2d_condition_flax.py
│ │ │ ├── unet_3d_blocks.py
│ │ │ ├── unet_3d_condition.py
│ │ │ ├── vae.py
│ │ │ ├── vae_flax.py
│ │ │ └── vq_model.py
│ │ │ ├── optimization.py
│ │ │ ├── pipelines
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── alt_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_roberta_series.py
│ │ │ │ ├── pipeline_alt_diffusion.py
│ │ │ │ ├── pipeline_alt_diffusion_img2img.py
│ │ │ │ └── pipeline_output.py
│ │ │ ├── audio_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── mel.py
│ │ │ │ └── pipeline_audio_diffusion.py
│ │ │ ├── audioldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_audioldm.py
│ │ │ ├── audioldm2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_audioldm2.py
│ │ │ │ └── pipeline_audioldm2.py
│ │ │ ├── auto_pipeline.py
│ │ │ ├── blip_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── blip_image_processing.py
│ │ │ │ ├── modeling_blip2.py
│ │ │ │ ├── modeling_ctx_clip.py
│ │ │ │ └── pipeline_blip_diffusion.py
│ │ │ ├── consistency_models
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_consistency_models.py
│ │ │ ├── controlnet
│ │ │ │ ├── __init__.py
│ │ │ │ ├── multicontrolnet.py
│ │ │ │ ├── pipeline_controlnet.py
│ │ │ │ ├── pipeline_controlnet_blip_diffusion.py
│ │ │ │ ├── pipeline_controlnet_img2img.py
│ │ │ │ ├── pipeline_controlnet_inpaint.py
│ │ │ │ ├── pipeline_controlnet_inpaint_sd_xl.py
│ │ │ │ ├── pipeline_controlnet_sd_xl.py
│ │ │ │ ├── pipeline_controlnet_sd_xl_img2img.py
│ │ │ │ └── pipeline_flax_controlnet.py
│ │ │ ├── dance_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_dance_diffusion.py
│ │ │ ├── ddim
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_ddim.py
│ │ │ ├── ddpm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_ddpm.py
│ │ │ ├── deepfloyd_if
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_if.py
│ │ │ │ ├── pipeline_if_img2img.py
│ │ │ │ ├── pipeline_if_img2img_superresolution.py
│ │ │ │ ├── pipeline_if_inpainting.py
│ │ │ │ ├── pipeline_if_inpainting_superresolution.py
│ │ │ │ ├── pipeline_if_superresolution.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ ├── safety_checker.py
│ │ │ │ ├── timesteps.py
│ │ │ │ └── watermark.py
│ │ │ ├── dit
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_dit.py
│ │ │ ├── kandinsky
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_kandinsky.py
│ │ │ │ ├── pipeline_kandinsky_combined.py
│ │ │ │ ├── pipeline_kandinsky_img2img.py
│ │ │ │ ├── pipeline_kandinsky_inpaint.py
│ │ │ │ ├── pipeline_kandinsky_prior.py
│ │ │ │ └── text_encoder.py
│ │ │ ├── kandinsky2_2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_kandinsky2_2.py
│ │ │ │ ├── pipeline_kandinsky2_2_combined.py
│ │ │ │ ├── pipeline_kandinsky2_2_controlnet.py
│ │ │ │ ├── pipeline_kandinsky2_2_controlnet_img2img.py
│ │ │ │ ├── pipeline_kandinsky2_2_img2img.py
│ │ │ │ ├── pipeline_kandinsky2_2_inpainting.py
│ │ │ │ ├── pipeline_kandinsky2_2_prior.py
│ │ │ │ └── pipeline_kandinsky2_2_prior_emb2emb.py
│ │ │ ├── latent_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_latent_diffusion.py
│ │ │ │ └── pipeline_latent_diffusion_superresolution.py
│ │ │ ├── latent_diffusion_uncond
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_latent_diffusion_uncond.py
│ │ │ ├── musicldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_musicldm.py
│ │ │ ├── onnx_utils.py
│ │ │ ├── paint_by_example
│ │ │ │ ├── __init__.py
│ │ │ │ ├── image_encoder.py
│ │ │ │ └── pipeline_paint_by_example.py
│ │ │ ├── pipeline_flax_utils.py
│ │ │ ├── pipeline_utils.py
│ │ │ ├── pndm
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_pndm.py
│ │ │ ├── repaint
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_repaint.py
│ │ │ ├── score_sde_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_score_sde_ve.py
│ │ │ ├── semantic_stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ └── pipeline_semantic_stable_diffusion.py
│ │ │ ├── shap_e
│ │ │ │ ├── __init__.py
│ │ │ │ ├── camera.py
│ │ │ │ ├── pipeline_shap_e.py
│ │ │ │ ├── pipeline_shap_e_img2img.py
│ │ │ │ └── renderer.py
│ │ │ ├── spectrogram_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── continous_encoder.py
│ │ │ │ ├── midi_utils.py
│ │ │ │ ├── notes_encoder.py
│ │ │ │ └── pipeline_spectrogram_diffusion.py
│ │ │ ├── stable_diffusion
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── clip_image_project_model.py
│ │ │ │ ├── convert_from_ckpt.py
│ │ │ │ ├── pipeline_cycle_diffusion.py
│ │ │ │ ├── pipeline_flax_stable_diffusion.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── pipeline_onnx_stable_diffusion_upscale.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ ├── pipeline_stable_diffusion.py
│ │ │ │ ├── pipeline_stable_diffusion_attend_and_excite.py
│ │ │ │ ├── pipeline_stable_diffusion_depth2img.py
│ │ │ │ ├── pipeline_stable_diffusion_diffedit.py
│ │ │ │ ├── pipeline_stable_diffusion_gligen.py
│ │ │ │ ├── pipeline_stable_diffusion_gligen_text_image.py
│ │ │ │ ├── pipeline_stable_diffusion_image_variation.py
│ │ │ │ ├── pipeline_stable_diffusion_img2img.py
│ │ │ │ ├── pipeline_stable_diffusion_inpaint.py
│ │ │ │ ├── pipeline_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── pipeline_stable_diffusion_instruct_pix2pix.py
│ │ │ │ ├── pipeline_stable_diffusion_k_diffusion.py
│ │ │ │ ├── pipeline_stable_diffusion_latent_upscale.py
│ │ │ │ ├── pipeline_stable_diffusion_ldm3d.py
│ │ │ │ ├── pipeline_stable_diffusion_model_editing.py
│ │ │ │ ├── pipeline_stable_diffusion_panorama.py
│ │ │ │ ├── pipeline_stable_diffusion_paradigms.py
│ │ │ │ ├── pipeline_stable_diffusion_pix2pix_zero.py
│ │ │ │ ├── pipeline_stable_diffusion_sag.py
│ │ │ │ ├── pipeline_stable_diffusion_upscale.py
│ │ │ │ ├── pipeline_stable_unclip.py
│ │ │ │ ├── pipeline_stable_unclip_img2img.py
│ │ │ │ ├── safety_checker.py
│ │ │ │ ├── safety_checker_flax.py
│ │ │ │ └── stable_unclip_image_normalizer.py
│ │ │ ├── stable_diffusion_safe
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ ├── pipeline_stable_diffusion_safe.py
│ │ │ │ └── safety_checker.py
│ │ │ ├── stable_diffusion_xl
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_flax_stable_diffusion_xl.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ ├── pipeline_stable_diffusion_xl.py
│ │ │ │ ├── pipeline_stable_diffusion_xl_img2img.py
│ │ │ │ ├── pipeline_stable_diffusion_xl_inpaint.py
│ │ │ │ ├── pipeline_stable_diffusion_xl_instruct_pix2pix.py
│ │ │ │ └── watermark.py
│ │ │ ├── stochastic_karras_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_stochastic_karras_ve.py
│ │ │ ├── t2i_adapter
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_stable_diffusion_adapter.py
│ │ │ │ └── pipeline_stable_diffusion_xl_adapter.py
│ │ │ ├── text_to_video_synthesis
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_output.py
│ │ │ │ ├── pipeline_text_to_video_synth.py
│ │ │ │ ├── pipeline_text_to_video_synth_img2img.py
│ │ │ │ └── pipeline_text_to_video_zero.py
│ │ │ ├── unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pipeline_unclip.py
│ │ │ │ ├── pipeline_unclip_image_variation.py
│ │ │ │ └── text_proj.py
│ │ │ ├── unidiffuser
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_text_decoder.py
│ │ │ │ ├── modeling_uvit.py
│ │ │ │ └── pipeline_unidiffuser.py
│ │ │ ├── versatile_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_text_unet.py
│ │ │ │ ├── pipeline_versatile_diffusion.py
│ │ │ │ ├── pipeline_versatile_diffusion_dual_guided.py
│ │ │ │ ├── pipeline_versatile_diffusion_image_variation.py
│ │ │ │ └── pipeline_versatile_diffusion_text_to_image.py
│ │ │ ├── vq_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── pipeline_vq_diffusion.py
│ │ │ └── wuerstchen
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_paella_vq_model.py
│ │ │ │ ├── modeling_wuerstchen_common.py
│ │ │ │ ├── modeling_wuerstchen_diffnext.py
│ │ │ │ ├── modeling_wuerstchen_prior.py
│ │ │ │ ├── pipeline_wuerstchen.py
│ │ │ │ ├── pipeline_wuerstchen_combined.py
│ │ │ │ └── pipeline_wuerstchen_prior.py
│ │ │ ├── py.typed
│ │ │ ├── schedulers
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── scheduling_consistency_models.py
│ │ │ ├── scheduling_ddim.py
│ │ │ ├── scheduling_ddim_flax.py
│ │ │ ├── scheduling_ddim_inverse.py
│ │ │ ├── scheduling_ddim_parallel.py
│ │ │ ├── scheduling_ddpm.py
│ │ │ ├── scheduling_ddpm_flax.py
│ │ │ ├── scheduling_ddpm_parallel.py
│ │ │ ├── scheduling_ddpm_wuerstchen.py
│ │ │ ├── scheduling_deis_multistep.py
│ │ │ ├── scheduling_dpmsolver_multistep.py
│ │ │ ├── scheduling_dpmsolver_multistep_flax.py
│ │ │ ├── scheduling_dpmsolver_multistep_inverse.py
│ │ │ ├── scheduling_dpmsolver_sde.py
│ │ │ ├── scheduling_dpmsolver_singlestep.py
│ │ │ ├── scheduling_euler_ancestral_discrete.py
│ │ │ ├── scheduling_euler_discrete.py
│ │ │ ├── scheduling_euler_discrete_flax.py
│ │ │ ├── scheduling_heun_discrete.py
│ │ │ ├── scheduling_ipndm.py
│ │ │ ├── scheduling_k_dpm_2_ancestral_discrete.py
│ │ │ ├── scheduling_k_dpm_2_discrete.py
│ │ │ ├── scheduling_karras_ve.py
│ │ │ ├── scheduling_karras_ve_flax.py
│ │ │ ├── scheduling_lms_discrete.py
│ │ │ ├── scheduling_lms_discrete_flax.py
│ │ │ ├── scheduling_pndm.py
│ │ │ ├── scheduling_pndm_flax.py
│ │ │ ├── scheduling_repaint.py
│ │ │ ├── scheduling_sde_ve.py
│ │ │ ├── scheduling_sde_ve_flax.py
│ │ │ ├── scheduling_sde_vp.py
│ │ │ ├── scheduling_unclip.py
│ │ │ ├── scheduling_unipc_multistep.py
│ │ │ ├── scheduling_utils.py
│ │ │ ├── scheduling_utils_flax.py
│ │ │ └── scheduling_vq_diffusion.py
│ │ │ ├── training_utils.py
│ │ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── accelerate_utils.py
│ │ │ ├── constants.py
│ │ │ ├── deprecation_utils.py
│ │ │ ├── doc_utils.py
│ │ │ ├── dummy_flax_and_transformers_objects.py
│ │ │ ├── dummy_flax_objects.py
│ │ │ ├── dummy_note_seq_objects.py
│ │ │ ├── dummy_onnx_objects.py
│ │ │ ├── dummy_pt_objects.py
│ │ │ ├── dummy_torch_and_librosa_objects.py
│ │ │ ├── dummy_torch_and_scipy_objects.py
│ │ │ ├── dummy_torch_and_torchsde_objects.py
│ │ │ ├── dummy_torch_and_transformers_and_k_diffusion_objects.py
│ │ │ ├── dummy_torch_and_transformers_and_onnx_objects.py
│ │ │ ├── dummy_torch_and_transformers_objects.py
│ │ │ ├── dummy_transformers_and_torch_and_note_seq_objects.py
│ │ │ ├── dynamic_modules_utils.py
│ │ │ ├── export_utils.py
│ │ │ ├── hub_utils.py
│ │ │ ├── import_utils.py
│ │ │ ├── loading_utils.py
│ │ │ ├── logging.py
│ │ │ ├── model_card_template.md
│ │ │ ├── outputs.py
│ │ │ ├── peft_utils.py
│ │ │ ├── pil_utils.py
│ │ │ ├── state_dict_utils.py
│ │ │ ├── testing_utils.py
│ │ │ └── torch_utils.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── fixtures
│ │ │ ├── custom_pipeline
│ │ │ │ ├── pipeline.py
│ │ │ │ └── what_ever.py
│ │ │ └── elise_format0.mid
│ │ ├── lora
│ │ │ ├── test_lora_layers_old_backend.py
│ │ │ └── test_lora_layers_peft.py
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── test_activations.py
│ │ │ ├── test_attention_processor.py
│ │ │ ├── test_layers_utils.py
│ │ │ ├── test_modeling_common.py
│ │ │ ├── test_modeling_common_flax.py
│ │ │ ├── test_models_prior.py
│ │ │ ├── test_models_unet_1d.py
│ │ │ ├── test_models_unet_2d.py
│ │ │ ├── test_models_unet_2d_condition.py
│ │ │ ├── test_models_unet_2d_flax.py
│ │ │ ├── test_models_unet_3d_condition.py
│ │ │ ├── test_models_vae.py
│ │ │ ├── test_models_vae_flax.py
│ │ │ ├── test_models_vq.py
│ │ │ ├── test_unet_2d_blocks.py
│ │ │ └── test_unet_blocks_common.py
│ │ ├── others
│ │ │ ├── test_check_copies.py
│ │ │ ├── test_check_dummies.py
│ │ │ ├── test_config.py
│ │ │ ├── test_dependencies.py
│ │ │ ├── test_ema.py
│ │ │ ├── test_hub_utils.py
│ │ │ ├── test_image_processor.py
│ │ │ ├── test_outputs.py
│ │ │ ├── test_training.py
│ │ │ └── test_utils.py
│ │ ├── pipelines
│ │ │ ├── __init__.py
│ │ │ ├── altdiffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_alt_diffusion.py
│ │ │ │ └── test_alt_diffusion_img2img.py
│ │ │ ├── audio_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_audio_diffusion.py
│ │ │ ├── audioldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_audioldm.py
│ │ │ ├── audioldm2
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_audioldm2.py
│ │ │ ├── blipdiffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_blipdiffusion.py
│ │ │ ├── consistency_models
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_consistency_models.py
│ │ │ ├── controlnet
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_controlnet.py
│ │ │ │ ├── test_controlnet_blip_diffusion.py
│ │ │ │ ├── test_controlnet_img2img.py
│ │ │ │ ├── test_controlnet_inpaint.py
│ │ │ │ ├── test_controlnet_inpaint_sdxl.py
│ │ │ │ ├── test_controlnet_sdxl.py
│ │ │ │ ├── test_controlnet_sdxl_img2img.py
│ │ │ │ └── test_flax_controlnet.py
│ │ │ ├── dance_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_dance_diffusion.py
│ │ │ ├── ddim
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_ddim.py
│ │ │ ├── ddpm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_ddpm.py
│ │ │ ├── deepfloyd_if
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_if.py
│ │ │ │ ├── test_if_img2img.py
│ │ │ │ ├── test_if_img2img_superresolution.py
│ │ │ │ ├── test_if_inpainting.py
│ │ │ │ ├── test_if_inpainting_superresolution.py
│ │ │ │ └── test_if_superresolution.py
│ │ │ ├── dit
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_dit.py
│ │ │ ├── kandinsky
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_kandinsky.py
│ │ │ │ ├── test_kandinsky_combined.py
│ │ │ │ ├── test_kandinsky_img2img.py
│ │ │ │ ├── test_kandinsky_inpaint.py
│ │ │ │ └── test_kandinsky_prior.py
│ │ │ ├── kandinsky2_2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_kandinsky.py
│ │ │ │ ├── test_kandinsky_combined.py
│ │ │ │ ├── test_kandinsky_controlnet.py
│ │ │ │ ├── test_kandinsky_controlnet_img2img.py
│ │ │ │ ├── test_kandinsky_img2img.py
│ │ │ │ ├── test_kandinsky_inpaint.py
│ │ │ │ ├── test_kandinsky_prior.py
│ │ │ │ └── test_kandinsky_prior_emb2emb.py
│ │ │ ├── karras_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_karras_ve.py
│ │ │ ├── latent_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_latent_diffusion.py
│ │ │ │ ├── test_latent_diffusion_superresolution.py
│ │ │ │ └── test_latent_diffusion_uncond.py
│ │ │ ├── musicldm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_musicldm.py
│ │ │ ├── paint_by_example
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_paint_by_example.py
│ │ │ ├── pipeline_params.py
│ │ │ ├── pndm
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_pndm.py
│ │ │ ├── repaint
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_repaint.py
│ │ │ ├── score_sde_ve
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_score_sde_ve.py
│ │ │ ├── semantic_stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_semantic_diffusion.py
│ │ │ ├── shap_e
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_shap_e.py
│ │ │ │ └── test_shap_e_img2img.py
│ │ │ ├── spectrogram_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_spectrogram_diffusion.py
│ │ │ ├── stable_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_cycle_diffusion.py
│ │ │ │ ├── test_onnx_stable_diffusion.py
│ │ │ │ ├── test_onnx_stable_diffusion_img2img.py
│ │ │ │ ├── test_onnx_stable_diffusion_inpaint.py
│ │ │ │ ├── test_onnx_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── test_onnx_stable_diffusion_upscale.py
│ │ │ │ ├── test_stable_diffusion.py
│ │ │ │ ├── test_stable_diffusion_adapter.py
│ │ │ │ ├── test_stable_diffusion_gligen.py
│ │ │ │ ├── test_stable_diffusion_gligen_text_image.py
│ │ │ │ ├── test_stable_diffusion_image_variation.py
│ │ │ │ ├── test_stable_diffusion_img2img.py
│ │ │ │ ├── test_stable_diffusion_inpaint.py
│ │ │ │ ├── test_stable_diffusion_inpaint_legacy.py
│ │ │ │ ├── test_stable_diffusion_instruction_pix2pix.py
│ │ │ │ ├── test_stable_diffusion_k_diffusion.py
│ │ │ │ ├── test_stable_diffusion_ldm3d.py
│ │ │ │ ├── test_stable_diffusion_model_editing.py
│ │ │ │ ├── test_stable_diffusion_panorama.py
│ │ │ │ ├── test_stable_diffusion_paradigms.py
│ │ │ │ ├── test_stable_diffusion_pix2pix_zero.py
│ │ │ │ └── test_stable_diffusion_sag.py
│ │ │ ├── stable_diffusion_2
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_stable_diffusion.py
│ │ │ │ ├── test_stable_diffusion_attend_and_excite.py
│ │ │ │ ├── test_stable_diffusion_depth.py
│ │ │ │ ├── test_stable_diffusion_diffedit.py
│ │ │ │ ├── test_stable_diffusion_flax.py
│ │ │ │ ├── test_stable_diffusion_flax_inpaint.py
│ │ │ │ ├── test_stable_diffusion_inpaint.py
│ │ │ │ ├── test_stable_diffusion_latent_upscale.py
│ │ │ │ ├── test_stable_diffusion_upscale.py
│ │ │ │ └── test_stable_diffusion_v_pred.py
│ │ │ ├── stable_diffusion_safe
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_safe_diffusion.py
│ │ │ ├── stable_diffusion_xl
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_stable_diffusion_xl.py
│ │ │ │ ├── test_stable_diffusion_xl_adapter.py
│ │ │ │ ├── test_stable_diffusion_xl_img2img.py
│ │ │ │ ├── test_stable_diffusion_xl_inpaint.py
│ │ │ │ └── test_stable_diffusion_xl_instruction_pix2pix.py
│ │ │ ├── stable_unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_stable_unclip.py
│ │ │ │ └── test_stable_unclip_img2img.py
│ │ │ ├── test_pipeline_utils.py
│ │ │ ├── test_pipelines.py
│ │ │ ├── test_pipelines_auto.py
│ │ │ ├── test_pipelines_combined.py
│ │ │ ├── test_pipelines_common.py
│ │ │ ├── test_pipelines_flax.py
│ │ │ ├── test_pipelines_onnx_common.py
│ │ │ ├── text_to_video_synthesis
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_text_to_video.py
│ │ │ │ ├── test_text_to_video_zero.py
│ │ │ │ └── test_video_to_video.py
│ │ │ ├── unclip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_unclip.py
│ │ │ │ └── test_unclip_image_variation.py
│ │ │ ├── unidiffuser
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_unidiffuser.py
│ │ │ ├── versatile_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_versatile_diffusion_dual_guided.py
│ │ │ │ ├── test_versatile_diffusion_image_variation.py
│ │ │ │ ├── test_versatile_diffusion_mega.py
│ │ │ │ └── test_versatile_diffusion_text_to_image.py
│ │ │ ├── vq_diffusion
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_vq_diffusion.py
│ │ │ └── wuerstchen
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_wuerstchen_combined.py
│ │ │ │ ├── test_wuerstchen_decoder.py
│ │ │ │ └── test_wuerstchen_prior.py
│ │ └── schedulers
│ │ │ ├── __init__.py
│ │ │ ├── test_scheduler_consistency_model.py
│ │ │ ├── test_scheduler_ddim.py
│ │ │ ├── test_scheduler_ddim_inverse.py
│ │ │ ├── test_scheduler_ddim_parallel.py
│ │ │ ├── test_scheduler_ddpm.py
│ │ │ ├── test_scheduler_ddpm_parallel.py
│ │ │ ├── test_scheduler_deis.py
│ │ │ ├── test_scheduler_dpm_multi.py
│ │ │ ├── test_scheduler_dpm_multi_inverse.py
│ │ │ ├── test_scheduler_dpm_sde.py
│ │ │ ├── test_scheduler_dpm_single.py
│ │ │ ├── test_scheduler_euler.py
│ │ │ ├── test_scheduler_euler_ancestral.py
│ │ │ ├── test_scheduler_flax.py
│ │ │ ├── test_scheduler_heun.py
│ │ │ ├── test_scheduler_ipndm.py
│ │ │ ├── test_scheduler_kdpm2_ancestral.py
│ │ │ ├── test_scheduler_kdpm2_discrete.py
│ │ │ ├── test_scheduler_lms.py
│ │ │ ├── test_scheduler_pndm.py
│ │ │ ├── test_scheduler_score_sde_ve.py
│ │ │ ├── test_scheduler_unclip.py
│ │ │ ├── test_scheduler_unipc.py
│ │ │ ├── test_scheduler_vq_diffusion.py
│ │ │ └── test_schedulers.py
│ ├── train_text_to_image.py
│ ├── train_text_to_image_inpaint.py
│ ├── train_text_to_image_sdxl.py
│ ├── train_text_to_image_sdxl_inpainting.py
│ └── utils
│ │ ├── check_config_docstrings.py
│ │ ├── check_copies.py
│ │ ├── check_doc_toc.py
│ │ ├── check_dummies.py
│ │ ├── check_inits.py
│ │ ├── check_repo.py
│ │ ├── check_table.py
│ │ ├── custom_init_isort.py
│ │ ├── fetch_torch_cuda_pipeline_test_matrix.py
│ │ ├── get_modified_files.py
│ │ ├── overwrite_expected_slice.py
│ │ ├── print_env.py
│ │ ├── release.py
│ │ └── stale.py
├── llava
│ ├── __init__.py
│ ├── constants.py
│ ├── conversation.py
│ ├── datasets
│ │ ├── edit_dataset.py
│ │ ├── inpainting_dataset.py
│ │ ├── sg_dataset.py
│ │ ├── text_dataset.py
│ │ ├── utils.py
│ │ └── vqa_dataset.py
│ ├── eval
│ │ ├── eval_gpt_review.py
│ │ ├── eval_gpt_review_bench.py
│ │ ├── eval_gpt_review_visual.py
│ │ ├── eval_pope.py
│ │ ├── eval_science_qa.py
│ │ ├── eval_science_qa_gpt4.py
│ │ ├── eval_science_qa_gpt4_requery.py
│ │ ├── eval_textvqa.py
│ │ ├── generate_webpage_data_from_table.py
│ │ ├── m4c_evaluator.py
│ │ ├── model_qa.py
│ │ ├── model_vqa.py
│ │ ├── model_vqa_loader.py
│ │ ├── model_vqa_mmbench.py
│ │ ├── model_vqa_science.py
│ │ ├── qa_baseline_gpt35.py
│ │ ├── run_llava.py
│ │ ├── summarize_gpt_review.py
│ │ └── webpage
│ │ │ ├── figures
│ │ │ ├── alpaca.png
│ │ │ ├── bard.jpg
│ │ │ ├── chatgpt.svg
│ │ │ ├── llama.jpg
│ │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg
│ │ │ └── vicuna.jpeg
│ │ │ ├── index.html
│ │ │ ├── script.js
│ │ │ └── styles.css
│ ├── masks
│ │ ├── __init__.py
│ │ ├── countless2d.py
│ │ ├── make_mask.py
│ │ └── segmask.py
│ ├── mm_utils.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── apply_delta.py
│ │ ├── builder.py
│ │ ├── consolidate.py
│ │ ├── language_model
│ │ │ ├── llava_gemma.py
│ │ │ ├── llava_llama.py
│ │ │ ├── llava_mistral.py
│ │ │ ├── llava_mpt.py
│ │ │ ├── llava_phi3.py
│ │ │ └── llava_qwen2.py
│ │ ├── llava_arch.py
│ │ ├── make_delta.py
│ │ ├── multimodal_encoder
│ │ │ ├── builder.py
│ │ │ └── clip_encoder.py
│ │ ├── multimodal_projector
│ │ │ └── builder.py
│ │ └── utils.py
│ ├── prompt_temp.py
│ ├── prompt_zh_temp.py
│ ├── train
│ │ ├── llama_flash_attn_monkey_patch.py
│ │ ├── llama_xformers_attn_monkey_patch.py
│ │ ├── llava_trainer.py
│ │ ├── pretrain.py
│ │ ├── pretrain_mem.py
│ │ ├── train.py
│ │ ├── train_mem.py
│ │ └── train_xformers.py
│ └── utils.py
└── serve
│ ├── __init__.py
│ ├── cli-sd15-editing.py
│ ├── cli-sd15-inpainting.py
│ ├── cli-sd15.py
│ ├── cli-sdxl-inpainting.py
│ ├── cli-sdxl.py
│ ├── cli.py
│ ├── cli2-sd15.py
│ ├── cli2-sdxl.py
│ ├── cli2.py
│ ├── controller.py
│ ├── examples
│ ├── aes.png
│ ├── jiateng.png
│ └── snow_scene.jpeg
│ ├── gradio_t2i_server.py
│ ├── gradio_web_server.py
│ ├── model_worker.py
│ ├── register_worker.py
│ └── test_message.py
├── pip.sh
├── pyproject.toml
├── requirements.txt
└── scripts
├── pretrain.sh
├── pretrain_gemma.sh
├── pretrain_llama3.sh
├── pretrain_mistral.sh
├── pretrain_phi3.sh
├── pretrain_qwen2-05b.sh
├── pretrain_qwen2-15b.sh
├── pretrain_qwen2-7b.sh
├── pretrain_vicuna_13b.sh
├── pretrain_vicuna_7b.sh
├── run_gradio_t2i.sh
├── test-2.sh
├── test-llmga-sd15-editing.sh
├── test-llmga-sd15-inpainting.sh
├── test-llmga-sd15-t2i.sh
├── test-llmga-sdxl-inpainting.sh
├── test-llmga-sdxl-t2i.sh
├── test.sh
├── test2-llmga-sd15-t2i.sh
├── test2-llmga-sdxl-t2i.sh
├── train_llmga_s1_05b_qwen2.sh
├── train_llmga_s1_13b_vicuna.sh
├── train_llmga_s1_15b_qwen2.sh
├── train_llmga_s1_2b_gemma.sh
├── train_llmga_s1_3b_phi3.sh
├── train_llmga_s1_7b_mistral.sh
├── train_llmga_s1_7b_qwen2.sh
├── train_llmga_s1_7b_vicuna.sh
├── train_llmga_s1_8b_llama3.sh
├── train_llmga_s2_sd15_inpaint.sh
├── train_llmga_s2_sd15_t2i.sh
├── train_llmga_s2_sdxl_inpaint.sh
├── train_llmga_s2_sdxl_t2i.sh
├── v1_5
├── eval
│ ├── gqa.sh
│ ├── llavabench.sh
│ ├── mmbench.sh
│ ├── mmbench_cn.sh
│ ├── mme.sh
│ ├── mmvet.sh
│ ├── pope.sh
│ ├── qbench.sh
│ ├── qbench_zh.sh
│ ├── seed.sh
│ ├── sqa.sh
│ ├── textvqa.sh
│ ├── vizwiz.sh
│ └── vqav2.sh
├── finetune.sh
├── finetune_lora.sh
├── finetune_task.sh
├── finetune_task_lora.sh
└── pretrain.sh
├── zero2.json
├── zero3.json
└── zero3_offload.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__
3 | *.pyc
4 | *.egg-info
5 | dist
6 |
7 | # Log
8 | *.log
9 | *.log.*
10 | *.json
11 | *.jsonl
12 |
13 | # Data
14 | !**/alpaca-data-conversation.json
15 |
16 | # Editor
17 | .idea
18 | *.swp
19 |
20 | # Other
21 | .DS_Store
22 | wandb
23 | output
24 |
25 | checkpoints
26 | ckpts*
27 |
--------------------------------------------------------------------------------
/docs/Data.md:
--------------------------------------------------------------------------------
1 | ## Data
2 |
3 | | Data file name | Size |
4 | | --- | ---: |
5 | | [llava_instruct_150k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_150k.json) | 229 MB |
6 |
7 |
--------------------------------------------------------------------------------
/imgs/demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/demo1.png
--------------------------------------------------------------------------------
/imgs/demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/demo2.png
--------------------------------------------------------------------------------
/imgs/github_poster1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/github_poster1.png
--------------------------------------------------------------------------------
/imgs/github_poster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/github_poster2.png
--------------------------------------------------------------------------------
/imgs/logo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/logo.pdf
--------------------------------------------------------------------------------
/imgs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/logo.png
--------------------------------------------------------------------------------
/imgs/method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/imgs/method.png
--------------------------------------------------------------------------------
/llmga/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/llmga/diffusers/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | title: 'Diffusers: State-of-the-art diffusion models'
3 | message: >-
4 | If you use this software, please cite it using the
5 | metadata from this file.
6 | type: software
7 | authors:
8 | - given-names: Patrick
9 | family-names: von Platen
10 | - given-names: Suraj
11 | family-names: Patil
12 | - given-names: Anton
13 | family-names: Lozhkov
14 | - given-names: Pedro
15 | family-names: Cuenca
16 | - given-names: Nathan
17 | family-names: Lambert
18 | - given-names: Kashif
19 | family-names: Rasul
20 | - given-names: Mishig
21 | family-names: Davaadorj
22 | - given-names: Thomas
23 | family-names: Wolf
24 | repository-code: 'https://github.com/huggingface/diffusers'
25 | abstract: >-
26 | Diffusers provides pretrained diffusion models across
27 | multiple modalities, such as vision and audio, and serves
28 | as a modular toolbox for inference and training of
29 | diffusion models.
30 | keywords:
31 | - deep-learning
32 | - pytorch
33 | - image-generation
34 | - diffusion
35 | - text2image
36 | - image2image
37 | - score-based-generative-modeling
38 | - stable-diffusion
39 | license: Apache-2.0
40 | version: 0.12.1
41 |
--------------------------------------------------------------------------------
/llmga/diffusers/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include src/diffusers/utils/model_card_template.md
3 |
--------------------------------------------------------------------------------
/llmga/diffusers/_typos.toml:
--------------------------------------------------------------------------------
1 | # Files for typos
2 | # Instruction: https://github.com/marketplace/actions/typos-action#getting-started
3 |
4 | [default.extend-identifiers]
5 |
6 | [default.extend-words]
7 | NIN="NIN" # NIN is used in scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
8 | nd="np" # nd may be np (numpy)
9 | parms="parms" # parms is used in scripts/convert_original_stable_diffusion_to_diffusers.py
10 |
11 |
12 | [files]
13 | extend-exclude = ["_typos.toml"]
14 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/_config.py:
--------------------------------------------------------------------------------
1 | # docstyle-ignore
2 | INSTALL_CONTENT = """
3 | # Diffusers installation
4 | ! pip install diffusers transformers datasets accelerate
5 | # To install from source instead of the last release, comment the command above and uncomment the following one.
6 | # ! pip install git+https://github.com/huggingface/diffusers.git
7 | """
8 |
9 | notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
10 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/attnprocessor.md:
--------------------------------------------------------------------------------
1 | # Attention Processor
2 |
3 | An attention processor is a class for applying different types of attention mechanisms.
4 |
5 | ## AttnProcessor
6 | [[autodoc]] models.attention_processor.AttnProcessor
7 |
8 | ## AttnProcessor2_0
9 | [[autodoc]] models.attention_processor.AttnProcessor2_0
10 |
11 | ## LoRAAttnProcessor
12 | [[autodoc]] models.attention_processor.LoRAAttnProcessor
13 |
14 | ## LoRAAttnProcessor2_0
15 | [[autodoc]] models.attention_processor.LoRAAttnProcessor2_0
16 |
17 | ## CustomDiffusionAttnProcessor
18 | [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
19 |
20 | ## CustomDiffusionAttnProcessor2_0
21 | [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
22 |
23 | ## AttnAddedKVProcessor
24 | [[autodoc]] models.attention_processor.AttnAddedKVProcessor
25 |
26 | ## AttnAddedKVProcessor2_0
27 | [[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
28 |
29 | ## LoRAAttnAddedKVProcessor
30 | [[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor
31 |
32 | ## XFormersAttnProcessor
33 | [[autodoc]] models.attention_processor.XFormersAttnProcessor
34 |
35 | ## LoRAXFormersAttnProcessor
36 | [[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor
37 |
38 | ## CustomDiffusionXFormersAttnProcessor
39 | [[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
40 |
41 | ## SlicedAttnProcessor
42 | [[autodoc]] models.attention_processor.SlicedAttnProcessor
43 |
44 | ## SlicedAttnAddedKVProcessor
45 | [[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor
46 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/configuration.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Configuration
14 |
15 | Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which stores all the parameters that are passed to their respective `__init__` methods in a JSON-configuration file.
16 |
17 |
18 |
19 | To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`.
20 |
21 |
22 |
23 | ## ConfigMixin
24 |
25 | [[autodoc]] ConfigMixin
26 | - load_config
27 | - from_config
28 | - save_config
29 | - to_json_file
30 | - to_json_string
31 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/diffusion_pipeline.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Pipelines
14 |
15 | The [`DiffusionPipeline`] is the quickest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) for inference.
16 |
17 |
18 |
19 | You shouldn't use the [`DiffusionPipeline`] class for training or finetuning a diffusion model. Individual
20 | components (for example, [`UNet2DModel`] and [`UNet2DConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with them instead.
21 |
22 |
23 |
24 | The pipeline type (for example [`StableDiffusionPipeline`]) of any diffusion pipeline loaded with [`~DiffusionPipeline.from_pretrained`] is automatically
25 | detected and pipeline components are loaded and passed to the `__init__` function of the pipeline.
26 |
27 | Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrained`].
28 |
29 | ## DiffusionPipeline
30 |
31 | [[autodoc]] DiffusionPipeline
32 | - all
33 | - __call__
34 | - device
35 | - to
36 | - components
37 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/autoencoder_tiny.md:
--------------------------------------------------------------------------------
1 | # Tiny AutoEncoder
2 |
3 | Tiny AutoEncoder for Stable Diffusion (TAESD) was introduced in [madebyollin/taesd](https://github.com/madebyollin/taesd) by Ollin Boer Bohan. It is a tiny distilled version of Stable Diffusion's VAE that can quickly decode the latents in a [`StableDiffusionPipeline`] or [`StableDiffusionXLPipeline`] almost instantly.
4 |
5 | To use with Stable Diffusion v-2.1:
6 |
7 | ```python
8 | import torch
9 | from diffusers import DiffusionPipeline, AutoencoderTiny
10 |
11 | pipe = DiffusionPipeline.from_pretrained(
12 | "stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16
13 | )
14 | pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesd", torch_dtype=torch.float16)
15 | pipe = pipe.to("cuda")
16 |
17 | prompt = "slice of delicious New York-style berry cheesecake"
18 | image = pipe(prompt, num_inference_steps=25).images[0]
19 | image.save("cheesecake.png")
20 | ```
21 |
22 | To use with Stable Diffusion XL 1.0
23 |
24 | ```python
25 | import torch
26 | from diffusers import DiffusionPipeline, AutoencoderTiny
27 |
28 | pipe = DiffusionPipeline.from_pretrained(
29 | "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
30 | )
31 | pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
32 | pipe = pipe.to("cuda")
33 |
34 | prompt = "slice of delicious New York-style berry cheesecake"
35 | image = pipe(prompt, num_inference_steps=25).images[0]
36 | image.save("cheesecake_sdxl.png")
37 | ```
38 |
39 | ## AutoencoderTiny
40 |
41 | [[autodoc]] AutoencoderTiny
42 |
43 | ## AutoencoderTinyOutput
44 |
45 | [[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/overview.md:
--------------------------------------------------------------------------------
1 | # Models
2 |
3 | 🤗 Diffusers provides pretrained models for popular algorithms and modules to create custom diffusion systems. The primary function of models is to denoise an input sample as modeled by the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\).
4 |
5 | All models are built from the base [`ModelMixin`] class which is a [`torch.nn.module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) providing basic functionality for saving and loading models, locally and from the Hugging Face Hub.
6 |
7 | ## ModelMixin
8 | [[autodoc]] ModelMixin
9 |
10 | ## FlaxModelMixin
11 |
12 | [[autodoc]] FlaxModelMixin
13 |
14 | ## PushToHubMixin
15 |
16 | [[autodoc]] utils.PushToHubMixin
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/prior_transformer.md:
--------------------------------------------------------------------------------
1 | # Prior Transformer
2 |
3 | The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents
4 | ](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process.
5 |
6 | The abstract from the paper is:
7 |
8 | *Contrastive models like CLIP have been shown to learn robust representations of images that capture both semantics and style. To leverage these representations for image generation, we propose a two-stage model: a prior that generates a CLIP image embedding given a text caption, and a decoder that generates an image conditioned on the image embedding. We show that explicitly generating image representations improves image diversity with minimal loss in photorealism and caption similarity. Our decoders conditioned on image representations can also produce variations of an image that preserve both its semantics and style, while varying the non-essential details absent from the image representation. Moreover, the joint embedding space of CLIP enables language-guided image manipulations in a zero-shot fashion. We use diffusion models for the decoder and experiment with both autoregressive and diffusion models for the prior, finding that the latter are computationally more efficient and produce higher-quality samples.*
9 |
10 | ## PriorTransformer
11 |
12 | [[autodoc]] PriorTransformer
13 |
14 | ## PriorTransformerOutput
15 |
16 | [[autodoc]] models.prior_transformer.PriorTransformerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/transformer2d.md:
--------------------------------------------------------------------------------
1 | # Transformer2D
2 |
3 | A Transformer model for image-like data from [CompVis](https://huggingface.co/CompVis) that is based on the [Vision Transformer](https://huggingface.co/papers/2010.11929) introduced by Dosovitskiy et al. The [`Transformer2DModel`] accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs.
4 |
5 | When the input is **continuous**:
6 |
7 | 1. Project the input and reshape it to `(batch_size, sequence_length, feature_dimension)`.
8 | 2. Apply the Transformer blocks in the standard way.
9 | 3. Reshape to image.
10 |
11 | When the input is **discrete**:
12 |
13 |
14 |
15 | It is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised image don't contain a prediction for the masked pixel because the unnoised image cannot be masked.
16 |
17 |
18 |
19 | 1. Convert input (classes of latent pixels) to embeddings and apply positional embeddings.
20 | 2. Apply the Transformer blocks in the standard way.
21 | 3. Predict classes of unnoised image.
22 |
23 | ## Transformer2DModel
24 |
25 | [[autodoc]] Transformer2DModel
26 |
27 | ## Transformer2DModelOutput
28 |
29 | [[autodoc]] models.transformer_2d.Transformer2DModelOutput
30 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/transformer_temporal.md:
--------------------------------------------------------------------------------
1 | # Transformer Temporal
2 |
3 | A Transformer model for video-like data.
4 |
5 | ## TransformerTemporalModel
6 |
7 | [[autodoc]] models.transformer_temporal.TransformerTemporalModel
8 |
9 | ## TransformerTemporalModelOutput
10 |
11 | [[autodoc]] models.transformer_temporal.TransformerTemporalModelOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/unet.md:
--------------------------------------------------------------------------------
1 | # UNet1DModel
2 |
3 | The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 1D UNet model.
4 |
5 | The abstract from the paper is:
6 |
7 | *There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net.*
8 |
9 | ## UNet1DModel
10 | [[autodoc]] UNet1DModel
11 |
12 | ## UNet1DOutput
13 | [[autodoc]] models.unet_1d.UNet1DOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/models/vq.md:
--------------------------------------------------------------------------------
1 | # VQModel
2 |
3 | The VQ-VAE model was introduced in [Neural Discrete Representation Learning](https://huggingface.co/papers/1711.00937) by Aaron van den Oord, Oriol Vinyals and Koray Kavukcuoglu. The model is used in 🤗 Diffusers to decode latent representations into images. Unlike [`AutoencoderKL`], the [`VQModel`] works in a quantized latent space.
4 |
5 | The abstract from the paper is:
6 |
7 | *Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of "posterior collapse" -- where the latents are ignored when they are paired with a powerful autoregressive decoder -- typically observed in the VAE framework. Pairing these representations with an autoregressive prior, the model can generate high quality images, videos, and speech as well as doing high quality speaker conversion and unsupervised learning of phonemes, providing further evidence of the utility of the learnt representations.*
8 |
9 | ## VQModel
10 |
11 | [[autodoc]] VQModel
12 |
13 | ## VQEncoderOutput
14 |
15 | [[autodoc]] models.vq_model.VQEncoderOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/pipelines/audio_diffusion.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Audio Diffusion
14 |
15 | [Audio Diffusion](https://github.com/teticio/audio-diffusion) is by Robert Dargavel Smith, and it leverages the recent advances in image generation from diffusion models by converting audio samples to and from Mel spectrogram images.
16 |
17 | The original codebase, training scripts and example notebooks can be found at [teticio/audio-diffusion](https://github.com/teticio/audio-diffusion).
18 |
19 |
20 |
21 | Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
22 |
23 |
24 |
25 | ## AudioDiffusionPipeline
26 | [[autodoc]] AudioDiffusionPipeline
27 | - all
28 | - __call__
29 |
30 | ## AudioPipelineOutput
31 | [[autodoc]] pipelines.AudioPipelineOutput
32 |
33 | ## ImagePipelineOutput
34 | [[autodoc]] pipelines.ImagePipelineOutput
35 |
36 | ## Mel
37 | [[autodoc]] Mel
38 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/pipelines/dance_diffusion.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Dance Diffusion
14 |
15 | [Dance Diffusion](https://github.com/Harmonai-org/sample-generator) is by Zach Evans.
16 |
17 | Dance Diffusion is the first in a suite of generative audio tools for producers and musicians released by [Harmonai](https://github.com/Harmonai-org).
18 |
19 | The original codebase of this implementation can be found at [Harmonai-org](https://github.com/Harmonai-org/sample-generator).
20 |
21 |
22 |
23 | Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
24 |
25 |
26 |
27 | ## DanceDiffusionPipeline
28 | [[autodoc]] DanceDiffusionPipeline
29 | - all
30 | - __call__
31 |
32 | ## AudioPipelineOutput
33 | [[autodoc]] pipelines.AudioPipelineOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/ddim_inverse.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # DDIMInverseScheduler
14 |
15 | `DDIMInverseScheduler` is the inverted scheduler from [Denoising Diffusion Implicit Models](https://huggingface.co/papers/2010.02502) (DDIM) by Jiaming Song, Chenlin Meng and Stefano Ermon.
16 | The implementation is mostly based on the DDIM inversion definition from [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://huggingface.co/papers/2211.09794.pdf).
17 |
18 | ## DDIMInverseScheduler
19 | [[autodoc]] DDIMInverseScheduler
20 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/dpm_discrete.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # KDPM2DiscreteScheduler
14 |
15 | The `KDPM2DiscreteScheduler` is inspired by the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 | The original codebase can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion).
18 |
19 | ## KDPM2DiscreteScheduler
20 | [[autodoc]] KDPM2DiscreteScheduler
21 |
22 | ## SchedulerOutput
23 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/dpm_discrete_ancestral.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # KDPM2AncestralDiscreteScheduler
14 |
15 | The `KDPM2DiscreteScheduler` with ancestral sampling is inspired by the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 | The original codebase can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion).
18 |
19 | ## KDPM2AncestralDiscreteScheduler
20 | [[autodoc]] KDPM2AncestralDiscreteScheduler
21 |
22 | ## SchedulerOutput
23 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/dpm_sde.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # DPMSolverSDEScheduler
14 |
15 | The `DPMSolverSDEScheduler` is inspired by the stochastic sampler from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper, and the scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 | ## DPMSolverSDEScheduler
18 | [[autodoc]] DPMSolverSDEScheduler
19 |
20 | ## SchedulerOutput
21 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/euler.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # EulerDiscreteScheduler
14 |
15 | The Euler scheduler (Algorithm 2) is from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 |
18 | ## EulerDiscreteScheduler
19 | [[autodoc]] EulerDiscreteScheduler
20 |
21 | ## EulerDiscreteSchedulerOutput
22 | [[autodoc]] schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/euler_ancestral.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # EulerAncestralDiscreteScheduler
14 |
15 | A scheduler that uses ancestral sampling with Euler method steps. This is a fast scheduler which can often generate good outputs in 20-30 steps. The scheduler is based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72) implementation by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 | ## EulerAncestralDiscreteScheduler
18 | [[autodoc]] EulerAncestralDiscreteScheduler
19 |
20 | ## EulerAncestralDiscreteSchedulerOutput
21 | [[autodoc]] schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/heun.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # HeunDiscreteScheduler
14 |
15 | The Heun scheduler (Algorithm 1) is from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper by Karras et al. The scheduler is ported from the [k-diffusion](https://github.com/crowsonkb/k-diffusion) library and created by [Katherine Crowson](https://github.com/crowsonkb/).
16 |
17 | ## HeunDiscreteScheduler
18 | [[autodoc]] HeunDiscreteScheduler
19 |
20 | ## SchedulerOutput
21 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/ipndm.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # IPNDMScheduler
14 |
15 | `IPNDMScheduler` is a fourth-order Improved Pseudo Linear Multistep scheduler. The original implementation can be found at [crowsonkb/v-diffusion-pytorch](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
16 |
17 | ## IPNDMScheduler
18 | [[autodoc]] IPNDMScheduler
19 |
20 | ## SchedulerOutput
21 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/lms_discrete.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # LMSDiscreteScheduler
14 |
15 | `LMSDiscreteScheduler` is a linear multistep scheduler for discrete beta schedules. The scheduler is ported from and created by [Katherine Crowson](https://github.com/crowsonkb/), and the original implementation can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
16 |
17 | ## LMSDiscreteScheduler
18 | [[autodoc]] LMSDiscreteScheduler
19 |
20 | ## LMSDiscreteSchedulerOutput
21 | [[autodoc]] schedulers.scheduling_lms_discrete.LMSDiscreteSchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/pndm.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # PNDMScheduler
14 |
15 | `PNDMScheduler`, or pseudo numerical methods for diffusion models, uses more advanced ODE integration techniques like the Runge-Kutta and linear multi-step method. The original implementation can be found at [crowsonkb/k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181).
16 |
17 | ## PNDMScheduler
18 | [[autodoc]] PNDMScheduler
19 |
20 | ## SchedulerOutput
21 | [[autodoc]] schedulers.scheduling_utils.SchedulerOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/schedulers/stochastic_karras_ve.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # KarrasVeScheduler
14 |
15 | `KarrasVeScheduler` is a stochastic sampler tailored o variance-expanding (VE) models. It is based on the [Elucidating the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) and [Score-based generative modeling through stochastic differential equations](https://huggingface.co/papers/2011.13456) papers.
16 |
17 | ## KarrasVeScheduler
18 | [[autodoc]] KarrasVeScheduler
19 |
20 | ## KarrasVeOutput
21 | [[autodoc]] schedulers.scheduling_karras_ve.KarrasVeOutput
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/api/utilities.md:
--------------------------------------------------------------------------------
1 | # Utilities
2 |
3 | Utility and helper functions for working with 🤗 Diffusers.
4 |
5 | ## numpy_to_pil
6 |
7 | [[autodoc]] utils.numpy_to_pil
8 |
9 | ## pt_to_pil
10 |
11 | [[autodoc]] utils.pt_to_pil
12 |
13 | ## load_image
14 |
15 | [[autodoc]] utils.load_image
16 |
17 | ## export_to_gif
18 |
19 | [[autodoc]] utils.export_to_gif
20 |
21 | ## export_to_video
22 |
23 | [[autodoc]] utils.export_to_video
24 |
25 | ## make_image_grid
26 |
27 | [[autodoc]] utils.pil_utils.make_image_grid
28 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/imgs/access_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/docs/source/en/imgs/access_request.png
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/imgs/diffusers_library.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/docs/source/en/imgs/diffusers_library.jpg
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/optimization/opt_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | Generating high-quality outputs is computationally intensive, especially during each iterative step where you go from a noisy output to a less noisy output. One of 🤗 Diffuser's goal is to make this technology widely accessible to everyone, which includes enabling fast inference on consumer and specialized hardware.
16 |
17 | This section will cover tips and tricks - like half-precision weights and sliced attention - for optimizing inference speed and reducing memory-consumption. You'll also learn how to speed up your PyTorch code with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) or [ONNX Runtime](https://onnxruntime.ai/docs/), and enable memory-efficient attention with [xFormers](https://facebookresearch.github.io/xformers/). There are also guides for running inference on specific hardware like Apple Silicon, and Intel or Habana processors.
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/training/ddpo.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Reinforcement learning training with DDPO
14 |
15 | You can fine-tune Stable Diffusion on a reward function via reinforcement learning with the 🤗 TRL library and 🤗 Diffusers. This is done with the Denoising Diffusion Policy Optimization (DDPO) algorithm introduced by Black et al. in [Training Diffusion Models with Reinforcement Learning](https://arxiv.org/abs/2305.13301), which is implemented in 🤗 TRL with the [`~trl.DDPOTrainer`].
16 |
17 | For more information, check out the [`~trl.DDPOTrainer`] API reference and the [Finetune Stable Diffusion Models with DDPO via TRL](https://huggingface.co/blog/trl-ddpo) blog post.
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/using-diffusers/loading_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | 🧨 Diffusers offers many pipelines, models, and schedulers for generative tasks. To make loading these components as simple as possible, we provide a single and unified method - `from_pretrained()` - that loads any of these components from either the Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) or your local machine. Whenever you load a pipeline or model, the latest files are automatically downloaded and cached so you can quickly reuse them next time without redownloading the files.
16 |
17 | This section will show you everything you need to know about loading pipelines, how to load different components in a pipeline, how to load checkpoint variants, and how to load community pipelines. You'll also learn how to load schedulers and compare the speed and quality trade-offs of using different schedulers. Finally, you'll see how to convert and load KerasCV checkpoints so you can use them in PyTorch with 🧨 Diffusers.
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/using-diffusers/other-modalities.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Using Diffusers with other modalities
14 |
15 | Diffusers is in the process of expanding to modalities other than images.
16 |
17 | Example type | Colab | Pipeline |
18 | :-------------------------:|:-------------------------:|:-------------------------:|
19 | [Molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation | [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) | ❌
20 |
21 | More coming soon!
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/en/using-diffusers/pipeline_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | A pipeline is an end-to-end class that provides a quick and easy way to use a diffusion system for inference by bundling independently trained models and schedulers together. Certain combinations of models and schedulers define specific pipeline types, like [`StableDiffusionXLPipeline`] or [`StableDiffusionControlNetPipeline`], with specific capabilities. All pipeline types inherit from the base [`DiffusionPipeline`] class; pass it any checkpoint, and it'll automatically detect the pipeline type and load the necessary components.
16 |
17 | This section introduces you to some of the more complex pipelines like Stable Diffusion XL, ControlNet, and DiffEdit, which require additional inputs. You'll also learn how to use a distilled version of the Stable Diffusion model to speed up inference, how to control randomness on your hardware when generating images, and how to create a community pipeline for a custom task like generating images from speech.
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/in_translation.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # 번역중
14 |
15 | 열심히 번역을 진행중입니다. 조금만 기다려주세요.
16 | 감사합니다!
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/optimization/open_vino.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # 추론을 위한 OpenVINO 사용 방법
14 |
15 | 🤗 [Optimum](https://github.com/huggingface/optimum-intel)은 OpenVINO와 호환되는 Stable Diffusion 파이프라인을 제공합니다.
16 | 이제 다양한 Intel 프로세서에서 OpenVINO Runtime으로 쉽게 추론을 수행할 수 있습니다. ([여기](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html)서 지원되는 전 기기 목록을 확인하세요).
17 |
18 | ## 설치
19 |
20 | 다음 명령어로 🤗 Optimum을 설치합니다:
21 |
22 | ```
23 | pip install optimum["openvino"]
24 | ```
25 |
26 | ## Stable Diffusion 추론
27 |
28 | OpenVINO 모델을 불러오고 OpenVINO 런타임으로 추론을 실행하려면 `StableDiffusionPipeline`을 `OVStableDiffusionPipeline`으로 교체해야 합니다. PyTorch 모델을 불러오고 즉시 OpenVINO 형식으로 변환하려는 경우 `export=True`로 설정합니다.
29 |
30 | ```python
31 | from optimum.intel.openvino import OVStableDiffusionPipeline
32 |
33 | model_id = "runwayml/stable-diffusion-v1-5"
34 | pipe = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
35 | prompt = "a photo of an astronaut riding a horse on mars"
36 | images = pipe(prompt).images[0]
37 | ```
38 |
39 | [Optimum 문서](https://huggingface.co/docs/optimum/intel/inference#export-and-inference-of-stable-diffusion-models)에서 (정적 reshaping과 모델 컴파일 등의) 더 많은 예시들을 찾을 수 있습니다.
40 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/optimization/opt_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # 개요
14 |
15 | 노이즈가 많은 출력에서 적은 출력으로 만드는 과정으로 고품질 생성 모델의 출력을 만드는 각각의 반복되는 스텝은 많은 계산이 필요합니다. 🧨 Diffuser의 목표 중 하나는 모든 사람이 이 기술을 널리 이용할 수 있도록 하는 것이며, 여기에는 소비자 및 특수 하드웨어에서 빠른 추론을 가능하게 하는 것을 포함합니다.
16 |
17 | 이 섹션에서는 추론 속도를 최적화하고 메모리 소비를 줄이기 위한 반정밀(half-precision) 가중치 및 sliced attention과 같은 팁과 요령을 다룹니다. 또한 [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) 또는 [ONNX Runtime](https://onnxruntime.ai/docs/)을 사용하여 PyTorch 코드의 속도를 높이고, [xFormers](https://facebookresearch.github.io/xformers/)를 사용하여 memory-efficient attention을 활성화하는 방법을 배울 수 있습니다. Apple Silicon, Intel 또는 Habana 프로세서와 같은 특정 하드웨어에서 추론을 실행하기 위한 가이드도 있습니다.
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/optimization/xformers.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # xFormers 설치하기
14 |
15 | 추론과 학습 모두에 [xFormers](https://github.com/facebookresearch/xformers)를 사용하는 것이 좋습니다.
16 | 자체 테스트로 어텐션 블록에서 수행된 최적화가 더 빠른 속도와 적은 메모리 소비를 확인했습니다.
17 |
18 | 2023년 1월에 출시된 xFormers 버전 '0.0.16'부터 사전 빌드된 pip wheel을 사용하여 쉽게 설치할 수 있습니다:
19 |
20 | ```bash
21 | pip install xformers
22 | ```
23 |
24 |
25 |
26 | xFormers PIP 패키지에는 최신 버전의 PyTorch(xFormers 0.0.16에 1.13.1)가 필요합니다. 이전 버전의 PyTorch를 사용해야 하는 경우 [프로젝트 지침](https://github.com/facebookresearch/xformers#installing-xformers)의 소스를 사용해 xFormers를 설치하는 것이 좋습니다.
27 |
28 |
29 |
30 | xFormers를 설치하면, [여기](fp16#memory-efficient-attention)서 설명한 것처럼 'enable_xformers_memory_efficient_attention()'을 사용하여 추론 속도를 높이고 메모리 소비를 줄일 수 있습니다.
31 |
32 |
33 |
34 | [이 이슈](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)에 따르면 xFormers `v0.0.16`에서 GPU를 사용한 학습(파인 튜닝 또는 Dreambooth)을 할 수 없습니다. 해당 문제가 발견되면. 해당 코멘트를 참고해 development 버전을 설치하세요.
35 |
36 |
37 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/tutorials/tutorial_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | 🧨 Diffusers에 오신 걸 환영합니다! 여러분이 diffusion 모델과 생성 AI를 처음 접하고, 더 많은 걸 배우고 싶으셨다면 제대로 찾아오셨습니다. 이 튜토리얼은 diffusion model을 여러분에게 젠틀하게 소개하고, 라이브러리의 기본 사항(핵심 구성요소와 🧨 Diffusers 사용법)을 이해하는 데 도움이 되도록 설계되었습니다.
16 |
17 | 여러분은 이 튜토리얼을 통해 빠르게 생성하기 위해선 추론 파이프라인을 어떻게 사용해야 하는지, 그리고 라이브러리를 modular toolbox처럼 이용해서 여러분만의 diffusion system을 구축할 수 있도록 파이프라인을 분해하는 법을 배울 수 있습니다. 다음 단원에서는 여러분이 원하는 것을 생성하기 위해 자신만의 diffusion model을 학습하는 방법을 배우게 됩니다.
18 |
19 | 튜토리얼을 완료한다면 여러분은 라이브러리를 직접 탐색하고, 자신의 프로젝트와 애플리케이션에 적용할 스킬들을 습득할 수 있을 겁니다.
20 |
21 | [Discord](https://discord.com/invite/JfAtkvEtRb)나 [포럼](https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63) 커뮤니티에 자유롭게 참여해서 다른 사용자와 개발자들과 교류하고 협업해 보세요!
22 |
23 | 자 지금부터 diffusing을 시작해 보겠습니다! 🧨
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/using-diffusers/loading_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | 🧨 Diffusers는 생성 작업을 위한 다양한 파이프라인, 모델, 스케줄러를 제공합니다. 이러한 컴포넌트를 최대한 간단하게 로드할 수 있도록 단일 통합 메서드인 `from_pretrained()`를 제공하여 Hugging Face [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) 또는 로컬 머신에서 이러한 컴포넌트를 불러올 수 있습니다. 파이프라인이나 모델을 로드할 때마다, 최신 파일이 자동으로 다운로드되고 캐시되므로, 다음에 파일을 다시 다운로드하지 않고도 빠르게 재사용할 수 있습니다.
16 |
17 | 이 섹션은 파이프라인 로딩, 파이프라인에서 다양한 컴포넌트를 로드하는 방법, 체크포인트 variants를 불러오는 방법, 그리고 커뮤니티 파이프라인을 불러오는 방법에 대해 알아야 할 모든 것들을 다룹니다. 또한 스케줄러를 불러오는 방법과 서로 다른 스케줄러를 사용할 때 발생하는 속도와 품질간의 트레이드 오프를 비교하는 방법 역시 다룹니다. 그리고 마지막으로 🧨 Diffusers와 함께 파이토치에서 사용할 수 있도록 KerasCV 체크포인트를 변환하고 불러오는 방법을 살펴봅니다.
18 |
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/ko/using-diffusers/pipeline_overview.md:
--------------------------------------------------------------------------------
1 |
12 |
13 | # Overview
14 |
15 | 파이프라인은 독립적으로 훈련된 모델과 스케줄러를 함께 모아서 추론을 위해 diffusion 시스템을 빠르고 쉽게 사용할 수 있는 방법을 제공하는 end-to-end 클래스입니다. 모델과 스케줄러의 특정 조합은 특수한 기능과 함께 [`StableDiffusionPipeline`] 또는 [`StableDiffusionControlNetPipeline`]과 같은 특정 파이프라인 유형을 정의합니다. 모든 파이프라인 유형은 기본 [`DiffusionPipeline`] 클래스에서 상속됩니다. 어느 체크포인트를 전달하면, 파이프라인 유형을 자동으로 감지하고 필요한 구성 요소들을 불러옵니다.
16 |
17 | 이 섹션에서는 unconditional 이미지 생성, text-to-image 생성의 다양한 테크닉과 변화를 파이프라인에서 지원하는 작업들을 소개합니다. 프롬프트에 있는 특정 단어가 출력에 영향을 미치는 것을 조정하기 위해 재현성을 위한 시드 설정과 프롬프트에 가중치를 부여하는 것으로 생성 프로세스를 더 잘 제어하는 방법에 대해 배울 수 있습니다. 마지막으로 음성에서부터 이미지 생성과 같은 커스텀 작업을 위한 커뮤니티 파이프라인을 만드는 방법을 알 수 있습니다.
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/docs/source/zh/_toctree.yml:
--------------------------------------------------------------------------------
1 | - sections:
2 | - local: index
3 | title: 🧨 Diffusers
4 | - local: quicktour
5 | title: 快速入门
6 | - local: stable_diffusion
7 | title: 有效和高效的扩散
8 | - local: installation
9 | title: 安装
10 | title: 开始
11 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/community/one_step_unet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import torch
3 |
4 | from diffusers import DiffusionPipeline
5 |
6 |
7 | class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
8 | def __init__(self, unet, scheduler):
9 | super().__init__()
10 |
11 | self.register_modules(unet=unet, scheduler=scheduler)
12 |
13 | def __call__(self):
14 | image = torch.randn(
15 | (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
16 | )
17 | timestep = 1
18 |
19 | model_output = self.unet(image, timestep).sample
20 | scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
21 |
22 | result = scheduler_output - scheduler_output + torch.ones_like(scheduler_output)
23 |
24 | return result
25 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/controlnet/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | datasets
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/controlnet/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | datasets
3 | flax
4 | optax
5 | torch
6 | torchvision
7 | ftfy
8 | tensorboard
9 | Jinja2
10 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/controlnet/requirements_sdxl.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | datasets
8 | wandb
9 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/custom_diffusion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/dreambooth/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/dreambooth/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/dreambooth/requirements_sdxl.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/inference/README.md:
--------------------------------------------------------------------------------
1 | # Inference Examples
2 |
3 | **The inference examples folder is deprecated and will be removed in a future version**.
4 | **Officially supported inference examples can be found in the [Pipelines folder](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines)**.
5 |
6 | - For `Image-to-Image text-guided generation with Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
7 | - For `In-painting using Stable Diffusion`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
8 | - For `Tweak prompts reusing seeds and latents`, please have a look at the official [Pipeline examples](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines#examples)
9 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/inference/image_to_image.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from diffusers import StableDiffusionImg2ImgPipeline # noqa F401
4 |
5 |
6 | warnings.warn(
7 | "The `image_to_image.py` script is outdated. Please use directly `from diffusers import"
8 | " StableDiffusionImg2ImgPipeline` instead."
9 | )
10 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/inference/inpainting.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | from diffusers import StableDiffusionInpaintPipeline as StableDiffusionInpaintPipeline # noqa F401
4 |
5 |
6 | warnings.warn(
7 | "The `inpainting.py` script is outdated. Please use directly `from diffusers import"
8 | " StableDiffusionInpaintPipeline` instead."
9 | )
10 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/instruct_pix2pix/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
--------------------------------------------------------------------------------
/llmga/diffusers/examples/kandinsky2_2/text_to_image/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | Jinja2
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/reinforcement_learning/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers.
4 | There are two ways to use the script, `run_diffuser_locomotion.py`.
5 |
6 | The key option is a change of the variable `n_guide_steps`.
7 | When `n_guide_steps=0`, the trajectories are sampled from the diffusion model, but not fine-tuned to maximize reward in the environment.
8 | By default, `n_guide_steps=2` to match the original implementation.
9 |
10 |
11 | You will need some RL specific requirements to run the examples:
12 |
13 | ```
14 | pip install -f https://download.pytorch.org/whl/torch_stable.html \
15 | free-mujoco-py \
16 | einops \
17 | gym==0.24.1 \
18 | protobuf==3.20.1 \
19 | git+https://github.com/rail-berkeley/d4rl.git \
20 | mediapy \
21 | Pillow==9.0.0
22 | ```
23 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 | # Research projects
2 |
3 | This folder contains various research projects using 🧨 Diffusers.
4 | They are not really maintained by the core maintainers of this library and often require a specific version of Diffusers that is indicated in the requirements file of each folder.
5 | Updating them to the most recent version of the library will require some work.
6 |
7 | To use any of them, just run the command
8 |
9 | ```
10 | pip install -r requirements.txt
11 | ```
12 | inside the folder of your choice.
13 |
14 | If you need help with any of those, please open an issue where you directly ping the author(s), as indicated at the top of the README of each folder.
15 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/colossalai/inference.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from diffusers import StableDiffusionPipeline
4 |
5 |
6 | model_id = "path-to-your-trained-model"
7 | pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
8 |
9 | prompt = "A photo of sks dog in a bucket"
10 | image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
11 |
12 | image.save("dog-bucket.png")
13 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/colossalai/requirement.txt:
--------------------------------------------------------------------------------
1 | diffusers
2 | torch
3 | torchvision
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | transformers
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/dreambooth_inpaint/requirements.txt:
--------------------------------------------------------------------------------
1 | diffusers==0.9.0
2 | accelerate>=0.16.0
3 | torchvision
4 | transformers>=4.21.0
5 | ftfy
6 | tensorboard
7 | Jinja2
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/intel_opts/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.21.0
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | intel_extension_for_pytorch>=1.13
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | torchvision
3 | transformers>=4.25.0
4 | ftfy
5 | tensorboard
6 | modelcards
7 | neural-compressor
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/lora/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | Jinja2
8 | git+https://github.com/huggingface/peft.git
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/mulit_token_textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/multi_subject_dreambooth/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/onnxruntime/README.md:
--------------------------------------------------------------------------------
1 | ## Diffusers examples with ONNXRuntime optimizations
2 |
3 | **This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.**
4 |
5 | This aims to provide diffusers examples with ONNXRuntime optimizations for training/fine-tuning unconditional image generation, text to image, and textual inversion. Please see individual directories for more details on how to run each task using ONNXRuntime.
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/onnxruntime/text_to_image/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | modelcards
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/onnxruntime/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | modelcards
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | datasets
4 | tensorboard
--------------------------------------------------------------------------------
/llmga/diffusers/examples/research_projects/rdm/README.md:
--------------------------------------------------------------------------------
1 | ## Diffusers examples with ONNXRuntime optimizations
2 |
3 | **This research project is not actively maintained by the diffusers team. For any questions or comments, please contact Isamu Isozaki(isamu-isozaki) on github with any questions.**
4 |
5 | The aim of this project is to provide retrieval augmented diffusion models to diffusers!
--------------------------------------------------------------------------------
/llmga/diffusers/examples/t2i_adapter/README.md:
--------------------------------------------------------------------------------
1 | We don't yet support training T2I-Adapters on Stable Diffusion yet. For training T2I-Adapters on Stable Diffusion XL, refer [here](./README_sdxl.md).
--------------------------------------------------------------------------------
/llmga/diffusers/examples/t2i_adapter/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | accelerate>=0.16.0
3 | safetensors
4 | datasets
5 | torchvision
6 | ftfy
7 | tensorboard
8 | wandb
--------------------------------------------------------------------------------
/llmga/diffusers/examples/text_to_image/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | datasets
5 | ftfy
6 | tensorboard
7 | Jinja2
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/text_to_image/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | datasets
3 | flax
4 | optax
5 | torch
6 | torchvision
7 | ftfy
8 | tensorboard
9 | Jinja2
10 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/text_to_image/requirements_sdxl.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.22.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 | datasets
8 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/textual_inversion/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | transformers>=4.25.1
4 | ftfy
5 | tensorboard
6 | Jinja2
7 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/textual_inversion/requirements_flax.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.25.1
2 | flax
3 | optax
4 | torch
5 | torchvision
6 | ftfy
7 | tensorboard
8 | Jinja2
9 |
--------------------------------------------------------------------------------
/llmga/diffusers/examples/unconditional_image_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.16.0
2 | torchvision
3 | datasets
4 |
--------------------------------------------------------------------------------
/llmga/diffusers/pip.sh:
--------------------------------------------------------------------------------
1 | pip install datasets --index-url https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
2 | pip install . --index-url https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
3 | pip install albumentations --index-url https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
--------------------------------------------------------------------------------
/llmga/diffusers/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | target-version = ['py37']
4 |
5 | [tool.ruff]
6 | # Never enforce `E501` (line length violations).
7 | ignore = ["C901", "E501", "E741", "W605"]
8 | select = ["C", "E", "F", "I", "W"]
9 | line-length = 119
10 |
11 | # Ignore import violations in all `__init__.py` files.
12 | [tool.ruff.per-file-ignores]
13 | "__init__.py" = ["E402", "F401", "F403", "F811"]
14 | "src/diffusers/utils/dummy_*.py" = ["F401"]
15 |
16 | [tool.ruff.isort]
17 | lines-after-imports = 2
18 | known-first-party = ["diffusers"]
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/scripts/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/scripts/convert_unclip_txt2img_to_image_variation.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
4 |
5 | from diffusers import UnCLIPImageVariationPipeline, UnCLIPPipeline
6 |
7 |
8 | if __name__ == "__main__":
9 | parser = argparse.ArgumentParser()
10 |
11 | parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
12 |
13 | parser.add_argument(
14 | "--txt2img_unclip",
15 | default="kakaobrain/karlo-v1-alpha",
16 | type=str,
17 | required=False,
18 | help="The pretrained txt2img unclip.",
19 | )
20 |
21 | args = parser.parse_args()
22 |
23 | txt2img = UnCLIPPipeline.from_pretrained(args.txt2img_unclip)
24 |
25 | feature_extractor = CLIPImageProcessor()
26 | image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
27 |
28 | img2img = UnCLIPImageVariationPipeline(
29 | decoder=txt2img.decoder,
30 | text_encoder=txt2img.text_encoder,
31 | tokenizer=txt2img.tokenizer,
32 | text_proj=txt2img.text_proj,
33 | feature_extractor=feature_extractor,
34 | image_encoder=image_encoder,
35 | super_res_first=txt2img.super_res_first,
36 | super_res_last=txt2img.super_res_last,
37 | decoder_scheduler=txt2img.decoder_scheduler,
38 | super_res_scheduler=txt2img.super_res_scheduler,
39 | )
40 |
41 | img2img.save_pretrained(args.dump_path)
42 |
--------------------------------------------------------------------------------
/llmga/diffusers/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | default_section = FIRSTPARTY
3 | ensure_newline_before_comments = True
4 | force_grid_wrap = 0
5 | include_trailing_comma = True
6 | known_first_party = accelerate
7 | known_third_party =
8 | numpy
9 | torch
10 | torch_xla
11 |
12 | line_length = 119
13 | lines_after_imports = 2
14 | multi_line_output = 3
15 | use_parentheses = True
16 |
17 | [flake8]
18 | ignore = E203, E722, E501, E741, W503, W605
19 | max-line-length = 119
20 | per-file-ignores = __init__.py:F401
21 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/src/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 |
18 |
19 | class BaseDiffusersCLICommand(ABC):
20 | @staticmethod
21 | @abstractmethod
22 | def register_subcommand(parser: ArgumentParser):
23 | raise NotImplementedError()
24 |
25 | @abstractmethod
26 | def run(self):
27 | raise NotImplementedError()
28 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/commands/diffusers_cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2023 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from argparse import ArgumentParser
17 |
18 | from .env import EnvironmentCommand
19 | from .fp16_safetensors import FP16SafetensorsCommand
20 |
21 |
22 | def main():
23 | parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli []")
24 | commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
25 |
26 | # Register commands
27 | EnvironmentCommand.register_subcommand(commands_parser)
28 | FP16SafetensorsCommand.register_subcommand(commands_parser)
29 |
30 | # Let's go
31 | args = parser.parse_args()
32 |
33 | if not hasattr(args, "func"):
34 | parser.print_help()
35 | exit(1)
36 |
37 | # Run
38 | service = args.func(args)
39 | service.run()
40 |
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/dependency_versions_table.py:
--------------------------------------------------------------------------------
1 | # THIS FILE HAS BEEN AUTOGENERATED. To update:
2 | # 1. modify the `_deps` dict in setup.py
3 | # 2. run `make deps_table_update``
4 | deps = {
5 | "Pillow": "Pillow",
6 | "accelerate": "accelerate>=0.11.0",
7 | "compel": "compel==0.1.8",
8 | "black": "black~=23.1",
9 | "datasets": "datasets",
10 | "filelock": "filelock",
11 | "flax": "flax>=0.4.1",
12 | "hf-doc-builder": "hf-doc-builder>=0.3.0",
13 | "huggingface-hub": "huggingface-hub>=0.13.2",
14 | "requests-mock": "requests-mock==1.10.0",
15 | "importlib_metadata": "importlib_metadata",
16 | "invisible-watermark": "invisible-watermark>=0.2.0",
17 | "isort": "isort>=5.5.4",
18 | "jax": "jax>=0.4.1",
19 | "jaxlib": "jaxlib>=0.4.1",
20 | "Jinja2": "Jinja2",
21 | "k-diffusion": "k-diffusion>=0.0.12",
22 | "torchsde": "torchsde",
23 | "note_seq": "note_seq",
24 | "librosa": "librosa",
25 | "numpy": "numpy",
26 | "omegaconf": "omegaconf",
27 | "parameterized": "parameterized",
28 | "protobuf": "protobuf>=3.20.3,<4",
29 | "pytest": "pytest",
30 | "pytest-timeout": "pytest-timeout",
31 | "pytest-xdist": "pytest-xdist",
32 | "ruff": "ruff==0.0.280",
33 | "safetensors": "safetensors>=0.3.1",
34 | "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
35 | "scipy": "scipy",
36 | "onnx": "onnx",
37 | "regex": "regex!=2019.12.17",
38 | "requests": "requests",
39 | "tensorboard": "tensorboard",
40 | "torch": "torch>=1.4",
41 | "torchvision": "torchvision",
42 | "transformers": "transformers>=4.25.1",
43 | "urllib3": "urllib3<=2.0.0",
44 | }
45 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/experimental/README.md:
--------------------------------------------------------------------------------
1 | # 🧨 Diffusers Experimental
2 |
3 | We are adding experimental code to support novel applications and usages of the Diffusers library.
4 | Currently, the following experiments are supported:
5 | * Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/experimental/__init__.py:
--------------------------------------------------------------------------------
1 | from .rl import ValueGuidedRLPipeline
2 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/experimental/rl/__init__.py:
--------------------------------------------------------------------------------
1 | from .value_guided_sampling import ValueGuidedRLPipeline
2 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/models/README.md:
--------------------------------------------------------------------------------
1 | # Models
2 |
3 | For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models/overview).
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/models/activations.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | def get_activation(act_fn: str) -> nn.Module:
5 | """Helper function to get activation function from string.
6 |
7 | Args:
8 | act_fn (str): Name of activation function.
9 |
10 | Returns:
11 | nn.Module: Activation function.
12 | """
13 | if act_fn in ["swish", "silu"]:
14 | return nn.SiLU()
15 | elif act_fn == "mish":
16 | return nn.Mish()
17 | elif act_fn == "gelu":
18 | return nn.GELU()
19 | elif act_fn == "relu":
20 | return nn.ReLU()
21 | else:
22 | raise ValueError(f"Unsupported activation function: {act_fn}")
23 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/alt_diffusion/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import (
8 | BaseOutput,
9 | )
10 |
11 |
12 | @dataclass
13 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
14 | class AltDiffusionPipelineOutput(BaseOutput):
15 | """
16 | Output class for Alt Diffusion pipelines.
17 |
18 | Args:
19 | images (`List[PIL.Image.Image]` or `np.ndarray`)
20 | List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
21 | num_channels)`.
22 | nsfw_content_detected (`List[bool]`)
23 | List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
24 | `None` if safety checking could not be performed.
25 | """
26 |
27 | images: Union[List[PIL.Image.Image], np.ndarray]
28 | nsfw_content_detected: Optional[List[bool]]
29 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/audio_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {
7 | "mel": ["Mel"],
8 | "pipeline_audio_diffusion": ["AudioDiffusionPipeline"],
9 | }
10 |
11 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
12 | from .mel import Mel
13 | from .pipeline_audio_diffusion import AudioDiffusionPipeline
14 |
15 | else:
16 | import sys
17 |
18 | sys.modules[__name__] = _LazyModule(
19 | __name__,
20 | globals()["__file__"],
21 | _import_structure,
22 | module_spec=__spec__,
23 | )
24 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/audioldm/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | OptionalDependencyNotAvailable,
6 | _LazyModule,
7 | is_torch_available,
8 | is_transformers_available,
9 | is_transformers_version,
10 | )
11 |
12 |
13 | _dummy_objects = {}
14 | _import_structure = {}
15 |
16 | try:
17 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
18 | raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 | from ...utils.dummy_torch_and_transformers_objects import (
21 | AudioLDMPipeline,
22 | )
23 |
24 | _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
25 | else:
26 | _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
27 |
28 |
29 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
30 | try:
31 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
32 | raise OptionalDependencyNotAvailable()
33 | except OptionalDependencyNotAvailable:
34 | from ...utils.dummy_torch_and_transformers_objects import (
35 | AudioLDMPipeline,
36 | )
37 |
38 | else:
39 | from .pipeline_audioldm import AudioLDMPipeline
40 | else:
41 | import sys
42 |
43 | sys.modules[__name__] = _LazyModule(
44 | __name__,
45 | globals()["__file__"],
46 | _import_structure,
47 | module_spec=__spec__,
48 | )
49 |
50 | for name, value in _dummy_objects.items():
51 | setattr(sys.modules[__name__], name, value)
52 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/blip_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL
6 | from PIL import Image
7 |
8 | from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
9 |
10 |
11 | try:
12 | if not (is_transformers_available() and is_torch_available()):
13 | raise OptionalDependencyNotAvailable()
14 | except OptionalDependencyNotAvailable:
15 | from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
16 | else:
17 | from .blip_image_processing import BlipImageProcessor
18 | from .modeling_blip2 import Blip2QFormerModel
19 | from .modeling_ctx_clip import ContextCLIPTextModel
20 | from .pipeline_blip_diffusion import BlipDiffusionPipeline
21 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/consistency_models/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | _LazyModule,
6 | )
7 |
8 |
9 | _import_structure = {"pipeline_consistency_models": ["ConsistencyModelPipeline"]}
10 |
11 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
12 | from .pipeline_consistency_models import ConsistencyModelPipeline
13 |
14 | else:
15 | import sys
16 |
17 | sys.modules[__name__] = _LazyModule(
18 | __name__,
19 | globals()["__file__"],
20 | _import_structure,
21 | module_spec=__spec__,
22 | )
23 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/dance_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_dance_diffusion import DanceDiffusionPipeline
10 | else:
11 | import sys
12 |
13 | sys.modules[__name__] = _LazyModule(
14 | __name__,
15 | globals()["__file__"],
16 | _import_structure,
17 | module_spec=__spec__,
18 | )
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/ddim/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_ddim": ["DDIMPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_ddim import DDIMPipeline
10 | else:
11 | import sys
12 |
13 | sys.modules[__name__] = _LazyModule(
14 | __name__,
15 | globals()["__file__"],
16 | _import_structure,
17 | module_spec=__spec__,
18 | )
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/ddpm/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | _LazyModule,
6 | )
7 |
8 |
9 | _import_structure = {"pipeline_ddpm": ["DDPMPipeline"]}
10 |
11 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
12 | from .pipeline_ddpm import DDPMPipeline
13 |
14 | else:
15 | import sys
16 |
17 | sys.modules[__name__] = _LazyModule(
18 | __name__,
19 | globals()["__file__"],
20 | _import_structure,
21 | module_spec=__spec__,
22 | )
23 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import BaseOutput
8 |
9 |
10 | @dataclass
11 | class IFPipelineOutput(BaseOutput):
12 | """
13 | Args:
14 | Output class for Stable Diffusion pipelines.
15 | images (`List[PIL.Image.Image]` or `np.ndarray`)
16 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
17 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
18 | nsfw_detected (`List[bool]`)
19 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
20 | (nsfw) content or a watermark. `None` if safety checking could not be performed.
21 | watermark_detected (`List[bool]`)
22 | List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
23 | checking could not be performed.
24 | """
25 |
26 | images: Union[List[PIL.Image.Image], np.ndarray]
27 | nsfw_detected: Optional[List[bool]]
28 | watermark_detected: Optional[List[bool]]
29 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/dit/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_dit": ["DiTPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_dit import DiTPipeline
10 |
11 | else:
12 | import sys
13 |
14 | sys.modules[__name__] = _LazyModule(
15 | __name__,
16 | globals()["__file__"],
17 | _import_structure,
18 | module_spec=__spec__,
19 | )
20 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/kandinsky/text_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
3 |
4 |
5 | class MCLIPConfig(XLMRobertaConfig):
6 | model_type = "M-CLIP"
7 |
8 | def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs):
9 | self.transformerDimensions = transformerDimSize
10 | self.numDims = imageDimSize
11 | super().__init__(**kwargs)
12 |
13 |
14 | class MultilingualCLIP(PreTrainedModel):
15 | config_class = MCLIPConfig
16 |
17 | def __init__(self, config, *args, **kwargs):
18 | super().__init__(config, *args, **kwargs)
19 | self.transformer = XLMRobertaModel(config)
20 | self.LinearTransformation = torch.nn.Linear(
21 | in_features=config.transformerDimensions, out_features=config.numDims
22 | )
23 |
24 | def forward(self, input_ids, attention_mask):
25 | embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
26 | embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
27 | return self.LinearTransformation(embs2), embs
28 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | OptionalDependencyNotAvailable,
6 | _LazyModule,
7 | get_objects_from_module,
8 | is_torch_available,
9 | is_transformers_available,
10 | )
11 |
12 |
13 | _dummy_objects = {}
14 | _import_structure = {}
15 |
16 | try:
17 | if not (is_transformers_available() and is_torch_available()):
18 | raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 | from ...utils import dummy_torch_and_transformers_objects # noqa F403
21 |
22 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23 | else:
24 | _import_structure["pipeline_latent_diffusion"] = ["LDMBertModel", "LDMTextToImagePipeline"]
25 | _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
26 |
27 |
28 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29 | try:
30 | if not (is_transformers_available() and is_torch_available()):
31 | raise OptionalDependencyNotAvailable()
32 |
33 | except OptionalDependencyNotAvailable:
34 | from ...utils.dummy_torch_and_transformers_objects import *
35 | else:
36 | from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
37 | from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
38 |
39 | else:
40 | import sys
41 |
42 | sys.modules[__name__] = _LazyModule(
43 | __name__,
44 | globals()["__file__"],
45 | _import_structure,
46 | module_spec=__spec__,
47 | )
48 |
49 | for name, value in _dummy_objects.items():
50 | setattr(sys.modules[__name__], name, value)
51 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/latent_diffusion_uncond/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_latent_diffusion_uncond import LDMPipeline
10 | else:
11 | import sys
12 |
13 | sys.modules[__name__] = _LazyModule(
14 | __name__,
15 | globals()["__file__"],
16 | _import_structure,
17 | module_spec=__spec__,
18 | )
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/musicldm/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | OptionalDependencyNotAvailable,
6 | _LazyModule,
7 | get_objects_from_module,
8 | is_torch_available,
9 | is_transformers_available,
10 | is_transformers_version,
11 | )
12 |
13 |
14 | _dummy_objects = {}
15 | _import_structure = {}
16 |
17 | try:
18 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
19 | raise OptionalDependencyNotAvailable()
20 | except OptionalDependencyNotAvailable:
21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403
22 |
23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24 | else:
25 | _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
26 |
27 |
28 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29 | try:
30 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
31 | raise OptionalDependencyNotAvailable()
32 |
33 | except OptionalDependencyNotAvailable:
34 | from ...utils.dummy_torch_and_transformers_objects import *
35 | else:
36 | from .pipeline_musicldm import MusicLDMPipeline
37 |
38 | else:
39 | import sys
40 |
41 | sys.modules[__name__] = _LazyModule(
42 | __name__,
43 | globals()["__file__"],
44 | _import_structure,
45 | module_spec=__spec__,
46 | )
47 |
48 | for name, value in _dummy_objects.items():
49 | setattr(sys.modules[__name__], name, value)
50 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/pndm/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_pndm import PNDMPipeline
10 | else:
11 | import sys
12 |
13 | sys.modules[__name__] = _LazyModule(
14 | __name__,
15 | globals()["__file__"],
16 | _import_structure,
17 | module_spec=__spec__,
18 | )
19 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/repaint/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_repaint import RePaintPipeline
10 |
11 | else:
12 | import sys
13 |
14 | sys.modules[__name__] = _LazyModule(
15 | __name__,
16 | globals()["__file__"],
17 | _import_structure,
18 | module_spec=__spec__,
19 | )
20 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/score_sde_ve/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_score_sde_ve import ScoreSdeVePipeline
10 |
11 | else:
12 | import sys
13 |
14 | sys.modules[__name__] = _LazyModule(
15 | __name__,
16 | globals()["__file__"],
17 | _import_structure,
18 | module_spec=__spec__,
19 | )
20 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | OptionalDependencyNotAvailable,
6 | _LazyModule,
7 | get_objects_from_module,
8 | is_torch_available,
9 | is_transformers_available,
10 | )
11 |
12 |
13 | _dummy_objects = {}
14 | _import_structure = {}
15 |
16 | try:
17 | if not (is_transformers_available() and is_torch_available()):
18 | raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 | from ...utils import dummy_torch_and_transformers_objects # noqa F403
21 |
22 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23 | else:
24 | _import_structure["pipeline_output"] = ["SemanticStableDiffusionPipelineOutput"]
25 | _import_structure["pipeline_semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
26 |
27 |
28 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29 | try:
30 | if not (is_transformers_available() and is_torch_available()):
31 | raise OptionalDependencyNotAvailable()
32 |
33 | except OptionalDependencyNotAvailable:
34 | from ...utils.dummy_torch_and_transformers_objects import *
35 | else:
36 | from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
37 |
38 | else:
39 | import sys
40 |
41 | sys.modules[__name__] = _LazyModule(
42 | __name__,
43 | globals()["__file__"],
44 | _import_structure,
45 | module_spec=__spec__,
46 | )
47 |
48 | for name, value in _dummy_objects.items():
49 | setattr(sys.modules[__name__], name, value)
50 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import BaseOutput
8 |
9 |
10 | @dataclass
11 | class SemanticStableDiffusionPipelineOutput(BaseOutput):
12 | """
13 | Output class for Stable Diffusion pipelines.
14 |
15 | Args:
16 | images (`List[PIL.Image.Image]` or `np.ndarray`)
17 | List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
18 | num_channels)`.
19 | nsfw_content_detected (`List[bool]`)
20 | List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
21 | `None` if safety checking could not be performed.
22 | """
23 |
24 | images: Union[List[PIL.Image.Image], np.ndarray]
25 | nsfw_content_detected: Optional[List[bool]]
26 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The GLIGEN Authors and HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from torch import nn
16 |
17 | from ...configuration_utils import ConfigMixin, register_to_config
18 | from ...models.modeling_utils import ModelMixin
19 |
20 |
21 | class CLIPImageProjection(ModelMixin, ConfigMixin):
22 | @register_to_config
23 | def __init__(self, hidden_size: int = 768):
24 | super().__init__()
25 | self.hidden_size = hidden_size
26 | self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
27 |
28 | def forward(self, x):
29 | return self.project(x)
30 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import BaseOutput, is_flax_available
8 |
9 |
10 | @dataclass
11 | class StableDiffusionPipelineOutput(BaseOutput):
12 | """
13 | Output class for Stable Diffusion pipelines.
14 |
15 | Args:
16 | images (`List[PIL.Image.Image]` or `np.ndarray`)
17 | List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
18 | num_channels)`.
19 | nsfw_content_detected (`List[bool]`)
20 | List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
21 | `None` if safety checking could not be performed.
22 | """
23 |
24 | images: Union[List[PIL.Image.Image], np.ndarray]
25 | nsfw_content_detected: Optional[List[bool]]
26 |
27 |
28 | if is_flax_available():
29 | import flax
30 |
31 | @flax.struct.dataclass
32 | class FlaxStableDiffusionPipelineOutput(BaseOutput):
33 | """
34 | Output class for Flax-based Stable Diffusion pipelines.
35 |
36 | Args:
37 | images (`np.ndarray`):
38 | Denoised images of array shape of `(batch_size, height, width, num_channels)`.
39 | nsfw_content_detected (`List[bool]`):
40 | List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
41 | or `None` if safety checking could not be performed.
42 | """
43 |
44 | images: np.ndarray
45 | nsfw_content_detected: List[bool]
46 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import (
8 | BaseOutput,
9 | )
10 |
11 |
12 | @dataclass
13 | class StableDiffusionSafePipelineOutput(BaseOutput):
14 | """
15 | Output class for Safe Stable Diffusion pipelines.
16 |
17 | Args:
18 | images (`List[PIL.Image.Image]` or `np.ndarray`)
19 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
20 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
21 | nsfw_content_detected (`List[bool]`)
22 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
23 | (nsfw) content, or `None` if safety checking could not be performed.
24 | images (`List[PIL.Image.Image]` or `np.ndarray`)
25 | List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
26 | (nsfw) content, or `None` if no safety check was performed or no images were flagged.
27 | applied_safety_concept (`str`)
28 | The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
29 | """
30 |
31 | images: Union[List[PIL.Image.Image], np.ndarray]
32 | nsfw_content_detected: Optional[List[bool]]
33 | unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
34 | applied_safety_concept: Optional[str]
35 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Union
3 |
4 | import numpy as np
5 | import PIL.Image
6 |
7 | from ...utils import BaseOutput, is_flax_available
8 |
9 |
10 | @dataclass
11 | class StableDiffusionXLPipelineOutput(BaseOutput):
12 | """
13 | Output class for Stable Diffusion pipelines.
14 |
15 | Args:
16 | images (`List[PIL.Image.Image]` or `np.ndarray`)
17 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
18 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
19 | """
20 |
21 | images: Union[List[PIL.Image.Image], np.ndarray]
22 |
23 |
24 | if is_flax_available():
25 | import flax
26 |
27 | @flax.struct.dataclass
28 | class FlaxStableDiffusionXLPipelineOutput(BaseOutput):
29 | """
30 | Output class for Flax Stable Diffusion XL pipelines.
31 |
32 | Args:
33 | images (`np.ndarray`)
34 | Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline.
35 | """
36 |
37 | images: np.ndarray
38 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stable_diffusion_xl/watermark.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from ...utils import is_invisible_watermark_available
5 |
6 |
7 | if is_invisible_watermark_available():
8 | from imwatermark import WatermarkEncoder
9 |
10 |
11 | # Copied from https://github.com/Stability-AI/generative-models/blob/613af104c6b85184091d42d374fef420eddb356d/scripts/demo/streamlit_helpers.py#L66
12 | WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
13 | # bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
14 | WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
15 |
16 |
17 | class StableDiffusionXLWatermarker:
18 | def __init__(self):
19 | self.watermark = WATERMARK_BITS
20 | self.encoder = WatermarkEncoder()
21 |
22 | self.encoder.set_watermark("bits", self.watermark)
23 |
24 | def apply_watermark(self, images: torch.FloatTensor):
25 | # can't encode images that are smaller than 256
26 | if images.shape[-1] < 256:
27 | return images
28 |
29 | images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
30 |
31 | images = [self.encoder.encode(image, "dwtDct") for image in images]
32 |
33 | images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2)
34 |
35 | images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
36 | return images
37 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/stochastic_karras_ve/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
4 |
5 |
6 | _import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]}
7 |
8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
9 | from .pipeline_stochastic_karras_ve import KarrasVePipeline
10 |
11 | else:
12 | import sys
13 |
14 | sys.modules[__name__] = _LazyModule(
15 | __name__,
16 | globals()["__file__"],
17 | _import_structure,
18 | module_spec=__spec__,
19 | )
20 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/t2i_adapter/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from ...utils import (
4 | DIFFUSERS_SLOW_IMPORT,
5 | OptionalDependencyNotAvailable,
6 | _LazyModule,
7 | get_objects_from_module,
8 | is_torch_available,
9 | is_transformers_available,
10 | )
11 |
12 |
13 | _dummy_objects = {}
14 | _import_structure = {}
15 |
16 | try:
17 | if not (is_transformers_available() and is_torch_available()):
18 | raise OptionalDependencyNotAvailable()
19 | except OptionalDependencyNotAvailable:
20 | from ...utils import dummy_torch_and_transformers_objects # noqa F403
21 |
22 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23 | else:
24 | _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
25 | _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
26 |
27 |
28 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29 | try:
30 | if not (is_transformers_available() and is_torch_available()):
31 | raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 | from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
34 | else:
35 | from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
36 | from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
37 | else:
38 | import sys
39 |
40 | sys.modules[__name__] = _LazyModule(
41 | __name__,
42 | globals()["__file__"],
43 | _import_structure,
44 | module_spec=__spec__,
45 | )
46 | for name, value in _dummy_objects.items():
47 | setattr(sys.modules[__name__], name, value)
48 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Union
3 |
4 | import numpy as np
5 | import torch
6 |
7 | from ...utils import (
8 | BaseOutput,
9 | )
10 |
11 |
12 | @dataclass
13 | class TextToVideoSDPipelineOutput(BaseOutput):
14 | """
15 | Output class for text-to-video pipelines.
16 |
17 | Args:
18 | frames (`List[np.ndarray]` or `torch.FloatTensor`)
19 | List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
20 | a `torch` tensor. The length of the list denotes the video length (the number of frames).
21 | """
22 |
23 | frames: Union[List[np.ndarray], torch.FloatTensor]
24 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/src/diffusers/py.typed
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/schedulers/README.md:
--------------------------------------------------------------------------------
1 | # Schedulers
2 |
3 | For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers/overview).
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/doc_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | Doc utilities: Utilities related to documentation
16 | """
17 | import re
18 |
19 |
20 | def replace_example_docstring(example_docstring):
21 | def docstring_decorator(fn):
22 | func_doc = fn.__doc__
23 | lines = func_doc.split("\n")
24 | i = 0
25 | while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
26 | i += 1
27 | if i < len(lines):
28 | lines[i] = example_docstring
29 | func_doc = "\n".join(lines)
30 | else:
31 | raise ValueError(
32 | f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
33 | f"current docstring is:\n{func_doc}"
34 | )
35 | fn.__doc__ = func_doc
36 | return fn
37 |
38 | return docstring_decorator
39 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_note_seq_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class MidiProcessor(metaclass=DummyObject):
6 | _backends = ["note_seq"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["note_seq"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["note_seq"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["note_seq"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_onnx_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class OnnxRuntimeModel(metaclass=DummyObject):
6 | _backends = ["onnx"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["onnx"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["onnx"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["onnx"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_torch_and_librosa_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class AudioDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["torch", "librosa"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "librosa"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "librosa"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "librosa"])
18 |
19 |
20 | class Mel(metaclass=DummyObject):
21 | _backends = ["torch", "librosa"]
22 |
23 | def __init__(self, *args, **kwargs):
24 | requires_backends(self, ["torch", "librosa"])
25 |
26 | @classmethod
27 | def from_config(cls, *args, **kwargs):
28 | requires_backends(cls, ["torch", "librosa"])
29 |
30 | @classmethod
31 | def from_pretrained(cls, *args, **kwargs):
32 | requires_backends(cls, ["torch", "librosa"])
33 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_torch_and_scipy_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class LMSDiscreteScheduler(metaclass=DummyObject):
6 | _backends = ["torch", "scipy"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "scipy"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "scipy"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "scipy"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_torch_and_torchsde_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class DPMSolverSDEScheduler(metaclass=DummyObject):
6 | _backends = ["torch", "torchsde"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "torchsde"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "torchsde"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "torchsde"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class StableDiffusionKDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["torch", "transformers", "k_diffusion"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["torch", "transformers", "k_diffusion"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["torch", "transformers", "k_diffusion"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["torch", "transformers", "k_diffusion"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..utils import DummyObject, requires_backends
3 |
4 |
5 | class SpectrogramDiffusionPipeline(metaclass=DummyObject):
6 | _backends = ["transformers", "torch", "note_seq"]
7 |
8 | def __init__(self, *args, **kwargs):
9 | requires_backends(self, ["transformers", "torch", "note_seq"])
10 |
11 | @classmethod
12 | def from_config(cls, *args, **kwargs):
13 | requires_backends(cls, ["transformers", "torch", "note_seq"])
14 |
15 | @classmethod
16 | def from_pretrained(cls, *args, **kwargs):
17 | requires_backends(cls, ["transformers", "torch", "note_seq"])
18 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/loading_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Union
3 |
4 | import PIL.Image
5 | import PIL.ImageOps
6 | import requests
7 |
8 |
9 | def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
10 | """
11 | Loads `image` to a PIL Image.
12 |
13 | Args:
14 | image (`str` or `PIL.Image.Image`):
15 | The image to convert to the PIL Image format.
16 | Returns:
17 | `PIL.Image.Image`:
18 | A PIL Image.
19 | """
20 | if isinstance(image, str):
21 | if image.startswith("http://") or image.startswith("https://"):
22 | image = PIL.Image.open(requests.get(image, stream=True).raw)
23 | elif os.path.isfile(image):
24 | image = PIL.Image.open(image)
25 | else:
26 | raise ValueError(
27 | f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
28 | )
29 | elif isinstance(image, PIL.Image.Image):
30 | image = image
31 | else:
32 | raise ValueError(
33 | "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
34 | )
35 | image = PIL.ImageOps.exif_transpose(image)
36 | image = image.convert("RGB")
37 | return image
38 |
--------------------------------------------------------------------------------
/llmga/diffusers/src/diffusers/utils/model_card_template.md:
--------------------------------------------------------------------------------
1 | ---
2 | {{ card_data }}
3 | ---
4 |
5 |
7 |
8 | # {{ model_name | default("Diffusion Model") }}
9 |
10 | ## Model description
11 |
12 | This diffusion model is trained with the [🤗 Diffusers](https://github.com/huggingface/diffusers) library
13 | on the `{{ dataset_name }}` dataset.
14 |
15 | ## Intended uses & limitations
16 |
17 | #### How to use
18 |
19 | ```python
20 | # TODO: add an example code snippet for running this diffusion pipeline
21 | ```
22 |
23 | #### Limitations and bias
24 |
25 | [TODO: provide examples of latent issues and potential remediations]
26 |
27 | ## Training data
28 |
29 | [TODO: describe the data used to train the model]
30 |
31 | ### Training hyperparameters
32 |
33 | The following hyperparameters were used during training:
34 | - learning_rate: {{ learning_rate }}
35 | - train_batch_size: {{ train_batch_size }}
36 | - eval_batch_size: {{ eval_batch_size }}
37 | - gradient_accumulation_steps: {{ gradient_accumulation_steps }}
38 | - optimizer: AdamW with betas=({{ adam_beta1 }}, {{ adam_beta2 }}), weight_decay={{ adam_weight_decay }} and epsilon={{ adam_epsilon }}
39 | - lr_scheduler: {{ lr_scheduler }}
40 | - lr_warmup_steps: {{ lr_warmup_steps }}
41 | - ema_inv_gamma: {{ ema_inv_gamma }}
42 | - ema_inv_gamma: {{ ema_power }}
43 | - ema_inv_gamma: {{ ema_max_decay }}
44 | - mixed_precision: {{ mixed_precision }}
45 |
46 | ### Training results
47 |
48 | 📈 [TensorBoard logs](https://huggingface.co/{{ repo_name }}/tensorboard?#scalars)
49 |
50 |
51 |
--------------------------------------------------------------------------------
/llmga/diffusers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/fixtures/elise_format0.mid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/fixtures/elise_format0.mid
--------------------------------------------------------------------------------
/llmga/diffusers/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/models/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/models/test_models_vae_flax.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from diffusers import FlaxAutoencoderKL
4 | from diffusers.utils import is_flax_available
5 | from diffusers.utils.testing_utils import require_flax
6 |
7 | from .test_modeling_common_flax import FlaxModelTesterMixin
8 |
9 |
10 | if is_flax_available():
11 | import jax
12 |
13 |
14 | @require_flax
15 | class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase):
16 | model_class = FlaxAutoencoderKL
17 |
18 | @property
19 | def dummy_input(self):
20 | batch_size = 4
21 | num_channels = 3
22 | sizes = (32, 32)
23 |
24 | prng_key = jax.random.PRNGKey(0)
25 | image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes))
26 |
27 | return {"sample": image, "prng_key": prng_key}
28 |
29 | def prepare_init_args_and_inputs_for_common(self):
30 | init_dict = {
31 | "block_out_channels": [32, 64],
32 | "in_channels": 3,
33 | "out_channels": 3,
34 | "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
35 | "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
36 | "latent_channels": 4,
37 | }
38 | inputs_dict = self.dummy_input
39 | return init_dict, inputs_dict
40 |
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/altdiffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/altdiffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/audio_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/audio_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/audioldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/audioldm/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/audioldm2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/audioldm2/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/blipdiffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/blipdiffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/consistency_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/consistency_models/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/controlnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/controlnet/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/dance_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/dance_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/ddim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/ddim/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/ddpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/ddpm/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/dit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/dit/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/kandinsky/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/kandinsky/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/kandinsky2_2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/kandinsky2_2/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/karras_ve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/karras_ve/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/latent_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/latent_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/musicldm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/musicldm/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/paint_by_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/paint_by_example/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/pndm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/pndm/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/repaint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/repaint/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/score_sde_ve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/score_sde_ve/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/shap_e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/shap_e/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/stable_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/stable_diffusion_2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/stable_diffusion_2/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/stable_diffusion_xl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/stable_diffusion_xl/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/stable_unclip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/stable_unclip/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/test_pipelines_onnx_common.py:
--------------------------------------------------------------------------------
1 | from diffusers.utils.testing_utils import require_onnxruntime
2 |
3 |
4 | @require_onnxruntime
5 | class OnnxPipelineTesterMixin:
6 | """
7 | This mixin is designed to be used with unittest.TestCase classes.
8 | It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline,
9 | equivalence of dict and tuple outputs, etc.
10 | """
11 |
12 | pass
13 |
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/text_to_video_synthesis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/text_to_video_synthesis/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/unclip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/unclip/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/unidiffuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/unidiffuser/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/versatile_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/versatile_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/vq_diffusion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/vq_diffusion/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/pipelines/wuerstchen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/pipelines/wuerstchen/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/tests/schedulers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/diffusers/tests/schedulers/__init__.py
--------------------------------------------------------------------------------
/llmga/diffusers/utils/get_modified_files.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2023 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.:
17 | # python ./utils/get_modified_files.py utils src tests examples
18 | #
19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered
20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results
21 |
22 | import re
23 | import subprocess
24 | import sys
25 |
26 |
27 | fork_point_sha = subprocess.check_output("git merge-base main HEAD".split()).decode("utf-8")
28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
29 |
30 | joined_dirs = "|".join(sys.argv[1:])
31 | regex = re.compile(rf"^({joined_dirs}).*?\.py$")
32 |
33 | relevant_modified_files = [x for x in modified_files if regex.match(x)]
34 | print(" ".join(relevant_modified_files), end="")
35 |
--------------------------------------------------------------------------------
/llmga/diffusers/utils/print_env.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # coding=utf-8
4 | # Copyright 2023 The HuggingFace Inc. team.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # this script dumps information about the environment
19 |
20 | import os
21 | import platform
22 | import sys
23 |
24 |
25 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
26 |
27 | print("Python version:", sys.version)
28 |
29 | print("OS platform:", platform.platform())
30 | print("OS architecture:", platform.machine())
31 |
32 | try:
33 | import torch
34 |
35 | print("Torch version:", torch.__version__)
36 | print("Cuda available:", torch.cuda.is_available())
37 | print("Cuda version:", torch.version.cuda)
38 | print("CuDNN version:", torch.backends.cudnn.version())
39 | print("Number of GPUs available:", torch.cuda.device_count())
40 | except ImportError:
41 | print("Torch version:", None)
42 |
43 | try:
44 | import transformers
45 |
46 | print("transformers version:", transformers.__version__)
47 | except ImportError:
48 | print("transformers version:", None)
49 |
--------------------------------------------------------------------------------
/llmga/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 |
--------------------------------------------------------------------------------
/llmga/llava/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 | IMAGE_TOKEN_INDEX = -200
9 | DEFAULT_IMAGE_TOKEN = ""
10 | DEFAULT_IMAGE_PATCH_TOKEN = ""
11 | DEFAULT_IM_START_TOKEN = ""
12 | DEFAULT_IM_END_TOKEN = ""
13 | IMAGE_PLACEHOLDER = ""
14 | DEFAULT_OUTPUT_START_TOKEN = ""
15 | DEFAULT_OUTPUT_END_TOKEN = ""
16 | DEFAULT_EDIT_START_TOKEN = ""
17 | DEFAULT_EDIT_END_TOKEN = ""
18 |
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/llava/eval/webpage/figures/alpaca.png
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/llava/eval/webpage/figures/bard.jpg
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/llava/eval/webpage/figures/llama.jpg
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/llmga/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/llava/eval/webpage/figures/vicuna.jpeg
--------------------------------------------------------------------------------
/llmga/llava/masks/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/llmga/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | from .language_model.llava_gemma import LlavaGemmaForCausalLM, LlavaGemmaConfig
6 | from .language_model.llava_phi3 import LlavaPhi3ForCausalLM, LlavaPhi3Config
7 | from .language_model.llava_qwen2 import LlavaQwen2ForCausalLM, LlavaQwen2Config
8 | except:
9 | pass
10 |
11 |
--------------------------------------------------------------------------------
/llmga/llava/model/consolidate.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4 | """
5 | import argparse
6 |
7 | import torch
8 | from transformers import AutoTokenizer, AutoModelForCausalLM
9 | from llmga.llava.model import *
10 | from llmga.llava.model.utils import auto_upgrade
11 |
12 |
13 | def consolidate_ckpt(src_path, dst_path):
14 | print("Loading model")
15 | auto_upgrade(src_path)
16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 | src_model.save_pretrained(dst_path)
19 | src_tokenizer.save_pretrained(dst_path)
20 |
21 |
22 | if __name__ == "__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--src", type=str, required=True)
25 | parser.add_argument("--dst", type=str, required=True)
26 |
27 | args = parser.parse_args()
28 |
29 | consolidate_ckpt(args.src, args.dst)
30 |
--------------------------------------------------------------------------------
/llmga/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .clip_encoder import CLIPVisionTower
3 |
4 |
5 | def build_vision_tower(vision_tower_cfg, **kwargs):
6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
7 | is_absolute_path_exists = os.path.exists(vision_tower)
8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 |
11 | raise ValueError(f'Unknown vision tower: {vision_tower}')
12 |
--------------------------------------------------------------------------------
/llmga/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import re
4 |
5 |
6 | class IdentityMap(nn.Module):
7 | def __init__(self):
8 | super().__init__()
9 |
10 | def forward(self, x, *args, **kwargs):
11 | return x
12 |
13 | @property
14 | def config(self):
15 | return {"mm_projector_type": 'identity'}
16 |
17 |
18 | class SimpleResBlock(nn.Module):
19 | def __init__(self, channels):
20 | super().__init__()
21 | self.pre_norm = nn.LayerNorm(channels)
22 |
23 | self.proj = nn.Sequential(
24 | nn.Linear(channels, channels),
25 | nn.GELU(),
26 | nn.Linear(channels, channels)
27 | )
28 | def forward(self, x):
29 | x = self.pre_norm(x)
30 | return x + self.proj(x)
31 |
32 |
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 | projector_type = getattr(config, 'mm_projector_type', 'linear')
35 |
36 | if projector_type == 'linear':
37 | return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 |
39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 | if mlp_gelu_match:
41 | mlp_depth = int(mlp_gelu_match.group(1))
42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 | for _ in range(1, mlp_depth):
44 | modules.append(nn.GELU())
45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 | return nn.Sequential(*modules)
47 |
48 | if projector_type == 'identity':
49 | return IdentityMap()
50 |
51 | raise ValueError(f'Unknown projector type: {projector_type}')
52 |
--------------------------------------------------------------------------------
/llmga/llava/model/utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoConfig
2 |
3 |
4 | def auto_upgrade(config):
5 | cfg = AutoConfig.from_pretrained(config)
6 | if 'llava' in config and 'llava' not in cfg.model_type:
7 | assert cfg.model_type == 'llama'
8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 | if confirm.lower() in ["y", "yes"]:
12 | print("Upgrading checkpoint...")
13 | assert len(cfg.architectures) == 1
14 | setattr(cfg.__class__, "model_type", "llava")
15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 | cfg.save_pretrained(config)
17 | print("Checkpoint upgraded.")
18 | else:
19 | print("Checkpoint upgrade aborted.")
20 | exit(1)
21 |
--------------------------------------------------------------------------------
/llmga/llava/train/pretrain_mem.py:
--------------------------------------------------------------------------------
1 | from llmga.llava.train.pretrain import train
2 | import os
3 | # os.environ["WANDB_DISABLED"]="true"
4 | if __name__ == "__main__":
5 | train(attn_implementation="flash_attention_2")
6 |
--------------------------------------------------------------------------------
/llmga/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llmga.llava.train.train import train
2 | import os
3 | os.environ["WANDB_DISABLED"]="true"
4 | if __name__ == "__main__":
5 | train(attn_implementation="flash_attention_2")
6 |
--------------------------------------------------------------------------------
/llmga/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
2 |
3 | # Need to call this before importing transformers.
4 | from llmga.llava.train.llama_xformers_attn_monkey_patch import (
5 | replace_llama_attn_with_xformers_attn,
6 | )
7 |
8 | replace_llama_attn_with_xformers_attn()
9 |
10 | from llmga.llava.train.train import train
11 |
12 | if __name__ == "__main__":
13 | train()
14 |
--------------------------------------------------------------------------------
/llmga/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/serve/__init__.py
--------------------------------------------------------------------------------
/llmga/serve/examples/aes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/serve/examples/aes.png
--------------------------------------------------------------------------------
/llmga/serve/examples/jiateng.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/serve/examples/jiateng.png
--------------------------------------------------------------------------------
/llmga/serve/examples/snow_scene.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dvlab-research/LLMGA/7e32eb6858c2fda168e921c12fa9622d9af9ce42/llmga/serve/examples/snow_scene.jpeg
--------------------------------------------------------------------------------
/llmga/serve/register_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | Manually register workers.
3 |
4 | Usage:
5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6 | """
7 |
8 | import argparse
9 |
10 | import requests
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--controller-address", type=str)
15 | parser.add_argument("--worker-name", type=str)
16 | parser.add_argument("--check-heart-beat", action="store_true")
17 | args = parser.parse_args()
18 |
19 | url = args.controller_address + "/register_worker"
20 | data = {
21 | "worker_name": args.worker_name,
22 | "check_heart_beat": args.check_heart_beat,
23 | "worker_status": None,
24 | }
25 | r = requests.post(url, json=data)
26 | assert r.status_code == 200
27 |
--------------------------------------------------------------------------------
/pip.sh:
--------------------------------------------------------------------------------
1 | sudo apt-get update
2 | sudo apt install tmux
3 | sudo apt install libgl1-mesa-glx
4 | pip install --upgrade pip # enable PEP 660 support
5 | pip install -e ".[train]"
6 | pip install -r requirements.txt
7 | pip install flash-attn --no-build-isolation
8 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "llmga"
7 | version = "1.0.0"
8 | description = "LLMGA: Multimodal Large Language Model-based Generation Assistant."
9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 | "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
17 | "accelerate==0.21.0", "peft", "bitsandbytes",
18 | "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
19 | "gradio==4.16.0", "gradio_client==0.8.1",
20 | "requests", "httpx==0.24.0", "uvicorn", "fastapi",
21 | "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
22 | ]
23 |
24 |
25 | [tool.setuptools.packages.find]
26 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
27 |
28 | [tool.wheel]
29 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
30 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.37.2
2 | sentencepiece
3 | accelerate
4 | bitsandbytes
5 | pydantic
6 | lmdb
--------------------------------------------------------------------------------
/scripts/pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Uncomment and set the following variables correspondingly to run this script:
4 |
5 | # MODEL_VERSION=vicuna-v1-3-7b
6 | # MODEL_VERSION=llama-2-7b-chat
7 |
8 | ########### DO NOT CHANGE ###########
9 | ########### USE THIS FOR BOTH ###########
10 | PROMPT_VERSION=plain
11 | ########### DO NOT CHANGE ###########
12 |
13 | deepspeed llava/train/train_mem.py \
14 | --deepspeed ./scripts/zero2.json \
15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
16 | --version $PROMPT_VERSION \
17 | --data_path /path/to/pretrain_data.json \
18 | --image_folder /path/to/images \
19 | --vision_tower openai/clip-vit-large-patch14 \
20 | --tune_mm_mlp_adapter True \
21 | --mm_vision_select_layer -2 \
22 | --mm_use_im_start_end False \
23 | --mm_use_im_patch_token False \
24 | --bf16 True \
25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26 | --num_train_epochs 1 \
27 | --per_device_train_batch_size 16 \
28 | --per_device_eval_batch_size 4 \
29 | --gradient_accumulation_steps 1 \
30 | --evaluation_strategy "no" \
31 | --save_strategy "steps" \
32 | --save_steps 24000 \
33 | --save_total_limit 1 \
34 | --learning_rate 2e-3 \
35 | --weight_decay 0. \
36 | --warmup_ratio 0.03 \
37 | --lr_scheduler_type "cosine" \
38 | --logging_steps 1 \
39 | --tf32 True \
40 | --model_max_length 2048 \
41 | --gradient_checkpointing True \
42 | --dataloader_num_workers 4 \
43 | --lazy_preprocess True \
44 | --report_to wandb
45 |
--------------------------------------------------------------------------------
/scripts/pretrain_gemma.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | sudo mkdir ./checkpoints
4 | sudo chmod -R 777 ./checkpoints
5 |
6 |
7 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
8 | --deepspeed ./scripts/zero3.json \
9 | --model_name_or_path ./base_models/gemma-2b-it \
10 | --version gemma \
11 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
12 | --image_folder ./data/llava_pretrain/images \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --mm_projector_type mlp2x_gelu \
15 | --tune_mm_mlp_adapter True \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_im_start_end False \
18 | --mm_use_im_patch_token False \
19 | --bf16 True \
20 | --output_dir ./checkpoints/llmga-gemma-pretrain \
21 | --num_train_epochs 1 \
22 | --per_device_train_batch_size 8 \
23 | --per_device_eval_batch_size 4 \
24 | --gradient_accumulation_steps 4 \
25 | --evaluation_strategy "no" \
26 | --save_strategy "steps" \
27 | --save_steps 24000 \
28 | --save_total_limit 1 \
29 | --learning_rate 1e-3 \
30 | --weight_decay 0. \
31 | --warmup_ratio 0.03 \
32 | --lr_scheduler_type "cosine" \
33 | --logging_steps 1 \
34 | --tf32 True \
35 | --model_max_length 2048 \
36 | --gradient_checkpointing True \
37 | --dataloader_num_workers 4 \
38 | --lazy_preprocess True \
39 | --image_aspect_ratio "resizesquare" \
40 |
41 |
--------------------------------------------------------------------------------
/scripts/pretrain_llama3.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | sudo mkdir ./checkpoints
4 | sudo chmod -R 777 ./checkpoints
5 |
6 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
7 | --deepspeed ./scripts/zero3.json \
8 | --model_name_or_path ./base_models/Meta-Llama-3-8B-Instruct \
9 | --version llama_3 \
10 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
11 | --image_folder ./data/llava_pretrain/images \
12 | --vision_tower openai/clip-vit-large-patch14-336 \
13 | --mm_projector_type mlp2x_gelu \
14 | --tune_mm_mlp_adapter True \
15 | --mm_vision_select_layer -2 \
16 | --mm_use_im_start_end False \
17 | --mm_use_im_patch_token False \
18 | --bf16 True \
19 | --output_dir ./checkpoints/llmga-llama3-8b-pretrain \
20 | --num_train_epochs 1 \
21 | --per_device_train_batch_size 8 \
22 | --per_device_eval_batch_size 4 \
23 | --gradient_accumulation_steps 4 \
24 | --evaluation_strategy "no" \
25 | --save_strategy "steps" \
26 | --save_steps 24000 \
27 | --save_total_limit 1 \
28 | --learning_rate 1e-3 \
29 | --weight_decay 0. \
30 | --warmup_ratio 0.03 \
31 | --lr_scheduler_type "cosine" \
32 | --logging_steps 1 \
33 | --tf32 True \
34 | --model_max_length 2048 \
35 | --gradient_checkpointing True \
36 | --dataloader_num_workers 4 \
37 | --lazy_preprocess True \
38 | --image_aspect_ratio "resizesquare" \
39 |
40 |
--------------------------------------------------------------------------------
/scripts/pretrain_mistral.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/Mistral-7B-Instruct-v0.2 \
8 | --version mistral_instruct \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-mistral-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 | # --report_to wandb
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/scripts/pretrain_phi3.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | sudo mkdir ./checkpoints
4 | sudo chmod -R 777 ./checkpoints
5 |
6 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
7 | --deepspeed ./scripts/zero3.json \
8 | --model_name_or_path ./base_models/Phi-3-mini-128k-instruct \
9 | --version phi_3 \
10 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
11 | --image_folder ./data/llava_pretrain/images \
12 | --vision_tower openai/clip-vit-large-patch14-336 \
13 | --mm_projector_type mlp2x_gelu \
14 | --tune_mm_mlp_adapter True \
15 | --mm_vision_select_layer -2 \
16 | --mm_use_im_start_end False \
17 | --mm_use_im_patch_token False \
18 | --bf16 True \
19 | --output_dir ./checkpoints/llmga-Phi-3-mini-128k-pretrain \
20 | --num_train_epochs 1 \
21 | --per_device_train_batch_size 8 \
22 | --per_device_eval_batch_size 4 \
23 | --gradient_accumulation_steps 4 \
24 | --evaluation_strategy "no" \
25 | --save_strategy "steps" \
26 | --save_steps 24000 \
27 | --save_total_limit 1 \
28 | --learning_rate 1e-3 \
29 | --weight_decay 0. \
30 | --warmup_ratio 0.03 \
31 | --lr_scheduler_type "cosine" \
32 | --logging_steps 1 \
33 | --tf32 True \
34 | --model_max_length 2048 \
35 | --gradient_checkpointing True \
36 | --dataloader_num_workers 4 \
37 | --lazy_preprocess True \
38 | --image_aspect_ratio "resizesquare" \
39 | # --report_to wandb
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/scripts/pretrain_qwen2-05b.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/Qwen2-0.5B-Instruct \
8 | --version qwen_2 \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-Qwen2-0.5B-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/scripts/pretrain_qwen2-15b.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/Qwen2-1.5B-Instruct \
8 | --version qwen_2 \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-Qwen2-1.5B-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/scripts/pretrain_qwen2-7b.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/Qwen2-7B-Instruct \
8 | --version qwen_2 \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-Qwen2-7B-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 | --report_to wandb
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/scripts/pretrain_vicuna_13b.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/vicuna-13b-v1.5 \
8 | --version v1 \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-vicuna-13b-v1.5-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 |
--------------------------------------------------------------------------------
/scripts/pretrain_vicuna_7b.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo mkdir ./checkpoints
3 | sudo chmod -R 777 ./checkpoints
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/pretrain_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/vicuna-7b-v1.5 \
8 | --version v1 \
9 | --data_path ./data/llava_pretrain/images/blip_laion_cc_sbu_558k.json \
10 | --image_folder ./data/llava_pretrain/images \
11 | --vision_tower openai/clip-vit-large-patch14-336 \
12 | --mm_projector_type mlp2x_gelu \
13 | --tune_mm_mlp_adapter True \
14 | --mm_vision_select_layer -2 \
15 | --mm_use_im_start_end False \
16 | --mm_use_im_patch_token False \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llmga-vicuna-7b-v1.5-pretrain \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 8 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 4 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 24000 \
26 | --save_total_limit 1 \
27 | --learning_rate 1e-3 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --image_aspect_ratio "resizesquare" \
38 |
--------------------------------------------------------------------------------
/scripts/run_gradio_t2i.sh:
--------------------------------------------------------------------------------
1 |
2 | python3 llmga/serve/gradio_t2i_server.py \
3 | --model_path /mnt/bn/wyt-large-dataset/xiabin-model/llmga/checkpoints/llmga-vicuna-7b-v1.5-full-finetune \
4 | --sdmodel_id /mnt/bn/wyt-large-dataset/model/SDXL \
5 | --lora /mnt/bn/wyt-large-dataset/model/hyper-sd/Hyper-SDXL-1step-Unet.safetensors \
6 | --model-list-mode reload \
7 | --port 8334 \
8 | --load-4bit \
--------------------------------------------------------------------------------
/scripts/test-2.sh:
--------------------------------------------------------------------------------
1 |
2 | CUDA_VISIBLE_DEVICES=0 python3 -m llmga.serve.cli2 \
3 | --model-path /mnt/bn/wyt-large-dataset/xiabin-model/llmga/checkpoints/llmga-mistral_instruct-full-finetune \
4 | # --load-4bit
5 |
--------------------------------------------------------------------------------
/scripts/test-llmga-sd15-editing.sh:
--------------------------------------------------------------------------------
1 |
2 | sudo chmod -R 777 /mnt/bn/xiabinpaintv2/CVPR2024/code-final/LLMGA-v1/res
3 |
4 |
5 | CUDA_VISIBLE_DEVICES=1 python3 -m llmga.serve.cli-sd15-editing \
6 | --model-path /mnt/bn/xiabinpaintv2/CVPR2024/res/LLMGA1.5-v73/llmga-vicuna-7b-v1.5-full-finetune \
7 | --image-file /mnt/bn/xiabinpaintv2/CVPR2024/code-final/LLMGA-v1/000000003613_ori.png \
8 | --save_path /mnt/bn/xiabinpaintv2/CVPR2024/code-final/LLMGA-v1/res \
9 | --sd_model_id /mnt/bn/inpainting-bytenas-lq/xiabin/new-SD-model/sd15-t2i-outputs-05v10
10 | # --load-4bit
11 |
12 |
13 |
--------------------------------------------------------------------------------
/scripts/test-llmga-sd15-inpainting.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli-sd15-inpainting \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sd15-inpainting \
4 | --save_path ./res/inpainting/llmga7b-sd15 \
5 | --image-file /PATHtoIMG \
6 | --mask-file /PATHtomask
7 |
8 |
--------------------------------------------------------------------------------
/scripts/test-llmga-sd15-t2i.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli-sd15 \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sd15-t2i \
4 | --save_path ./res/t2i/llmga7b-sd15 \
5 | --image-file /PATHtoIMG
6 |
--------------------------------------------------------------------------------
/scripts/test-llmga-sdxl-inpainting.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli-sdxl-inpainting \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sdxl-inpainting \
4 | --save_path ./res/inpainting/llmga7b-sdxl \
5 | --image-file /PATHtoIMG \
6 | --mask-file /PATHtomask
7 |
8 |
9 |
--------------------------------------------------------------------------------
/scripts/test-llmga-sdxl-t2i.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli-sdxl \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sdxl-t2i \
4 | --save_path ./res/t2i/llmga7b-sdxl \
5 | --image-file /PATHtoIMG
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
1 |
2 | export http_proxy=http://sys-proxy-rd-relay.byted.org:8118 https_proxy=http://sys-proxy-rd-relay.byted.org:8118 no_proxy=code.byted.org
3 |
4 | CUDA_VISIBLE_DEVICES=0 python3 -m llmga.serve.cli \
5 | --model-path /mnt/bn/wyt-large-dataset/xiabin-model/llmga/checkpoints/llmga-mistral_instruct-full-finetune \
6 | --image-file ./llmga/serve/examples/jiateng.png \
7 | # --load-4bit
8 |
--------------------------------------------------------------------------------
/scripts/test2-llmga-sd15-t2i.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli2-sd15 \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sd15-t2i \
4 | --save_path ./res/t2i/llmga7b-sd15 \
5 |
6 |
--------------------------------------------------------------------------------
/scripts/test2-llmga-sdxl-t2i.sh:
--------------------------------------------------------------------------------
1 | python3 -m llmga.serve.cli2-sdxl \
2 | --model-path ./checkpoints/Inference/llmga-llama-2-7b-chat-full-finetune \
3 | --sdmodel_id ./checkpoints/Inference/llmga-sdxl-t2i \
4 | --save_path ./res/t2i/llmga7b-sdxl \
5 |
6 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_05b_qwen2.sh:
--------------------------------------------------------------------------------
1 | MODEL_VERSION="Qwen2-0.5B"
2 |
3 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero3.json \
5 | --model_name_or_path ./base_models/LLM/Qwen2-0.5B-Instruct \
6 | --version qwen_2 \
7 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
8 | --data_path2 ./data/jsons/llmga-data \
9 | --data_path3 ./data/jsons/text-data \
10 | --image_folder ./data/llava-imgs \
11 | --image_folder2 ./data/llmga-imgs \
12 | --vision_tower openai/clip-vit-large-patch14-336 \
13 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-Qwen2-0.5B-pretrain/mm_projector.bin \
14 | --mm_projector_type mlp2x_gelu \
15 | --mm_vision_select_layer -2 \
16 | --mm_use_output_start_end False \
17 | --mm_use_im_start_end False \
18 | --mm_use_im_patch_token False \
19 | --bf16 True \
20 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
21 | --num_train_epochs 1 \
22 | --per_device_train_batch_size 4 \
23 | --per_device_eval_batch_size 4 \
24 | --gradient_accumulation_steps 4 \
25 | --evaluation_strategy "no" \
26 | --save_strategy "steps" \
27 | --save_steps 50000 \
28 | --save_total_limit 1 \
29 | --learning_rate 2e-5 \
30 | --weight_decay 0. \
31 | --warmup_ratio 0.03 \
32 | --lr_scheduler_type "cosine" \
33 | --logging_steps 1 \
34 | --tf32 True \
35 | --model_max_length 2048 \
36 | --gradient_checkpointing True \
37 | --dataloader_num_workers 4 \
38 | --lazy_preprocess True \
39 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_13b_vicuna.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MODEL_VERSION="vicuna-13b-v1.5"
3 |
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/vicuna-13b-v1.5 \
8 | --version v1 \
9 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
10 | --data_path2 ./data/jsons/llmga-data \
11 | --data_path3 ./data/jsons/text-data \
12 | --image_folder ./data/llava-imgs \
13 | --image_folder2 ./data/llmga-imgs \
14 | --vision_tower openai/clip-vit-large-patch14-336 \
15 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-vicuna-13b-v1.5-pretrain/mm_projector.bin \
16 | --mm_projector_type mlp2x_gelu \
17 | --mm_vision_select_layer -2 \
18 | --mm_use_output_start_end False \
19 | --mm_use_im_start_end False \
20 | --mm_use_im_patch_token False \
21 | --bf16 True \
22 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
23 | --num_train_epochs 1 \
24 | --per_device_train_batch_size 4 \
25 | --per_device_eval_batch_size 4 \
26 | --gradient_accumulation_steps 4 \
27 | --evaluation_strategy "no" \
28 | --save_strategy "steps" \
29 | --save_steps 50000 \
30 | --save_total_limit 1 \
31 | --learning_rate 2e-5 \
32 | --weight_decay 0. \
33 | --warmup_ratio 0.03 \
34 | --lr_scheduler_type "cosine" \
35 | --logging_steps 1 \
36 | --tf32 True \
37 | --model_max_length 2048 \
38 | --gradient_checkpointing True \
39 | --dataloader_num_workers 4 \
40 | --lazy_preprocess True \
41 | --image_aspect_ratio "resizesquare" \
42 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_15b_qwen2.sh:
--------------------------------------------------------------------------------
1 |
2 | MODEL_VERSION="Qwen2-1.5B"
3 |
4 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path ./base_models/Qwen2-1.5B-Instruct \
7 | --version qwen_2 \
8 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
9 | --data_path2 ./data/jsons/llmga-data \
10 | --data_path3 ./data/jsons/text-data \
11 | --image_folder ./data/llava-imgs \
12 | --image_folder2 ./data/llmga-imgs \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-Qwen2-1.5B-pretrain/mm_projector.bin \
15 | --mm_projector_type mlp2x_gelu \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_output_start_end False \
18 | --mm_use_im_start_end False \
19 | --mm_use_im_patch_token False \
20 | --bf16 True \
21 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
22 | --num_train_epochs 1 \
23 | --per_device_train_batch_size 4 \
24 | --per_device_eval_batch_size 4 \
25 | --gradient_accumulation_steps 4 \
26 | --evaluation_strategy "no" \
27 | --save_strategy "steps" \
28 | --save_steps 50000 \
29 | --save_total_limit 1 \
30 | --learning_rate 2e-5 \
31 | --weight_decay 0. \
32 | --warmup_ratio 0.03 \
33 | --lr_scheduler_type "cosine" \
34 | --logging_steps 1 \
35 | --tf32 True \
36 | --model_max_length 2048 \
37 | --gradient_checkpointing True \
38 | --dataloader_num_workers 4 \
39 | --lazy_preprocess True \
40 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_2b_gemma.sh:
--------------------------------------------------------------------------------
1 |
2 | MODEL_VERSION="gemma-2b-it"
3 |
4 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path ./base_models/gemma-2b-it \
7 | --version gemma \
8 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
9 | --data_path2 ./data/jsons/llmga-data \
10 | --data_path3 ./data/jsons/text-data \
11 | --image_folder ./data/llava-imgs \
12 | --image_folder2 ./data/llmga-imgs \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-gemma-pretrain/mm_projector.bin \
15 | --mm_projector_type mlp2x_gelu \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_output_start_end False \
18 | --mm_use_im_start_end False \
19 | --mm_use_im_patch_token False \
20 | --bf16 True \
21 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
22 | --num_train_epochs 1 \
23 | --per_device_train_batch_size 2 \
24 | --per_device_eval_batch_size 4 \
25 | --gradient_accumulation_steps 8 \
26 | --evaluation_strategy "no" \
27 | --save_strategy "steps" \
28 | --save_steps 50000 \
29 | --save_total_limit 1 \
30 | --learning_rate 2e-5 \
31 | --weight_decay 0. \
32 | --warmup_ratio 0.03 \
33 | --lr_scheduler_type "cosine" \
34 | --logging_steps 1 \
35 | --tf32 True \
36 | --model_max_length 2048 \
37 | --gradient_checkpointing True \
38 | --dataloader_num_workers 4 \
39 | --lazy_preprocess True \
40 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_3b_phi3.sh:
--------------------------------------------------------------------------------
1 |
2 | MODEL_VERSION="Phi-3-mini-128k"
3 |
4 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path ./base_models/Phi-3-mini-128k-instruct \
7 | --version phi_3 \
8 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
9 | --data_path2 ./data/jsons/llmga-data \
10 | --data_path3 ./data/jsons/text-data \
11 | --image_folder ./data/llava-imgs \
12 | --image_folder2 ./data/llmga-imgs \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-Phi-3-mini-128k-pretrain/mm_projector.bin/mm_projector.bin \
15 | --mm_projector_type mlp2x_gelu \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_output_start_end False \
18 | --mm_use_im_start_end False \
19 | --mm_use_im_patch_token False \
20 | --bf16 True \
21 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
22 | --num_train_epochs 1 \
23 | --per_device_train_batch_size 4 \
24 | --per_device_eval_batch_size 4 \
25 | --gradient_accumulation_steps 4 \
26 | --evaluation_strategy "no" \
27 | --save_strategy "steps" \
28 | --save_steps 50000 \
29 | --save_total_limit 1 \
30 | --learning_rate 2e-5 \
31 | --weight_decay 0. \
32 | --warmup_ratio 0.03 \
33 | --lr_scheduler_type "cosine" \
34 | --logging_steps 1 \
35 | --tf32 True \
36 | --model_max_length 2048 \
37 | --gradient_checkpointing True \
38 | --dataloader_num_workers 4 \
39 | --lazy_preprocess True \
40 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_7b_mistral.sh:
--------------------------------------------------------------------------------
1 |
2 | MODEL_VERSION="mistral_instruct"
3 |
4 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path ./base_models/Mistral-7B-Instruct-v0.2 \
7 | --version mistral_instruct \
8 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
9 | --data_path2 ./data/jsons/llmga-data \
10 | --data_path3 ./data/jsons/text-data \
11 | --image_folder ./data/llava-imgs \
12 | --image_folder2 ./data/llmga-imgs \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-mistral-pretrain/mm_projector.bin \
15 | --mm_projector_type mlp2x_gelu \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_output_start_end False \
18 | --mm_use_im_start_end False \
19 | --mm_use_im_patch_token False \
20 | --bf16 True \
21 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
22 | --num_train_epochs 1 \
23 | --per_device_train_batch_size 4 \
24 | --per_device_eval_batch_size 4 \
25 | --gradient_accumulation_steps 4 \
26 | --evaluation_strategy "no" \
27 | --save_strategy "steps" \
28 | --save_steps 50000 \
29 | --save_total_limit 1 \
30 | --learning_rate 2e-5 \
31 | --weight_decay 0. \
32 | --warmup_ratio 0.03 \
33 | --lr_scheduler_type "cosine" \
34 | --logging_steps 1 \
35 | --tf32 True \
36 | --model_max_length 2048 \
37 | --gradient_checkpointing True \
38 | --dataloader_num_workers 4 \
39 | --lazy_preprocess True \
40 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_7b_qwen2.sh:
--------------------------------------------------------------------------------
1 | MODEL_VERSION="Qwen2-7B"
2 |
3 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero3.json \
5 | --model_name_or_path ./base_models/Qwen2-7B-Instruct \
6 | --version qwen_2 \
7 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
8 | --data_path2 ./data/jsons/llmga-data \
9 | --data_path3 ./data/jsons/text-data \
10 | --image_folder ./data/llava-imgs \
11 | --image_folder2 ./data/llmga-imgs \
12 | --vision_tower openai/clip-vit-large-patch14-336 \
13 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-Qwen2-7B-pretrain/mm_projector.bin \
14 | --mm_projector_type mlp2x_gelu \
15 | --mm_vision_select_layer -2 \
16 | --mm_use_output_start_end False \
17 | --mm_use_im_start_end False \
18 | --mm_use_im_patch_token False \
19 | --bf16 True \
20 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
21 | --num_train_epochs 1 \
22 | --per_device_train_batch_size 2 \
23 | --per_device_eval_batch_size 4 \
24 | --gradient_accumulation_steps 8 \
25 | --evaluation_strategy "no" \
26 | --save_strategy "steps" \
27 | --save_steps 50000 \
28 | --save_total_limit 1 \
29 | --learning_rate 2e-5 \
30 | --weight_decay 0. \
31 | --warmup_ratio 0.03 \
32 | --lr_scheduler_type "cosine" \
33 | --logging_steps 1 \
34 | --tf32 True \
35 | --model_max_length 2048 \
36 | --gradient_checkpointing True \
37 | --dataloader_num_workers 4 \
38 | --lazy_preprocess True \
39 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_7b_vicuna.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MODEL_VERSION="vicuna-7b-v1.5"
4 |
5 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
6 | --deepspeed ./scripts/zero3.json \
7 | --model_name_or_path ./base_models/vicuna-7b-v1.5 \
8 | --version v1 \
9 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
10 | --data_path2 ./data/jsons/llmga-data \
11 | --data_path3 ./data/jsons/text-data \
12 | --image_folder ./data/llava-imgs \
13 | --image_folder2 ./data/llmga-imgs \
14 | --vision_tower openai/clip-vit-large-patch14-336 \
15 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-vicuna-7b-v1.5-pretrain/mm_projector.bin \
16 | --mm_projector_type mlp2x_gelu \
17 | --mm_vision_select_layer -2 \
18 | --mm_use_output_start_end False \
19 | --mm_use_im_start_end False \
20 | --mm_use_im_patch_token False \
21 | --bf16 True \
22 | --output_dir ./checkpoints/llmga-$MODEL_VERSION-full-finetune \
23 | --num_train_epochs 1 \
24 | --per_device_train_batch_size 4 \
25 | --per_device_eval_batch_size 4 \
26 | --gradient_accumulation_steps 4 \
27 | --evaluation_strategy "no" \
28 | --save_strategy "steps" \
29 | --save_steps 50000 \
30 | --save_total_limit 1 \
31 | --learning_rate 2e-5 \
32 | --weight_decay 0. \
33 | --warmup_ratio 0.03 \
34 | --lr_scheduler_type "cosine" \
35 | --logging_steps 1 \
36 | --tf32 True \
37 | --model_max_length 2048 \
38 | --gradient_checkpointing True \
39 | --dataloader_num_workers 4 \
40 | --lazy_preprocess True \
41 | --image_aspect_ratio "resizesquare" \
42 | # --report_to wandb
43 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s1_8b_llama3.sh:
--------------------------------------------------------------------------------
1 |
2 | MODEL_VERSION="llama3-8b-it"
3 |
4 | deepspeed --master_port=7001 llmga/llava/train/train_mem.py \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path ./base_models/Meta-Llama-3-8B-Instruct \
7 | --version llama_3 \
8 | --data_path ./data/jsons/llava_v1_5_mix665k.json \
9 | --data_path2 ./data/jsons/llmga-data \
10 | --data_path3 ./data/jsons/text-data \
11 | --image_folder ./data/llava-imgs \
12 | --image_folder2 ./data/llmga-imgs \
13 | --vision_tower openai/clip-vit-large-patch14-336 \
14 | --pretrain_mm_mlp_adapter ./checkpoints/llmga-llama3-8b-pretrain/mm_projector.bin \
15 | --mm_projector_type mlp2x_gelu \
16 | --mm_vision_select_layer -2 \
17 | --mm_use_output_start_end False \
18 | --mm_use_im_start_end False \
19 | --mm_use_im_patch_token False \
20 | --bf16 True \
21 | --output_dir /mnt/bn/xiabinpaintv2/CVPR2024/res/LLMGA1.5-v101/llmga-$MODEL_VERSION-full-finetune \
22 | --num_train_epochs 1 \
23 | --per_device_train_batch_size 2 \
24 | --per_device_eval_batch_size 4 \
25 | --gradient_accumulation_steps 8 \
26 | --evaluation_strategy "no" \
27 | --save_strategy "steps" \
28 | --save_steps 50000 \
29 | --save_total_limit 1 \
30 | --learning_rate 2e-5 \
31 | --weight_decay 0. \
32 | --warmup_ratio 0.03 \
33 | --lr_scheduler_type "cosine" \
34 | --logging_steps 1 \
35 | --tf32 True \
36 | --model_max_length 2048 \
37 | --gradient_checkpointing True \
38 | --dataloader_num_workers 4 \
39 | --lazy_preprocess True \
40 | --image_aspect_ratio "resizesquare" \
--------------------------------------------------------------------------------
/scripts/train_llmga_s2_sd15_inpaint.sh:
--------------------------------------------------------------------------------
1 | export MODEL_NAME="runwayml/stable-diffusion-inpainting"
2 | accelerate launch --main_process_port 1234 --mixed_precision "bf16" --multi_gpu llmga/diffusers/train_text_to_image_inpaint.py \
3 | --pretrained_model_name_or_path=$MODEL_NAME \
4 | --train_data_dir="./data" \
5 | --use_ema \
6 | --allow_tf32 \
7 | --resolution=512 --center_crop --random_flip \
8 | --train_batch_size=32 \
9 | --gradient_accumulation_steps=1 \
10 | --gradient_checkpointing \
11 | --max_train_steps=50000 \
12 | --learning_rate=2e-05 \
13 | --max_grad_norm=1 \
14 | --lr_scheduler="constant_with_warmup" \
15 | --lr_warmup_steps=1000 \
16 | --checkpointing_steps 2000 \
17 | --checkpoints_total_limit 2 \
18 | --lr_warmup_steps=0 \
19 | --output_dir="./work_dirs/llmga-sd15-inpainting" \
20 | --blank_mask_prob 0 \
21 |
22 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s2_sd15_t2i.sh:
--------------------------------------------------------------------------------
1 | export MODEL_NAME="runwayml/stable-diffusion-v1-5"
2 | accelerate launch --main_process_port 1234 --mixed_precision "bf16" --multi_gpu llmga/diffusers/train_text_to_image.py \
3 | --pretrained_model_name_or_path=$MODEL_NAME \
4 | --train_data_dir="./data" \
5 | --use_ema \
6 | --allow_tf32 \
7 | --resolution=512 --center_crop --random_flip \
8 | --train_batch_size=32 \
9 | --gradient_accumulation_steps=1 \
10 | --gradient_checkpointing \
11 | --max_train_steps=50000 \
12 | --learning_rate=2e-05 \
13 | --max_grad_norm=1 \
14 | --lr_scheduler="constant_with_warmup" \
15 | --lr_warmup_steps=1000 \
16 | --checkpointing_steps 2000 \
17 | --checkpoints_total_limit 2 \
18 | --lr_warmup_steps=0 \
19 | --output_dir="./work_dirs/llmga-sd15-t2i" \
20 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s2_sdxl_inpaint.sh:
--------------------------------------------------------------------------------
1 | export MODEL_NAME="diffusers/stable-diffusion-xl-1.0-inpainting-0.1" #"stabilityai/stable-diffusion-xl-base-1.0"
2 |
3 | accelerate launch --main_process_port 1234 --mixed_precision "bf16" --multi_gpu llmga/diffusers/train_text_to_image_sdxl_inpainting.py \
4 | --pretrained_model_name_or_path=$MODEL_NAME \
5 | --train_data_dir="./data" \
6 | --use_ema \
7 | --allow_tf32 \
8 | --resolution=1024 --center_crop --random_flip \
9 | --train_batch_size=4 \
10 | --gradient_accumulation_steps=4 \
11 | --gradient_checkpointing \
12 | --max_train_steps=50000 \
13 | --learning_rate=2e-05 \
14 | --max_grad_norm=1 \
15 | --lr_scheduler="constant_with_warmup" \
16 | --lr_warmup_steps=1000 \
17 | --checkpointing_steps 2000 \
18 | --checkpoints_total_limit 2 \
19 | --lr_warmup_steps=0 \
20 | --output_dir="./work_dirs/llmga-sdxl-inpainting" \
21 |
--------------------------------------------------------------------------------
/scripts/train_llmga_s2_sdxl_t2i.sh:
--------------------------------------------------------------------------------
1 | export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
2 | accelerate launch --main_process_port 1234 --mixed_precision "bf16" --multi_gpu llmga/diffusers/train_text_to_image_sdxl.py \
3 | --pretrained_model_name_or_path=$MODEL_NAME \
4 | --train_data_dir="./data" \
5 | --use_ema \
6 | --allow_tf32 \
7 | --resolution=1024 --center_crop --random_flip \
8 | --train_batch_size=4 \
9 | --gradient_accumulation_steps=4 \
10 | --gradient_checkpointing \
11 | --max_train_steps=50000 \
12 | --learning_rate=2e-05 \
13 | --max_grad_norm=1 \
14 | --lr_scheduler="constant_with_warmup" \
15 | --lr_warmup_steps=1000 \
16 | --checkpointing_steps 2000 \
17 | --checkpoints_total_limit 2 \
18 | --lr_warmup_steps=0 \
19 | --output_dir="./work_dirs/llmga-sdxl-t2i" \
20 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT="llava-v1.5-13b"
9 | SPLIT="llava_gqa_testdev_balanced"
10 | GQADIR="./playground/data/eval/gqa/data"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path liuhaotian/llava-v1.5-13b \
15 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
16 | --image-folder ./playground/data/eval/gqa/data/images \
17 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --conv-mode vicuna_v1 &
22 | done
23 |
24 | wait
25 |
26 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
27 |
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 |
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 |
36 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
37 |
38 | cd $GQADIR
39 | python eval/eval.py --tier testdev_balanced
40 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
12 |
13 | python llava/eval/eval_gpt_review_bench.py \
14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
16 | --rule llava/eval/table/rule.json \
17 | --answer-list \
18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
20 | --output \
21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
22 |
23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
24 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo chmod -R 777 ./playground/data/eval
3 | SPLIT="mmbench_dev_20230712"
4 |
5 | # python3 -m llmga.llava.eval.model_vqa_mmbench \
6 | # --model-path /mnt/bn/xiabinpaintv2/CVPR2024/res/LLMGA1.5-v94/llmga-vicuna-7b-v1.5-full-finetune \
7 | # --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
8 | # --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llmga-7b.jsonl \
9 | # --single-pred-prompt \
10 | # --temperature 0 \
11 | # --conv-mode vicuna_v1
12 |
13 | # mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
14 |
15 | python3 scripts/convert_mmbench_for_submission.py \
16 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
17 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT \
18 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
19 | --experiment llmga-7b
20 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SPLIT="mmbench_dev_cn_20231003"
4 |
5 | python -m llava.eval.model_vqa_mmbench \
6 | --model-path liuhaotian/llava-v1.5-13b \
7 | --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
8 | --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \
9 | --lang cn \
10 | --single-pred-prompt \
11 | --temperature 0 \
12 | --conv-mode vicuna_v1
13 |
14 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
15 |
16 | python scripts/convert_mmbench_for_submission.py \
17 | --annotation-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
18 | --result-dir ./playground/data/eval/mmbench_cn/answers/$SPLIT \
19 | --upload-dir ./playground/data/eval/mmbench_cn/answers_upload/$SPLIT \
20 | --experiment llava-v1.5-13b
21 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa_loader \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/MME/llava_mme.jsonl \
6 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
7 | --answers-file ./playground/data/eval/MME/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | cd ./playground/data/eval/MME
12 |
13 | python convert_answer_to_mme.py --experiment llava-v1.5-13b
14 |
15 | cd eval_tool
16 |
17 | python calculation.py --results_dir answers/llava-v1.5-13b
18 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
6 | --image-folder ./playground/data/eval/mm-vet/images \
7 | --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | mkdir -p ./playground/data/eval/mm-vet/results
12 |
13 | python scripts/convert_mmvet_for_eval.py \
14 | --src ./playground/data/eval/mm-vet/answers/llmga-7b.jsonl \
15 | --dst ./playground/data/eval/mm-vet/results/llmga-7b.jsonl
16 |
17 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo chmod -R 777 ./playground/data/eval
4 | python3 -m llmga.llava.eval.model_vqa_loader \
5 | --model-path /mnt/bn/xiabinpaintv2/CVPR2024/res/LLMGA1.5-v94/llmga-vicuna-7b-v1.5-full-finetune \
6 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
7 | --image-folder /mnt/bn/xiabinpaintv2/CVPR2024/rebuttal/eval_dataset/coco2014/val2014 \
8 | --answers-file ./playground/data/eval/pope/answers/llmga-7b.jsonl \
9 | --temperature 0 \
10 | --conv-mode vicuna_v1
11 |
12 | python3 llmga/llava/eval/eval_pope.py \
13 | --annotation-dir ./playground/data/eval/pope/coco \
14 | --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
15 | --result-file ./playground/data/eval/pope/answers/llmga-7b.jsonl
16 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$1" = "dev" ]; then
4 | echo "Evaluating in 'dev' split."
5 | elif [ "$1" = "test" ]; then
6 | echo "Evaluating in 'test' split."
7 | else
8 | echo "Unknown split, please choose between 'dev' and 'test'."
9 | exit 1
10 | fi
11 |
12 | python -m llava.eval.model_vqa_qbench \
13 | --model-path liuhaotian/llava-v1.5-13b \
14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \
16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \
17 | --conv-mode llava_v1 \
18 | --lang en
19 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench_zh.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$1" = "dev" ]; then
4 | ZH_SPLIT="验证集"
5 | echo "Evaluating in 'dev' split."
6 | elif [ "$1" = "test" ]; then
7 | ZH_SPLIT="测试集"
8 | echo "Evaluating in 'test' split."
9 | else
10 | echo "Unknown split, please choose between 'dev' and 'test'."
11 | exit 1
12 | fi
13 |
14 | python -m llava.eval.model_vqa_qbench \
15 | --model-path liuhaotian/llava-v1.5-13b \
16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \
18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \
19 | --conv-mode llava_v1 \
20 | --lang zh
21 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/seed.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT="llava-v1.5-13b"
9 |
10 | for IDX in $(seq 0 $((CHUNKS-1))); do
11 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
12 | --model-path liuhaotian/llava-v1.5-13b \
13 | --question-file ./playground/data/eval/seed_bench/llava-seed-bench.jsonl \
14 | --image-folder ./playground/data/eval/seed_bench \
15 | --answers-file ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl \
16 | --num-chunks $CHUNKS \
17 | --chunk-idx $IDX \
18 | --temperature 0 \
19 | --conv-mode vicuna_v1 &
20 | done
21 |
22 | wait
23 |
24 | output_file=./playground/data/eval/seed_bench/answers/$CKPT/merge.jsonl
25 |
26 | # Clear out the output file if it exists.
27 | > "$output_file"
28 |
29 | # Loop through the indices and concatenate each file.
30 | for IDX in $(seq 0 $((CHUNKS-1))); do
31 | cat ./playground/data/eval/seed_bench/answers/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
32 | done
33 |
34 | # Evaluate
35 | python scripts/convert_seed_for_submission.py \
36 | --annotation-file ./playground/data/eval/seed_bench/SEED-Bench.json \
37 | --result-file $output_file \
38 | --result-upload-file ./playground/data/eval/seed_bench/answers_upload/llava-v1.5-13b.jsonl
39 |
40 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/sqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa_science \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/scienceqa/llava_test_CQM-A.json \
6 | --image-folder ./playground/data/eval/scienceqa/images/test \
7 | --answers-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
8 | --single-pred-prompt \
9 | --temperature 0 \
10 | --conv-mode vicuna_v1
11 |
12 | python llava/eval/eval_science_qa.py \
13 | --base-dir ./playground/data/eval/scienceqa \
14 | --result-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b.jsonl \
15 | --output-file ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_output.jsonl \
16 | --output-result ./playground/data/eval/scienceqa/answers/llava-v1.5-13b_result.json
17 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa_loader \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
6 | --image-folder ./playground/data/eval/textvqa/train_images \
7 | --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | python -m llava.eval.eval_textvqa \
12 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
13 | --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl
14 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa_loader \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
6 | --image-folder ./playground/data/eval/vizwiz/test \
7 | --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | python scripts/convert_vizwiz_for_submission.py \
12 | --annotation-file ./playground/data/eval/vizwiz/llava_test.jsonl \
13 | --result-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
14 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/llava-v1.5-13b.json
15 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo chmod -R 777 ./playground/data/eval
4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
6 |
7 | CHUNKS=${#GPULIST[@]}
8 |
9 | CKPT="llmga-7b"
10 | SPLIT="llava_vqav2_mscoco_test-dev2015"
11 |
12 | # for IDX in $(seq 0 $((CHUNKS-1))); do
13 | # CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python3 -m llmga.llava.eval.model_vqa_loader \
14 | # --model-path /mnt/bn/xiabinpaintv2/CVPR2024/res/LLMGA1.5-v94/llmga-vicuna-7b-v1.5-full-finetune \
15 | # --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
16 | # --image-folder /mnt/bn/xiabinpaintv2/CVPR2024/rebuttal/eval_dataset/vqav2/test2015 \
17 | # --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | # --num-chunks $CHUNKS \
19 | # --chunk-idx $IDX \
20 | # --temperature 0 \
21 | # --conv-mode vicuna_v1 &
22 | # done
23 |
24 | # wait
25 |
26 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
27 |
28 | # Clear out the output file if it exists.
29 | > "$output_file"
30 |
31 | # Loop through the indices and concatenate each file.
32 | for IDX in $(seq 0 $((CHUNKS-1))); do
33 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
34 | done
35 |
36 | python3 scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
37 |
38 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero3.json \
5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \
6 | --version v1 \
7 | --data_path ./playground/data/llava_v1_5_mix665k.json \
8 | --image_folder ./playground/data \
9 | --vision_tower openai/clip-vit-large-patch14-336 \
10 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
11 | --mm_projector_type mlp2x_gelu \
12 | --mm_vision_select_layer -2 \
13 | --mm_use_im_start_end False \
14 | --mm_use_im_patch_token False \
15 | --image_aspect_ratio pad \
16 | --group_by_modality_length True \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llava-v1.5-13b \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 16 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 1 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 50000 \
26 | --save_total_limit 1 \
27 | --learning_rate 2e-5 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --report_to wandb
38 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path lmsys/vicuna-13b-v1.5 \
7 | --version v1 \
8 | --data_path ./playground/data/llava_v1_5_mix665k.json \
9 | --image_folder ./playground/data \
10 | --vision_tower openai/clip-vit-large-patch14-336 \
11 | --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \
12 | --mm_projector_type mlp2x_gelu \
13 | --mm_vision_select_layer -2 \
14 | --mm_use_im_start_end False \
15 | --mm_use_im_patch_token False \
16 | --image_aspect_ratio pad \
17 | --group_by_modality_length True \
18 | --bf16 True \
19 | --output_dir ./checkpoints/llava-v1.5-13b-lora \
20 | --num_train_epochs 1 \
21 | --per_device_train_batch_size 16 \
22 | --per_device_eval_batch_size 4 \
23 | --gradient_accumulation_steps 1 \
24 | --evaluation_strategy "no" \
25 | --save_strategy "steps" \
26 | --save_steps 50000 \
27 | --save_total_limit 1 \
28 | --learning_rate 2e-4 \
29 | --weight_decay 0. \
30 | --warmup_ratio 0.03 \
31 | --lr_scheduler_type "cosine" \
32 | --logging_steps 1 \
33 | --tf32 True \
34 | --model_max_length 2048 \
35 | --gradient_checkpointing True \
36 | --dataloader_num_workers 4 \
37 | --lazy_preprocess True \
38 | --report_to wandb
39 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero3.json \
5 | --model_name_or_path liuhaotian/llava-v1.5-13b \
6 | --version v1 \
7 | --data_path ./playground/data/llava_v1_5_mix665k.json \
8 | --image_folder ./playground/data \
9 | --vision_tower openai/clip-vit-large-patch14-336 \
10 | --mm_projector_type mlp2x_gelu \
11 | --mm_vision_select_layer -2 \
12 | --mm_use_im_start_end False \
13 | --mm_use_im_patch_token False \
14 | --image_aspect_ratio pad \
15 | --group_by_modality_length True \
16 | --bf16 True \
17 | --output_dir ./checkpoints/llava-v1.5-13b-task \
18 | --num_train_epochs 1 \
19 | --per_device_train_batch_size 16 \
20 | --per_device_eval_batch_size 4 \
21 | --gradient_accumulation_steps 1 \
22 | --evaluation_strategy "no" \
23 | --save_strategy "steps" \
24 | --save_steps 50000 \
25 | --save_total_limit 1 \
26 | --learning_rate 2e-5 \
27 | --weight_decay 0. \
28 | --warmup_ratio 0.03 \
29 | --lr_scheduler_type "cosine" \
30 | --logging_steps 1 \
31 | --tf32 True \
32 | --model_max_length 2048 \
33 | --gradient_checkpointing True \
34 | --dataloader_num_workers 4 \
35 | --lazy_preprocess True \
36 | --report_to wandb
37 |
--------------------------------------------------------------------------------
/scripts/v1_5/finetune_task_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \
5 | --deepspeed ./scripts/zero3.json \
6 | --model_name_or_path liuhaotian/llava-v1.5-13b \
7 | --version v1 \
8 | --data_path ./playground/data/llava_v1_5_mix665k.json \
9 | --image_folder ./playground/data \
10 | --vision_tower openai/clip-vit-large-patch14-336 \
11 | --mm_projector_type mlp2x_gelu \
12 | --mm_vision_select_layer -2 \
13 | --mm_use_im_start_end False \
14 | --mm_use_im_patch_token False \
15 | --image_aspect_ratio pad \
16 | --group_by_modality_length True \
17 | --bf16 True \
18 | --output_dir ./checkpoints/llava-v1.5-13b-task-lora \
19 | --num_train_epochs 1 \
20 | --per_device_train_batch_size 16 \
21 | --per_device_eval_batch_size 4 \
22 | --gradient_accumulation_steps 1 \
23 | --evaluation_strategy "no" \
24 | --save_strategy "steps" \
25 | --save_steps 50000 \
26 | --save_total_limit 1 \
27 | --learning_rate 2e-4 \
28 | --weight_decay 0. \
29 | --warmup_ratio 0.03 \
30 | --lr_scheduler_type "cosine" \
31 | --logging_steps 1 \
32 | --tf32 True \
33 | --model_max_length 2048 \
34 | --gradient_checkpointing True \
35 | --dataloader_num_workers 4 \
36 | --lazy_preprocess True \
37 | --report_to wandb
38 |
--------------------------------------------------------------------------------
/scripts/v1_5/pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path lmsys/vicuna-13b-v1.5 \
6 | --version plain \
7 | --data_path ./playground/data/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json \
8 | --image_folder ./playground/data/LLaVA-Pretrain/images \
9 | --vision_tower openai/clip-vit-large-patch14-336 \
10 | --mm_projector_type mlp2x_gelu \
11 | --tune_mm_mlp_adapter True \
12 | --mm_vision_select_layer -2 \
13 | --mm_use_im_start_end False \
14 | --mm_use_im_patch_token False \
15 | --bf16 True \
16 | --output_dir ./checkpoints/llava-v1.5-13b-pretrain \
17 | --num_train_epochs 1 \
18 | --per_device_train_batch_size 32 \
19 | --per_device_eval_batch_size 4 \
20 | --gradient_accumulation_steps 1 \
21 | --evaluation_strategy "no" \
22 | --save_strategy "steps" \
23 | --save_steps 24000 \
24 | --save_total_limit 1 \
25 | --learning_rate 1e-3 \
26 | --weight_decay 0. \
27 | --warmup_ratio 0.03 \
28 | --lr_scheduler_type "cosine" \
29 | --logging_steps 1 \
30 | --tf32 True \
31 | --model_max_length 2048 \
32 | --gradient_checkpointing True \
33 | --dataloader_num_workers 4 \
34 | --lazy_preprocess True \
35 | --report_to wandb
36 |
--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 2,
18 | "overlap_comm": true,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto"
22 | }
23 | }
--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 3,
18 | "overlap_comm": true,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto",
22 | "stage3_prefetch_bucket_size": "auto",
23 | "stage3_param_persistence_threshold": "auto",
24 | "stage3_max_live_parameters": 1e9,
25 | "stage3_max_reuse_distance": 1e9,
26 | "stage3_gather_16bit_weights_on_model_save": true
27 | }
28 | }
--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupLR",
24 | "params": {
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 | "zero_optimization": {
31 | "stage": 3,
32 | "offload_optimizer": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "offload_param": {
37 | "device": "cpu",
38 | "pin_memory": true
39 | },
40 | "overlap_comm": true,
41 | "contiguous_gradients": true,
42 | "sub_group_size": 1e9,
43 | "reduce_bucket_size": "auto",
44 | "stage3_prefetch_bucket_size": "auto",
45 | "stage3_param_persistence_threshold": "auto",
46 | "stage3_max_live_parameters": 1e9,
47 | "stage3_max_reuse_distance": 1e9,
48 | "gather_16bit_weights_on_model_save": true
49 | },
50 | "gradient_accumulation_steps": "auto",
51 | "gradient_clipping": "auto",
52 | "train_batch_size": "auto",
53 | "train_micro_batch_size_per_gpu": "auto",
54 | "steps_per_print": 1e5,
55 | "wall_clock_breakdown": false
56 | }
--------------------------------------------------------------------------------