├── .gitignore ├── 0507_freecond_bays.ipynb ├── CI_visualization.ipynb ├── FreeCondDemo_video.mp4 ├── README.md ├── __init__.py ├── brushnet └── diffusers │ ├── __init__.py │ ├── commands │ ├── __init__.py │ ├── diffusers_cli.py │ ├── env.py │ └── fp16_safetensors.py │ ├── configuration_utils.py │ ├── dependency_versions_check.py │ ├── dependency_versions_table.py │ ├── experimental │ ├── README.md │ ├── __init__.py │ └── rl │ │ ├── __init__.py │ │ └── value_guided_sampling.py │ ├── image_processor.py │ ├── loaders │ ├── __init__.py │ ├── autoencoder.py │ ├── controlnet.py │ ├── ip_adapter.py │ ├── lora.py │ ├── lora_conversion_utils.py │ ├── peft.py │ ├── single_file.py │ ├── single_file_utils.py │ ├── textual_inversion.py │ ├── unet.py │ └── utils.py │ ├── models │ ├── README.md │ ├── __init__.py │ ├── activations.py │ ├── adapter.py │ ├── attention.py │ ├── attention_flax.py │ ├── attention_processor.py │ ├── autoencoders │ │ ├── __init__.py │ │ ├── autoencoder_asym_kl.py │ │ ├── autoencoder_kl.py │ │ ├── autoencoder_kl_temporal_decoder.py │ │ ├── autoencoder_tiny.py │ │ ├── consistency_decoder_vae.py │ │ └── vae.py │ ├── brushnet.py │ ├── controlnet.py │ ├── controlnet_flax.py │ ├── downsampling.py │ ├── dual_transformer_2d.py │ ├── embeddings.py │ ├── embeddings_flax.py │ ├── lora.py │ ├── modeling_flax_pytorch_utils.py │ ├── modeling_flax_utils.py │ ├── modeling_outputs.py │ ├── modeling_pytorch_flax_utils.py │ ├── modeling_utils.py │ ├── normalization.py │ ├── prior_transformer.py │ ├── resnet.py │ ├── resnet_flax.py │ ├── t5_film_transformer.py │ ├── transformer_2d.py │ ├── transformer_temporal.py │ ├── transformers │ │ ├── __init__.py │ │ ├── dual_transformer_2d.py │ │ ├── prior_transformer.py │ │ ├── t5_film_transformer.py │ │ ├── transformer_2d.py │ │ └── transformer_temporal.py │ ├── unet_1d.py │ ├── unet_1d_blocks.py │ ├── unet_2d.py │ ├── unet_2d_blocks.py │ ├── unet_2d_condition.py │ ├── unets │ │ ├── __init__.py │ │ ├── unet_1d.py │ │ ├── unet_1d_blocks.py │ │ ├── unet_2d.py │ │ ├── unet_2d_blocks.py │ │ ├── unet_2d_blocks_flax.py │ │ ├── unet_2d_condition.py │ │ ├── unet_2d_condition_flax.py │ │ ├── unet_3d_blocks.py │ │ ├── unet_3d_condition.py │ │ ├── unet_i2vgen_xl.py │ │ ├── unet_kandinsky3.py │ │ ├── unet_motion_model.py │ │ ├── unet_spatio_temporal_condition.py │ │ ├── unet_stable_cascade.py │ │ └── uvit_2d.py │ ├── upsampling.py │ ├── vae_flax.py │ └── vq_model.py │ ├── optimization.py │ ├── pipelines │ ├── README.md │ ├── __init__.py │ ├── amused │ │ ├── __init__.py │ │ ├── pipeline_amused.py │ │ ├── pipeline_amused_img2img.py │ │ └── pipeline_amused_inpaint.py │ ├── animatediff │ │ ├── __init__.py │ │ ├── pipeline_animatediff.py │ │ ├── pipeline_animatediff_video2video.py │ │ └── pipeline_output.py │ ├── audioldm │ │ ├── __init__.py │ │ └── pipeline_audioldm.py │ ├── audioldm2 │ │ ├── __init__.py │ │ ├── modeling_audioldm2.py │ │ └── pipeline_audioldm2.py │ ├── auto_pipeline.py │ ├── blip_diffusion │ │ ├── __init__.py │ │ ├── blip_image_processing.py │ │ ├── modeling_blip2.py │ │ ├── modeling_ctx_clip.py │ │ └── pipeline_blip_diffusion.py │ ├── brushnet │ │ ├── __init__.py │ │ ├── pipeline_brushnet.py │ │ └── pipeline_brushnet_sd_xl.py │ ├── consistency_models │ │ ├── __init__.py │ │ └── pipeline_consistency_models.py │ ├── controlnet │ │ ├── __init__.py │ │ ├── multicontrolnet.py │ │ ├── pipeline_controlnet.py │ │ ├── pipeline_controlnet_blip_diffusion.py │ │ ├── pipeline_controlnet_img2img.py │ │ ├── pipeline_controlnet_inpaint.py │ │ ├── pipeline_controlnet_inpaint_sd_xl.py │ │ ├── pipeline_controlnet_sd_xl.py │ │ ├── pipeline_controlnet_sd_xl_img2img.py │ │ └── pipeline_flax_controlnet.py │ ├── dance_diffusion │ │ ├── __init__.py │ │ └── pipeline_dance_diffusion.py │ ├── ddim │ │ ├── __init__.py │ │ └── pipeline_ddim.py │ ├── ddpm │ │ ├── __init__.py │ │ └── pipeline_ddpm.py │ ├── deepfloyd_if │ │ ├── __init__.py │ │ ├── pipeline_if.py │ │ ├── pipeline_if_img2img.py │ │ ├── pipeline_if_img2img_superresolution.py │ │ ├── pipeline_if_inpainting.py │ │ ├── pipeline_if_inpainting_superresolution.py │ │ ├── pipeline_if_superresolution.py │ │ ├── pipeline_output.py │ │ ├── safety_checker.py │ │ ├── timesteps.py │ │ └── watermark.py │ ├── deprecated │ │ ├── README.md │ │ ├── __init__.py │ │ ├── alt_diffusion │ │ │ ├── __init__.py │ │ │ ├── modeling_roberta_series.py │ │ │ ├── pipeline_alt_diffusion.py │ │ │ ├── pipeline_alt_diffusion_img2img.py │ │ │ └── pipeline_output.py │ │ ├── audio_diffusion │ │ │ ├── __init__.py │ │ │ ├── mel.py │ │ │ └── pipeline_audio_diffusion.py │ │ ├── latent_diffusion_uncond │ │ │ ├── __init__.py │ │ │ └── pipeline_latent_diffusion_uncond.py │ │ ├── pndm │ │ │ ├── __init__.py │ │ │ └── pipeline_pndm.py │ │ ├── repaint │ │ │ ├── __init__.py │ │ │ └── pipeline_repaint.py │ │ ├── score_sde_ve │ │ │ ├── __init__.py │ │ │ └── pipeline_score_sde_ve.py │ │ ├── spectrogram_diffusion │ │ │ ├── __init__.py │ │ │ ├── continuous_encoder.py │ │ │ ├── midi_utils.py │ │ │ ├── notes_encoder.py │ │ │ └── pipeline_spectrogram_diffusion.py │ │ ├── stable_diffusion_variants │ │ │ ├── __init__.py │ │ │ ├── pipeline_cycle_diffusion.py │ │ │ ├── pipeline_onnx_stable_diffusion_inpaint_legacy.py │ │ │ ├── pipeline_stable_diffusion_inpaint_legacy.py │ │ │ ├── pipeline_stable_diffusion_model_editing.py │ │ │ ├── pipeline_stable_diffusion_paradigms.py │ │ │ └── pipeline_stable_diffusion_pix2pix_zero.py │ │ ├── stochastic_karras_ve │ │ │ ├── __init__.py │ │ │ └── pipeline_stochastic_karras_ve.py │ │ ├── versatile_diffusion │ │ │ ├── __init__.py │ │ │ ├── modeling_text_unet.py │ │ │ ├── pipeline_versatile_diffusion.py │ │ │ ├── pipeline_versatile_diffusion_dual_guided.py │ │ │ ├── pipeline_versatile_diffusion_image_variation.py │ │ │ └── pipeline_versatile_diffusion_text_to_image.py │ │ └── vq_diffusion │ │ │ ├── __init__.py │ │ │ └── pipeline_vq_diffusion.py │ ├── dit │ │ ├── __init__.py │ │ └── pipeline_dit.py │ ├── free_init_utils.py │ ├── i2vgen_xl │ │ ├── __init__.py │ │ └── pipeline_i2vgen_xl.py │ ├── kandinsky │ │ ├── __init__.py │ │ ├── pipeline_kandinsky.py │ │ ├── pipeline_kandinsky_combined.py │ │ ├── pipeline_kandinsky_img2img.py │ │ ├── pipeline_kandinsky_inpaint.py │ │ ├── pipeline_kandinsky_prior.py │ │ └── text_encoder.py │ ├── kandinsky2_2 │ │ ├── __init__.py │ │ ├── pipeline_kandinsky2_2.py │ │ ├── pipeline_kandinsky2_2_combined.py │ │ ├── pipeline_kandinsky2_2_controlnet.py │ │ ├── pipeline_kandinsky2_2_controlnet_img2img.py │ │ ├── pipeline_kandinsky2_2_img2img.py │ │ ├── pipeline_kandinsky2_2_inpainting.py │ │ ├── pipeline_kandinsky2_2_prior.py │ │ └── pipeline_kandinsky2_2_prior_emb2emb.py │ ├── kandinsky3 │ │ ├── __init__.py │ │ ├── convert_kandinsky3_unet.py │ │ ├── pipeline_kandinsky3.py │ │ └── pipeline_kandinsky3_img2img.py │ ├── latent_consistency_models │ │ ├── __init__.py │ │ ├── pipeline_latent_consistency_img2img.py │ │ └── pipeline_latent_consistency_text2img.py │ ├── latent_diffusion │ │ ├── __init__.py │ │ ├── pipeline_latent_diffusion.py │ │ └── pipeline_latent_diffusion_superresolution.py │ ├── musicldm │ │ ├── __init__.py │ │ └── pipeline_musicldm.py │ ├── onnx_utils.py │ ├── paint_by_example │ │ ├── __init__.py │ │ ├── image_encoder.py │ │ └── pipeline_paint_by_example.py │ ├── pia │ │ ├── __init__.py │ │ └── pipeline_pia.py │ ├── pipeline_flax_utils.py │ ├── pipeline_loading_utils.py │ ├── pipeline_utils.py │ ├── pixart_alpha │ │ ├── __init__.py │ │ └── pipeline_pixart_alpha.py │ ├── semantic_stable_diffusion │ │ ├── __init__.py │ │ ├── pipeline_output.py │ │ └── pipeline_semantic_stable_diffusion.py │ ├── shap_e │ │ ├── __init__.py │ │ ├── camera.py │ │ ├── pipeline_shap_e.py │ │ ├── pipeline_shap_e_img2img.py │ │ └── renderer.py │ ├── stable_cascade │ │ ├── __init__.py │ │ ├── pipeline_stable_cascade.py │ │ ├── pipeline_stable_cascade_combined.py │ │ └── pipeline_stable_cascade_prior.py │ ├── stable_diffusion │ │ ├── README.md │ │ ├── __init__.py │ │ ├── clip_image_project_model.py │ │ ├── convert_from_ckpt.py │ │ ├── pipeline_flax_stable_diffusion.py │ │ ├── pipeline_flax_stable_diffusion_img2img.py │ │ ├── pipeline_flax_stable_diffusion_inpaint.py │ │ ├── pipeline_onnx_stable_diffusion.py │ │ ├── pipeline_onnx_stable_diffusion_img2img.py │ │ ├── pipeline_onnx_stable_diffusion_inpaint.py │ │ ├── pipeline_onnx_stable_diffusion_upscale.py │ │ ├── pipeline_output.py │ │ ├── pipeline_stable_diffusion.py │ │ ├── pipeline_stable_diffusion_depth2img.py │ │ ├── pipeline_stable_diffusion_image_variation.py │ │ ├── pipeline_stable_diffusion_img2img.py │ │ ├── pipeline_stable_diffusion_inpaint.py │ │ ├── pipeline_stable_diffusion_instruct_pix2pix.py │ │ ├── pipeline_stable_diffusion_latent_upscale.py │ │ ├── pipeline_stable_diffusion_upscale.py │ │ ├── pipeline_stable_unclip.py │ │ ├── pipeline_stable_unclip_img2img.py │ │ ├── safety_checker.py │ │ ├── safety_checker_flax.py │ │ └── stable_unclip_image_normalizer.py │ ├── stable_diffusion_attend_and_excite │ │ ├── __init__.py │ │ └── pipeline_stable_diffusion_attend_and_excite.py │ ├── stable_diffusion_diffedit │ │ ├── __init__.py │ │ └── pipeline_stable_diffusion_diffedit.py │ ├── stable_diffusion_gligen │ │ ├── __init__.py │ │ ├── pipeline_stable_diffusion_gligen.py │ │ └── pipeline_stable_diffusion_gligen_text_image.py │ ├── stable_diffusion_k_diffusion │ │ ├── __init__.py │ │ ├── pipeline_stable_diffusion_k_diffusion.py │ │ └── pipeline_stable_diffusion_xl_k_diffusion.py │ ├── stable_diffusion_ldm3d │ │ ├── __init__.py │ │ └── pipeline_stable_diffusion_ldm3d.py │ ├── stable_diffusion_panorama │ │ ├── __init__.py │ │ └── pipeline_stable_diffusion_panorama.py │ ├── stable_diffusion_safe │ │ ├── __init__.py │ │ ├── pipeline_output.py │ │ ├── pipeline_stable_diffusion_safe.py │ │ └── safety_checker.py │ ├── stable_diffusion_sag │ │ ├── __init__.py │ │ └── pipeline_stable_diffusion_sag.py │ ├── stable_diffusion_xl │ │ ├── __init__.py │ │ ├── pipeline_flax_stable_diffusion_xl.py │ │ ├── pipeline_output.py │ │ ├── pipeline_stable_diffusion_xl.py │ │ ├── pipeline_stable_diffusion_xl_img2img.py │ │ ├── pipeline_stable_diffusion_xl_inpaint.py │ │ ├── pipeline_stable_diffusion_xl_instruct_pix2pix.py │ │ └── watermark.py │ ├── stable_video_diffusion │ │ ├── __init__.py │ │ └── pipeline_stable_video_diffusion.py │ ├── t2i_adapter │ │ ├── __init__.py │ │ ├── pipeline_stable_diffusion_adapter.py │ │ └── pipeline_stable_diffusion_xl_adapter.py │ ├── text_to_video_synthesis │ │ ├── __init__.py │ │ ├── pipeline_output.py │ │ ├── pipeline_text_to_video_synth.py │ │ ├── pipeline_text_to_video_synth_img2img.py │ │ ├── pipeline_text_to_video_zero.py │ │ └── pipeline_text_to_video_zero_sdxl.py │ ├── unclip │ │ ├── __init__.py │ │ ├── pipeline_unclip.py │ │ ├── pipeline_unclip_image_variation.py │ │ └── text_proj.py │ ├── unidiffuser │ │ ├── __init__.py │ │ ├── modeling_text_decoder.py │ │ ├── modeling_uvit.py │ │ └── pipeline_unidiffuser.py │ └── wuerstchen │ │ ├── __init__.py │ │ ├── modeling_paella_vq_model.py │ │ ├── modeling_wuerstchen_common.py │ │ ├── modeling_wuerstchen_diffnext.py │ │ ├── modeling_wuerstchen_prior.py │ │ ├── pipeline_wuerstchen.py │ │ ├── pipeline_wuerstchen_combined.py │ │ └── pipeline_wuerstchen_prior.py │ ├── schedulers │ ├── README.md │ ├── __init__.py │ ├── deprecated │ │ ├── __init__.py │ │ ├── scheduling_karras_ve.py │ │ └── scheduling_sde_vp.py │ ├── scheduling_amused.py │ ├── scheduling_consistency_decoder.py │ ├── scheduling_consistency_models.py │ ├── scheduling_ddim.py │ ├── scheduling_ddim_flax.py │ ├── scheduling_ddim_inverse.py │ ├── scheduling_ddim_parallel.py │ ├── scheduling_ddpm.py │ ├── scheduling_ddpm_flax.py │ ├── scheduling_ddpm_parallel.py │ ├── scheduling_ddpm_wuerstchen.py │ ├── scheduling_deis_multistep.py │ ├── scheduling_dpmsolver_multistep.py │ ├── scheduling_dpmsolver_multistep_flax.py │ ├── scheduling_dpmsolver_multistep_inverse.py │ ├── scheduling_dpmsolver_sde.py │ ├── scheduling_dpmsolver_singlestep.py │ ├── scheduling_edm_dpmsolver_multistep.py │ ├── scheduling_edm_euler.py │ ├── scheduling_euler_ancestral_discrete.py │ ├── scheduling_euler_discrete.py │ ├── scheduling_euler_discrete_flax.py │ ├── scheduling_heun_discrete.py │ ├── scheduling_ipndm.py │ ├── scheduling_k_dpm_2_ancestral_discrete.py │ ├── scheduling_k_dpm_2_discrete.py │ ├── scheduling_karras_ve_flax.py │ ├── scheduling_lcm.py │ ├── scheduling_lms_discrete.py │ ├── scheduling_lms_discrete_flax.py │ ├── scheduling_pndm.py │ ├── scheduling_pndm_flax.py │ ├── scheduling_repaint.py │ ├── scheduling_sasolver.py │ ├── scheduling_sde_ve.py │ ├── scheduling_sde_ve_flax.py │ ├── scheduling_tcd.py │ ├── scheduling_unclip.py │ ├── scheduling_unipc_multistep.py │ ├── scheduling_utils.py │ ├── scheduling_utils_flax.py │ └── scheduling_vq_diffusion.py │ ├── training_utils.py │ └── utils │ ├── __init__.py │ ├── accelerate_utils.py │ ├── constants.py │ ├── deprecation_utils.py │ ├── doc_utils.py │ ├── dummy_flax_and_transformers_objects.py │ ├── dummy_flax_objects.py │ ├── dummy_note_seq_objects.py │ ├── dummy_onnx_objects.py │ ├── dummy_pt_objects.py │ ├── dummy_torch_and_librosa_objects.py │ ├── dummy_torch_and_scipy_objects.py │ ├── dummy_torch_and_torchsde_objects.py │ ├── dummy_torch_and_transformers_and_k_diffusion_objects.py │ ├── dummy_torch_and_transformers_and_onnx_objects.py │ ├── dummy_torch_and_transformers_objects.py │ ├── dummy_transformers_and_torch_and_note_seq_objects.py │ ├── dynamic_modules_utils.py │ ├── export_utils.py │ ├── hub_utils.py │ ├── import_utils.py │ ├── loading_utils.py │ ├── logging.py │ ├── model_card_template.md │ ├── outputs.py │ ├── peft_utils.py │ ├── pil_utils.py │ ├── state_dict_utils.py │ ├── testing_utils.py │ ├── torch_utils.py │ └── versions.py ├── data └── .gitkeep ├── demo_data ├── img_1_2.jpg └── mask_1_2.png ├── demo_out ├── CI_visualization.png ├── all_data_mean.png ├── christmas quokka.png ├── christmas_freecond.png ├── ganster_otter.png ├── github_teaser.jpg ├── halloween_quokka.png ├── paper_teaser.jpg ├── preset.png ├── self_attn.png ├── self_attn_multi.png ├── spy_otter.png └── tokens_mean.png ├── flux ├── controlnet_flux.py ├── pipeline_flux_controlnet_inpaint.py └── transformer_flux.py ├── freecond_app.py ├── freecond_demo.gif ├── freecond_demo.ipynb ├── freecond_evaluation.py ├── freecond_src ├── __init__.py ├── freecond.py ├── freecond_controlnet.py ├── freecond_optimizer.py ├── freecond_utils.py ├── freq.py └── vis_attn_control.py ├── freecond_with_optimization.ipynb ├── gp_model.pkl ├── hdpainter_src ├── __init__.py ├── config │ ├── ddpm │ │ ├── v1.yaml │ │ └── v2-upsample.yaml │ ├── encoders │ │ ├── clip.yaml │ │ └── openclip.yaml │ ├── unet │ │ ├── inpainting │ │ │ ├── v1.yaml │ │ │ └── v2.yaml │ │ └── upsample │ │ │ └── v2.yaml │ ├── vae-upsample.yaml │ └── vae.yaml ├── methods │ ├── __init__.py │ ├── fc_rasg.py │ ├── rasg.py │ ├── sd.py │ └── sr.py ├── models │ ├── __init__.py │ ├── common.py │ ├── inpainting.py │ ├── sam.py │ └── sd2_sr.py ├── smplfusion │ ├── __init__.py │ ├── ddim.py │ ├── models │ │ ├── __init__.py │ │ ├── encoders │ │ │ ├── clip_embedder.py │ │ │ └── open_clip_embedder.py │ │ ├── unet.py │ │ ├── util.py │ │ └── vae.py │ ├── modules │ │ ├── __init__.py │ │ ├── attention │ │ │ ├── __init__.py │ │ │ ├── basic_transformer_block.py │ │ │ ├── cross_attention.py │ │ │ ├── feed_forward.py │ │ │ ├── memory_efficient_cross_attention.py │ │ │ └── spatial_transformer.py │ │ ├── autoencoder.py │ │ ├── distributions.py │ │ ├── ema.py │ │ └── util.py │ ├── patches │ │ ├── __init__.py │ │ ├── attentionpatch │ │ │ ├── __init__.py │ │ │ ├── default.py │ │ │ └── painta.py │ │ ├── router.py │ │ └── transformerpatch │ │ │ ├── __init__.py │ │ │ ├── default.py │ │ │ └── painta.py │ ├── scheduler.py │ ├── share.py │ ├── util.py │ └── utils │ │ ├── __init__.py │ │ ├── input_image.py │ │ ├── input_mask.py │ │ └── input_shape.py └── utils │ ├── __init__.py │ ├── convert_diffusers_to_sd.py │ ├── iimage.py │ └── scores.py ├── powerpaint ├── app.py ├── models │ ├── BrushNet_CA.py │ ├── __init__.py │ ├── unet_2d_blocks.py │ └── unet_2d_condition.py ├── pipelines │ ├── __init__.py │ ├── pipeline_PowerPaint.py │ ├── pipeline_PowerPaint_Brushnet_CA.py │ └── pipeline_PowerPaint_ControlNet.py ├── powerpaint_freecond.py └── utils │ ├── __init__.py │ └── utils.py ├── requirements.txt ├── self_attention_visualization.ipynb └── t2v_metrics ├── __init__.py ├── clipscore.py ├── constants.py ├── itmscore.py ├── models ├── __init__.py ├── clipscore_models │ ├── __init__.py │ ├── blip2_itc_model.py │ ├── clip_model.py │ ├── hpsv2_model.py │ └── pickscore_model.py ├── itmscore_models │ ├── __init__.py │ ├── blip2_itm_model.py │ └── image_reward_model.py ├── model.py └── vqascore_models │ ├── __init__.py │ ├── clip_t5 │ ├── __init__.py │ └── model │ │ ├── __init__.py │ │ ├── language_model │ │ └── clip_t5.py │ │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ │ └── multimodal_projector │ │ └── builder.py │ ├── clip_t5_model.py │ ├── gpt4v_model.py │ ├── instructblip_model.py │ ├── lavis │ ├── __init__.py │ ├── common │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools │ │ │ ├── __init__.py │ │ │ ├── vqa.py │ │ │ └── vqa_eval.py │ ├── configs │ │ ├── datasets │ │ │ ├── aokvqa │ │ │ │ └── defaults.yaml │ │ │ ├── avsd │ │ │ │ └── defaults_dial.yaml │ │ │ ├── coco │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_ret.yaml │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ └── eval_vqa.yaml │ │ │ ├── conceptual_caption │ │ │ │ ├── defaults_12m.yaml │ │ │ │ └── defaults_3m.yaml │ │ │ ├── didemo │ │ │ │ └── defaults_ret.yaml │ │ │ ├── flickr30k │ │ │ │ └── defaults.yaml │ │ │ ├── gqa │ │ │ │ ├── balanced_testdev.yaml │ │ │ │ ├── balanced_val.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── imagenet │ │ │ │ └── defaults.yaml │ │ │ ├── laion │ │ │ │ └── defaults_2B_multi.yaml │ │ │ ├── msrvtt │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_qa.yaml │ │ │ │ └── defaults_ret.yaml │ │ │ ├── msvd │ │ │ │ ├── defaults_cap.yaml │ │ │ │ └── defaults_qa.yaml │ │ │ ├── nlvr │ │ │ │ └── defaults.yaml │ │ │ ├── nocaps │ │ │ │ └── defaults.yaml │ │ │ ├── okvqa │ │ │ │ └── defaults.yaml │ │ │ ├── sbu_caption │ │ │ │ └── defaults.yaml │ │ │ ├── snli_ve │ │ │ │ └── defaults.yaml │ │ │ ├── vatex │ │ │ │ └── defaults_cap.yaml │ │ │ └── vg │ │ │ │ ├── defaults_caption.yaml │ │ │ │ └── defaults_vqa.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── albef_classification_ve.yaml │ │ │ ├── albef_feature_extractor.yaml │ │ │ ├── albef_nlvr.yaml │ │ │ ├── albef_pretrain_base.yaml │ │ │ ├── albef_retrieval_coco.yaml │ │ │ ├── albef_retrieval_flickr.yaml │ │ │ ├── albef_vqav2.yaml │ │ │ ├── alpro_qa_msrvtt.yaml │ │ │ ├── alpro_qa_msvd.yaml │ │ │ ├── alpro_retrieval_didemo.yaml │ │ │ ├── alpro_retrieval_msrvtt.yaml │ │ │ ├── bert_config.json │ │ │ ├── bert_config_alpro.json │ │ │ ├── blip2 │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ ├── blip2_coco.yaml │ │ │ ├── blip2_instruct_flant5xl.yaml │ │ │ ├── blip2_instruct_flant5xxl.yaml │ │ │ ├── blip2_instruct_vicuna13b.yaml │ │ │ ├── blip2_instruct_vicuna7b.yaml │ │ │ ├── blip2_pretrain.yaml │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ ├── blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml │ │ │ ├── blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml │ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ ├── blip2_pretrain_vitL.yaml │ │ │ ├── blip2_vicuna13b.yaml │ │ │ └── blip2_vicuna7b.yaml │ │ │ ├── blip_caption_base_coco.yaml │ │ │ ├── blip_caption_large_coco.yaml │ │ │ ├── blip_classification_base.yaml │ │ │ ├── blip_feature_extractor_base.yaml │ │ │ ├── blip_itm_base.yaml │ │ │ ├── blip_itm_large.yaml │ │ │ ├── blip_nlvr.yaml │ │ │ ├── blip_pretrain_base.yaml │ │ │ ├── blip_pretrain_large.yaml │ │ │ ├── blip_retrieval_coco.yaml │ │ │ ├── blip_retrieval_flickr.yaml │ │ │ ├── blip_vqa_aokvqa.yaml │ │ │ ├── blip_vqa_okvqa.yaml │ │ │ ├── blip_vqav2.yaml │ │ │ ├── clip │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-g-14.json │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ ├── timm-resnet50d.json │ │ │ ├── timm-resnetaa50d.json │ │ │ ├── timm-resnetblur50.json │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ └── timm-vit_small_patch16_224.json │ │ │ ├── clip_resnet50.yaml │ │ │ ├── clip_vit_base16.yaml │ │ │ ├── clip_vit_base32.yaml │ │ │ ├── clip_vit_large14.yaml │ │ │ ├── clip_vit_large14_336.yaml │ │ │ ├── gpt_dialogue_base.yaml │ │ │ ├── img2prompt-vqa │ │ │ └── img2prompt_vqa_base.yaml │ │ │ ├── med_config.json │ │ │ ├── med_config_albef.json │ │ │ ├── med_large_config.json │ │ │ └── pnp-vqa │ │ │ ├── pnp_vqa_3b.yaml │ │ │ ├── pnp_vqa_base.yaml │ │ │ ├── pnp_vqa_large.yaml │ │ │ ├── unifiedqav2_3b_config.json │ │ │ ├── unifiedqav2_base_config.json │ │ │ └── unifiedqav2_large_config.json │ ├── models │ │ ├── __init__.py │ │ ├── albef_models │ │ │ ├── __init__.py │ │ │ ├── albef_classification.py │ │ │ ├── albef_feature_extractor.py │ │ │ ├── albef_nlvr.py │ │ │ ├── albef_outputs.py │ │ │ ├── albef_pretrain.py │ │ │ ├── albef_retrieval.py │ │ │ └── albef_vqa.py │ │ ├── base_model.py │ │ ├── blip2_models │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── blip2.py │ │ │ ├── blip2_image_text_matching.py │ │ │ ├── blip2_qformer.py │ │ │ ├── blip2_t5.py │ │ │ ├── blip2_t5_instruct.py │ │ │ ├── blip2_vicuna.py │ │ │ ├── blip2_vicuna_instruct.py │ │ │ ├── modeling_llama.py │ │ │ └── modeling_t5.py │ │ ├── blip_models │ │ │ ├── __init__.py │ │ │ ├── blip.py │ │ │ ├── blip_caption.py │ │ │ ├── blip_classification.py │ │ │ ├── blip_feature_extractor.py │ │ │ ├── blip_image_text_matching.py │ │ │ ├── blip_nlvr.py │ │ │ ├── blip_outputs.py │ │ │ ├── blip_pretrain.py │ │ │ ├── blip_retrieval.py │ │ │ ├── blip_vqa.py │ │ │ └── nlvr_encoder.py │ │ ├── clip_vit.py │ │ ├── eva_vit.py │ │ ├── med.py │ │ └── vit.py │ └── processors │ │ ├── __init__.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ └── randaugment.py │ ├── llava │ ├── __init__.py │ └── model │ │ ├── __init__.py │ │ ├── language_model │ │ └── llava_llama.py │ │ ├── llava_arch.py │ │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ │ └── multimodal_projector │ │ └── builder.py │ ├── llava16_model.py │ ├── llava_16 │ ├── __init__.py │ └── model │ │ ├── __init__.py │ │ ├── language_model │ │ └── llava_llama.py │ │ ├── llava_arch.py │ │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ │ └── multimodal_projector │ │ └── builder.py │ ├── llava_model.py │ ├── mm_utils.py │ └── vqa_model.py ├── score.py └── vqascore.py /.gitignore: -------------------------------------------------------------------------------- 1 | /data/* 2 | !/data/.gitkeep 3 | /runs_old 4 | /runs 5 | /masks 6 | new_dataset 7 | *.pyc 8 | *.pth 9 | *.safetensors 10 | *.typed 11 | *.bin 12 | *.bash 13 | DEV* 14 | *.csv 15 | *.sh 16 | /ckpt 17 | /hf_cache 18 | research_script -------------------------------------------------------------------------------- /FreeCondDemo_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/FreeCondDemo_video.mp4 -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/__init__.py -------------------------------------------------------------------------------- /brushnet/diffusers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseDiffusersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /brushnet/diffusers/commands/diffusers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from argparse import ArgumentParser 17 | 18 | from .env import EnvironmentCommand 19 | from .fp16_safetensors import FP16SafetensorsCommand 20 | 21 | 22 | def main(): 23 | parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli []") 24 | commands_parser = parser.add_subparsers(help="diffusers-cli command helpers") 25 | 26 | # Register commands 27 | EnvironmentCommand.register_subcommand(commands_parser) 28 | FP16SafetensorsCommand.register_subcommand(commands_parser) 29 | 30 | # Let's go 31 | args = parser.parse_args() 32 | 33 | if not hasattr(args, "func"): 34 | parser.print_help() 35 | exit(1) 36 | 37 | # Run 38 | service = args.func(args) 39 | service.run() 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /brushnet/diffusers/dependency_versions_check.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .dependency_versions_table import deps 16 | from .utils.versions import require_version, require_version_core 17 | 18 | 19 | # define which module versions we always want to check at run time 20 | # (usually the ones defined in `install_requires` in setup.py) 21 | # 22 | # order specific notes: 23 | # - tqdm must be checked before tokenizers 24 | 25 | pkgs_to_check_at_runtime = "python requests filelock numpy".split() 26 | for pkg in pkgs_to_check_at_runtime: 27 | if pkg in deps: 28 | require_version_core(deps[pkg]) 29 | else: 30 | raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py") 31 | 32 | 33 | def dep_version_check(pkg, hint=None): 34 | require_version(deps[pkg], hint) 35 | -------------------------------------------------------------------------------- /brushnet/diffusers/dependency_versions_table.py: -------------------------------------------------------------------------------- 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update: 2 | # 1. modify the `_deps` dict in setup.py 3 | # 2. run `make deps_table_update` 4 | deps = { 5 | "Pillow": "Pillow", 6 | "accelerate": "accelerate>=0.11.0", 7 | "compel": "compel==0.1.8", 8 | "datasets": "datasets", 9 | "filelock": "filelock", 10 | "flax": "flax>=0.4.1", 11 | "hf-doc-builder": "hf-doc-builder>=0.3.0", 12 | "huggingface-hub": "huggingface-hub", 13 | "requests-mock": "requests-mock==1.10.0", 14 | "importlib_metadata": "importlib_metadata", 15 | "invisible-watermark": "invisible-watermark>=0.2.0", 16 | "isort": "isort>=5.5.4", 17 | "jax": "jax>=0.4.1", 18 | "jaxlib": "jaxlib>=0.4.1", 19 | "Jinja2": "Jinja2", 20 | "k-diffusion": "k-diffusion>=0.0.12", 21 | "torchsde": "torchsde", 22 | "note_seq": "note_seq", 23 | "librosa": "librosa", 24 | "numpy": "numpy", 25 | "parameterized": "parameterized", 26 | "peft": "peft>=0.6.0", 27 | "protobuf": "protobuf>=3.20.3,<4", 28 | "pytest": "pytest", 29 | "pytest-timeout": "pytest-timeout", 30 | "pytest-xdist": "pytest-xdist", 31 | "python": "python>=3.8.0", 32 | "ruff": "ruff==0.1.5", 33 | "safetensors": "safetensors>=0.3.1", 34 | "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", 35 | "GitPython": "GitPython<3.1.19", 36 | "scipy": "scipy", 37 | "onnx": "onnx", 38 | "regex": "regex!=2019.12.17", 39 | "requests": "requests", 40 | "tensorboard": "tensorboard", 41 | "torch": "torch>=1.4", 42 | "torchvision": "torchvision", 43 | "transformers": "transformers>=4.25.1", 44 | "urllib3": "urllib3<=2.0.0", 45 | } 46 | -------------------------------------------------------------------------------- /brushnet/diffusers/experimental/README.md: -------------------------------------------------------------------------------- 1 | # 🧨 Diffusers Experimental 2 | 3 | We are adding experimental code to support novel applications and usages of the Diffusers library. 4 | Currently, the following experiments are supported: 5 | * Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model. -------------------------------------------------------------------------------- /brushnet/diffusers/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | from .rl import ValueGuidedRLPipeline 2 | -------------------------------------------------------------------------------- /brushnet/diffusers/experimental/rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .value_guided_sampling import ValueGuidedRLPipeline 2 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/README.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models/overview). -------------------------------------------------------------------------------- /brushnet/diffusers/models/autoencoders/__init__.py: -------------------------------------------------------------------------------- 1 | from .autoencoder_asym_kl import AsymmetricAutoencoderKL 2 | from .autoencoder_kl import AutoencoderKL 3 | from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder 4 | from .autoencoder_tiny import AutoencoderTiny 5 | from .consistency_decoder_vae import ConsistencyDecoderVAE 6 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/dual_transformer_2d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from ..utils import deprecate 15 | from .transformers.dual_transformer_2d import DualTransformer2DModel 16 | 17 | 18 | class DualTransformer2DModel(DualTransformer2DModel): 19 | deprecation_message = "Importing `DualTransformer2DModel` from `diffusers.models.dual_transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.dual_transformer_2d import DualTransformer2DModel`, instead." 20 | deprecate("DualTransformer2DModel", "0.29", deprecation_message) 21 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/modeling_outputs.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from ..utils import BaseOutput 4 | 5 | 6 | @dataclass 7 | class AutoencoderKLOutput(BaseOutput): 8 | """ 9 | Output of AutoencoderKL encoding method. 10 | 11 | Args: 12 | latent_dist (`DiagonalGaussianDistribution`): 13 | Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`. 14 | `DiagonalGaussianDistribution` allows for sampling latents from the distribution. 15 | """ 16 | 17 | latent_dist: "DiagonalGaussianDistribution" # noqa: F821 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/prior_transformer.py: -------------------------------------------------------------------------------- 1 | from ..utils import deprecate 2 | from .transformers.prior_transformer import PriorTransformer, PriorTransformerOutput 3 | 4 | 5 | class PriorTransformerOutput(PriorTransformerOutput): 6 | deprecation_message = "Importing `PriorTransformerOutput` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformerOutput`, instead." 7 | deprecate("PriorTransformerOutput", "0.29", deprecation_message) 8 | 9 | 10 | class PriorTransformer(PriorTransformer): 11 | deprecation_message = "Importing `PriorTransformer` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformer`, instead." 12 | deprecate("PriorTransformer", "0.29", deprecation_message) 13 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/transformer_2d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from ..utils import deprecate 15 | from .transformers.transformer_2d import Transformer2DModel, Transformer2DModelOutput 16 | 17 | 18 | class Transformer2DModelOutput(Transformer2DModelOutput): 19 | deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead." 20 | deprecate("Transformer2DModelOutput", "0.29", deprecation_message) 21 | 22 | 23 | class Transformer2DModel(Transformer2DModel): 24 | deprecation_message = "Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead." 25 | deprecate("Transformer2DModel", "0.29", deprecation_message) 26 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from ...utils import is_torch_available 2 | 3 | 4 | if is_torch_available(): 5 | from .dual_transformer_2d import DualTransformer2DModel 6 | from .prior_transformer import PriorTransformer 7 | from .t5_film_transformer import T5FilmDecoder 8 | from .transformer_2d import Transformer2DModel 9 | from .transformer_temporal import TransformerTemporalModel 10 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/unet_1d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from ..utils import deprecate 16 | from .unets.unet_1d import UNet1DModel, UNet1DOutput 17 | 18 | 19 | class UNet1DOutput(UNet1DOutput): 20 | deprecation_message = "Importing `UNet1DOutput` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DOutput`, instead." 21 | deprecate("UNet1DOutput", "0.29", deprecation_message) 22 | 23 | 24 | class UNet1DModel(UNet1DModel): 25 | deprecation_message = "Importing `UNet1DModel` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DModel`, instead." 26 | deprecate("UNet1DModel", "0.29", deprecation_message) 27 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/unet_2d.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from ..utils import deprecate 17 | from .unets.unet_2d import UNet2DModel, UNet2DOutput 18 | 19 | 20 | class UNet2DOutput(UNet2DOutput): 21 | deprecation_message = "Importing `UNet2DOutput` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DOutput`, instead." 22 | deprecate("UNet2DOutput", "0.29", deprecation_message) 23 | 24 | 25 | class UNet2DModel(UNet2DModel): 26 | deprecation_message = "Importing `UNet2DModel` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DModel`, instead." 27 | deprecate("UNet2DModel", "0.29", deprecation_message) 28 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/unet_2d_condition.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from ..utils import deprecate 15 | from .unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput 16 | 17 | 18 | class UNet2DConditionOutput(UNet2DConditionOutput): 19 | deprecation_message = "Importing `UNet2DConditionOutput` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput`, instead." 20 | deprecate("UNet2DConditionOutput", "0.29", deprecation_message) 21 | 22 | 23 | class UNet2DConditionModel(UNet2DConditionModel): 24 | deprecation_message = "Importing `UNet2DConditionModel` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel`, instead." 25 | deprecate("UNet2DConditionModel", "0.29", deprecation_message) 26 | -------------------------------------------------------------------------------- /brushnet/diffusers/models/unets/__init__.py: -------------------------------------------------------------------------------- 1 | from ...utils import is_flax_available, is_torch_available 2 | 3 | 4 | if is_torch_available(): 5 | from .unet_1d import UNet1DModel 6 | from .unet_2d import UNet2DModel 7 | from .unet_2d_condition import UNet2DConditionModel 8 | from .unet_3d_condition import UNet3DConditionModel 9 | from .unet_i2vgen_xl import I2VGenXLUNet 10 | from .unet_kandinsky3 import Kandinsky3UNet 11 | from .unet_motion_model import MotionAdapter, UNetMotionModel 12 | from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel 13 | from .unet_stable_cascade import StableCascadeUNet 14 | from .uvit_2d import UVit2DModel 15 | 16 | 17 | if is_flax_available(): 18 | from .unet_2d_condition_flax import FlaxUNet2DConditionModel 19 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/animatediff/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | import torch 7 | 8 | from ...utils import BaseOutput 9 | 10 | 11 | @dataclass 12 | class AnimateDiffPipelineOutput(BaseOutput): 13 | r""" 14 | Output class for AnimateDiff pipelines. 15 | 16 | Args: 17 | frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): 18 | List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised 19 | PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape 20 | `(batch_size, num_frames, channels, height, width)` 21 | """ 22 | 23 | frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] 24 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/audioldm/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | is_torch_available, 8 | is_transformers_available, 9 | is_transformers_version, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | try: 17 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | from ...utils.dummy_torch_and_transformers_objects import ( 21 | AudioLDMPipeline, 22 | ) 23 | 24 | _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline}) 25 | else: 26 | _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"] 27 | 28 | 29 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 30 | try: 31 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")): 32 | raise OptionalDependencyNotAvailable() 33 | except OptionalDependencyNotAvailable: 34 | from ...utils.dummy_torch_and_transformers_objects import ( 35 | AudioLDMPipeline, 36 | ) 37 | 38 | else: 39 | from .pipeline_audioldm import AudioLDMPipeline 40 | else: 41 | import sys 42 | 43 | sys.modules[__name__] = _LazyModule( 44 | __name__, 45 | globals()["__file__"], 46 | _import_structure, 47 | module_spec=__spec__, 48 | ) 49 | 50 | for name, value in _dummy_objects.items(): 51 | setattr(sys.modules[__name__], name, value) 52 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/blip_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Union 3 | 4 | import numpy as np 5 | import PIL 6 | from PIL import Image 7 | 8 | from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available 9 | 10 | 11 | try: 12 | if not (is_transformers_available() and is_torch_available()): 13 | raise OptionalDependencyNotAvailable() 14 | except OptionalDependencyNotAvailable: 15 | from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline 16 | else: 17 | from .blip_image_processing import BlipImageProcessor 18 | from .modeling_blip2 import Blip2QFormerModel 19 | from .modeling_ctx_clip import ContextCLIPTextModel 20 | from .pipeline_blip_diffusion import BlipDiffusionPipeline 21 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/consistency_models/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | _LazyModule, 6 | ) 7 | 8 | 9 | _import_structure = { 10 | "pipeline_consistency_models": ["ConsistencyModelPipeline"], 11 | } 12 | 13 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 14 | from .pipeline_consistency_models import ConsistencyModelPipeline 15 | 16 | else: 17 | import sys 18 | 19 | sys.modules[__name__] = _LazyModule( 20 | __name__, 21 | globals()["__file__"], 22 | _import_structure, 23 | module_spec=__spec__, 24 | ) 25 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/dance_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_dance_diffusion import DanceDiffusionPipeline 10 | else: 11 | import sys 12 | 13 | sys.modules[__name__] = _LazyModule( 14 | __name__, 15 | globals()["__file__"], 16 | _import_structure, 17 | module_spec=__spec__, 18 | ) 19 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/ddim/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_ddim": ["DDIMPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_ddim import DDIMPipeline 10 | else: 11 | import sys 12 | 13 | sys.modules[__name__] = _LazyModule( 14 | __name__, 15 | globals()["__file__"], 16 | _import_structure, 17 | module_spec=__spec__, 18 | ) 19 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/ddpm/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | _LazyModule, 6 | ) 7 | 8 | 9 | _import_structure = {"pipeline_ddpm": ["DDPMPipeline"]} 10 | 11 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 12 | from .pipeline_ddpm import DDPMPipeline 13 | 14 | else: 15 | import sys 16 | 17 | sys.modules[__name__] = _LazyModule( 18 | __name__, 19 | globals()["__file__"], 20 | _import_structure, 21 | module_spec=__spec__, 22 | ) 23 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deepfloyd_if/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | 7 | from ...utils import BaseOutput 8 | 9 | 10 | @dataclass 11 | class IFPipelineOutput(BaseOutput): 12 | """ 13 | Args: 14 | Output class for Stable Diffusion pipelines. 15 | images (`List[PIL.Image.Image]` or `np.ndarray`) 16 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, 17 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. 18 | nsfw_detected (`List[bool]`) 19 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" 20 | (nsfw) content or a watermark. `None` if safety checking could not be performed. 21 | watermark_detected (`List[bool]`) 22 | List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety 23 | checking could not be performed. 24 | """ 25 | 26 | images: Union[List[PIL.Image.Image], np.ndarray] 27 | nsfw_detected: Optional[List[bool]] 28 | watermark_detected: Optional[List[bool]] 29 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/README.md: -------------------------------------------------------------------------------- 1 | # Deprecated Pipelines 2 | 3 | This folder contains pipelines that have very low usage as measured by model downloads, issues and PRs. While you can still use the pipelines just as before, we will stop testing the pipelines and will not accept any changes to existing files. -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | 7 | from ....utils import ( 8 | BaseOutput, 9 | ) 10 | 11 | 12 | @dataclass 13 | # Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt 14 | class AltDiffusionPipelineOutput(BaseOutput): 15 | """ 16 | Output class for Alt Diffusion pipelines. 17 | 18 | Args: 19 | images (`List[PIL.Image.Image]` or `np.ndarray`) 20 | List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, 21 | num_channels)`. 22 | nsfw_content_detected (`List[bool]`) 23 | List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or 24 | `None` if safety checking could not be performed. 25 | """ 26 | 27 | images: Union[List[PIL.Image.Image], np.ndarray] 28 | nsfw_content_detected: Optional[List[bool]] 29 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/audio_diffusion/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = { 7 | "mel": ["Mel"], 8 | "pipeline_audio_diffusion": ["AudioDiffusionPipeline"], 9 | } 10 | 11 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 12 | from .mel import Mel 13 | from .pipeline_audio_diffusion import AudioDiffusionPipeline 14 | 15 | else: 16 | import sys 17 | 18 | sys.modules[__name__] = _LazyModule( 19 | __name__, 20 | globals()["__file__"], 21 | _import_structure, 22 | module_spec=__spec__, 23 | ) 24 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_latent_diffusion_uncond import LDMPipeline 10 | else: 11 | import sys 12 | 13 | sys.modules[__name__] = _LazyModule( 14 | __name__, 15 | globals()["__file__"], 16 | _import_structure, 17 | module_spec=__spec__, 18 | ) 19 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/pndm/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_pndm": ["PNDMPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_pndm import PNDMPipeline 10 | else: 11 | import sys 12 | 13 | sys.modules[__name__] = _LazyModule( 14 | __name__, 15 | globals()["__file__"], 16 | _import_structure, 17 | module_spec=__spec__, 18 | ) 19 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/repaint/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_repaint": ["RePaintPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_repaint import RePaintPipeline 10 | 11 | else: 12 | import sys 13 | 14 | sys.modules[__name__] = _LazyModule( 15 | __name__, 16 | globals()["__file__"], 17 | _import_structure, 18 | module_spec=__spec__, 19 | ) 20 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/score_sde_ve/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_score_sde_ve import ScoreSdeVePipeline 10 | 11 | else: 12 | import sys 13 | 14 | sys.modules[__name__] = _LazyModule( 15 | __name__, 16 | globals()["__file__"], 17 | _import_structure, 18 | module_spec=__spec__, 19 | ) 20 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_stochastic_karras_ve import KarrasVePipeline 10 | 11 | else: 12 | import sys 13 | 14 | sys.modules[__name__] = _LazyModule( 15 | __name__, 16 | globals()["__file__"], 17 | _import_structure, 18 | module_spec=__spec__, 19 | ) 20 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/dit/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule 4 | 5 | 6 | _import_structure = {"pipeline_dit": ["DiTPipeline"]} 7 | 8 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 9 | from .pipeline_dit import DiTPipeline 10 | 11 | else: 12 | import sys 13 | 14 | sys.modules[__name__] = _LazyModule( 15 | __name__, 16 | globals()["__file__"], 17 | _import_structure, 18 | module_spec=__spec__, 19 | ) 20 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/i2vgen_xl/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | try: 17 | if not (is_transformers_available() and is_torch_available()): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 21 | 22 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 23 | else: 24 | _import_structure["pipeline_i2vgen_xl"] = ["I2VGenXLPipeline"] 25 | 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | except OptionalDependencyNotAvailable: 32 | from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 33 | else: 34 | from .pipeline_i2vgen_xl import I2VGenXLPipeline 35 | 36 | else: 37 | import sys 38 | 39 | sys.modules[__name__] = _LazyModule( 40 | __name__, 41 | globals()["__file__"], 42 | _import_structure, 43 | module_spec=__spec__, 44 | ) 45 | for name, value in _dummy_objects.items(): 46 | setattr(sys.modules[__name__], name, value) 47 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/kandinsky/text_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel 3 | 4 | 5 | class MCLIPConfig(XLMRobertaConfig): 6 | model_type = "M-CLIP" 7 | 8 | def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs): 9 | self.transformerDimensions = transformerDimSize 10 | self.numDims = imageDimSize 11 | super().__init__(**kwargs) 12 | 13 | 14 | class MultilingualCLIP(PreTrainedModel): 15 | config_class = MCLIPConfig 16 | 17 | def __init__(self, config, *args, **kwargs): 18 | super().__init__(config, *args, **kwargs) 19 | self.transformer = XLMRobertaModel(config) 20 | self.LinearTransformation = torch.nn.Linear( 21 | in_features=config.transformerDimensions, out_features=config.numDims 22 | ) 23 | 24 | def forward(self, input_ids, attention_mask): 25 | embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0] 26 | embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None] 27 | return self.LinearTransformation(embs2), embs 28 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/musicldm/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | is_transformers_version, 11 | ) 12 | 13 | 14 | _dummy_objects = {} 15 | _import_structure = {} 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"] 26 | 27 | 28 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 29 | try: 30 | if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")): 31 | raise OptionalDependencyNotAvailable() 32 | 33 | except OptionalDependencyNotAvailable: 34 | from ...utils.dummy_torch_and_transformers_objects import * 35 | else: 36 | from .pipeline_musicldm import MusicLDMPipeline 37 | 38 | else: 39 | import sys 40 | 41 | sys.modules[__name__] = _LazyModule( 42 | __name__, 43 | globals()["__file__"], 44 | _import_structure, 45 | module_spec=__spec__, 46 | ) 47 | 48 | for name, value in _dummy_objects.items(): 49 | setattr(sys.modules[__name__], name, value) 50 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/pia/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | try: 17 | if not (is_transformers_available() and is_torch_available()): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | from ...utils import dummy_torch_and_transformers_objects 21 | 22 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 23 | else: 24 | _import_structure["pipeline_pia"] = ["PIAPipeline", "PIAPipelineOutput"] 25 | 26 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 27 | try: 28 | if not (is_transformers_available() and is_torch_available()): 29 | raise OptionalDependencyNotAvailable() 30 | except OptionalDependencyNotAvailable: 31 | from ...utils.dummy_torch_and_transformers_objects import * 32 | 33 | else: 34 | from .pipeline_pia import PIAPipeline, PIAPipelineOutput 35 | 36 | else: 37 | import sys 38 | 39 | sys.modules[__name__] = _LazyModule( 40 | __name__, 41 | globals()["__file__"], 42 | _import_structure, 43 | module_spec=__spec__, 44 | ) 45 | for name, value in _dummy_objects.items(): 46 | setattr(sys.modules[__name__], name, value) 47 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/pixart_alpha/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_pixart_alpha"] = ["PixArtAlphaPipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_pixart_alpha import PixArtAlphaPipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | 7 | from ...utils import BaseOutput 8 | 9 | 10 | @dataclass 11 | class SemanticStableDiffusionPipelineOutput(BaseOutput): 12 | """ 13 | Output class for Stable Diffusion pipelines. 14 | 15 | Args: 16 | images (`List[PIL.Image.Image]` or `np.ndarray`) 17 | List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, 18 | num_channels)`. 19 | nsfw_content_detected (`List[bool]`) 20 | List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or 21 | `None` if safety checking could not be performed. 22 | """ 23 | 24 | images: Union[List[PIL.Image.Image], np.ndarray] 25 | nsfw_content_detected: Optional[List[bool]] 26 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion/clip_image_project_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from torch import nn 16 | 17 | from ...configuration_utils import ConfigMixin, register_to_config 18 | from ...models.modeling_utils import ModelMixin 19 | 20 | 21 | class CLIPImageProjection(ModelMixin, ConfigMixin): 22 | @register_to_config 23 | def __init__(self, hidden_size: int = 768): 24 | super().__init__() 25 | self.hidden_size = hidden_size 26 | self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False) 27 | 28 | def forward(self, x): 29 | return self.project(x) 30 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_diffedit/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_panorama/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | 7 | from ...utils import ( 8 | BaseOutput, 9 | ) 10 | 11 | 12 | @dataclass 13 | class StableDiffusionSafePipelineOutput(BaseOutput): 14 | """ 15 | Output class for Safe Stable Diffusion pipelines. 16 | 17 | Args: 18 | images (`List[PIL.Image.Image]` or `np.ndarray`) 19 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, 20 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. 21 | nsfw_content_detected (`List[bool]`) 22 | List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" 23 | (nsfw) content, or `None` if safety checking could not be performed. 24 | images (`List[PIL.Image.Image]` or `np.ndarray`) 25 | List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work" 26 | (nsfw) content, or `None` if no safety check was performed or no images were flagged. 27 | applied_safety_concept (`str`) 28 | The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled 29 | """ 30 | 31 | images: Union[List[PIL.Image.Image], np.ndarray] 32 | nsfw_content_detected: Optional[List[bool]] 33 | unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]] 34 | applied_safety_concept: Optional[str] 35 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_sag/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | 17 | try: 18 | if not (is_transformers_available() and is_torch_available()): 19 | raise OptionalDependencyNotAvailable() 20 | except OptionalDependencyNotAvailable: 21 | from ...utils import dummy_torch_and_transformers_objects # noqa F403 22 | 23 | _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) 24 | else: 25 | _import_structure["pipeline_stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not (is_transformers_available() and is_torch_available()): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ...utils.dummy_torch_and_transformers_objects import * 34 | else: 35 | from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline 36 | 37 | else: 38 | import sys 39 | 40 | sys.modules[__name__] = _LazyModule( 41 | __name__, 42 | globals()["__file__"], 43 | _import_structure, 44 | module_spec=__spec__, 45 | ) 46 | 47 | for name, value in _dummy_objects.items(): 48 | setattr(sys.modules[__name__], name, value) 49 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import PIL.Image 6 | 7 | from ...utils import BaseOutput, is_flax_available 8 | 9 | 10 | @dataclass 11 | class StableDiffusionXLPipelineOutput(BaseOutput): 12 | """ 13 | Output class for Stable Diffusion pipelines. 14 | 15 | Args: 16 | images (`List[PIL.Image.Image]` or `np.ndarray`) 17 | List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, 18 | num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. 19 | """ 20 | 21 | images: Union[List[PIL.Image.Image], np.ndarray] 22 | 23 | 24 | if is_flax_available(): 25 | import flax 26 | 27 | @flax.struct.dataclass 28 | class FlaxStableDiffusionXLPipelineOutput(BaseOutput): 29 | """ 30 | Output class for Flax Stable Diffusion XL pipelines. 31 | 32 | Args: 33 | images (`np.ndarray`) 34 | Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline. 35 | """ 36 | 37 | images: np.ndarray 38 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/stable_diffusion_xl/watermark.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from ...utils import is_invisible_watermark_available 5 | 6 | 7 | if is_invisible_watermark_available(): 8 | from imwatermark import WatermarkEncoder 9 | 10 | 11 | # Copied from https://github.com/Stability-AI/generative-models/blob/613af104c6b85184091d42d374fef420eddb356d/scripts/demo/streamlit_helpers.py#L66 12 | WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110 13 | # bin(x)[2:] gives bits of x as str, use int to convert them to 0/1 14 | WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] 15 | 16 | 17 | class StableDiffusionXLWatermarker: 18 | def __init__(self): 19 | self.watermark = WATERMARK_BITS 20 | self.encoder = WatermarkEncoder() 21 | 22 | self.encoder.set_watermark("bits", self.watermark) 23 | 24 | def apply_watermark(self, images: torch.FloatTensor): 25 | # can't encode images that are smaller than 256 26 | if images.shape[-1] < 256: 27 | return images 28 | 29 | images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy() 30 | 31 | images = [self.encoder.encode(image, "dwtDct") for image in images] 32 | 33 | images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2) 34 | 35 | images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0) 36 | return images 37 | -------------------------------------------------------------------------------- /brushnet/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import PIL 6 | import torch 7 | 8 | from ...utils import ( 9 | BaseOutput, 10 | ) 11 | 12 | 13 | @dataclass 14 | class TextToVideoSDPipelineOutput(BaseOutput): 15 | """ 16 | Output class for text-to-video pipelines. 17 | 18 | Args: 19 | frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]): 20 | List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised 21 | PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape 22 | `(batch_size, num_frames, channels, height, width)` 23 | """ 24 | 25 | frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]] 26 | -------------------------------------------------------------------------------- /brushnet/diffusers/schedulers/README.md: -------------------------------------------------------------------------------- 1 | # Schedulers 2 | 3 | For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers/overview). -------------------------------------------------------------------------------- /brushnet/diffusers/schedulers/deprecated/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from ...utils import ( 4 | DIFFUSERS_SLOW_IMPORT, 5 | OptionalDependencyNotAvailable, 6 | _LazyModule, 7 | get_objects_from_module, 8 | is_torch_available, 9 | is_transformers_available, 10 | ) 11 | 12 | 13 | _dummy_objects = {} 14 | _import_structure = {} 15 | 16 | try: 17 | if not (is_transformers_available() and is_torch_available()): 18 | raise OptionalDependencyNotAvailable() 19 | except OptionalDependencyNotAvailable: 20 | from ...utils import dummy_pt_objects # noqa F403 21 | 22 | _dummy_objects.update(get_objects_from_module(dummy_pt_objects)) 23 | else: 24 | _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"] 25 | _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"] 26 | 27 | if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: 28 | try: 29 | if not is_torch_available(): 30 | raise OptionalDependencyNotAvailable() 31 | 32 | except OptionalDependencyNotAvailable: 33 | from ..utils.dummy_pt_objects import * # noqa F403 34 | else: 35 | from .scheduling_karras_ve import KarrasVeScheduler 36 | from .scheduling_sde_vp import ScoreSdeVpScheduler 37 | 38 | 39 | else: 40 | import sys 41 | 42 | sys.modules[__name__] = _LazyModule( 43 | __name__, 44 | globals()["__file__"], 45 | _import_structure, 46 | module_spec=__spec__, 47 | ) 48 | 49 | for name, value in _dummy_objects.items(): 50 | setattr(sys.modules[__name__], name, value) 51 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/doc_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Doc utilities: Utilities related to documentation 16 | """ 17 | import re 18 | 19 | 20 | def replace_example_docstring(example_docstring): 21 | def docstring_decorator(fn): 22 | func_doc = fn.__doc__ 23 | lines = func_doc.split("\n") 24 | i = 0 25 | while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None: 26 | i += 1 27 | if i < len(lines): 28 | lines[i] = example_docstring 29 | func_doc = "\n".join(lines) 30 | else: 31 | raise ValueError( 32 | f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, " 33 | f"current docstring is:\n{func_doc}" 34 | ) 35 | fn.__doc__ = func_doc 36 | return fn 37 | 38 | return docstring_decorator 39 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_note_seq_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class MidiProcessor(metaclass=DummyObject): 6 | _backends = ["note_seq"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["note_seq"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["note_seq"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["note_seq"]) 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_onnx_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class OnnxRuntimeModel(metaclass=DummyObject): 6 | _backends = ["onnx"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["onnx"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["onnx"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["onnx"]) 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_torch_and_librosa_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class AudioDiffusionPipeline(metaclass=DummyObject): 6 | _backends = ["torch", "librosa"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["torch", "librosa"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["torch", "librosa"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["torch", "librosa"]) 18 | 19 | 20 | class Mel(metaclass=DummyObject): 21 | _backends = ["torch", "librosa"] 22 | 23 | def __init__(self, *args, **kwargs): 24 | requires_backends(self, ["torch", "librosa"]) 25 | 26 | @classmethod 27 | def from_config(cls, *args, **kwargs): 28 | requires_backends(cls, ["torch", "librosa"]) 29 | 30 | @classmethod 31 | def from_pretrained(cls, *args, **kwargs): 32 | requires_backends(cls, ["torch", "librosa"]) 33 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_torch_and_scipy_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class LMSDiscreteScheduler(metaclass=DummyObject): 6 | _backends = ["torch", "scipy"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["torch", "scipy"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["torch", "scipy"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["torch", "scipy"]) 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_torch_and_torchsde_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class DPMSolverSDEScheduler(metaclass=DummyObject): 6 | _backends = ["torch", "torchsde"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["torch", "torchsde"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["torch", "torchsde"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["torch", "torchsde"]) 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class StableDiffusionKDiffusionPipeline(metaclass=DummyObject): 6 | _backends = ["torch", "transformers", "k_diffusion"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["torch", "transformers", "k_diffusion"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["torch", "transformers", "k_diffusion"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["torch", "transformers", "k_diffusion"]) 18 | 19 | 20 | class StableDiffusionXLKDiffusionPipeline(metaclass=DummyObject): 21 | _backends = ["torch", "transformers", "k_diffusion"] 22 | 23 | def __init__(self, *args, **kwargs): 24 | requires_backends(self, ["torch", "transformers", "k_diffusion"]) 25 | 26 | @classmethod 27 | def from_config(cls, *args, **kwargs): 28 | requires_backends(cls, ["torch", "transformers", "k_diffusion"]) 29 | 30 | @classmethod 31 | def from_pretrained(cls, *args, **kwargs): 32 | requires_backends(cls, ["torch", "transformers", "k_diffusion"]) 33 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class SpectrogramDiffusionPipeline(metaclass=DummyObject): 6 | _backends = ["transformers", "torch", "note_seq"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["transformers", "torch", "note_seq"]) 10 | 11 | @classmethod 12 | def from_config(cls, *args, **kwargs): 13 | requires_backends(cls, ["transformers", "torch", "note_seq"]) 14 | 15 | @classmethod 16 | def from_pretrained(cls, *args, **kwargs): 17 | requires_backends(cls, ["transformers", "torch", "note_seq"]) 18 | -------------------------------------------------------------------------------- /brushnet/diffusers/utils/model_card_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | {{ card_data }} 3 | --- 4 | 5 | 7 | 8 | {{ model_description }} 9 | 10 | ## Intended uses & limitations 11 | 12 | #### How to use 13 | 14 | ```python 15 | # TODO: add an example code snippet for running this diffusion pipeline 16 | ``` 17 | 18 | #### Limitations and bias 19 | 20 | [TODO: provide examples of latent issues and potential remediations] 21 | 22 | ## Training details 23 | 24 | [TODO: describe the data used to train the model] 25 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/data/.gitkeep -------------------------------------------------------------------------------- /demo_data/img_1_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_data/img_1_2.jpg -------------------------------------------------------------------------------- /demo_data/mask_1_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_data/mask_1_2.png -------------------------------------------------------------------------------- /demo_out/CI_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/CI_visualization.png -------------------------------------------------------------------------------- /demo_out/all_data_mean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/all_data_mean.png -------------------------------------------------------------------------------- /demo_out/christmas quokka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/christmas quokka.png -------------------------------------------------------------------------------- /demo_out/christmas_freecond.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/christmas_freecond.png -------------------------------------------------------------------------------- /demo_out/ganster_otter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/ganster_otter.png -------------------------------------------------------------------------------- /demo_out/github_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/github_teaser.jpg -------------------------------------------------------------------------------- /demo_out/halloween_quokka.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/halloween_quokka.png -------------------------------------------------------------------------------- /demo_out/paper_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/paper_teaser.jpg -------------------------------------------------------------------------------- /demo_out/preset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/preset.png -------------------------------------------------------------------------------- /demo_out/self_attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/self_attn.png -------------------------------------------------------------------------------- /demo_out/self_attn_multi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/self_attn_multi.png -------------------------------------------------------------------------------- /demo_out/spy_otter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/spy_otter.png -------------------------------------------------------------------------------- /demo_out/tokens_mean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/demo_out/tokens_mean.png -------------------------------------------------------------------------------- /freecond_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/freecond_demo.gif -------------------------------------------------------------------------------- /freecond_src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/freecond_src/__init__.py -------------------------------------------------------------------------------- /gp_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/gp_model.pkl -------------------------------------------------------------------------------- /hdpainter_src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/config/ddpm/v1.yaml: -------------------------------------------------------------------------------- 1 | linear_start: 0.00085 2 | linear_end: 0.0120 3 | num_timesteps_cond: 1 4 | log_every_t: 200 5 | timesteps: 1000 6 | first_stage_key: "jpg" 7 | cond_stage_key: "txt" 8 | image_size: 64 9 | channels: 4 10 | cond_stage_trainable: false 11 | conditioning_key: crossattn 12 | monitor: val/loss_simple_ema 13 | scale_factor: 0.18215 14 | use_ema: False # we set this to false because this is an inference only config -------------------------------------------------------------------------------- /hdpainter_src/config/ddpm/v2-upsample.yaml: -------------------------------------------------------------------------------- 1 | parameterization: "v" 2 | low_scale_key: "lr" 3 | linear_start: 0.0001 4 | linear_end: 0.02 5 | num_timesteps_cond: 1 6 | log_every_t: 200 7 | timesteps: 1000 8 | first_stage_key: "jpg" 9 | cond_stage_key: "txt" 10 | image_size: 128 11 | channels: 4 12 | cond_stage_trainable: false 13 | conditioning_key: "hybrid-adm" 14 | monitor: val/loss_simple_ema 15 | scale_factor: 0.08333 16 | use_ema: False 17 | 18 | low_scale_config: 19 | target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation 20 | params: 21 | noise_schedule_config: # image space 22 | linear_start: 0.0001 23 | linear_end: 0.02 24 | max_noise_level: 350 25 | -------------------------------------------------------------------------------- /hdpainter_src/config/encoders/clip.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.encoders.clip_embedder.FrozenCLIPEmbedder -------------------------------------------------------------------------------- /hdpainter_src/config/encoders/openclip.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.encoders.open_clip_embedder.FrozenOpenCLIPEmbedder 2 | __init__: 3 | freeze: True 4 | layer: "penultimate" -------------------------------------------------------------------------------- /hdpainter_src/config/unet/inpainting/v1.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.unet.UNetModel 2 | __init__: 3 | image_size: 32 # unused 4 | in_channels: 9 # 4 data + 4 downscaled image + 1 mask 5 | out_channels: 4 6 | model_channels: 320 7 | attention_resolutions: [ 4, 2, 1 ] 8 | num_res_blocks: 2 9 | channel_mult: [ 1, 2, 4, 4 ] 10 | num_heads: 8 11 | use_spatial_transformer: True 12 | transformer_depth: 1 13 | context_dim: 768 14 | use_checkpoint: False 15 | legacy: False -------------------------------------------------------------------------------- /hdpainter_src/config/unet/inpainting/v2.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.unet.UNetModel 2 | __init__: 3 | use_checkpoint: False 4 | image_size: 32 # unused 5 | in_channels: 9 6 | out_channels: 4 7 | model_channels: 320 8 | attention_resolutions: [ 4, 2, 1 ] 9 | num_res_blocks: 2 10 | channel_mult: [ 1, 2, 4, 4 ] 11 | num_head_channels: 64 # need to fix for flash-attn 12 | use_spatial_transformer: True 13 | use_linear_in_transformer: True 14 | transformer_depth: 1 15 | context_dim: 1024 16 | legacy: False -------------------------------------------------------------------------------- /hdpainter_src/config/unet/upsample/v2.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.unet.UNetModel 2 | __init__: 3 | use_checkpoint: False 4 | num_classes: 1000 # timesteps for noise conditioning (here constant, just need one) 5 | image_size: 128 6 | in_channels: 7 7 | out_channels: 4 8 | model_channels: 256 9 | attention_resolutions: [ 2,4,8] 10 | num_res_blocks: 2 11 | channel_mult: [ 1, 2, 2, 4] 12 | disable_self_attentions: [True, True, True, False] 13 | disable_middle_self_attn: False 14 | num_heads: 8 15 | use_spatial_transformer: True 16 | transformer_depth: 1 17 | context_dim: 1024 18 | legacy: False 19 | use_linear_in_transformer: True -------------------------------------------------------------------------------- /hdpainter_src/config/vae-upsample.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.vae.AutoencoderKL 2 | __init__: 3 | embed_dim: 4 4 | ddconfig: 5 | double_z: True 6 | z_channels: 4 7 | resolution: 256 8 | in_channels: 3 9 | out_ch: 3 10 | ch: 128 11 | ch_mult: [ 1,2,4 ] 12 | num_res_blocks: 2 13 | attn_resolutions: [ ] 14 | dropout: 0.0 15 | lossconfig: 16 | target: torch.nn.Identity -------------------------------------------------------------------------------- /hdpainter_src/config/vae.yaml: -------------------------------------------------------------------------------- 1 | __class__: smplfusion.models.vae.AutoencoderKL 2 | __init__: 3 | embed_dim: 4 4 | monitor: val/rec_loss 5 | ddconfig: 6 | double_z: true 7 | z_channels: 4 8 | resolution: 256 9 | in_channels: 3 10 | out_ch: 3 11 | ch: 128 12 | ch_mult: [1,2,4,4] 13 | num_res_blocks: 2 14 | attn_resolutions: [] 15 | dropout: 0.0 16 | lossconfig: 17 | target: torch.nn.Identity -------------------------------------------------------------------------------- /hdpainter_src/methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/methods/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import sd2_sr, sam 2 | from .inpainting import load_inpainting_model, pre_download_inpainting_models 3 | -------------------------------------------------------------------------------- /hdpainter_src/models/sam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from segment_anything import sam_model_registry, SamPredictor 3 | from .common import * 4 | 5 | MODEL_PATH = f'{MODEL_FOLDER}/sam/sam_vit_h_4b8939.pth' 6 | DOWNLOAD_URL = 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth' 7 | 8 | # pre-download 9 | # download_file(DOWNLOAD_URL, MODEL_PATH) 10 | 11 | 12 | def load_model(device='cuda:0'): 13 | download_file(DOWNLOAD_URL, MODEL_PATH) 14 | sam = sam_model_registry["vit_h"](checkpoint=MODEL_PATH) 15 | sam.to(device=device) 16 | sam_predictor = SamPredictor(sam) 17 | return sam_predictor 18 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/__init__.py: -------------------------------------------------------------------------------- 1 | from . import share, scheduler 2 | from .ddim import DDIM 3 | from .patches import router, attentionpatch, transformerpatch -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/smplfusion/models/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/smplfusion/modules/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/modules/attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/smplfusion/modules/attention/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/modules/attention/feed_forward.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class GEGLU(nn.Module): 7 | def __init__(self, dim_in, dim_out): 8 | super().__init__() 9 | self.proj = nn.Linear(dim_in, dim_out * 2) 10 | 11 | def forward(self, x): 12 | x, gate = self.proj(x).chunk(2, dim=-1) 13 | return x * F.gelu(gate) 14 | 15 | 16 | class FeedForward(nn.Module): 17 | def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): 18 | super().__init__() 19 | inner_dim = int(dim * mult) 20 | dim_out = dim_out or dim 21 | project_in = nn.Sequential( 22 | nn.Linear(dim, inner_dim), 23 | nn.GELU() 24 | ) if not glu else GEGLU(dim, inner_dim) 25 | 26 | self.net = nn.Sequential( 27 | project_in, 28 | nn.Dropout(dropout), 29 | nn.Linear(inner_dim, dim_out) 30 | ) 31 | 32 | def forward(self, x): 33 | return self.net(x) -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/patches/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/hdpainter_src/smplfusion/patches/__init__.py -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/patches/attentionpatch/__init__.py: -------------------------------------------------------------------------------- 1 | from . import default 2 | from . import painta 3 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/patches/router.py: -------------------------------------------------------------------------------- 1 | from . import attentionpatch 2 | from . import transformerpatch 3 | 4 | attention_forward = attentionpatch.default.forward 5 | basic_transformer_forward = transformerpatch.default.forward 6 | 7 | def reset(): 8 | global attention_forward, basic_transformer_forward 9 | attention_forward = attentionpatch.default.forward 10 | basic_transformer_forward = transformerpatch.default.forward 11 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/patches/transformerpatch/__init__.py: -------------------------------------------------------------------------------- 1 | from . import default 2 | from . import painta 3 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/patches/transformerpatch/default.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ... import share 3 | 4 | def forward(self, x, context=None): 5 | x = x + self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) # Self Attn. 6 | x = x + self.attn2(self.norm2(x), context=context) # Cross Attn. 7 | x = x + self.ff(self.norm3(x)) 8 | return x 9 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def linear(n_timestep = 1000, start = 1e-4, end = 2e-2): 4 | return Schedule(torch.linspace(start ** 0.5, end ** 0.5, n_timestep, dtype = torch.float64) ** 2) 5 | 6 | class Schedule: 7 | def __init__(self, betas): 8 | self.betas = betas 9 | self._alphas = 1 - betas 10 | self.alphas = torch.cumprod(self._alphas, 0) 11 | self.one_minus_alphas = 1 - self.alphas 12 | self.sqrt_alphas = torch.sqrt(self.alphas) 13 | self.sqrt_one_minus_alphas = torch.sqrt(1 - self.alphas) 14 | self.sqrt_noise_signal_ratio = self.sqrt_one_minus_alphas / self.sqrt_alphas 15 | self.noise_signal_ratio = (1 - self.alphas) / self.alphas 16 | 17 | def range(self, dt): 18 | return range(len(self.betas)-1, 0, -dt) 19 | 20 | def sigma(self, t, dt): 21 | return torch.sqrt((1 - self._alphas[t - dt]) / (1 - self._alphas[t]) * (1 - self._alphas[t] / self._alphas[t - dt])) 22 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/util.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from ..utils import IImage 3 | 4 | 5 | def instantiate_from_config(config): 6 | if not "target" in config: 7 | if config == '__is_first_stage__': 8 | return None 9 | elif config == "__is_unconditional__": 10 | return None 11 | raise KeyError("Expected key `target` to instantiate.") 12 | return get_obj_from_str(config["target"])(**config.get("params", dict())) 13 | 14 | 15 | def get_obj_from_str(string, reload=False): 16 | module, cls = string.rsplit(".", 1) 17 | if reload: 18 | module_imp = importlib.import_module(module) 19 | importlib.reload(module_imp) 20 | return getattr(importlib.import_module(module, package=None), cls) -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .input_image import InputImage 2 | from .input_mask import InputMask 3 | from .input_shape import InputShape 4 | -------------------------------------------------------------------------------- /hdpainter_src/smplfusion/utils/input_shape.py: -------------------------------------------------------------------------------- 1 | class InputShape: 2 | def __init__(self, image_size): 3 | self.h,self.w = image_size[::-1] 4 | self.res = self.h * self.w 5 | self.res64 = self.res // 64 6 | self.res32 = self.res // 64 // 4 7 | self.res16 = self.res // 64 // 16 8 | self.res8 = self.res // 64 // 64 9 | self.shape = [self.h, self.w] 10 | self.shape64 = [self.h // 8, self.w // 8] 11 | self.shape32 = [self.h // 16, self.w // 16] 12 | self.shape16 = [self.h // 32, self.w // 32] 13 | self.shape8 = [self.h // 64, self.w // 64] 14 | 15 | def reshape(self, x): 16 | assert len(x.shape) == 3 17 | if x.shape[1] == self.res64: return x.reshape([x.shape[0]] + self.shape64 + [x.shape[-1]]) 18 | if x.shape[1] == self.res32: return x.reshape([x.shape[0]] + self.shape32 + [x.shape[-1]]) 19 | if x.shape[1] == self.res16: return x.reshape([x.shape[0]] + self.shape16 + [x.shape[-1]]) 20 | if x.shape[1] == self.res8: return x.reshape([x.shape[0]] + self.shape8 + [x.shape[-1]]) 21 | raise Exception("Unknown shape") 22 | 23 | def get_res(self, q, device = 'cpu'): 24 | if q.shape[1] == self.res64: return 64 25 | if q.shape[1] == self.res32: return 32 26 | if q.shape[1] == self.res16: return 16 27 | if q.shape[1] == self.res8: return 8 -------------------------------------------------------------------------------- /hdpainter_src/utils/scores.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | def l1(_crossattn_similarity, mask, token_idx = [1,2]): 6 | similarity = torch.cat(_crossattn_similarity,1)[1] 7 | similarity = similarity.mean(0).permute(2,0,1) 8 | # similarity = similarity.softmax(dim = 0) 9 | 10 | return (similarity[token_idx] * mask.cuda()).sum() 11 | 12 | def bce(_crossattn_similarity, mask, token_idx = [1,2]): 13 | similarity = torch.cat(_crossattn_similarity,1)[1] 14 | similarity = similarity.mean(0).permute(2,0,1) 15 | # similarity = similarity.softmax(dim = 0) 16 | 17 | return -sum([ 18 | F.binary_cross_entropy_with_logits(x - 1.0, mask.cuda()) 19 | for x in similarity[token_idx] 20 | ]) 21 | 22 | def softmax(_crossattn_similarity, mask, token_idx = [1,2]): 23 | similarity = torch.cat(_crossattn_similarity,1)[1] 24 | similarity = similarity.mean(0).permute(2,0,1) 25 | 26 | similarity = similarity[1:].softmax(dim = 0) # Comute the softmax to obtain probability values 27 | token_idx = [x - 1 for x in token_idx] 28 | 29 | score = similarity[token_idx].sum(dim = 0) # Sum up all relevant tokens to get pixel-wise probability of belonging to the correct class 30 | score = torch.log(score) # Obtain log-probabilities per-pixel 31 | return (score * mask.cuda()).sum() # Sum up log-probabilities (equivalent to multiplying P-values) for all pixels inside of the mask -------------------------------------------------------------------------------- /powerpaint/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .BrushNet_CA import BrushNetModel 2 | from .unet_2d_condition import UNet2DConditionModel 3 | 4 | 5 | __all__ = ["BrushNetModel", "UNet2DConditionModel"] 6 | -------------------------------------------------------------------------------- /powerpaint/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline_PowerPaint import StableDiffusionInpaintPipeline 2 | from .pipeline_PowerPaint_Brushnet_CA import StableDiffusionPowerPaintBrushNetPipeline 3 | from .pipeline_PowerPaint_ControlNet import StableDiffusionControlNetInpaintPipeline 4 | 5 | 6 | __all__ = [ 7 | "StableDiffusionInpaintPipeline", 8 | "StableDiffusionControlNetInpaintPipeline", 9 | "StableDiffusionPowerPaintBrushNetPipeline", 10 | ] 11 | -------------------------------------------------------------------------------- /powerpaint/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import ImageProjection, TokenizerWrapper 2 | 3 | 4 | __all__ = ["TokenizerWrapper", "ImageProjection"] 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #--extra-index-url https://download.pytorch.org/whl/cu118 2 | 3 | einops==0.7.0 4 | gradio==3.47.1 5 | mmdet==3.3.0 6 | numpy==1.24.1 7 | omegaconf==2.3.0 8 | open-clip-torch==2.23.0 9 | opencv-python==4.7.* 10 | openmim==0.3.9 11 | mmengine==0.10.2 12 | openai-clip==1.0.1 13 | pandas==2.1.4 14 | Pillow==9.4.0 15 | pytorch-lightning==2.1.2 16 | PyYAML==6.0.1 17 | safetensors==0.4.5 18 | scipy==1.10.0 19 | segment-anything @ git+https://github.com/facebookresearch/segment-anything.git@6fdee8f2727f4506cfbbe553e23b895e27956588 20 | torch==2.1.1 21 | torchvision==0.16.1 22 | tqdm==4.66.1 23 | transformers==4.45.1 24 | xformers==0.0.23 25 | image-reward==1.5 26 | hpsv2 @ git+https://github.com/tgxs002/HPSv2 27 | kmedoids==0.5.2 28 | # for t2v score 29 | iopath==0.1.10 30 | openai==1.75.0 31 | tiktoken==0.9.0 -------------------------------------------------------------------------------- /t2v_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | from .constants import HF_CACHE_DIR 7 | from .vqascore import VQAScore, list_all_vqascore_models 8 | from .clipscore import CLIPScore, list_all_clipscore_models 9 | from .itmscore import ITMScore, list_all_itmscore_models 10 | 11 | def list_all_models(): 12 | return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models() 13 | 14 | def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=HF_CACHE_DIR, **kwargs): 15 | if model in list_all_vqascore_models(): 16 | return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs) 17 | elif model in list_all_clipscore_models(): 18 | return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs) 19 | elif model in list_all_itmscore_models(): 20 | return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs) 21 | else: 22 | raise NotImplementedError() -------------------------------------------------------------------------------- /t2v_metrics/clipscore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.clipscore_models import list_all_clipscore_models, get_clipscore_model 8 | 9 | class CLIPScore(Score): 10 | def prepare_scoremodel(self, 11 | model='openai:ViT-L/14', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR): 14 | return get_clipscore_model( 15 | model, 16 | device=device, 17 | cache_dir=cache_dir 18 | ) 19 | 20 | def list_all_models(self) -> List[str]: 21 | return list_all_clipscore_models() -------------------------------------------------------------------------------- /t2v_metrics/constants.py: -------------------------------------------------------------------------------- 1 | HF_CACHE_DIR = "./hf_cache/" # TODO: change this to your own cache dir 2 | 3 | # For CLIP-FlanT5 and LLaVA-1.5 (copied from llava) 4 | CONTEXT_LEN = 2048 5 | SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions." 6 | IGNORE_INDEX = -100 7 | IMAGE_TOKEN_INDEX = -200 8 | DEFAULT_IMAGE_TOKEN = "" -------------------------------------------------------------------------------- /t2v_metrics/itmscore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.itmscore_models import list_all_itmscore_models, get_itmscore_model 8 | 9 | class ITMScore(Score): 10 | def prepare_scoremodel(self, 11 | model='blip2-itm', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR): 14 | return get_itmscore_model( 15 | model, 16 | device=device, 17 | cache_dir=cache_dir 18 | ) 19 | 20 | def list_all_models(self) -> List[str]: 21 | return list_all_itmscore_models() -------------------------------------------------------------------------------- /t2v_metrics/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/t2v_metrics/models/__init__.py -------------------------------------------------------------------------------- /t2v_metrics/models/clipscore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_model import CLIP_MODELS, CLIPScoreModel 2 | from .blip2_itc_model import BLIP2_ITC_MODELS, BLIP2ITCScoreModel 3 | from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel 4 | from .pickscore_model import PICKSCORE_MODELS, PickScoreModel 5 | from ...constants import HF_CACHE_DIR 6 | 7 | ALL_CLIP_MODELS = [ 8 | CLIP_MODELS, 9 | BLIP2_ITC_MODELS, 10 | HPSV2_MODELS, 11 | PICKSCORE_MODELS, 12 | ] 13 | 14 | def list_all_clipscore_models(): 15 | return [model for models in ALL_CLIP_MODELS for model in models] 16 | 17 | def get_clipscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR): 18 | assert model_name in list_all_clipscore_models() 19 | if model_name in CLIP_MODELS: 20 | return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir) 21 | elif model_name in BLIP2_ITC_MODELS: 22 | return BLIP2ITCScoreModel(model_name, device=device, cache_dir=cache_dir) 23 | elif model_name in HPSV2_MODELS: 24 | return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir) 25 | elif model_name in PICKSCORE_MODELS: 26 | return PickScoreModel(model_name, device=device, cache_dir=cache_dir) 27 | else: 28 | raise NotImplementedError() -------------------------------------------------------------------------------- /t2v_metrics/models/itmscore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .blip2_itm_model import BLIP2_ITM_MODELS, BLIP2ITMScoreModel 2 | from .image_reward_model import IMAGE_REWARD_MODELS, ImageRewardScoreModel 3 | from ...constants import HF_CACHE_DIR 4 | 5 | ALL_ITM_MODELS = [ 6 | BLIP2_ITM_MODELS, 7 | IMAGE_REWARD_MODELS, 8 | ] 9 | 10 | def list_all_itmscore_models(): 11 | return [model for models in ALL_ITM_MODELS for model in models] 12 | 13 | def get_itmscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR): 14 | assert model_name in list_all_itmscore_models() 15 | if model_name in BLIP2_ITM_MODELS: 16 | return BLIP2ITMScoreModel(model_name, device=device, cache_dir=cache_dir) 17 | elif model_name in IMAGE_REWARD_MODELS: 18 | return ImageRewardScoreModel(model_name, device=device, cache_dir=cache_dir) 19 | else: 20 | raise NotImplementedError() -------------------------------------------------------------------------------- /t2v_metrics/models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | import os 4 | import torch 5 | import numpy as np 6 | from PIL import Image 7 | 8 | from ..constants import HF_CACHE_DIR 9 | 10 | def image_loader(image_path): 11 | if image_path.split('.')[-1] == 'npy': 12 | return Image.fromarray(np.load(image_path)[:, :, [2, 1, 0]], 'RGB') 13 | else: 14 | return Image.open(image_path).convert("RGB") 15 | 16 | class ScoreModel(ABC): 17 | def __init__(self, 18 | model_name='clip-flant5-xxl', 19 | device='cuda', 20 | cache_dir=HF_CACHE_DIR): 21 | self.model_name = model_name 22 | self.device = device 23 | self.cache_dir = cache_dir 24 | if not os.path.exists(self.cache_dir): 25 | os.makedirs(self.cache_dir) 26 | self.image_loader = image_loader 27 | self.load_model() 28 | 29 | @abstractmethod 30 | def load_model(self): 31 | """Load the model, tokenizer, and etc. 32 | """ 33 | pass 34 | 35 | @abstractmethod 36 | def load_images(self, 37 | image: List[str]) -> torch.Tensor: 38 | """Load the image(s), and return a tensor (after preprocessing) put on self.device 39 | """ 40 | pass 41 | 42 | @abstractmethod 43 | def forward(self, 44 | images: List[str], 45 | texts: List[str]) -> torch.Tensor: 46 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 47 | """ 48 | pass -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_t5_model import CLIP_T5_MODELS, CLIPT5Model 2 | from .llava_model import LLAVA_MODELS, LLaVAModel 3 | from .llava16_model import LLAVA16_MODELS, LLaVA16Model 4 | from .instructblip_model import InstructBLIP_MODELS, InstructBLIPModel 5 | from .gpt4v_model import GPT4V_MODELS, GPT4VModel 6 | from ...constants import HF_CACHE_DIR 7 | 8 | ALL_VQA_MODELS = [ 9 | CLIP_T5_MODELS, 10 | LLAVA_MODELS, 11 | LLAVA16_MODELS, 12 | InstructBLIP_MODELS, 13 | GPT4V_MODELS, 14 | ] 15 | 16 | def list_all_vqascore_models(): 17 | return [model for models in ALL_VQA_MODELS for model in models] 18 | 19 | def get_vqascore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwargs): 20 | assert model_name in list_all_vqascore_models() 21 | if model_name in CLIP_T5_MODELS: 22 | return CLIPT5Model(model_name, device=device, cache_dir=cache_dir, **kwargs) 23 | elif model_name in LLAVA_MODELS: 24 | return LLaVAModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 25 | elif model_name in LLAVA16_MODELS: 26 | return LLaVA16Model(model_name, device=device, cache_dir=cache_dir, **kwargs) 27 | elif model_name in InstructBLIP_MODELS: 28 | return InstructBLIPModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 29 | elif model_name in GPT4V_MODELS: 30 | return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 31 | else: 32 | raise NotImplementedError() -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/clip_t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/t2v_metrics/models/vqascore_models/clip_t5/__init__.py -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.clip_t5 import CLIPT5ForConditionalGeneration, CLIPT5Config, ModelArguments -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from .common.registry import registry 14 | 15 | from .models import * 16 | from .processors import * 17 | 18 | 19 | root_dir = os.path.dirname(os.path.abspath(__file__)) 20 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 21 | 22 | registry.register_path("library_root", root_dir) 23 | repo_root = os.path.join(root_dir, "..") 24 | registry.register_path("repo_root", repo_root) 25 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 26 | registry.register_path("cache_root", cache_root) 27 | 28 | registry.register("MAX_INT", sys.maxsize) 29 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 30 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json 18 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 19 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 20 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json 21 | storage: 22 | - coco/annotations/vqa_val_eval.json 23 | - coco/annotations/answer_list.json 24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 25 | - coco/annotations/v2_mscoco_val2014_annotations.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_testdev.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json 22 | storage: 23 | - gqa/annotations/testdev_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json 22 | storage: 23 | - gqa/annotations/val_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json 16 | storage: msrvtt/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json 19 | storage: msrvtt/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json 22 | storage: msrvtt/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json 25 | storage: msrvtt/annotations/qa_ans2label.json 26 | videos: 27 | storage: msrvtt/videos 28 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json 16 | storage: msvd/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json 19 | storage: msvd/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json 22 | storage: msvd/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json 25 | storage: msvd/annotations/qa_ans2label.json 26 | videos: 27 | storage: msvd/videos 28 | 29 | instance_id_key: question_id 30 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xxl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xxl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_no_prefix_iter_100000/20231015004/checkpoint_80000.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_prefix_iter_100000/20231015004/checkpoint_80000.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna13b.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna7b.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basiclab/FreeCond/6aadc496e88a97b8170241059b9b27a75dd050f3/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from .base_processor import BaseProcessor 9 | 10 | from .blip_processors import ( 11 | BlipImageTrainProcessor, 12 | Blip2ImageTrainProcessor, 13 | BlipImageEvalProcessor, 14 | BlipCaptionProcessor, 15 | ) 16 | 17 | from ..common.registry import registry 18 | 19 | __all__ = [ 20 | "BaseProcessor", 21 | # BLIP 22 | "BlipImageTrainProcessor", 23 | "Blip2ImageTrainProcessor", 24 | "BlipImageEvalProcessor", 25 | "BlipCaptionProcessor", 26 | ] 27 | 28 | 29 | def load_processor(name, cfg=None): 30 | """ 31 | Example 32 | 33 | >>> processor = load_processor("alpro_video_train", cfg=None) 34 | """ 35 | processor = registry.get_processor_class(name).from_config(cfg) 36 | 37 | return processor 38 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments 2 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or vision_tower.startswith("Lin-Chen"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava_16/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM, LlavaConfig -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava_16/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/llava_16/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') -------------------------------------------------------------------------------- /t2v_metrics/models/vqascore_models/vqa_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List 3 | import torch 4 | 5 | from ..model import ScoreModel 6 | 7 | class VQAScoreModel(ScoreModel): 8 | 9 | @abstractmethod 10 | def forward(self, 11 | images: List[str], 12 | texts: List[str], 13 | question_template: str, 14 | answer_template: str) -> torch.Tensor: 15 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 16 | question_template: a string with optional {} to be replaced with the 'text' 17 | answer_template: a string with optional {} to be replaced with the 'text' 18 | """ 19 | pass -------------------------------------------------------------------------------- /t2v_metrics/vqascore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.vqascore_models import list_all_vqascore_models, get_vqascore_model 8 | 9 | class VQAScore(Score): 10 | def prepare_scoremodel(self, 11 | model='clip-flant5-xxl', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR, 14 | **kwargs): 15 | return get_vqascore_model( 16 | model, 17 | device=device, 18 | cache_dir=cache_dir, 19 | **kwargs 20 | ) 21 | 22 | def list_all_models(self) -> List[str]: 23 | return list_all_vqascore_models() --------------------------------------------------------------------------------