├── .gitignore ├── LICENSE ├── README.md ├── asset └── teaser.jpg ├── config ├── compositional_image │ └── rbf.yaml └── quantity_aware │ └── rbf.yaml ├── main.py ├── rbf ├── corrector │ ├── __init__.py │ ├── adaptive_sampler.py │ ├── base.py │ ├── dps.py │ ├── reward_model │ │ ├── __init__.py │ │ ├── base.py │ │ ├── counting.py │ │ ├── human.py │ │ ├── imagereward.py │ │ ├── pickscore.py │ │ ├── stylereward.py │ │ ├── vlm.py │ │ ├── vqa.py │ │ └── vqa_server.py │ └── rgrp_sampler.py ├── logger │ ├── __init__.py │ └── logger.py ├── model │ ├── __init__.py │ ├── base.py │ ├── image.py │ └── image_flux.py ├── prior │ ├── __init__.py │ ├── base.py │ ├── denoise_schedulers │ │ ├── __init__.py │ │ └── scheduler.py │ ├── flux.py │ ├── flux_fill.py │ ├── instaflow.py │ ├── sd.py │ └── sd2.py ├── rbf.py ├── shared_modules.py ├── time_sampler │ ├── __init__.py │ └── base.py └── utils │ ├── camera_utils.py │ ├── config_utils.py │ ├── extra_utils.py │ ├── fs_travel_utils.py │ ├── image_utils.py │ ├── path_utils.py │ ├── print_utils.py │ └── random_utils.py ├── requirements.txt ├── setup.py └── third-party └── t2v_metrics ├── .gitignore ├── LICENSE ├── README.md ├── dataset.py ├── datasets ├── SeeTRUE.csv ├── dsg_tifa160_anns.csv ├── stanfordt23d.json ├── sugar_crepe │ ├── add_att.json │ ├── add_obj.json │ ├── replace_att.json │ ├── replace_obj.json │ ├── replace_rel.json │ ├── swap_att.json │ └── swap_obj.json ├── t2vscore_alignment_score.json ├── t2vscore_quality_score.json ├── t2vscore_results.csv └── tifa160.json ├── eval.py ├── genai_bench ├── evaluate.py ├── generate.py └── model_performance_vqacore.md ├── genai_image_eval.py ├── genai_image_ranking.py ├── genai_video_eval.py ├── gpt4_eval.py ├── images ├── 0 │ ├── DALLE3.png │ ├── DeepFloyd.jpg │ ├── Midjourney.jpg │ └── SDXL.jpg ├── 1 │ ├── DALLE3.png │ ├── DeepFloyd.jpg │ ├── Midjourney.jpg │ └── SDXL.jpg ├── 0.png ├── 1.png └── example.png ├── pyproject.toml ├── requirements.txt ├── t2v_metrics ├── __init__.py ├── clipscore.py ├── constants.py ├── itmscore.py ├── models │ ├── __init__.py │ ├── clipscore_models │ │ ├── __init__.py │ │ ├── blip2_itc_model.py │ │ ├── clip_model.py │ │ ├── hpsv2_model.py │ │ └── pickscore_model.py │ ├── itmscore_models │ │ ├── __init__.py │ │ ├── blip2_itm_model.py │ │ └── image_reward_model.py │ ├── model.py │ └── vqascore_models │ │ ├── __init__.py │ │ ├── clip_t5 │ │ ├── __init__.py │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── language_model │ │ │ └── clip_t5.py │ │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ │ └── multimodal_projector │ │ │ └── builder.py │ │ ├── clip_t5_model.py │ │ ├── gpt4v_model.py │ │ ├── instructblip_model.py │ │ ├── lavis │ │ ├── __init__.py │ │ ├── common │ │ │ ├── config.py │ │ │ ├── dist_utils.py │ │ │ ├── gradcam.py │ │ │ ├── logger.py │ │ │ ├── optims.py │ │ │ ├── registry.py │ │ │ ├── utils.py │ │ │ └── vqa_tools │ │ │ │ ├── __init__.py │ │ │ │ ├── vqa.py │ │ │ │ └── vqa_eval.py │ │ ├── configs │ │ │ ├── datasets │ │ │ │ ├── aokvqa │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── avsd │ │ │ │ │ └── defaults_dial.yaml │ │ │ │ ├── coco │ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ │ ├── defaults_ret.yaml │ │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ │ └── eval_vqa.yaml │ │ │ │ ├── conceptual_caption │ │ │ │ │ ├── defaults_12m.yaml │ │ │ │ │ └── defaults_3m.yaml │ │ │ │ ├── didemo │ │ │ │ │ └── defaults_ret.yaml │ │ │ │ ├── flickr30k │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── gqa │ │ │ │ │ ├── balanced_testdev.yaml │ │ │ │ │ ├── balanced_val.yaml │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── imagenet │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── laion │ │ │ │ │ └── defaults_2B_multi.yaml │ │ │ │ ├── msrvtt │ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ │ ├── defaults_qa.yaml │ │ │ │ │ └── defaults_ret.yaml │ │ │ │ ├── msvd │ │ │ │ │ ├── defaults_cap.yaml │ │ │ │ │ └── defaults_qa.yaml │ │ │ │ ├── nlvr │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── nocaps │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── okvqa │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── sbu_caption │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── snli_ve │ │ │ │ │ └── defaults.yaml │ │ │ │ ├── vatex │ │ │ │ │ └── defaults_cap.yaml │ │ │ │ └── vg │ │ │ │ │ ├── defaults_caption.yaml │ │ │ │ │ └── defaults_vqa.yaml │ │ │ ├── default.yaml │ │ │ └── models │ │ │ │ ├── albef_classification_ve.yaml │ │ │ │ ├── albef_feature_extractor.yaml │ │ │ │ ├── albef_nlvr.yaml │ │ │ │ ├── albef_pretrain_base.yaml │ │ │ │ ├── albef_retrieval_coco.yaml │ │ │ │ ├── albef_retrieval_flickr.yaml │ │ │ │ ├── albef_vqav2.yaml │ │ │ │ ├── alpro_qa_msrvtt.yaml │ │ │ │ ├── alpro_qa_msvd.yaml │ │ │ │ ├── alpro_retrieval_didemo.yaml │ │ │ │ ├── alpro_retrieval_msrvtt.yaml │ │ │ │ ├── bert_config.json │ │ │ │ ├── bert_config_alpro.json │ │ │ │ ├── blip2 │ │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ │ ├── blip2_coco.yaml │ │ │ │ ├── blip2_instruct_flant5xl.yaml │ │ │ │ ├── blip2_instruct_flant5xxl.yaml │ │ │ │ ├── blip2_instruct_vicuna13b.yaml │ │ │ │ ├── blip2_instruct_vicuna7b.yaml │ │ │ │ ├── blip2_pretrain.yaml │ │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ │ ├── blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml │ │ │ │ ├── blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml │ │ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ │ ├── blip2_pretrain_vitL.yaml │ │ │ │ ├── blip2_vicuna13b.yaml │ │ │ │ └── blip2_vicuna7b.yaml │ │ │ │ ├── blip_caption_base_coco.yaml │ │ │ │ ├── blip_caption_large_coco.yaml │ │ │ │ ├── blip_classification_base.yaml │ │ │ │ ├── blip_feature_extractor_base.yaml │ │ │ │ ├── blip_itm_base.yaml │ │ │ │ ├── blip_itm_large.yaml │ │ │ │ ├── blip_nlvr.yaml │ │ │ │ ├── blip_pretrain_base.yaml │ │ │ │ ├── blip_pretrain_large.yaml │ │ │ │ ├── blip_retrieval_coco.yaml │ │ │ │ ├── blip_retrieval_flickr.yaml │ │ │ │ ├── blip_vqa_aokvqa.yaml │ │ │ │ ├── blip_vqa_okvqa.yaml │ │ │ │ ├── blip_vqav2.yaml │ │ │ │ ├── clip │ │ │ │ ├── RN101-quickgelu.json │ │ │ │ ├── RN101.json │ │ │ │ ├── RN50-quickgelu.json │ │ │ │ ├── RN50.json │ │ │ │ ├── RN50x16.json │ │ │ │ ├── RN50x4.json │ │ │ │ ├── ViT-B-16-plus-240.json │ │ │ │ ├── ViT-B-16-plus.json │ │ │ │ ├── ViT-B-16.json │ │ │ │ ├── ViT-B-32-plus-256.json │ │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ │ ├── ViT-B-32.json │ │ │ │ ├── ViT-H-14.json │ │ │ │ ├── ViT-H-16.json │ │ │ │ ├── ViT-L-14-280.json │ │ │ │ ├── ViT-L-14-336.json │ │ │ │ ├── ViT-L-14.json │ │ │ │ ├── ViT-L-16-320.json │ │ │ │ ├── ViT-L-16.json │ │ │ │ ├── ViT-g-14.json │ │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ │ ├── timm-resnet50d.json │ │ │ │ ├── timm-resnetaa50d.json │ │ │ │ ├── timm-resnetblur50.json │ │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ │ └── timm-vit_small_patch16_224.json │ │ │ │ ├── clip_resnet50.yaml │ │ │ │ ├── clip_vit_base16.yaml │ │ │ │ ├── clip_vit_base32.yaml │ │ │ │ ├── clip_vit_large14.yaml │ │ │ │ ├── clip_vit_large14_336.yaml │ │ │ │ ├── gpt_dialogue_base.yaml │ │ │ │ ├── img2prompt-vqa │ │ │ │ └── img2prompt_vqa_base.yaml │ │ │ │ ├── med_config.json │ │ │ │ ├── med_config_albef.json │ │ │ │ ├── med_large_config.json │ │ │ │ └── pnp-vqa │ │ │ │ ├── pnp_vqa_3b.yaml │ │ │ │ ├── pnp_vqa_base.yaml │ │ │ │ ├── pnp_vqa_large.yaml │ │ │ │ ├── unifiedqav2_3b_config.json │ │ │ │ ├── unifiedqav2_base_config.json │ │ │ │ └── unifiedqav2_large_config.json │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── albef_models │ │ │ │ ├── __init__.py │ │ │ │ ├── albef_classification.py │ │ │ │ ├── albef_feature_extractor.py │ │ │ │ ├── albef_nlvr.py │ │ │ │ ├── albef_outputs.py │ │ │ │ ├── albef_pretrain.py │ │ │ │ ├── albef_retrieval.py │ │ │ │ └── albef_vqa.py │ │ │ ├── base_model.py │ │ │ ├── blip2_models │ │ │ │ ├── Qformer.py │ │ │ │ ├── __init__.py │ │ │ │ ├── blip2.py │ │ │ │ ├── blip2_image_text_matching.py │ │ │ │ ├── blip2_qformer.py │ │ │ │ ├── blip2_t5.py │ │ │ │ ├── blip2_t5_instruct.py │ │ │ │ ├── blip2_vicuna.py │ │ │ │ ├── blip2_vicuna_instruct.py │ │ │ │ ├── modeling_llama.py │ │ │ │ └── modeling_t5.py │ │ │ ├── blip_models │ │ │ │ ├── __init__.py │ │ │ │ ├── blip.py │ │ │ │ ├── blip_caption.py │ │ │ │ ├── blip_classification.py │ │ │ │ ├── blip_feature_extractor.py │ │ │ │ ├── blip_image_text_matching.py │ │ │ │ ├── blip_nlvr.py │ │ │ │ ├── blip_outputs.py │ │ │ │ ├── blip_pretrain.py │ │ │ │ ├── blip_retrieval.py │ │ │ │ ├── blip_vqa.py │ │ │ │ └── nlvr_encoder.py │ │ │ ├── clip_vit.py │ │ │ ├── eva_vit.py │ │ │ ├── med.py │ │ │ └── vit.py │ │ └── processors │ │ │ ├── __init__.py │ │ │ ├── base_processor.py │ │ │ ├── blip_processors.py │ │ │ └── randaugment.py │ │ ├── llava │ │ ├── __init__.py │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── language_model │ │ │ └── llava_llama.py │ │ │ ├── llava_arch.py │ │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ │ └── multimodal_projector │ │ │ └── builder.py │ │ ├── llava16_model.py │ │ ├── llava_16 │ │ ├── __init__.py │ │ └── model │ │ │ ├── __init__.py │ │ │ ├── language_model │ │ │ └── llava_llama.py │ │ │ ├── llava_arch.py │ │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ │ └── multimodal_projector │ │ │ └── builder.py │ │ ├── llava_model.py │ │ ├── mm_utils.py │ │ └── vqa_model.py ├── score.py └── vqascore.py └── tau_optimization.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 KAIST Visual AI Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /asset/teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/asset/teaser.jpg -------------------------------------------------------------------------------- /config/compositional_image/rbf.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 2 2 | init_n_particles: 25 3 | 4 | convert_scheduler: vp 5 | sample_method: sde 6 | text_prompt: 7 | 8 | #=============================================================================== 9 | 10 | max_nfe: 500 11 | max_steps: 10 12 | block_size: 1 13 | n_particles: 1 14 | scheduler_n: 1.0 15 | 16 | #=============================================================================== 17 | 18 | root_dir: ./results 19 | tag: ${filtering_method} 20 | save_now: True 21 | 22 | device: 0 23 | seed: 0 24 | 25 | filtering_method: rbf 26 | 27 | #=============================================================================== 28 | # Trainer settings 29 | #=============================================================================== 30 | disable_debug: False 31 | 32 | #=============================================================================== 33 | # Dataset 34 | #=============================================================================== 35 | height: 1024 36 | width: 1024 37 | 38 | #=============================================================================== 39 | # Time sampler 40 | #=============================================================================== 41 | time_sampler: flux_scheduler 42 | time_schedule: exp 43 | t_max: 1000 44 | 45 | #=============================================================================== 46 | # Model 47 | #=============================================================================== 48 | model: flux_image 49 | channels: 4096 # [B 4096 64] 50 | 51 | #=============================================================================== 52 | # Prior 53 | #=============================================================================== 54 | prior: flux 55 | guidance_scale: 3.5 56 | 57 | diffusion_coefficient: square 58 | diffusion_norm: 3.0 59 | 60 | # Only used for "exp" diffusion coefficient 61 | exp_diff_coeff_sigma: 0.1 62 | 63 | model_name: "black-forest-labs/FLUX.1-schnell" 64 | 65 | #=============================================================================== 66 | # Logger 67 | #=============================================================================== 68 | logger: self 69 | log_interval: 1 70 | 71 | #=============================================================================== 72 | # Corrector 73 | #=============================================================================== 74 | corrector: particle 75 | 76 | reward_weight: 0.5 77 | reward_score: vqa 78 | 79 | vqa_model: clip-flant5-xxl 80 | vqa_batch_size: 32 81 | 82 | vqa_device: 1 83 | image_reward_weight: 0.0 84 | -------------------------------------------------------------------------------- /config/quantity_aware/rbf.yaml: -------------------------------------------------------------------------------- 1 | max_nfe: 500 2 | batch_size: 2 3 | tau_norm: 0.0 4 | 5 | text_prompt: Six airplanes flying over a desert with seven camels walking below 6 | class_gt_counts: 6, 7 7 | class_names: airplanes, camels 8 | 9 | max_steps: 10 10 | 11 | block_size: 1 12 | n_particles: 1 13 | sample_method: sde 14 | convert_scheduler: vp 15 | scheduler_n: 1 16 | 17 | init_n_particles: 25 18 | 19 | 20 | #=============================================================================== 21 | 22 | root_dir: ./results 23 | tag: ${filtering_method} 24 | save_now: True 25 | 26 | device: 0 27 | seed: 0 28 | 29 | filtering_method: rbf 30 | 31 | #=============================================================================== 32 | # Trainer settings 33 | #=============================================================================== 34 | disable_debug: False 35 | 36 | #=============================================================================== 37 | # Dataset 38 | #=============================================================================== 39 | height: 1024 40 | width: 1024 41 | 42 | #=============================================================================== 43 | # Time sampler 44 | #=============================================================================== 45 | time_sampler: flux_scheduler 46 | time_schedule: exp 47 | t_max: 1000 48 | 49 | #=============================================================================== 50 | # Model 51 | #=============================================================================== 52 | model: flux_image 53 | channels: 4096 # [B 4096 64] 54 | 55 | #=============================================================================== 56 | # Prior 57 | #=============================================================================== 58 | prior: flux 59 | guidance_scale: 3.5 60 | 61 | diffusion_coefficient: square 62 | diffusion_norm: 3.0 63 | 64 | model_name: "black-forest-labs/FLUX.1-schnell" 65 | 66 | #=============================================================================== 67 | # Logger 68 | #=============================================================================== 69 | logger: self 70 | log_interval: 1 71 | 72 | #=============================================================================== 73 | # Corrector 74 | #=============================================================================== 75 | corrector: particle 76 | 77 | reward_score: counting 78 | reward_func: diff 79 | count_reward_model: gdsam -------------------------------------------------------------------------------- /rbf/corrector/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import DDIMCorrector 2 | from .rgrp_sampler import RGRPSampler 3 | from .adaptive_sampler import AdaptiveSampler 4 | from .dps import DiffRewardCorrector 5 | 6 | CORRECTORs = { 7 | "ddim": DDIMCorrector, 8 | "particle": RGRPSampler, 9 | "adaptive": AdaptiveSampler, 10 | "diff": DiffRewardCorrector, 11 | } 12 | 13 | CORRECTOR_REQUIRING_GRADIENT = ["diff"] -------------------------------------------------------------------------------- /rbf/corrector/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | from random import randint 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from rbf.utils.extra_utils import ignore_kwargs 9 | from rbf.utils.print_utils import print_warning, print_error, print_info 10 | from rbf import shared_modules as sm 11 | 12 | 13 | class Corrector(ABC): 14 | @ignore_kwargs 15 | @dataclass 16 | class Config: 17 | correct_steps: int = 1 18 | 19 | 20 | def __init__(self, cfg): 21 | self.cfg = self.Config(**cfg) 22 | self.potentials = [] 23 | 24 | 25 | @abstractmethod 26 | def pre_correct(self, images): 27 | # Correct samples 28 | pass 29 | 30 | 31 | @abstractmethod 32 | def post_correct(self, images): 33 | # Correct samples 34 | pass 35 | 36 | 37 | class DDIMCorrector(Corrector): 38 | def pre_correct( 39 | self, 40 | noisy_sample, 41 | tweedie, 42 | model_pred, 43 | step=None 44 | ): 45 | return noisy_sample, model_pred 46 | 47 | def post_correct( 48 | self, 49 | prev_noisy_sample, 50 | tweedie, 51 | model_pred, 52 | step, 53 | ): 54 | return prev_noisy_sample, tweedie, model_pred 55 | 56 | def final_correct( 57 | self, 58 | noisy_sample, 59 | tweedie, 60 | step=None 61 | ): 62 | return noisy_sample, tweedie 63 | 64 | -------------------------------------------------------------------------------- /rbf/corrector/dps.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | from random import randint 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from rbf.utils.extra_utils import ignore_kwargs 9 | from rbf.utils.print_utils import print_warning, print_error, print_info 10 | from rbf import shared_modules as sm 11 | from rbf.corrector.base import Corrector 12 | from rbf.corrector.rgrp_sampler import RGRPSampler 13 | 14 | 15 | class DiffRewardCorrector(RGRPSampler): 16 | @ignore_kwargs 17 | @dataclass 18 | class Config(RGRPSampler.Config): 19 | strength: float = 1.0 20 | device: int = 0 21 | batch_size: int = 1 22 | n_particles: int = 1 23 | 24 | reward_score: str = 'style' 25 | guidance_method: str = 'dps' 26 | 27 | disable_debug: bool = False 28 | log_interval: int = 5 29 | 30 | 31 | def __init__(self, cfg): 32 | super().__init__(cfg) 33 | self.cfg = self.Config(**cfg) 34 | 35 | 36 | def adjust_sample_size(self, t_curr, step): 37 | return self.cfg.n_particles 38 | 39 | 40 | def apply_guidance( 41 | self, 42 | noisy_sample, 43 | tweedie, 44 | step, 45 | ): 46 | 47 | if self.cfg.guidance_method == "dps": 48 | weight = self.reward_model(tweedie, step) 49 | 50 | # NOTE: Weight is computed as -loss. 51 | # Applying guidance with +gradient 52 | 53 | grad = torch.autograd.grad(weight.sum(), noisy_sample)[0] 54 | 55 | prev_latent_noisy = prev_latent_noisy + (self.cfg.strength * grad) / torch.abs(weight.view(-1, * ([1] * (len(prev_latent_noisy.shape) - 1)) )) 56 | 57 | else: 58 | raise NotImplementedError(f"Guidance {self.cfg.guidance} not implemented") 59 | 60 | return prev_latent_noisy 61 | 62 | 63 | def post_correct( 64 | self, 65 | noisy_sample, 66 | tweedie, 67 | model_pred, 68 | step, 69 | ): 70 | 71 | rgb_tweedie = sm.prior.decode_latent( 72 | tweedie, convert_to_float=False 73 | ) 74 | 75 | # Apply guidance (DPS/FreeDoM) 76 | prev_latent_noisy = self.apply_guidance( 77 | noisy_sample, 78 | rgb_tweedie, 79 | step, 80 | ) 81 | 82 | ( 83 | resample_noisy_sample, 84 | resample_tweedie, 85 | resample_model_pred 86 | ) = super().post_correct( 87 | prev_latent_noisy, 88 | tweedie, 89 | model_pred, 90 | ) 91 | 92 | return (resample_noisy_sample, resample_tweedie, resample_model_pred) # B x D -------------------------------------------------------------------------------- /rbf/corrector/reward_model/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import AestheticRewardModel, CompressionRewardModel, InpaintingRewardModel 2 | from .counting import CountingRewardModel 3 | from .vlm import VLMRewardModel 4 | from .human import HumanRewardModel 5 | from .pickscore import PickScoreRewardModel 6 | from .vqa import VQARewardModel 7 | from .imagereward import ImageRewardRewardModel -------------------------------------------------------------------------------- /rbf/corrector/reward_model/vqa_server.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.managers import BaseManager 2 | import torch 3 | import argparse 4 | import time 5 | import t2v_metrics 6 | import ImageReward as RM 7 | 8 | class RemoteVQAManager(BaseManager): 9 | pass 10 | 11 | @torch.no_grad() 12 | def process_VQA(input): 13 | start_time = time.time(); 14 | 15 | images = input.get("images"); 16 | text_prompt = input.get("text"); 17 | image_reward_weight = input.get("irw"); 18 | 19 | scores = [] 20 | for idx in range(0, len(images), vqa_batch_size): 21 | cur_batch_size = min(vqa_batch_size, len(images) - idx) 22 | cur_images = images[idx:idx+cur_batch_size] 23 | cur_scores = vqa_reward_model(images=cur_images, texts=[text_prompt]) 24 | 25 | if image_reward_weight > 0.0: 26 | image_reward_score = rm_model.score(text_prompt, cur_images) 27 | if type(image_reward_score) is float: 28 | image_reward_score = [image_reward_score] 29 | image_reward_score = torch.tensor(image_reward_score).to(cur_scores) 30 | cur_scores = cur_scores + image_reward_score[:, None] * image_reward_weight 31 | 32 | scores += cur_scores.reshape(-1).cpu().numpy().tolist(); 33 | 34 | output = { 35 | "scores": scores 36 | }; 37 | 38 | print("Reward calculation took {:.3f}s".format(time.time() - start_time)); 39 | return output; 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser(); 44 | 45 | parser.add_argument("--gpu", type = str, default = "0"); 46 | parser.add_argument("--addr", type = int, default = 5000); 47 | parser.add_argument("--vqa_model", type = str, default = "clip-flant5-xxl"); 48 | parser.add_argument("--vqa_batch_size", type = int, default = 32); 49 | 50 | args = parser.parse_args(); 51 | 52 | RemoteVQAManager.register("process_VQA", callable = process_VQA); 53 | 54 | device = "cuda:{}".format(args.gpu); 55 | vqa_batch_size = args.vqa_batch_size; 56 | vqa_model = args.vqa_model; 57 | 58 | vqa_reward_model = t2v_metrics.get_score_model(model = vqa_model, device = device); 59 | rm_model = RM.load("ImageReward-v1.0", device = device); 60 | 61 | manager = RemoteVQAManager(address=("localhost", int(args.addr)), authkey=b"secret") 62 | server = manager.get_server() 63 | print("Server started... Listening for requests.") 64 | server.serve_forever() 65 | -------------------------------------------------------------------------------- /rbf/logger/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import SelfLogger 2 | 3 | LOGGERs = { 4 | "self": SelfLogger, 5 | } 6 | -------------------------------------------------------------------------------- /rbf/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .image import ImageModel 2 | from .image_flux import FluxImageModel 3 | 4 | MODELs = { 5 | "image": ImageModel, 6 | "flux_image": FluxImageModel, 7 | } 8 | -------------------------------------------------------------------------------- /rbf/prior/__init__.py: -------------------------------------------------------------------------------- 1 | from .sd import ( 2 | StableDiffusionPrior, 3 | ) 4 | 5 | from .flux import FluxPrior 6 | from .instaflow import InstaFlowPrior 7 | from .flux_fill import FluxFillPrior 8 | 9 | from .sd2 import SD2Prior 10 | PRIORs = { 11 | "sd": StableDiffusionPrior, 12 | "flux": FluxPrior, 13 | "flux_fill": FluxFillPrior, 14 | "instaflow": InstaFlowPrior, 15 | "sd2": SD2Prior, 16 | } 17 | -------------------------------------------------------------------------------- /rbf/prior/denoise_schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | from .scheduler import ( 2 | CondOTScheduler, 3 | PolynomialConvexScheduler, 4 | VPScheduler, 5 | LinearVPScheduler, 6 | CosineScheduler, 7 | GeneralConvexScheduler, 8 | ) 9 | 10 | -------------------------------------------------------------------------------- /rbf/shared_modules.py: -------------------------------------------------------------------------------- 1 | dataset = None 2 | background = None 3 | model = None 4 | prior = None 5 | logger = None 6 | time_sampler = None 7 | noise_sampler = None 8 | corrector = None 9 | 10 | OFF_LOG = False 11 | DO_NOT_SAVE_INTERMEDIATE_IMAGES = False 12 | 13 | def assert_initialized(): 14 | assert ( 15 | dataset is not None 16 | and background is not None 17 | and model is not None 18 | and prior is not None 19 | and logger is not None 20 | and time_sampler is not None 21 | and noise_sampler is not None 22 | ), "Please initialize the shared modules before using them." -------------------------------------------------------------------------------- /rbf/time_sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import LinearAnnealingTimeSampler, FluxTimeSampler, SDTimeSampler 2 | 3 | TIME_SAMPLERs = { 4 | "linear_annealing": LinearAnnealingTimeSampler, 5 | "flux_scheduler": FluxTimeSampler, 6 | "sd_scheduler": SDTimeSampler 7 | } -------------------------------------------------------------------------------- /rbf/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | from omegaconf import OmegaConf, DictConfig 3 | 4 | 5 | def load_config(*yamls: str, cli_args: Optional[list] = None, from_string=False, **kwargs) -> Any: 6 | if from_string: 7 | yaml_confs = [OmegaConf.create(s) for s in yamls] 8 | else: 9 | yaml_confs = [OmegaConf.load(f) for f in yamls] 10 | cli_conf = OmegaConf.from_cli(cli_args) 11 | cfg = OmegaConf.merge(*yaml_confs, cli_conf, kwargs) 12 | OmegaConf.resolve(cfg) 13 | assert isinstance(cfg, DictConfig) 14 | 15 | return cfg 16 | 17 | def fetch_config(self, cfg): 18 | """ 19 | Fetch dataclass variables to local variables 20 | self: any class object 21 | cfg: any dataclass object 22 | """ 23 | 24 | for key, value in cfg.items(): 25 | setattr(self, key, value) 26 | return self 27 | -------------------------------------------------------------------------------- /rbf/utils/print_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | print_utils.py 3 | 4 | Utility functions for printing fancy messages. 5 | """ 6 | 7 | import textwrap 8 | 9 | class color: 10 | purple = '\033[95m' 11 | cyan = '\033[96m' 12 | darkcyan = '\033[36m' 13 | blue = '\033[94m' 14 | green = '\033[92m' 15 | yellow = '\033[93m' 16 | red = '\033[91m' 17 | bold = '\033[1m' 18 | end = '\033[0m' 19 | 20 | def print_with_box(text: str, box_color: str = color.purple, text_color: str = color.end, title: str = "", max_len = 88) -> None: 21 | """ 22 | Prints a message with a box around it. 23 | """ 24 | lines = text.split("\n") 25 | if len(title) > max_len - 3: 26 | title = title[:max_len - 6] + "..." 27 | text_len = max([len(line) for line in lines]) 28 | title_len = len(title) 29 | line_len = min(max_len, max(title_len, text_len)) 30 | 31 | # if each line is longer than max_len, break it into multiple lines 32 | new_lines = [] 33 | for line in lines: 34 | while len(line) > line_len: 35 | new_lines.append(line[:line_len]) 36 | line = line[line_len:] 37 | new_lines.append(line) 38 | lines = new_lines 39 | 40 | bar_len = line_len - len(title) 41 | front_bar_len = bar_len // 2 42 | back_bar_len = bar_len - front_bar_len 43 | print(box_color+"╭─" + "─"*front_bar_len + title + "─"*back_bar_len + "─╮"+color.end) 44 | for line in lines: 45 | print(box_color+"│ " + text_color + line.ljust(line_len) + box_color + " │"+color.end) 46 | print(box_color+"╰" + "─" * (line_len + 2) + "╯"+color.end) 47 | 48 | def print_warning(*args) -> None: 49 | text = ' '.join(map(str, args)) 50 | print(color.yellow + color.bold + '[Warning] ' + color.end + color.yellow + text + color.end) 51 | 52 | def print_info(*args) -> None: 53 | text = ' '.join(map(str, args)) 54 | print(color.green + color.bold + '[Info] ' + color.end + color.green + text + color.end) 55 | 56 | def print_error(*args) -> None: 57 | text = ' '.join(map(str, args)) 58 | print(color.red + color.bold + '[Error] ' + color.end + color.red + text + color.end) 59 | 60 | def print_note(*args) -> None: 61 | text = ' '.join(map(str, args)) 62 | print(color.cyan + color.bold + '[NOTE] ' + color.end + color.cyan + text + color.end) 63 | 64 | def print_wrap(text, max_width=100): 65 | wrapped_text = textwrap.fill(text, width=max_width) 66 | print(wrapped_text) 67 | 68 | 69 | def print_qna(question, response): 70 | print("************************************") 71 | print_wrap(f"*** [QUESTION]\n{question}") 72 | print("************************************") 73 | print_wrap(f"*** [RESPONSE]\n{response}") 74 | print("************************************") 75 | -------------------------------------------------------------------------------- /rbf/utils/random_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | random_utils.py 3 | 4 | Utility functions for controlling randomness. 5 | """ 6 | 7 | import os 8 | import numpy as np 9 | import torch 10 | import random 11 | 12 | 13 | def seed_everything(seed=0): 14 | """ 15 | Seeds the random number generators of Python, Numpy and PyTorch. 16 | """ 17 | os.environ["PYTHONHASHSEED"] = str(seed) 18 | random.seed(seed) 19 | np.random.seed(seed) 20 | torch.manual_seed(seed) 21 | torch.cuda.manual_seed(seed) 22 | torch.cuda.manual_seed_all(seed) 23 | torch.backends.cudnn.deterministic = True 24 | torch.backends.cudnn.benchmark = False -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==1.4.0 2 | diffusers==0.32.2 3 | einops==0.8.1 4 | huggingface-hub==0.29.1 5 | image-reward==1.5 6 | imageio==2.37.0 7 | matplotlib==3.10.0 8 | natsort==8.4.0 9 | numpy==1.26.3 10 | omegaconf==2.3.0 11 | pillow==11.0.0 12 | scikit-learn==1.6.1 13 | scipy==1.15.2 14 | sentencepiece==0.2.0 15 | supervision==0.25.1 16 | tokenizers==0.21.0 17 | tqdm==4.67.1 18 | transformers==4.49.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="rbf", 5 | version=0.1, 6 | packages=["rbf"], 7 | zip_safe=False, 8 | ) 9 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/eval.py: -------------------------------------------------------------------------------- 1 | # Evaluate on all datasets in VQAScore paper 2 | 3 | import argparse 4 | import os 5 | import t2v_metrics 6 | from dataset import Winoground, NaturalBench_Retrieval, EqBen_Mini, StanfordT23D, TIFA160_DSG, Flickr8K_CF, SeeTrue, Pickapic_v1, T2VScore 7 | 8 | 9 | def config(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--root_dir", default="./datasets", type=str, 12 | help='Root directory for saving datasets.') 13 | parser.add_argument("--cache_dir", default=t2v_metrics.constants.HF_CACHE_DIR, type=str) 14 | parser.add_argument("--device", default="cuda", type=str) 15 | parser.add_argument("--batch_size", default=16, type=int) 16 | parser.add_argument("--model", default="clip-flant5-xxl", type=str) 17 | parser.add_argument("--question", default=None, type=str) 18 | parser.add_argument("--answer", default=None, type=str) 19 | return parser.parse_args() 20 | 21 | def main(): 22 | args = config() 23 | if not os.path.exists(args.root_dir): 24 | os.makedirs(args.root_dir) 25 | 26 | score_func = t2v_metrics.get_score_model(model=args.model, device=args.device, cache_dir=args.cache_dir) 27 | 28 | kwargs = {} 29 | if args.question is not None: 30 | print(f"Using question template: {args.question}") 31 | kwargs['question_template'] = args.question 32 | if args.answer is not None: 33 | print(f"Using answer template: {args.answer}") 34 | kwargs['answer_template'] = args.answer 35 | 36 | print(f"Performance of {args.model}.") 37 | for dataset_cls in [ 38 | Winoground, 39 | NaturalBench_Retrieval, 40 | EqBen_Mini, 41 | TIFA160_DSG, 42 | Pickapic_v1, 43 | SeeTrue, 44 | StanfordT23D, 45 | T2VScore, 46 | Flickr8K_CF 47 | ]: 48 | 49 | dataset = dataset_cls(root_dir=args.root_dir) 50 | scores = score_func.batch_forward(dataset, batch_size=args.batch_size, **kwargs).cpu() 51 | dataset.evaluate_scores(scores) 52 | 53 | if __name__ == "__main__": 54 | main() 55 | 56 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/genai_video_eval.py: -------------------------------------------------------------------------------- 1 | # Evaluate on GenAI-Bench-Video using a specific model 2 | # Example scripts to run: 3 | # VQAScore: python genai_video_eval.py --model clip-flant5-xxl 4 | # CLIPScore: python genai_video_eval.py --model openai:ViT-L-14-336 5 | import argparse 6 | import os 7 | import t2v_metrics 8 | from dataset import GenAIBench_Video 9 | import json 10 | import torch 11 | import numpy as np 12 | from genai_image_eval import show_performance_per_skill 13 | 14 | def config(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--root_dir", default="./datasets", type=str, 17 | help='Root directory for saving datasets.') 18 | parser.add_argument("--cache_dir", default=t2v_metrics.constants.HF_CACHE_DIR, type=str) 19 | parser.add_argument("--device", default="cuda", type=str) 20 | parser.add_argument("--batch_size", default=16, type=int) 21 | parser.add_argument("--num_prompts", default=800, type=int, choices=[527, 800]) 22 | parser.add_argument("--model", default="clip-flant5-xxl", type=str) 23 | parser.add_argument("--question", default=None, type=str) 24 | parser.add_argument("--answer", default=None, type=str) 25 | parser.add_argument("--result_dir", default="./genai_video_results", type=str) 26 | parser.add_argument("--eval_mode", default="avg_frames", type=str) 27 | return parser.parse_args() 28 | 29 | 30 | def main(): 31 | args = config() 32 | if not os.path.exists(args.root_dir): 33 | os.makedirs(args.root_dir) 34 | 35 | os.makedirs(args.result_dir, exist_ok=True) 36 | result_path = f"{args.result_dir}/{args.model}_{args.eval_mode}_{args.num_prompts}_prompts.pt" 37 | dataset = GenAIBench_Video(root_dir=args.root_dir, eval_mode=args.eval_mode, num_prompts=args.num_prompts) 38 | if os.path.exists(result_path): 39 | print(f"Result file {result_path} already exists. Skipping.") 40 | scores = torch.load(result_path) 41 | else: 42 | score_func = t2v_metrics.get_score_model(model=args.model, device=args.device, cache_dir=args.cache_dir) 43 | 44 | kwargs = {} 45 | if args.question is not None: 46 | print(f"Using question template: {args.question}") 47 | kwargs['question_template'] = args.question 48 | if args.answer is not None: 49 | print(f"Using answer template: {args.answer}") 50 | kwargs['answer_template'] = args.answer 51 | 52 | print(f"Performance of {args.model} on using {args.eval_mode}.") 53 | scores = score_func.batch_forward(dataset, batch_size=args.batch_size, **kwargs).cpu() 54 | torch.save(scores, result_path) 55 | 56 | ### Get performance per skill 57 | our_scores = scores.mean(axis=1) 58 | show_performance_per_skill(our_scores, dataset, items_name='videos', prompt_to_items_name='prompt_to_videos', print_std=True) 59 | 60 | print("Alignment Performance") 61 | ### Alignment performance 62 | dataset.evaluate_scores(scores) 63 | 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0.png -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/0/DALLE3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/DALLE3.png -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/0/DeepFloyd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/DeepFloyd.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/0/Midjourney.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/Midjourney.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/0/SDXL.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/0/SDXL.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1.png -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/1/DALLE3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/DALLE3.png -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/1/DeepFloyd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/DeepFloyd.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/1/Midjourney.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/Midjourney.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/1/SDXL.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/1/SDXL.jpg -------------------------------------------------------------------------------- /third-party/t2v_metrics/images/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/images/example.png -------------------------------------------------------------------------------- /third-party/t2v_metrics/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "t2v_metrics" 7 | version = "1.2" 8 | description = "Evaluating Text-to-Visual Generation with Image-to-Text Generation." 9 | authors = [ 10 | {name="Zhiqiu Lin", email="zl279@cornell.edu"}, 11 | ] 12 | readme = "README.md" 13 | requires-python = ">=3.10" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: Apache Software License", 17 | ] 18 | dependencies = [ 19 | "ftfy>=6.1.1", 20 | "tqdm>=4.64.1", 21 | "gdown>=4.7.1", 22 | "huggingface-hub>=0.19.4", 23 | "open-clip-torch>=2.23.0", 24 | "openai>=1.29.0", 25 | "opencv-python>=4.11.0.86", 26 | "opencv-python-headless", 27 | "pandas>=2.1.4", 28 | "scipy>=1.11.4", 29 | "sentencepiece>=0.1.99", 30 | "transformers>=4.48.1", 31 | "datasets>=2.15.0", 32 | "tokenizers", 33 | "omegaconf", 34 | "iopath", 35 | "fairscale", 36 | # for clipscore 37 | "scikit-learn", 38 | "pycocoevalcap", 39 | "image-reward", 40 | "hpsv2", 41 | "fire==0.4.0", 42 | "tiktoken>=0.7.0", 43 | ] 44 | 45 | [tool.setuptools] 46 | include-package-data = true 47 | packages = ["t2v_metrics", "t2v_metrics.models"] 48 | 49 | [tool.setuptools.package-data] 50 | 't2v_metrics' = ['**/*.json', '**/*.yaml', '**/*.py'] 51 | 52 | [project.urls] 53 | Home = "https://linzhiqiu.github.io/papers/vqascore" 54 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | datasets 3 | scipy 4 | sentencepiece 5 | gdown 6 | tqdm 7 | ftfy 8 | regex 9 | git+https://github.com/openai/CLIP.git 10 | open-clip-torch 11 | opencv-python 12 | opencv-python-headless 13 | openai 14 | einops 15 | pandas 16 | # salesforce-lavis -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | from .constants import HF_CACHE_DIR 7 | from .vqascore import VQAScore, list_all_vqascore_models 8 | from .clipscore import CLIPScore, list_all_clipscore_models 9 | from .itmscore import ITMScore, list_all_itmscore_models 10 | 11 | def list_all_models(): 12 | return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models() 13 | 14 | def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=HF_CACHE_DIR, **kwargs): 15 | if model in list_all_vqascore_models(): 16 | return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs) 17 | elif model in list_all_clipscore_models(): 18 | return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs) 19 | elif model in list_all_itmscore_models(): 20 | return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs) 21 | else: 22 | raise NotImplementedError() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/clipscore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.clipscore_models import list_all_clipscore_models, get_clipscore_model 8 | 9 | class CLIPScore(Score): 10 | def prepare_scoremodel(self, 11 | model='openai:ViT-L/14', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR): 14 | return get_clipscore_model( 15 | model, 16 | device=device, 17 | cache_dir=cache_dir 18 | ) 19 | 20 | def list_all_models(self) -> List[str]: 21 | return list_all_clipscore_models() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/constants.py: -------------------------------------------------------------------------------- 1 | HF_CACHE_DIR = "./hf_cache/" # TODO: change this to your own cache dir 2 | 3 | # For CLIP-FlanT5 and LLaVA-1.5 (copied from llava) 4 | CONTEXT_LEN = 2048 5 | SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions." 6 | IGNORE_INDEX = -100 7 | IMAGE_TOKEN_INDEX = -200 8 | DEFAULT_IMAGE_TOKEN = "" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/itmscore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.itmscore_models import list_all_itmscore_models, get_itmscore_model 8 | 9 | class ITMScore(Score): 10 | def prepare_scoremodel(self, 11 | model='blip2-itm', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR): 14 | return get_itmscore_model( 15 | model, 16 | device=device, 17 | cache_dir=cache_dir 18 | ) 19 | 20 | def list_all_models(self) -> List[str]: 21 | return list_all_itmscore_models() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/__init__.py -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/clipscore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_model import CLIP_MODELS, CLIPScoreModel 2 | from .blip2_itc_model import BLIP2_ITC_MODELS, BLIP2ITCScoreModel 3 | from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel 4 | from .pickscore_model import PICKSCORE_MODELS, PickScoreModel 5 | from ...constants import HF_CACHE_DIR 6 | 7 | ALL_CLIP_MODELS = [ 8 | CLIP_MODELS, 9 | BLIP2_ITC_MODELS, 10 | HPSV2_MODELS, 11 | PICKSCORE_MODELS, 12 | ] 13 | 14 | def list_all_clipscore_models(): 15 | return [model for models in ALL_CLIP_MODELS for model in models] 16 | 17 | def get_clipscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR): 18 | assert model_name in list_all_clipscore_models() 19 | if model_name in CLIP_MODELS: 20 | return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir) 21 | elif model_name in BLIP2_ITC_MODELS: 22 | return BLIP2ITCScoreModel(model_name, device=device, cache_dir=cache_dir) 23 | elif model_name in HPSV2_MODELS: 24 | return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir) 25 | elif model_name in PICKSCORE_MODELS: 26 | return PickScoreModel(model_name, device=device, cache_dir=cache_dir) 27 | else: 28 | raise NotImplementedError() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/clipscore_models/clip_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List 3 | import torch 4 | import open_clip 5 | 6 | from ..model import ScoreModel 7 | from ...constants import HF_CACHE_DIR 8 | 9 | CLIP_MODELS = [f"{pretrained}:{arch}" for arch, pretrained in open_clip.list_pretrained()] 10 | 11 | class CLIPScoreModel(ScoreModel): 12 | "A wrapper for OpenCLIP models (including openAI's CLIP, OpenCLIP, DatacompCLIP)" 13 | def __init__(self, 14 | model_name='openai:ViT-L-14', 15 | device='cuda', 16 | cache_dir=HF_CACHE_DIR): 17 | assert model_name in CLIP_MODELS 18 | super().__init__(model_name=model_name, 19 | device=device, 20 | cache_dir=cache_dir) 21 | 22 | def load_model(self): 23 | """Load the model, tokenizer, image transform 24 | """ 25 | self.pretrained, self.arch = self.model_name.split(':') 26 | self.model, _, self.preprocess = open_clip.create_model_and_transforms( 27 | self.arch, 28 | pretrained=self.pretrained, 29 | device=self.device, 30 | cache_dir=self.cache_dir 31 | ) 32 | self.tokenizer = open_clip.get_tokenizer(self.arch) 33 | self.model.eval() 34 | 35 | def load_images(self, 36 | image: List[str]) -> torch.Tensor: 37 | """Load the image(s), and return a tensor (after preprocessing) put on self.device 38 | """ 39 | image = [self.image_loader(x) for x in image] 40 | image = [self.preprocess(x) for x in image] 41 | image = torch.stack(image, dim=0).to(self.device) 42 | return image 43 | 44 | @torch.no_grad() 45 | def forward(self, 46 | images: List[str], 47 | texts: List[str]) -> torch.Tensor: 48 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 49 | """ 50 | assert len(images) == len(texts) 51 | image = self.load_images(images) 52 | text = self.tokenizer(texts).to(self.device) 53 | image_features = self.model.encode_image(image) 54 | image_features /= image_features.norm(dim=-1, keepdim=True) 55 | text_features = self.model.encode_text(text) 56 | text_features /= text_features.norm(dim=-1, keepdim=True) 57 | 58 | # return cosine similarity as scores 59 | return (image_features * text_features).sum(dim=-1) -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import torch 3 | 4 | from ..model import ScoreModel 5 | from ...constants import HF_CACHE_DIR 6 | 7 | HPSV2_MODELS = ['hpsv2'] 8 | 9 | class HPSV2ScoreModel(ScoreModel): 10 | "A wrapper for HPSv2 models " 11 | def __init__(self, 12 | model_name='openai:ViT-L-14', 13 | device='cuda', 14 | cache_dir=HF_CACHE_DIR): 15 | assert model_name in HPSV2_MODELS 16 | super().__init__(model_name=model_name, 17 | device=device, 18 | cache_dir=cache_dir) 19 | 20 | def load_model(self): 21 | """Load the model, tokenizer, image transform 22 | """ 23 | import hpsv2 24 | self.hpsv2 = hpsv2 25 | 26 | def load_images(self, 27 | image: List[str]): 28 | """Load the image(s), and return a tensor (after preprocessing) put on self.device 29 | """ 30 | images = [self.image_loader(x) for x in image] 31 | return images 32 | 33 | @torch.no_grad() 34 | def forward(self, 35 | images: List[str], 36 | texts: List[str]) -> torch.Tensor: 37 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 38 | """ 39 | assert len(images) == len(texts) 40 | images = self.load_images(images) 41 | scores = torch.zeros(len(images), dtype=torch.float16).to(self.device) 42 | for i in range(len(images)): 43 | caption = texts[i] 44 | image = images[i] 45 | scores[i] = float(self.hpsv2.score(image, caption)[0]) 46 | 47 | # return cosine similarity as scores 48 | return scores -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/clipscore_models/pickscore_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List 3 | import torch 4 | from transformers import AutoProcessor, AutoModel 5 | from PIL import Image 6 | 7 | from ..model import ScoreModel 8 | from ...constants import HF_CACHE_DIR 9 | 10 | PICKSCORE_MODELS = ['pickscore-v1'] 11 | 12 | class PickScoreModel(ScoreModel): 13 | "A wrapper for PickScore models" 14 | def __init__(self, 15 | model_name='pickscore-v1', 16 | device='cuda', 17 | cache_dir=HF_CACHE_DIR): 18 | assert model_name in PICKSCORE_MODELS 19 | super().__init__(model_name=model_name, 20 | device=device, 21 | cache_dir=cache_dir) 22 | 23 | def load_model(self): 24 | """Load the model, tokenizer, image transform 25 | """ 26 | assert self.model_name == 'pickscore-v1' 27 | processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K" 28 | model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1" 29 | 30 | self.processor = AutoProcessor.from_pretrained(processor_name_or_path) 31 | self.model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(self.device) 32 | 33 | def load_images(self, 34 | image: List[str]) -> torch.Tensor: 35 | """Load the image(s), and return a tensor (no preprocessing!!) put on self.device 36 | """ 37 | image = [self.image_loader(x) for x in image] 38 | image = self.processor(images=image, padding=True, truncation=True, max_length=77, return_tensors="pt").to(self.device) 39 | # image = torch.stack(image, dim=0).to(self.device) 40 | return image 41 | 42 | @torch.no_grad() 43 | def forward(self, 44 | images: List[str], 45 | texts: List[str]) -> torch.Tensor: 46 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 47 | """ 48 | assert len(images) == len(texts) 49 | image = self.load_images(images) 50 | text_inputs = self.processor( 51 | text=texts, 52 | padding=True, 53 | truncation=True, 54 | max_length=77, 55 | return_tensors="pt", 56 | ).to(self.device) 57 | 58 | # embed 59 | image_embs = self.model.get_image_features(**image) 60 | image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True) 61 | 62 | text_embs = self.model.get_text_features(**text_inputs) 63 | text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True) 64 | 65 | # score 66 | scores = (image_embs * text_embs).sum(dim=-1) 67 | return scores 68 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/itmscore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .blip2_itm_model import BLIP2_ITM_MODELS, BLIP2ITMScoreModel 2 | from .image_reward_model import IMAGE_REWARD_MODELS, ImageRewardScoreModel 3 | from ...constants import HF_CACHE_DIR 4 | 5 | ALL_ITM_MODELS = [ 6 | BLIP2_ITM_MODELS, 7 | IMAGE_REWARD_MODELS, 8 | ] 9 | 10 | def list_all_itmscore_models(): 11 | return [model for models in ALL_ITM_MODELS for model in models] 12 | 13 | def get_itmscore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR): 14 | assert model_name in list_all_itmscore_models() 15 | if model_name in BLIP2_ITM_MODELS: 16 | return BLIP2ITMScoreModel(model_name, device=device, cache_dir=cache_dir) 17 | elif model_name in IMAGE_REWARD_MODELS: 18 | return ImageRewardScoreModel(model_name, device=device, cache_dir=cache_dir) 19 | else: 20 | raise NotImplementedError() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/itmscore_models/image_reward_model.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import torch 3 | import os 4 | from torchvision import transforms 5 | 6 | import ImageReward as reward 7 | from ..model import ScoreModel 8 | from ...constants import HF_CACHE_DIR 9 | 10 | IMAGE_REWARD_MODELS = { 11 | 'image-reward-v1': {'variant': "ImageReward-v1.0"}, 12 | } 13 | 14 | class ImageRewardScoreModel(ScoreModel): 15 | "A wrapper for ImageReward ITMScore (finetuned on human preference) models" 16 | def __init__(self, 17 | model_name='image-reward-v1', 18 | device='cuda', 19 | cache_dir=HF_CACHE_DIR): 20 | assert model_name in IMAGE_REWARD_MODELS, f"Model name must be one of {IMAGE_REWARD_MODELS.keys()}" 21 | os.environ['TORCH_HOME'] = cache_dir 22 | import timm.models.hub as timm_hub 23 | super().__init__(model_name=model_name, 24 | device=device, 25 | cache_dir=cache_dir) 26 | 27 | def load_model(self): 28 | """Load the model, tokenizer, image transform 29 | """ 30 | self.variant = IMAGE_REWARD_MODELS[self.model_name]['variant'] 31 | self.model = reward.load(self.variant).to(self.device).eval() 32 | 33 | def load_images(self, 34 | image: List[str]) -> torch.Tensor: 35 | """Load the image(s), and return a tensor (after preprocessing) put on self.device 36 | """ 37 | image = [self.image_loader(x) for x in image] 38 | image = [self.model.preprocess(image) for image in image] 39 | assert all(x.shape == image[0].shape for x in image) 40 | image = torch.stack(image, dim=0).to(self.device) 41 | return image 42 | 43 | @torch.no_grad() 44 | def forward(self, 45 | images: List[str], 46 | texts: List[str]) -> torch.Tensor: 47 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 48 | """ 49 | assert len(images) == len(texts), "Number of images and texts must match" 50 | rewards = torch.zeros(len(texts), dtype=torch.float32).to(self.device) 51 | images = self.load_images(images) 52 | for index in range(len(texts)): 53 | text_input = self.model.blip.tokenizer( 54 | texts[index], padding='max_length', 55 | truncation=True, max_length=35, return_tensors="pt").to(self.device) 56 | image_embeds = self.model.blip.visual_encoder(images[index].unsqueeze(0)) 57 | image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(self.device) 58 | text_output = self.model.blip.text_encoder( 59 | text_input.input_ids, 60 | attention_mask = text_input.attention_mask, 61 | encoder_hidden_states = image_embeds, 62 | encoder_attention_mask = image_atts, 63 | return_dict = True, 64 | ) 65 | 66 | txt_features = text_output.last_hidden_state[:,0,:].float() # (feature_dim) 67 | reward_score = self.model.mlp(txt_features) 68 | reward_score = (reward_score - self.model.mean) / self.model.std 69 | 70 | rewards[index] = reward_score 71 | 72 | return rewards -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | import os 4 | import torch 5 | import numpy as np 6 | from PIL import Image 7 | 8 | from ..constants import HF_CACHE_DIR 9 | 10 | 11 | 12 | def image_loader(image_path): 13 | # >>> ADDED ######################################################### 14 | if type(image_path) == Image.Image: 15 | return image_path.convert("RGB") 16 | # <<< ADDED ######################################################### 17 | elif image_path.split('.')[-1] == 'npy': 18 | return Image.fromarray(np.load(image_path)[:, :, [2, 1, 0]], 'RGB') 19 | else: 20 | return Image.open(image_path).convert("RGB") 21 | 22 | class ScoreModel(ABC): 23 | def __init__(self, 24 | model_name='clip-flant5-xxl', 25 | device='cuda', 26 | cache_dir=HF_CACHE_DIR): 27 | self.model_name = model_name 28 | self.device = device 29 | self.cache_dir = cache_dir 30 | if not os.path.exists(self.cache_dir): 31 | os.makedirs(self.cache_dir) 32 | self.image_loader = image_loader 33 | self.load_model() 34 | 35 | @abstractmethod 36 | def load_model(self): 37 | """Load the model, tokenizer, and etc. 38 | """ 39 | pass 40 | 41 | @abstractmethod 42 | def load_images(self, 43 | image: List[str]) -> torch.Tensor: 44 | """Load the image(s), and return a tensor (after preprocessing) put on self.device 45 | """ 46 | pass 47 | 48 | @abstractmethod 49 | def forward(self, 50 | images: List[str], 51 | texts: List[str]) -> torch.Tensor: 52 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 53 | """ 54 | pass -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip_t5_model import CLIP_T5_MODELS, CLIPT5Model 2 | from .llava_model import LLAVA_MODELS, LLaVAModel 3 | from .llava16_model import LLAVA16_MODELS, LLaVA16Model 4 | from .instructblip_model import InstructBLIP_MODELS, InstructBLIPModel 5 | from .gpt4v_model import GPT4V_MODELS, GPT4VModel 6 | from ...constants import HF_CACHE_DIR 7 | 8 | ALL_VQA_MODELS = [ 9 | CLIP_T5_MODELS, 10 | LLAVA_MODELS, 11 | LLAVA16_MODELS, 12 | InstructBLIP_MODELS, 13 | GPT4V_MODELS, 14 | ] 15 | 16 | def list_all_vqascore_models(): 17 | return [model for models in ALL_VQA_MODELS for model in models] 18 | 19 | def get_vqascore_model(model_name, device='cuda', cache_dir=HF_CACHE_DIR, **kwargs): 20 | assert model_name in list_all_vqascore_models() 21 | if model_name in CLIP_T5_MODELS: 22 | return CLIPT5Model(model_name, device=device, cache_dir=cache_dir, **kwargs) 23 | elif model_name in LLAVA_MODELS: 24 | return LLaVAModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 25 | elif model_name in LLAVA16_MODELS: 26 | return LLaVA16Model(model_name, device=device, cache_dir=cache_dir, **kwargs) 27 | elif model_name in InstructBLIP_MODELS: 28 | return InstructBLIPModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 29 | elif model_name in GPT4V_MODELS: 30 | return GPT4VModel(model_name, device=device, cache_dir=cache_dir, **kwargs) 31 | else: 32 | raise NotImplementedError() -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.clip_t5 import CLIPT5ForConditionalGeneration, CLIPT5Config, ModelArguments -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | else: 20 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 21 | 22 | def load_model(self): 23 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 24 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 25 | self.vision_tower.requires_grad_(False) 26 | 27 | self.is_loaded = True 28 | 29 | def feature_select(self, image_forward_outs): 30 | image_features = image_forward_outs.hidden_states[self.select_layer] 31 | if self.select_feature == 'patch': 32 | image_features = image_features[:, 1:] 33 | elif self.select_feature == 'cls_patch': 34 | image_features = image_features 35 | else: 36 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 37 | return image_features 38 | 39 | @torch.no_grad() 40 | def forward(self, images): 41 | if type(images) is list: 42 | image_features = [] 43 | for image in images: 44 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 45 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 46 | image_features.append(image_feature) 47 | else: 48 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 49 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 50 | 51 | return image_features 52 | 53 | @property 54 | def dummy_feature(self): 55 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 56 | 57 | @property 58 | def dtype(self): 59 | return self.vision_tower.dtype 60 | 61 | @property 62 | def device(self): 63 | return self.vision_tower.device 64 | 65 | @property 66 | def config(self): 67 | if self.is_loaded: 68 | return self.vision_tower.config 69 | else: 70 | return self.cfg_only 71 | 72 | @property 73 | def hidden_size(self): 74 | return self.config.hidden_size 75 | 76 | @property 77 | def num_patches(self): 78 | return (self.config.image_size // self.config.patch_size) ** 2 79 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from .common.registry import registry 14 | 15 | from .models import * 16 | from .processors import * 17 | 18 | 19 | root_dir = os.path.dirname(os.path.abspath(__file__)) 20 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 21 | 22 | registry.register_path("library_root", root_dir) 23 | repo_root = os.path.join(root_dir, "..") 24 | registry.register_path("repo_root", repo_root) 25 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 26 | registry.register_path("cache_root", cache_root) 27 | 28 | registry.register("MAX_INT", sys.maxsize) 29 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 30 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/aokvqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | aok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json 17 | storage: 18 | - aokvqa/annotations/aokvqa_v1p0_train.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json 22 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 23 | storage: 24 | - aokvqa/annotations/aokvqa_v1p0_val.json 25 | - aokvqa/annotations/specialized_vocab_train_lavis.json 26 | # - aokvqa/annotations/large_vocab_train_lavis.json 27 | test: 28 | url: 29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json 31 | storage: 32 | - aokvqa/annotations/aokvqa_v1p0_test.json 33 | - aokvqa/annotations/specialized_vocab_train_lavis.json 34 | images: 35 | storage: coco/images/ 36 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_train.json 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val.json 18 | storage: 19 | - coco/annotations/vqa_train.json 20 | - coco/annotations/vqa_val.json 21 | val: 22 | url: 23 | # TODO make this order insensitive 24 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json 25 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 27 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json 28 | storage: 29 | - coco/annotations/vqa_val_eval.json 30 | - coco/annotations/answer_list.json 31 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 32 | - coco/annotations/v2_mscoco_val2014_annotations.json 33 | test: 34 | url: 35 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_test.json 36 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 37 | storage: 38 | - coco/annotations/vqa_test.json 39 | - coco/annotations/answer_list.json 40 | images: 41 | storage: coco/images/ 42 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json 18 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 19 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 20 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json 21 | storage: 22 | - coco/annotations/vqa_val_eval.json 23 | - coco/annotations/answer_list.json 24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 25 | - coco/annotations/v2_mscoco_val2014_annotations.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_testdev.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json 22 | storage: 23 | - gqa/annotations/testdev_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/balanced_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json 22 | storage: 23 | - gqa/annotations/val_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/gqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/share/datasets/vision/GQA/questions1.2/train_all_questions/train_all_questions_0.json 17 | - /export/share/datasets/vision/GQA/questions1.2/val_all_questions.json 18 | storage: 19 | - gqa/annotations/train_all_questions_0.json 20 | - gqa/annotations/val_all_questions.json 21 | val: 22 | url: 23 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json 24 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json 25 | storage: 26 | - aokvqa/annotations/aokvqa_v1p0_val.json 27 | - aokvqa/annotations/large_vocab_train_lavis.json 28 | test: 29 | url: 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json 31 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/large_vocab_train_lavis.json 32 | storage: 33 | - aokvqa/annotations/aokvqa_v1p0_test.json 34 | - aokvqa/annotations/large_vocab_train_lavis.json 35 | images: 36 | storage: gqa/images/ 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_train.json 16 | storage: msrvtt/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_val.json 19 | storage: msrvtt/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/qa_test.json 22 | storage: msrvtt/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/train_ans2label.json 25 | storage: msrvtt/annotations/qa_ans2label.json 26 | videos: 27 | storage: msrvtt/videos 28 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/msvd/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json 16 | storage: msvd/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json 19 | storage: msvd/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json 22 | storage: msvd/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json 25 | storage: msvd/annotations/qa_ans2label.json 26 | videos: 27 | storage: msvd/videos 28 | 29 | instance_id_key: question_id 30 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/okvqa/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | ok_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json 18 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_train2014_questions.json 19 | # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_train2014_annotations.json 20 | storage: 21 | - okvqa/annotations/okvqa_train.json 22 | # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json 23 | # - okvqa/annotations/mscoco_train2014_annotations.json 24 | test: 25 | url: 26 | # TODO make this order insensitive 27 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json 28 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json 29 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json 30 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json 31 | storage: 32 | - okvqa/annotations/vqa_val_eval.json 33 | - okvqa/annotations/answer_list.json 34 | - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json 35 | - okvqa/annotations/mscoco_val2014_annotations.json 36 | images: 37 | storage: coco/images/ 38 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xxl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xxl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_no_prefix_iter_100000/20231015004/checkpoint_80000.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "lavis/output/BLIP2/Pretrain_stage2_flant5_xl_batch_80_prefix_iter_100000/20231015004/checkpoint_80000.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna13b.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vicuna7b.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "lmsys/vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/img2prompt-vqa/img2prompt_vqa_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: img2prompt_vqa 8 | model_type: base 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_generation_moodel: 47 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/projects/img2prompt/T5_large_QG.pth" 48 | 49 | 50 | 51 | preprocess: 52 | vis_processor: 53 | eval: 54 | name: "blip_image_eval" 55 | image_size: 384 56 | text_processor: 57 | eval: 58 | name: "blip_caption" 59 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_answering_model: 47 | arch: pnp_unifiedqav2_fid 48 | 49 | pretrained: "allenai/unifiedqa-v2-t5-3b-1363200" 50 | 51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_3b_config.json" 52 | 53 | preprocess: 54 | vis_processor: 55 | eval: 56 | name: "blip_image_eval" 57 | image_size: 384 58 | text_processor: 59 | eval: 60 | name: "blip_caption" 61 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | question_answering_model: 46 | arch: pnp_unifiedqav2_fid 47 | 48 | pretrained: "allenai/unifiedqa-v2-t5-base-1363200" 49 | 50 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_base_config.json" 51 | 52 | preprocess: 53 | vis_processor: 54 | eval: 55 | name: "blip_image_eval" 56 | image_size: 384 57 | text_processor: 58 | eval: 59 | name: "blip_caption" 60 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/pnp_vqa_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | image_question_matching_model: 11 | arch: blip_image_text_matching 12 | load_finetuned: True 13 | 14 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco_train2014.pth" 15 | 16 | # vit encoder 17 | vit_type: "large" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | 21 | image_size: 384 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_large_config.json" 25 | 26 | embed_dim: 256 27 | 28 | image_captioning_model: 29 | arch: blip_caption 30 | load_finetuned: True 31 | 32 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption_coco_train2014.pth" 33 | 34 | vit_type: "large" 35 | vit_grad_ckpt: True 36 | vit_ckpt_layer: 5 37 | 38 | image_size: 384 39 | 40 | # bert config 41 | med_config_path: "configs/models/med_large_config.json" 42 | 43 | # generation configs 44 | prompt: "a picture of " 45 | 46 | question_answering_model: 47 | arch: pnp_unifiedqav2_fid 48 | 49 | pretrained: "allenai/unifiedqa-v2-t5-large-1363200" 50 | 51 | t5_config_path: "configs/models/pnp-vqa/unifiedqav2_large_config.json" 52 | 53 | preprocess: 54 | vis_processor: 55 | eval: 56 | name: "blip_image_eval" 57 | image_size: 384 58 | text_processor: 59 | eval: 60 | name: "blip_caption" 61 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_3b_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 16384, 6 | "d_kv": 128, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 24, 21 | "num_heads": 32, 22 | "num_layers": 24, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "torch_dtype": "float32", 57 | "transformers_version": "4.21.3", 58 | "use_cache": true, 59 | "vocab_size": 32128 60 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 3072, 6 | "d_kv": 64, 7 | "d_model": 768, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 12, 21 | "num_heads": 12, 22 | "num_layers": 12, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/pnp-vqa/unifiedqav2_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "T5ForConditionalGeneration" 4 | ], 5 | "d_ff": 4096, 6 | "d_kv": 64, 7 | "d_model": 1024, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "relu", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "relu", 13 | "gradient_checkpointing": false, 14 | "initializer_factor": 1.0, 15 | "is_encoder_decoder": true, 16 | "is_gated_act": false, 17 | "layer_norm_epsilon": 1e-06, 18 | "model_type": "t5", 19 | "n_positions": 512, 20 | "num_decoder_layers": 24, 21 | "num_heads": 16, 22 | "num_layers": 24, 23 | "output_past": true, 24 | "pad_token_id": 0, 25 | "relative_attention_max_distance": 128, 26 | "relative_attention_num_buckets": 32, 27 | "task_specific_params": { 28 | "summarization": { 29 | "early_stopping": true, 30 | "length_penalty": 2.0, 31 | "max_length": 200, 32 | "min_length": 30, 33 | "no_repeat_ngram_size": 3, 34 | "num_beams": 4, 35 | "prefix": "summarize: " 36 | }, 37 | "translation_en_to_de": { 38 | "early_stopping": true, 39 | "max_length": 300, 40 | "num_beams": 4, 41 | "prefix": "translate English to German: " 42 | }, 43 | "translation_en_to_fr": { 44 | "early_stopping": true, 45 | "max_length": 300, 46 | "num_beams": 4, 47 | "prefix": "translate English to French: " 48 | }, 49 | "translation_en_to_ro": { 50 | "early_stopping": true, 51 | "max_length": 300, 52 | "num_beams": 4, 53 | "prefix": "translate English to Romanian: " 54 | } 55 | }, 56 | "transformers_version": "4.21.3", 57 | "use_cache": true, 58 | "vocab_size": 32128 59 | } -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/albef_models/albef_outputs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from dataclasses import dataclass 9 | from typing import Optional 10 | 11 | import torch 12 | from transformers.modeling_outputs import ( 13 | BaseModelOutputWithPoolingAndCrossAttentions, 14 | CausalLMOutputWithCrossAttentions, 15 | ModelOutput, 16 | ) 17 | 18 | 19 | @dataclass 20 | class AlbefSimilarity(ModelOutput): 21 | sim_i2t: torch.FloatTensor = None 22 | sim_t2i: torch.FloatTensor = None 23 | 24 | sim_i2t_m: Optional[torch.FloatTensor] = None 25 | sim_t2i_m: Optional[torch.FloatTensor] = None 26 | 27 | sim_i2t_targets: Optional[torch.FloatTensor] = None 28 | sim_t2i_targets: Optional[torch.FloatTensor] = None 29 | 30 | 31 | @dataclass 32 | class AlbefIntermediateOutput(ModelOutput): 33 | # uni-modal features 34 | image_embeds: torch.FloatTensor = None 35 | text_embeds: Optional[torch.FloatTensor] = None 36 | 37 | image_embeds_m: Optional[torch.FloatTensor] = None 38 | text_embeds_m: Optional[torch.FloatTensor] = None 39 | 40 | # intermediate outputs of multimodal encoder 41 | encoder_output: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None 42 | encoder_output_m: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None 43 | encoder_output_neg: Optional[BaseModelOutputWithPoolingAndCrossAttentions] = None 44 | 45 | itm_logits: Optional[torch.FloatTensor] = None 46 | itm_labels: Optional[torch.LongTensor] = None 47 | 48 | # intermediate outputs of multimodal decoder 49 | decoder_output: Optional[CausalLMOutputWithCrossAttentions] = None 50 | decoder_labels: Optional[torch.LongTensor] = None 51 | 52 | 53 | @dataclass 54 | class AlbefOutput(ModelOutput): 55 | # some finetuned models (e.g. BlipVQA) do not compute similarity, thus optional. 56 | sims: Optional[AlbefSimilarity] = None 57 | 58 | intermediate_output: AlbefIntermediateOutput = None 59 | 60 | loss: Optional[torch.FloatTensor] = None 61 | 62 | loss_itc: Optional[torch.FloatTensor] = None 63 | 64 | loss_itm: Optional[torch.FloatTensor] = None 65 | 66 | loss_mlm: Optional[torch.FloatTensor] = None 67 | 68 | 69 | @dataclass 70 | class AlbefOutputWithLogits(AlbefOutput): 71 | logits: torch.FloatTensor = None 72 | logits_m: torch.FloatTensor = None 73 | 74 | 75 | @dataclass 76 | class AlbefOutputFeatures(ModelOutput): 77 | """ 78 | Data class of features from AlbefFeatureExtractor. 79 | 80 | Args: 81 | image_embeds: `torch.FloatTensor` of shape `(batch_size, num_patches+1, embed_dim)`, `optional` 82 | image_features: `torch.FloatTensor` of shape `(batch_size, num_patches+1, feature_dim)`, `optional` 83 | text_embeds: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, embed_dim)`, `optional` 84 | text_features: `torch.FloatTensor` of shape `(batch_size, sequence_length+1, feature_dim)`, `optional` 85 | 86 | The first embedding or feature is for the [CLS] token. 87 | 88 | Features are obtained by projecting the corresponding embedding into a normalized low-dimensional space. 89 | """ 90 | 91 | image_embeds: Optional[torch.FloatTensor] = None 92 | image_embeds_proj: Optional[torch.FloatTensor] = None 93 | 94 | text_embeds: Optional[torch.FloatTensor] = None 95 | text_embeds_proj: Optional[torch.FloatTensor] = None 96 | 97 | multimodal_embeds: Optional[torch.FloatTensor] = None 98 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KAIST-Visual-AI-Group/Flow-Inference-Time-Scaling/72fba24af5ac8d5ce6c90d3b0c68193d188272b3/third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import logging 9 | import os 10 | 11 | import torch 12 | from ...common.dist_utils import download_cached_file 13 | from ...common.utils import is_url 14 | from ...models.base_model import BaseModel 15 | from ...models.vit import interpolate_pos_embed 16 | from transformers import BertTokenizer 17 | 18 | 19 | class BlipBase(BaseModel): 20 | @classmethod 21 | def init_tokenizer(cls): 22 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 23 | tokenizer.add_special_tokens({"bos_token": "[DEC]"}) 24 | tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]}) 25 | tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0] 26 | return tokenizer 27 | 28 | def load_from_pretrained(self, url_or_filename): 29 | if is_url(url_or_filename): 30 | cached_file = download_cached_file( 31 | url_or_filename, check_hash=False, progress=True 32 | ) 33 | checkpoint = torch.load(cached_file, map_location="cpu") 34 | elif os.path.isfile(url_or_filename): 35 | checkpoint = torch.load(url_or_filename, map_location="cpu") 36 | else: 37 | raise RuntimeError("checkpoint url or path is invalid") 38 | 39 | state_dict = checkpoint["model"] 40 | 41 | state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed( 42 | state_dict["visual_encoder.pos_embed"], self.visual_encoder 43 | ) 44 | if "visual_encoder_m.pos_embed" in self.state_dict().keys(): 45 | state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed( 46 | state_dict["visual_encoder_m.pos_embed"], self.visual_encoder_m 47 | ) 48 | 49 | for key in self.state_dict().keys(): 50 | if key in state_dict.keys(): 51 | if state_dict[key].shape != self.state_dict()[key].shape: 52 | del state_dict[key] 53 | 54 | msg = self.load_state_dict(state_dict, strict=False) 55 | 56 | logging.info("Missing keys {}".format(msg.missing_keys)) 57 | logging.info("load checkpoint from %s" % url_or_filename) 58 | 59 | return msg 60 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from .base_processor import BaseProcessor 9 | 10 | from .blip_processors import ( 11 | BlipImageTrainProcessor, 12 | Blip2ImageTrainProcessor, 13 | BlipImageEvalProcessor, 14 | BlipCaptionProcessor, 15 | ) 16 | 17 | from ..common.registry import registry 18 | 19 | __all__ = [ 20 | "BaseProcessor", 21 | # BLIP 22 | "BlipImageTrainProcessor", 23 | "Blip2ImageTrainProcessor", 24 | "BlipImageEvalProcessor", 25 | "BlipCaptionProcessor", 26 | ] 27 | 28 | 29 | def load_processor(name, cfg=None): 30 | """ 31 | Example 32 | 33 | >>> processor = load_processor("alpro_video_train", cfg=None) 34 | """ 35 | processor = registry.get_processor_class(name).from_config(cfg) 36 | 37 | return processor 38 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments 2 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig, ModelArguments -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or vision_tower.startswith("Lin-Chen"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | else: 20 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 21 | 22 | def load_model(self): 23 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 24 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 25 | self.vision_tower.requires_grad_(False) 26 | 27 | self.is_loaded = True 28 | 29 | def feature_select(self, image_forward_outs): 30 | image_features = image_forward_outs.hidden_states[self.select_layer] 31 | if self.select_feature == 'patch': 32 | image_features = image_features[:, 1:] 33 | elif self.select_feature == 'cls_patch': 34 | image_features = image_features 35 | else: 36 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 37 | return image_features 38 | 39 | @torch.no_grad() 40 | def forward(self, images): 41 | if type(images) is list: 42 | image_features = [] 43 | for image in images: 44 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 45 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 46 | image_features.append(image_feature) 47 | else: 48 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 49 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 50 | 51 | return image_features 52 | 53 | @property 54 | def dummy_feature(self): 55 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 56 | 57 | @property 58 | def dtype(self): 59 | return self.vision_tower.dtype 60 | 61 | @property 62 | def device(self): 63 | return self.vision_tower.device 64 | 65 | @property 66 | def config(self): 67 | if self.is_loaded: 68 | return self.vision_tower.config 69 | else: 70 | return self.cfg_only 71 | 72 | @property 73 | def hidden_size(self): 74 | return self.config.hidden_size 75 | 76 | @property 77 | def num_patches(self): 78 | return (self.config.image_size // self.config.patch_size) ** 2 79 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM, LlavaConfig -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 5 | 6 | 7 | class CLIPVisionTower(nn.Module): 8 | def __init__(self, vision_tower, args, delay_load=False): 9 | super().__init__() 10 | 11 | self.is_loaded = False 12 | 13 | self.vision_tower_name = vision_tower 14 | self.select_layer = args.mm_vision_select_layer 15 | self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') 16 | 17 | if not delay_load: 18 | self.load_model() 19 | elif getattr(args, 'unfreeze_mm_vision_tower', False): 20 | self.load_model() 21 | else: 22 | self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) 23 | 24 | def load_model(self): 25 | self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) 26 | self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name) 27 | self.vision_tower.requires_grad_(False) 28 | 29 | self.is_loaded = True 30 | 31 | def feature_select(self, image_forward_outs): 32 | image_features = image_forward_outs.hidden_states[self.select_layer] 33 | if self.select_feature == 'patch': 34 | image_features = image_features[:, 1:] 35 | elif self.select_feature == 'cls_patch': 36 | image_features = image_features 37 | else: 38 | raise ValueError(f'Unexpected select feature: {self.select_feature}') 39 | return image_features 40 | 41 | @torch.no_grad() 42 | def forward(self, images): 43 | if type(images) is list: 44 | image_features = [] 45 | for image in images: 46 | image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) 47 | image_feature = self.feature_select(image_forward_out).to(image.dtype) 48 | image_features.append(image_feature) 49 | else: 50 | image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) 51 | image_features = self.feature_select(image_forward_outs).to(images.dtype) 52 | 53 | return image_features 54 | 55 | @property 56 | def dummy_feature(self): 57 | return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) 58 | 59 | @property 60 | def dtype(self): 61 | return self.vision_tower.dtype 62 | 63 | @property 64 | def device(self): 65 | return self.vision_tower.device 66 | 67 | @property 68 | def config(self): 69 | if self.is_loaded: 70 | return self.vision_tower.config 71 | else: 72 | return self.cfg_only 73 | 74 | @property 75 | def hidden_size(self): 76 | return self.config.hidden_size 77 | 78 | @property 79 | def num_patches_per_side(self): 80 | return self.config.image_size // self.config.patch_size 81 | 82 | @property 83 | def num_patches(self): 84 | return (self.config.image_size // self.config.patch_size) ** 2 -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/llava_16/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/models/vqascore_models/vqa_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List 3 | import torch 4 | 5 | from ..model import ScoreModel 6 | 7 | class VQAScoreModel(ScoreModel): 8 | 9 | @abstractmethod 10 | def forward(self, 11 | images: List[str], 12 | texts: List[str], 13 | question_template: str, 14 | answer_template: str) -> torch.Tensor: 15 | """Forward pass of the model to return n scores for n (image, text) pairs (in PyTorch Tensor) 16 | question_template: a string with optional {} to be replaced with the 'text' 17 | answer_template: a string with optional {} to be replaced with the 'text' 18 | """ 19 | pass -------------------------------------------------------------------------------- /third-party/t2v_metrics/t2v_metrics/vqascore.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .score import Score 4 | 5 | from .constants import HF_CACHE_DIR 6 | 7 | from .models.vqascore_models import list_all_vqascore_models, get_vqascore_model 8 | 9 | class VQAScore(Score): 10 | def prepare_scoremodel(self, 11 | model='clip-flant5-xxl', 12 | device='cuda', 13 | cache_dir=HF_CACHE_DIR, 14 | **kwargs): 15 | return get_vqascore_model( 16 | model, 17 | device=device, 18 | cache_dir=cache_dir, 19 | **kwargs 20 | ) 21 | 22 | def list_all_models(self) -> List[str]: 23 | return list_all_vqascore_models() --------------------------------------------------------------------------------